diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,147387 @@ +{ + "best_metric": 1.3142019510269165, + "best_model_checkpoint": "/export/data/salmasia/tradutor/checkpoints/hf_phi3_lora/checkpoint-19500", + "epoch": 6.254770193041568, + "eval_steps": 500, + "global_step": 21000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002978461996686461, + "grad_norm": 0.48318326473236084, + "learning_rate": 2e-08, + "loss": 1.845, + "step": 1 + }, + { + "epoch": 0.0005956923993372922, + "grad_norm": 0.48029589653015137, + "learning_rate": 4e-08, + "loss": 1.8713, + "step": 2 + }, + { + "epoch": 0.0008935385990059383, + "grad_norm": 0.4785495400428772, + "learning_rate": 6.000000000000001e-08, + "loss": 1.8476, + "step": 3 + }, + { + "epoch": 0.0011913847986745843, + "grad_norm": 0.47286421060562134, + "learning_rate": 8e-08, + "loss": 1.8556, + "step": 4 + }, + { + "epoch": 0.0014892309983432306, + "grad_norm": 0.4699583649635315, + "learning_rate": 1.0000000000000001e-07, + "loss": 1.8643, + "step": 5 + }, + { + "epoch": 0.0017870771980118766, + "grad_norm": 0.48511743545532227, + "learning_rate": 1.2000000000000002e-07, + "loss": 1.8712, + "step": 6 + }, + { + "epoch": 0.002084923397680523, + "grad_norm": 0.46384984254837036, + "learning_rate": 1.4e-07, + "loss": 1.8585, + "step": 7 + }, + { + "epoch": 0.0023827695973491687, + "grad_norm": 0.4602062404155731, + "learning_rate": 1.6e-07, + "loss": 1.8288, + "step": 8 + }, + { + "epoch": 0.002680615797017815, + "grad_norm": 0.47385460138320923, + "learning_rate": 1.8e-07, + "loss": 1.8433, + "step": 9 + }, + { + "epoch": 0.002978461996686461, + "grad_norm": 0.4577183425426483, + "learning_rate": 2.0000000000000002e-07, + "loss": 1.8512, + "step": 10 + }, + { + "epoch": 0.003276308196355107, + "grad_norm": 0.48740354180336, + "learning_rate": 2.2e-07, + "loss": 1.8633, + "step": 11 + }, + { + "epoch": 0.0035741543960237532, + "grad_norm": 0.47859111428260803, + "learning_rate": 2.4000000000000003e-07, + "loss": 1.8782, + "step": 12 + }, + { + "epoch": 0.0038720005956923995, + "grad_norm": 0.4740448594093323, + "learning_rate": 2.6e-07, + "loss": 1.8873, + "step": 13 + }, + { + "epoch": 0.004169846795361046, + "grad_norm": 0.4691798985004425, + "learning_rate": 2.8e-07, + "loss": 1.8568, + "step": 14 + }, + { + "epoch": 0.004467692995029691, + "grad_norm": 0.4607478678226471, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.8595, + "step": 15 + }, + { + "epoch": 0.004765539194698337, + "grad_norm": 0.4853513836860657, + "learning_rate": 3.2e-07, + "loss": 1.8717, + "step": 16 + }, + { + "epoch": 0.005063385394366984, + "grad_norm": 0.4743955433368683, + "learning_rate": 3.4000000000000003e-07, + "loss": 1.8515, + "step": 17 + }, + { + "epoch": 0.00536123159403563, + "grad_norm": 0.4832974076271057, + "learning_rate": 3.6e-07, + "loss": 1.8849, + "step": 18 + }, + { + "epoch": 0.005659077793704276, + "grad_norm": 0.4619203507900238, + "learning_rate": 3.8e-07, + "loss": 1.8496, + "step": 19 + }, + { + "epoch": 0.005956923993372922, + "grad_norm": 0.4837261736392975, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.8788, + "step": 20 + }, + { + "epoch": 0.006254770193041568, + "grad_norm": 0.46011224389076233, + "learning_rate": 4.2000000000000006e-07, + "loss": 1.8711, + "step": 21 + }, + { + "epoch": 0.006552616392710214, + "grad_norm": 0.46401581168174744, + "learning_rate": 4.4e-07, + "loss": 1.8299, + "step": 22 + }, + { + "epoch": 0.00685046259237886, + "grad_norm": 0.47924378514289856, + "learning_rate": 4.6000000000000004e-07, + "loss": 1.8648, + "step": 23 + }, + { + "epoch": 0.0071483087920475065, + "grad_norm": 0.46203556656837463, + "learning_rate": 4.800000000000001e-07, + "loss": 1.8662, + "step": 24 + }, + { + "epoch": 0.007446154991716153, + "grad_norm": 0.44733723998069763, + "learning_rate": 5.000000000000001e-07, + "loss": 1.8475, + "step": 25 + }, + { + "epoch": 0.007744001191384799, + "grad_norm": 0.45633989572525024, + "learning_rate": 5.2e-07, + "loss": 1.8526, + "step": 26 + }, + { + "epoch": 0.008041847391053444, + "grad_norm": 0.4663243591785431, + "learning_rate": 5.4e-07, + "loss": 1.8574, + "step": 27 + }, + { + "epoch": 0.008339693590722091, + "grad_norm": 0.4683181941509247, + "learning_rate": 5.6e-07, + "loss": 1.8392, + "step": 28 + }, + { + "epoch": 0.008637539790390737, + "grad_norm": 0.46668142080307007, + "learning_rate": 5.800000000000001e-07, + "loss": 1.8743, + "step": 29 + }, + { + "epoch": 0.008935385990059382, + "grad_norm": 0.48022735118865967, + "learning_rate": 6.000000000000001e-07, + "loss": 1.8605, + "step": 30 + }, + { + "epoch": 0.00923323218972803, + "grad_norm": 0.4465586543083191, + "learning_rate": 6.200000000000001e-07, + "loss": 1.8268, + "step": 31 + }, + { + "epoch": 0.009531078389396675, + "grad_norm": 0.46732452511787415, + "learning_rate": 6.4e-07, + "loss": 1.8707, + "step": 32 + }, + { + "epoch": 0.009828924589065322, + "grad_norm": 0.4528926908969879, + "learning_rate": 6.6e-07, + "loss": 1.8549, + "step": 33 + }, + { + "epoch": 0.010126770788733967, + "grad_norm": 0.4607648551464081, + "learning_rate": 6.800000000000001e-07, + "loss": 1.8582, + "step": 34 + }, + { + "epoch": 0.010424616988402614, + "grad_norm": 0.4756196141242981, + "learning_rate": 7.000000000000001e-07, + "loss": 1.8732, + "step": 35 + }, + { + "epoch": 0.01072246318807126, + "grad_norm": 0.4642297923564911, + "learning_rate": 7.2e-07, + "loss": 1.8596, + "step": 36 + }, + { + "epoch": 0.011020309387739905, + "grad_norm": 0.45597583055496216, + "learning_rate": 7.4e-07, + "loss": 1.84, + "step": 37 + }, + { + "epoch": 0.011318155587408552, + "grad_norm": 0.4625261127948761, + "learning_rate": 7.6e-07, + "loss": 1.8599, + "step": 38 + }, + { + "epoch": 0.011616001787077198, + "grad_norm": 0.4891977608203888, + "learning_rate": 7.8e-07, + "loss": 1.8729, + "step": 39 + }, + { + "epoch": 0.011913847986745845, + "grad_norm": 0.47429201006889343, + "learning_rate": 8.000000000000001e-07, + "loss": 1.844, + "step": 40 + }, + { + "epoch": 0.01221169418641449, + "grad_norm": 0.4835318922996521, + "learning_rate": 8.200000000000001e-07, + "loss": 1.8688, + "step": 41 + }, + { + "epoch": 0.012509540386083135, + "grad_norm": 0.49138543009757996, + "learning_rate": 8.400000000000001e-07, + "loss": 1.8678, + "step": 42 + }, + { + "epoch": 0.012807386585751783, + "grad_norm": 0.4650191068649292, + "learning_rate": 8.6e-07, + "loss": 1.853, + "step": 43 + }, + { + "epoch": 0.013105232785420428, + "grad_norm": 0.47995880246162415, + "learning_rate": 8.8e-07, + "loss": 1.8733, + "step": 44 + }, + { + "epoch": 0.013403078985089075, + "grad_norm": 0.46710073947906494, + "learning_rate": 9.000000000000001e-07, + "loss": 1.8289, + "step": 45 + }, + { + "epoch": 0.01370092518475772, + "grad_norm": 0.48132428526878357, + "learning_rate": 9.200000000000001e-07, + "loss": 1.8678, + "step": 46 + }, + { + "epoch": 0.013998771384426368, + "grad_norm": 0.4732246994972229, + "learning_rate": 9.400000000000001e-07, + "loss": 1.8511, + "step": 47 + }, + { + "epoch": 0.014296617584095013, + "grad_norm": 0.48072347044944763, + "learning_rate": 9.600000000000001e-07, + "loss": 1.861, + "step": 48 + }, + { + "epoch": 0.014594463783763658, + "grad_norm": 0.4825969934463501, + "learning_rate": 9.800000000000001e-07, + "loss": 1.8701, + "step": 49 + }, + { + "epoch": 0.014892309983432305, + "grad_norm": 0.4636313319206238, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.8455, + "step": 50 + }, + { + "epoch": 0.01519015618310095, + "grad_norm": 0.47920089960098267, + "learning_rate": 1.02e-06, + "loss": 1.8628, + "step": 51 + }, + { + "epoch": 0.015488002382769598, + "grad_norm": 0.4644438922405243, + "learning_rate": 1.04e-06, + "loss": 1.8411, + "step": 52 + }, + { + "epoch": 0.015785848582438245, + "grad_norm": 0.4573518633842468, + "learning_rate": 1.06e-06, + "loss": 1.8294, + "step": 53 + }, + { + "epoch": 0.01608369478210689, + "grad_norm": 0.4642627239227295, + "learning_rate": 1.08e-06, + "loss": 1.8371, + "step": 54 + }, + { + "epoch": 0.016381540981775536, + "grad_norm": 0.4713428020477295, + "learning_rate": 1.1e-06, + "loss": 1.8422, + "step": 55 + }, + { + "epoch": 0.016679387181444183, + "grad_norm": 0.4599405527114868, + "learning_rate": 1.12e-06, + "loss": 1.8188, + "step": 56 + }, + { + "epoch": 0.016977233381112827, + "grad_norm": 0.47443103790283203, + "learning_rate": 1.14e-06, + "loss": 1.8586, + "step": 57 + }, + { + "epoch": 0.017275079580781474, + "grad_norm": 0.46728289127349854, + "learning_rate": 1.1600000000000001e-06, + "loss": 1.8384, + "step": 58 + }, + { + "epoch": 0.01757292578045012, + "grad_norm": 0.4742635190486908, + "learning_rate": 1.1800000000000001e-06, + "loss": 1.8487, + "step": 59 + }, + { + "epoch": 0.017870771980118764, + "grad_norm": 0.5065075755119324, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.8695, + "step": 60 + }, + { + "epoch": 0.01816861817978741, + "grad_norm": 0.4815598726272583, + "learning_rate": 1.2200000000000002e-06, + "loss": 1.847, + "step": 61 + }, + { + "epoch": 0.01846646437945606, + "grad_norm": 0.5095518827438354, + "learning_rate": 1.2400000000000002e-06, + "loss": 1.8614, + "step": 62 + }, + { + "epoch": 0.018764310579124706, + "grad_norm": 0.484244704246521, + "learning_rate": 1.26e-06, + "loss": 1.8396, + "step": 63 + }, + { + "epoch": 0.01906215677879335, + "grad_norm": 0.5211488604545593, + "learning_rate": 1.28e-06, + "loss": 1.8555, + "step": 64 + }, + { + "epoch": 0.019360002978461997, + "grad_norm": 0.5129069685935974, + "learning_rate": 1.3e-06, + "loss": 1.8668, + "step": 65 + }, + { + "epoch": 0.019657849178130644, + "grad_norm": 0.5392414927482605, + "learning_rate": 1.32e-06, + "loss": 1.9, + "step": 66 + }, + { + "epoch": 0.019955695377799287, + "grad_norm": 0.5067933201789856, + "learning_rate": 1.34e-06, + "loss": 1.857, + "step": 67 + }, + { + "epoch": 0.020253541577467934, + "grad_norm": 0.5195866227149963, + "learning_rate": 1.3600000000000001e-06, + "loss": 1.8268, + "step": 68 + }, + { + "epoch": 0.02055138777713658, + "grad_norm": 0.5253258943557739, + "learning_rate": 1.3800000000000001e-06, + "loss": 1.8535, + "step": 69 + }, + { + "epoch": 0.02084923397680523, + "grad_norm": 0.5204430222511292, + "learning_rate": 1.4000000000000001e-06, + "loss": 1.8414, + "step": 70 + }, + { + "epoch": 0.021147080176473872, + "grad_norm": 0.5310043096542358, + "learning_rate": 1.42e-06, + "loss": 1.8639, + "step": 71 + }, + { + "epoch": 0.02144492637614252, + "grad_norm": 0.5121457576751709, + "learning_rate": 1.44e-06, + "loss": 1.8546, + "step": 72 + }, + { + "epoch": 0.021742772575811167, + "grad_norm": 0.5088541507720947, + "learning_rate": 1.46e-06, + "loss": 1.8422, + "step": 73 + }, + { + "epoch": 0.02204061877547981, + "grad_norm": 0.5058099627494812, + "learning_rate": 1.48e-06, + "loss": 1.8439, + "step": 74 + }, + { + "epoch": 0.022338464975148457, + "grad_norm": 0.4940571188926697, + "learning_rate": 1.5e-06, + "loss": 1.8213, + "step": 75 + }, + { + "epoch": 0.022636311174817104, + "grad_norm": 0.5225253701210022, + "learning_rate": 1.52e-06, + "loss": 1.8536, + "step": 76 + }, + { + "epoch": 0.02293415737448575, + "grad_norm": 0.5128735303878784, + "learning_rate": 1.54e-06, + "loss": 1.8145, + "step": 77 + }, + { + "epoch": 0.023232003574154395, + "grad_norm": 0.5496757626533508, + "learning_rate": 1.56e-06, + "loss": 1.8661, + "step": 78 + }, + { + "epoch": 0.023529849773823042, + "grad_norm": 0.5278938412666321, + "learning_rate": 1.5800000000000001e-06, + "loss": 1.8255, + "step": 79 + }, + { + "epoch": 0.02382769597349169, + "grad_norm": 0.5448312163352966, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.8722, + "step": 80 + }, + { + "epoch": 0.024125542173160333, + "grad_norm": 0.5144855976104736, + "learning_rate": 1.6200000000000002e-06, + "loss": 1.8249, + "step": 81 + }, + { + "epoch": 0.02442338837282898, + "grad_norm": 0.5338228940963745, + "learning_rate": 1.6400000000000002e-06, + "loss": 1.8331, + "step": 82 + }, + { + "epoch": 0.024721234572497627, + "grad_norm": 0.5393568873405457, + "learning_rate": 1.6600000000000002e-06, + "loss": 1.8334, + "step": 83 + }, + { + "epoch": 0.02501908077216627, + "grad_norm": 0.5475087761878967, + "learning_rate": 1.6800000000000002e-06, + "loss": 1.8572, + "step": 84 + }, + { + "epoch": 0.025316926971834918, + "grad_norm": 0.5298764705657959, + "learning_rate": 1.7000000000000002e-06, + "loss": 1.8323, + "step": 85 + }, + { + "epoch": 0.025614773171503565, + "grad_norm": 0.5606328248977661, + "learning_rate": 1.72e-06, + "loss": 1.8654, + "step": 86 + }, + { + "epoch": 0.025912619371172212, + "grad_norm": 0.5514033436775208, + "learning_rate": 1.74e-06, + "loss": 1.8227, + "step": 87 + }, + { + "epoch": 0.026210465570840856, + "grad_norm": 0.559169352054596, + "learning_rate": 1.76e-06, + "loss": 1.8398, + "step": 88 + }, + { + "epoch": 0.026508311770509503, + "grad_norm": 0.5687103867530823, + "learning_rate": 1.7800000000000001e-06, + "loss": 1.8377, + "step": 89 + }, + { + "epoch": 0.02680615797017815, + "grad_norm": 0.5481163859367371, + "learning_rate": 1.8000000000000001e-06, + "loss": 1.8384, + "step": 90 + }, + { + "epoch": 0.027104004169846794, + "grad_norm": 0.5697974562644958, + "learning_rate": 1.8200000000000002e-06, + "loss": 1.8511, + "step": 91 + }, + { + "epoch": 0.02740185036951544, + "grad_norm": 0.5484298467636108, + "learning_rate": 1.8400000000000002e-06, + "loss": 1.8351, + "step": 92 + }, + { + "epoch": 0.027699696569184088, + "grad_norm": 0.5311096906661987, + "learning_rate": 1.8600000000000002e-06, + "loss": 1.8234, + "step": 93 + }, + { + "epoch": 0.027997542768852735, + "grad_norm": 0.5723997950553894, + "learning_rate": 1.8800000000000002e-06, + "loss": 1.8366, + "step": 94 + }, + { + "epoch": 0.02829538896852138, + "grad_norm": 0.5615350604057312, + "learning_rate": 1.9000000000000002e-06, + "loss": 1.8531, + "step": 95 + }, + { + "epoch": 0.028593235168190026, + "grad_norm": 0.5616511702537537, + "learning_rate": 1.9200000000000003e-06, + "loss": 1.8165, + "step": 96 + }, + { + "epoch": 0.028891081367858673, + "grad_norm": 0.5790432095527649, + "learning_rate": 1.94e-06, + "loss": 1.856, + "step": 97 + }, + { + "epoch": 0.029188927567527317, + "grad_norm": 0.5516716837882996, + "learning_rate": 1.9600000000000003e-06, + "loss": 1.8194, + "step": 98 + }, + { + "epoch": 0.029486773767195964, + "grad_norm": 0.57439124584198, + "learning_rate": 1.98e-06, + "loss": 1.8388, + "step": 99 + }, + { + "epoch": 0.02978461996686461, + "grad_norm": 0.5567128658294678, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.8182, + "step": 100 + }, + { + "epoch": 0.030082466166533258, + "grad_norm": 0.5853058099746704, + "learning_rate": 2.02e-06, + "loss": 1.8468, + "step": 101 + }, + { + "epoch": 0.0303803123662019, + "grad_norm": 0.5795766115188599, + "learning_rate": 2.04e-06, + "loss": 1.8274, + "step": 102 + }, + { + "epoch": 0.03067815856587055, + "grad_norm": 0.6096683740615845, + "learning_rate": 2.06e-06, + "loss": 1.8367, + "step": 103 + }, + { + "epoch": 0.030976004765539196, + "grad_norm": 0.5890591740608215, + "learning_rate": 2.08e-06, + "loss": 1.8098, + "step": 104 + }, + { + "epoch": 0.03127385096520784, + "grad_norm": 0.6070247888565063, + "learning_rate": 2.1000000000000002e-06, + "loss": 1.8306, + "step": 105 + }, + { + "epoch": 0.03157169716487649, + "grad_norm": 0.6029655337333679, + "learning_rate": 2.12e-06, + "loss": 1.8243, + "step": 106 + }, + { + "epoch": 0.03186954336454513, + "grad_norm": 0.5994901657104492, + "learning_rate": 2.1400000000000003e-06, + "loss": 1.8273, + "step": 107 + }, + { + "epoch": 0.03216738956421378, + "grad_norm": 0.5960265398025513, + "learning_rate": 2.16e-06, + "loss": 1.8279, + "step": 108 + }, + { + "epoch": 0.032465235763882425, + "grad_norm": 0.6401825547218323, + "learning_rate": 2.1800000000000003e-06, + "loss": 1.8318, + "step": 109 + }, + { + "epoch": 0.03276308196355107, + "grad_norm": 0.6169389486312866, + "learning_rate": 2.2e-06, + "loss": 1.8093, + "step": 110 + }, + { + "epoch": 0.03306092816321972, + "grad_norm": 0.6193079352378845, + "learning_rate": 2.2200000000000003e-06, + "loss": 1.8122, + "step": 111 + }, + { + "epoch": 0.033358774362888366, + "grad_norm": 0.657038152217865, + "learning_rate": 2.24e-06, + "loss": 1.8412, + "step": 112 + }, + { + "epoch": 0.03365662056255701, + "grad_norm": 0.6150979399681091, + "learning_rate": 2.2600000000000004e-06, + "loss": 1.812, + "step": 113 + }, + { + "epoch": 0.03395446676222565, + "grad_norm": 0.6213580369949341, + "learning_rate": 2.28e-06, + "loss": 1.8112, + "step": 114 + }, + { + "epoch": 0.0342523129618943, + "grad_norm": 0.6286153793334961, + "learning_rate": 2.3000000000000004e-06, + "loss": 1.7994, + "step": 115 + }, + { + "epoch": 0.03455015916156295, + "grad_norm": 0.6432107090950012, + "learning_rate": 2.3200000000000002e-06, + "loss": 1.8105, + "step": 116 + }, + { + "epoch": 0.034848005361231595, + "grad_norm": 0.629039466381073, + "learning_rate": 2.3400000000000005e-06, + "loss": 1.7944, + "step": 117 + }, + { + "epoch": 0.03514585156090024, + "grad_norm": 0.6385027170181274, + "learning_rate": 2.3600000000000003e-06, + "loss": 1.8105, + "step": 118 + }, + { + "epoch": 0.03544369776056889, + "grad_norm": 0.6555307507514954, + "learning_rate": 2.38e-06, + "loss": 1.824, + "step": 119 + }, + { + "epoch": 0.03574154396023753, + "grad_norm": 0.6707586646080017, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.7936, + "step": 120 + }, + { + "epoch": 0.036039390159906176, + "grad_norm": 0.6686285138130188, + "learning_rate": 2.42e-06, + "loss": 1.8116, + "step": 121 + }, + { + "epoch": 0.03633723635957482, + "grad_norm": 0.6620265245437622, + "learning_rate": 2.4400000000000004e-06, + "loss": 1.8123, + "step": 122 + }, + { + "epoch": 0.03663508255924347, + "grad_norm": 0.6774595975875854, + "learning_rate": 2.46e-06, + "loss": 1.7954, + "step": 123 + }, + { + "epoch": 0.03693292875891212, + "grad_norm": 0.6555871963500977, + "learning_rate": 2.4800000000000004e-06, + "loss": 1.8002, + "step": 124 + }, + { + "epoch": 0.037230774958580765, + "grad_norm": 0.6729901432991028, + "learning_rate": 2.5e-06, + "loss": 1.8022, + "step": 125 + }, + { + "epoch": 0.03752862115824941, + "grad_norm": 0.6566896438598633, + "learning_rate": 2.52e-06, + "loss": 1.7948, + "step": 126 + }, + { + "epoch": 0.03782646735791805, + "grad_norm": 0.6590833067893982, + "learning_rate": 2.5400000000000002e-06, + "loss": 1.7972, + "step": 127 + }, + { + "epoch": 0.0381243135575867, + "grad_norm": 0.6688756942749023, + "learning_rate": 2.56e-06, + "loss": 1.804, + "step": 128 + }, + { + "epoch": 0.038422159757255346, + "grad_norm": 0.6758476495742798, + "learning_rate": 2.5800000000000003e-06, + "loss": 1.7914, + "step": 129 + }, + { + "epoch": 0.03872000595692399, + "grad_norm": 0.6805055737495422, + "learning_rate": 2.6e-06, + "loss": 1.7951, + "step": 130 + }, + { + "epoch": 0.03901785215659264, + "grad_norm": 0.6505172252655029, + "learning_rate": 2.6200000000000003e-06, + "loss": 1.7979, + "step": 131 + }, + { + "epoch": 0.03931569835626129, + "grad_norm": 0.6635544896125793, + "learning_rate": 2.64e-06, + "loss": 1.7859, + "step": 132 + }, + { + "epoch": 0.039613544555929935, + "grad_norm": 0.6838406920433044, + "learning_rate": 2.6600000000000004e-06, + "loss": 1.7956, + "step": 133 + }, + { + "epoch": 0.039911390755598575, + "grad_norm": 0.6474420428276062, + "learning_rate": 2.68e-06, + "loss": 1.7752, + "step": 134 + }, + { + "epoch": 0.04020923695526722, + "grad_norm": 0.6364726424217224, + "learning_rate": 2.7000000000000004e-06, + "loss": 1.7841, + "step": 135 + }, + { + "epoch": 0.04050708315493587, + "grad_norm": 0.6838220357894897, + "learning_rate": 2.7200000000000002e-06, + "loss": 1.7843, + "step": 136 + }, + { + "epoch": 0.040804929354604516, + "grad_norm": 0.660972535610199, + "learning_rate": 2.7400000000000004e-06, + "loss": 1.7815, + "step": 137 + }, + { + "epoch": 0.04110277555427316, + "grad_norm": 0.7104812860488892, + "learning_rate": 2.7600000000000003e-06, + "loss": 1.8164, + "step": 138 + }, + { + "epoch": 0.04140062175394181, + "grad_norm": 0.6517634987831116, + "learning_rate": 2.7800000000000005e-06, + "loss": 1.7552, + "step": 139 + }, + { + "epoch": 0.04169846795361046, + "grad_norm": 0.6587279438972473, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.7679, + "step": 140 + }, + { + "epoch": 0.0419963141532791, + "grad_norm": 0.6743229627609253, + "learning_rate": 2.82e-06, + "loss": 1.7719, + "step": 141 + }, + { + "epoch": 0.042294160352947745, + "grad_norm": 0.6563546061515808, + "learning_rate": 2.84e-06, + "loss": 1.7677, + "step": 142 + }, + { + "epoch": 0.04259200655261639, + "grad_norm": 0.6561146378517151, + "learning_rate": 2.86e-06, + "loss": 1.7399, + "step": 143 + }, + { + "epoch": 0.04288985275228504, + "grad_norm": 0.6793851256370544, + "learning_rate": 2.88e-06, + "loss": 1.7773, + "step": 144 + }, + { + "epoch": 0.043187698951953686, + "grad_norm": 0.7065466642379761, + "learning_rate": 2.9e-06, + "loss": 1.7771, + "step": 145 + }, + { + "epoch": 0.04348554515162233, + "grad_norm": 0.6534376740455627, + "learning_rate": 2.92e-06, + "loss": 1.7494, + "step": 146 + }, + { + "epoch": 0.04378339135129098, + "grad_norm": 0.6629258990287781, + "learning_rate": 2.9400000000000002e-06, + "loss": 1.7584, + "step": 147 + }, + { + "epoch": 0.04408123755095962, + "grad_norm": 0.6547302007675171, + "learning_rate": 2.96e-06, + "loss": 1.7452, + "step": 148 + }, + { + "epoch": 0.04437908375062827, + "grad_norm": 0.6431775093078613, + "learning_rate": 2.9800000000000003e-06, + "loss": 1.7595, + "step": 149 + }, + { + "epoch": 0.044676929950296915, + "grad_norm": 0.6590375900268555, + "learning_rate": 3e-06, + "loss": 1.7349, + "step": 150 + }, + { + "epoch": 0.04497477614996556, + "grad_norm": 0.6398279070854187, + "learning_rate": 3.0200000000000003e-06, + "loss": 1.7334, + "step": 151 + }, + { + "epoch": 0.04527262234963421, + "grad_norm": 0.6175107955932617, + "learning_rate": 3.04e-06, + "loss": 1.7406, + "step": 152 + }, + { + "epoch": 0.045570468549302856, + "grad_norm": 0.6381570100784302, + "learning_rate": 3.0600000000000003e-06, + "loss": 1.7192, + "step": 153 + }, + { + "epoch": 0.0458683147489715, + "grad_norm": 0.5893192291259766, + "learning_rate": 3.08e-06, + "loss": 1.7181, + "step": 154 + }, + { + "epoch": 0.04616616094864014, + "grad_norm": 0.5449604392051697, + "learning_rate": 3.1000000000000004e-06, + "loss": 1.7141, + "step": 155 + }, + { + "epoch": 0.04646400714830879, + "grad_norm": 0.5816596746444702, + "learning_rate": 3.12e-06, + "loss": 1.7301, + "step": 156 + }, + { + "epoch": 0.04676185334797744, + "grad_norm": 0.5603417754173279, + "learning_rate": 3.1400000000000004e-06, + "loss": 1.7215, + "step": 157 + }, + { + "epoch": 0.047059699547646085, + "grad_norm": 0.557874858379364, + "learning_rate": 3.1600000000000002e-06, + "loss": 1.7011, + "step": 158 + }, + { + "epoch": 0.04735754574731473, + "grad_norm": 0.5643253922462463, + "learning_rate": 3.1800000000000005e-06, + "loss": 1.6999, + "step": 159 + }, + { + "epoch": 0.04765539194698338, + "grad_norm": 0.5932505130767822, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.7408, + "step": 160 + }, + { + "epoch": 0.04795323814665202, + "grad_norm": 0.5699681043624878, + "learning_rate": 3.2200000000000005e-06, + "loss": 1.7111, + "step": 161 + }, + { + "epoch": 0.048251084346320666, + "grad_norm": 0.5691033005714417, + "learning_rate": 3.2400000000000003e-06, + "loss": 1.6914, + "step": 162 + }, + { + "epoch": 0.04854893054598931, + "grad_norm": 0.579626739025116, + "learning_rate": 3.2600000000000006e-06, + "loss": 1.7124, + "step": 163 + }, + { + "epoch": 0.04884677674565796, + "grad_norm": 0.580288290977478, + "learning_rate": 3.2800000000000004e-06, + "loss": 1.7117, + "step": 164 + }, + { + "epoch": 0.04914462294532661, + "grad_norm": 0.5958548188209534, + "learning_rate": 3.3000000000000006e-06, + "loss": 1.7044, + "step": 165 + }, + { + "epoch": 0.049442469144995255, + "grad_norm": 0.5895776748657227, + "learning_rate": 3.3200000000000004e-06, + "loss": 1.6968, + "step": 166 + }, + { + "epoch": 0.0497403153446639, + "grad_norm": 0.5660363435745239, + "learning_rate": 3.3400000000000006e-06, + "loss": 1.6819, + "step": 167 + }, + { + "epoch": 0.05003816154433254, + "grad_norm": 0.5647677183151245, + "learning_rate": 3.3600000000000004e-06, + "loss": 1.6939, + "step": 168 + }, + { + "epoch": 0.05033600774400119, + "grad_norm": 0.518638014793396, + "learning_rate": 3.3800000000000007e-06, + "loss": 1.7027, + "step": 169 + }, + { + "epoch": 0.050633853943669836, + "grad_norm": 0.43530601263046265, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.686, + "step": 170 + }, + { + "epoch": 0.05093170014333848, + "grad_norm": 0.4004081189632416, + "learning_rate": 3.4200000000000007e-06, + "loss": 1.6593, + "step": 171 + }, + { + "epoch": 0.05122954634300713, + "grad_norm": 0.4063718020915985, + "learning_rate": 3.44e-06, + "loss": 1.6799, + "step": 172 + }, + { + "epoch": 0.05152739254267578, + "grad_norm": 0.3832969069480896, + "learning_rate": 3.46e-06, + "loss": 1.688, + "step": 173 + }, + { + "epoch": 0.051825238742344425, + "grad_norm": 0.39484286308288574, + "learning_rate": 3.48e-06, + "loss": 1.6716, + "step": 174 + }, + { + "epoch": 0.052123084942013065, + "grad_norm": 0.3989197611808777, + "learning_rate": 3.5e-06, + "loss": 1.6764, + "step": 175 + }, + { + "epoch": 0.05242093114168171, + "grad_norm": 0.38792771100997925, + "learning_rate": 3.52e-06, + "loss": 1.6629, + "step": 176 + }, + { + "epoch": 0.05271877734135036, + "grad_norm": 0.4144127368927002, + "learning_rate": 3.54e-06, + "loss": 1.6654, + "step": 177 + }, + { + "epoch": 0.053016623541019006, + "grad_norm": 0.399394690990448, + "learning_rate": 3.5600000000000002e-06, + "loss": 1.6423, + "step": 178 + }, + { + "epoch": 0.05331446974068765, + "grad_norm": 0.40582484006881714, + "learning_rate": 3.58e-06, + "loss": 1.6579, + "step": 179 + }, + { + "epoch": 0.0536123159403563, + "grad_norm": 0.38763391971588135, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.6463, + "step": 180 + }, + { + "epoch": 0.05391016214002495, + "grad_norm": 0.39296337962150574, + "learning_rate": 3.62e-06, + "loss": 1.6406, + "step": 181 + }, + { + "epoch": 0.05420800833969359, + "grad_norm": 0.3867253065109253, + "learning_rate": 3.6400000000000003e-06, + "loss": 1.6463, + "step": 182 + }, + { + "epoch": 0.054505854539362235, + "grad_norm": 0.36434227228164673, + "learning_rate": 3.66e-06, + "loss": 1.6242, + "step": 183 + }, + { + "epoch": 0.05480370073903088, + "grad_norm": 0.38888224959373474, + "learning_rate": 3.6800000000000003e-06, + "loss": 1.6614, + "step": 184 + }, + { + "epoch": 0.05510154693869953, + "grad_norm": 0.37744617462158203, + "learning_rate": 3.7e-06, + "loss": 1.6095, + "step": 185 + }, + { + "epoch": 0.055399393138368176, + "grad_norm": 0.38917142152786255, + "learning_rate": 3.7200000000000004e-06, + "loss": 1.6236, + "step": 186 + }, + { + "epoch": 0.05569723933803682, + "grad_norm": 0.3825514018535614, + "learning_rate": 3.74e-06, + "loss": 1.6188, + "step": 187 + }, + { + "epoch": 0.05599508553770547, + "grad_norm": 0.3907104432582855, + "learning_rate": 3.7600000000000004e-06, + "loss": 1.6368, + "step": 188 + }, + { + "epoch": 0.05629293173737411, + "grad_norm": 0.37548741698265076, + "learning_rate": 3.7800000000000002e-06, + "loss": 1.6048, + "step": 189 + }, + { + "epoch": 0.05659077793704276, + "grad_norm": 0.38833755254745483, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.6154, + "step": 190 + }, + { + "epoch": 0.056888624136711405, + "grad_norm": 0.3748374879360199, + "learning_rate": 3.820000000000001e-06, + "loss": 1.6299, + "step": 191 + }, + { + "epoch": 0.05718647033638005, + "grad_norm": 0.4172203838825226, + "learning_rate": 3.8400000000000005e-06, + "loss": 1.6216, + "step": 192 + }, + { + "epoch": 0.0574843165360487, + "grad_norm": 0.4116345942020416, + "learning_rate": 3.86e-06, + "loss": 1.6112, + "step": 193 + }, + { + "epoch": 0.057782162735717346, + "grad_norm": 0.4274803698062897, + "learning_rate": 3.88e-06, + "loss": 1.6245, + "step": 194 + }, + { + "epoch": 0.05808000893538599, + "grad_norm": 0.3828435242176056, + "learning_rate": 3.900000000000001e-06, + "loss": 1.6066, + "step": 195 + }, + { + "epoch": 0.058377855135054633, + "grad_norm": 0.35347607731819153, + "learning_rate": 3.920000000000001e-06, + "loss": 1.5996, + "step": 196 + }, + { + "epoch": 0.05867570133472328, + "grad_norm": 0.3211362063884735, + "learning_rate": 3.94e-06, + "loss": 1.5987, + "step": 197 + }, + { + "epoch": 0.05897354753439193, + "grad_norm": 0.3006207346916199, + "learning_rate": 3.96e-06, + "loss": 1.59, + "step": 198 + }, + { + "epoch": 0.059271393734060575, + "grad_norm": 0.29288944602012634, + "learning_rate": 3.980000000000001e-06, + "loss": 1.6042, + "step": 199 + }, + { + "epoch": 0.05956923993372922, + "grad_norm": 0.27601832151412964, + "learning_rate": 4.000000000000001e-06, + "loss": 1.5809, + "step": 200 + }, + { + "epoch": 0.05986708613339787, + "grad_norm": 0.2815020680427551, + "learning_rate": 4.0200000000000005e-06, + "loss": 1.5892, + "step": 201 + }, + { + "epoch": 0.060164932333066516, + "grad_norm": 0.28545090556144714, + "learning_rate": 4.04e-06, + "loss": 1.5847, + "step": 202 + }, + { + "epoch": 0.060462778532735156, + "grad_norm": 0.275897741317749, + "learning_rate": 4.060000000000001e-06, + "loss": 1.5937, + "step": 203 + }, + { + "epoch": 0.0607606247324038, + "grad_norm": 0.27737608551979065, + "learning_rate": 4.08e-06, + "loss": 1.5782, + "step": 204 + }, + { + "epoch": 0.06105847093207245, + "grad_norm": 0.2871304750442505, + "learning_rate": 4.1e-06, + "loss": 1.5841, + "step": 205 + }, + { + "epoch": 0.0613563171317411, + "grad_norm": 0.29327964782714844, + "learning_rate": 4.12e-06, + "loss": 1.5759, + "step": 206 + }, + { + "epoch": 0.061654163331409745, + "grad_norm": 0.2835024893283844, + "learning_rate": 4.14e-06, + "loss": 1.5834, + "step": 207 + }, + { + "epoch": 0.06195200953107839, + "grad_norm": 0.2886221706867218, + "learning_rate": 4.16e-06, + "loss": 1.5973, + "step": 208 + }, + { + "epoch": 0.06224985573074703, + "grad_norm": 0.2900846600532532, + "learning_rate": 4.18e-06, + "loss": 1.6061, + "step": 209 + }, + { + "epoch": 0.06254770193041569, + "grad_norm": 0.28939002752304077, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.5918, + "step": 210 + }, + { + "epoch": 0.06284554813008433, + "grad_norm": 0.316026896238327, + "learning_rate": 4.22e-06, + "loss": 1.5984, + "step": 211 + }, + { + "epoch": 0.06314339432975298, + "grad_norm": 0.30959439277648926, + "learning_rate": 4.24e-06, + "loss": 1.5915, + "step": 212 + }, + { + "epoch": 0.06344124052942161, + "grad_norm": 0.30867230892181396, + "learning_rate": 4.26e-06, + "loss": 1.5838, + "step": 213 + }, + { + "epoch": 0.06373908672909026, + "grad_norm": 0.3157883882522583, + "learning_rate": 4.2800000000000005e-06, + "loss": 1.5959, + "step": 214 + }, + { + "epoch": 0.06403693292875891, + "grad_norm": 0.3085717558860779, + "learning_rate": 4.3e-06, + "loss": 1.5681, + "step": 215 + }, + { + "epoch": 0.06433477912842755, + "grad_norm": 0.30693379044532776, + "learning_rate": 4.32e-06, + "loss": 1.5813, + "step": 216 + }, + { + "epoch": 0.0646326253280962, + "grad_norm": 0.33237066864967346, + "learning_rate": 4.34e-06, + "loss": 1.5841, + "step": 217 + }, + { + "epoch": 0.06493047152776485, + "grad_norm": 0.33629241585731506, + "learning_rate": 4.360000000000001e-06, + "loss": 1.6016, + "step": 218 + }, + { + "epoch": 0.0652283177274335, + "grad_norm": 0.3244820237159729, + "learning_rate": 4.38e-06, + "loss": 1.5914, + "step": 219 + }, + { + "epoch": 0.06552616392710214, + "grad_norm": 0.33084553480148315, + "learning_rate": 4.4e-06, + "loss": 1.5835, + "step": 220 + }, + { + "epoch": 0.06582401012677079, + "grad_norm": 0.34093067049980164, + "learning_rate": 4.42e-06, + "loss": 1.5894, + "step": 221 + }, + { + "epoch": 0.06612185632643944, + "grad_norm": 0.35316869616508484, + "learning_rate": 4.440000000000001e-06, + "loss": 1.5778, + "step": 222 + }, + { + "epoch": 0.06641970252610808, + "grad_norm": 0.3374737799167633, + "learning_rate": 4.4600000000000005e-06, + "loss": 1.5691, + "step": 223 + }, + { + "epoch": 0.06671754872577673, + "grad_norm": 0.34237730503082275, + "learning_rate": 4.48e-06, + "loss": 1.5646, + "step": 224 + }, + { + "epoch": 0.06701539492544538, + "grad_norm": 0.34547626972198486, + "learning_rate": 4.5e-06, + "loss": 1.5657, + "step": 225 + }, + { + "epoch": 0.06731324112511403, + "grad_norm": 0.3584066927433014, + "learning_rate": 4.520000000000001e-06, + "loss": 1.5571, + "step": 226 + }, + { + "epoch": 0.06761108732478266, + "grad_norm": 0.36502501368522644, + "learning_rate": 4.540000000000001e-06, + "loss": 1.5843, + "step": 227 + }, + { + "epoch": 0.0679089335244513, + "grad_norm": 0.35336628556251526, + "learning_rate": 4.56e-06, + "loss": 1.559, + "step": 228 + }, + { + "epoch": 0.06820677972411995, + "grad_norm": 0.38950905203819275, + "learning_rate": 4.58e-06, + "loss": 1.6019, + "step": 229 + }, + { + "epoch": 0.0685046259237886, + "grad_norm": 0.3913187086582184, + "learning_rate": 4.600000000000001e-06, + "loss": 1.5615, + "step": 230 + }, + { + "epoch": 0.06880247212345725, + "grad_norm": 0.3854864835739136, + "learning_rate": 4.620000000000001e-06, + "loss": 1.5799, + "step": 231 + }, + { + "epoch": 0.0691003183231259, + "grad_norm": 0.39519399404525757, + "learning_rate": 4.6400000000000005e-06, + "loss": 1.5983, + "step": 232 + }, + { + "epoch": 0.06939816452279454, + "grad_norm": 0.3765176832675934, + "learning_rate": 4.66e-06, + "loss": 1.5673, + "step": 233 + }, + { + "epoch": 0.06969601072246319, + "grad_norm": 0.37804102897644043, + "learning_rate": 4.680000000000001e-06, + "loss": 1.5628, + "step": 234 + }, + { + "epoch": 0.06999385692213184, + "grad_norm": 0.3661312460899353, + "learning_rate": 4.7e-06, + "loss": 1.5743, + "step": 235 + }, + { + "epoch": 0.07029170312180048, + "grad_norm": 0.4114713966846466, + "learning_rate": 4.7200000000000005e-06, + "loss": 1.5789, + "step": 236 + }, + { + "epoch": 0.07058954932146913, + "grad_norm": 0.4079197645187378, + "learning_rate": 4.74e-06, + "loss": 1.561, + "step": 237 + }, + { + "epoch": 0.07088739552113778, + "grad_norm": 0.4288586378097534, + "learning_rate": 4.76e-06, + "loss": 1.5744, + "step": 238 + }, + { + "epoch": 0.07118524172080642, + "grad_norm": 0.44189968705177307, + "learning_rate": 4.78e-06, + "loss": 1.5642, + "step": 239 + }, + { + "epoch": 0.07148308792047506, + "grad_norm": 0.46231845021247864, + "learning_rate": 4.800000000000001e-06, + "loss": 1.5813, + "step": 240 + }, + { + "epoch": 0.0717809341201437, + "grad_norm": 0.44197338819503784, + "learning_rate": 4.8200000000000004e-06, + "loss": 1.5579, + "step": 241 + }, + { + "epoch": 0.07207878031981235, + "grad_norm": 0.44414064288139343, + "learning_rate": 4.84e-06, + "loss": 1.5589, + "step": 242 + }, + { + "epoch": 0.072376626519481, + "grad_norm": 0.41845178604125977, + "learning_rate": 4.86e-06, + "loss": 1.5465, + "step": 243 + }, + { + "epoch": 0.07267447271914965, + "grad_norm": 0.4510466158390045, + "learning_rate": 4.880000000000001e-06, + "loss": 1.5466, + "step": 244 + }, + { + "epoch": 0.0729723189188183, + "grad_norm": 0.442803293466568, + "learning_rate": 4.9000000000000005e-06, + "loss": 1.5472, + "step": 245 + }, + { + "epoch": 0.07327016511848694, + "grad_norm": 0.45264631509780884, + "learning_rate": 4.92e-06, + "loss": 1.5581, + "step": 246 + }, + { + "epoch": 0.07356801131815559, + "grad_norm": 0.4818227291107178, + "learning_rate": 4.94e-06, + "loss": 1.5708, + "step": 247 + }, + { + "epoch": 0.07386585751782423, + "grad_norm": 0.4921914041042328, + "learning_rate": 4.960000000000001e-06, + "loss": 1.5533, + "step": 248 + }, + { + "epoch": 0.07416370371749288, + "grad_norm": 0.5005349516868591, + "learning_rate": 4.980000000000001e-06, + "loss": 1.5569, + "step": 249 + }, + { + "epoch": 0.07446154991716153, + "grad_norm": 0.5374754667282104, + "learning_rate": 5e-06, + "loss": 1.565, + "step": 250 + }, + { + "epoch": 0.07475939611683018, + "grad_norm": 0.5428454279899597, + "learning_rate": 5.02e-06, + "loss": 1.5404, + "step": 251 + }, + { + "epoch": 0.07505724231649882, + "grad_norm": 0.5481797456741333, + "learning_rate": 5.04e-06, + "loss": 1.5472, + "step": 252 + }, + { + "epoch": 0.07535508851616747, + "grad_norm": 0.5704034566879272, + "learning_rate": 5.060000000000001e-06, + "loss": 1.5569, + "step": 253 + }, + { + "epoch": 0.0756529347158361, + "grad_norm": 0.5555924773216248, + "learning_rate": 5.0800000000000005e-06, + "loss": 1.5427, + "step": 254 + }, + { + "epoch": 0.07595078091550475, + "grad_norm": 0.5683029294013977, + "learning_rate": 5.1e-06, + "loss": 1.5584, + "step": 255 + }, + { + "epoch": 0.0762486271151734, + "grad_norm": 0.5711975693702698, + "learning_rate": 5.12e-06, + "loss": 1.5481, + "step": 256 + }, + { + "epoch": 0.07654647331484205, + "grad_norm": 0.5935271382331848, + "learning_rate": 5.140000000000001e-06, + "loss": 1.5775, + "step": 257 + }, + { + "epoch": 0.07684431951451069, + "grad_norm": 0.5552716851234436, + "learning_rate": 5.1600000000000006e-06, + "loss": 1.541, + "step": 258 + }, + { + "epoch": 0.07714216571417934, + "grad_norm": 0.5461673736572266, + "learning_rate": 5.18e-06, + "loss": 1.5504, + "step": 259 + }, + { + "epoch": 0.07744001191384799, + "grad_norm": 0.5375115871429443, + "learning_rate": 5.2e-06, + "loss": 1.544, + "step": 260 + }, + { + "epoch": 0.07773785811351663, + "grad_norm": 0.5090441107749939, + "learning_rate": 5.220000000000001e-06, + "loss": 1.5501, + "step": 261 + }, + { + "epoch": 0.07803570431318528, + "grad_norm": 0.4780338704586029, + "learning_rate": 5.240000000000001e-06, + "loss": 1.527, + "step": 262 + }, + { + "epoch": 0.07833355051285393, + "grad_norm": 0.4490078389644623, + "learning_rate": 5.2600000000000005e-06, + "loss": 1.5554, + "step": 263 + }, + { + "epoch": 0.07863139671252257, + "grad_norm": 0.39460065960884094, + "learning_rate": 5.28e-06, + "loss": 1.5536, + "step": 264 + }, + { + "epoch": 0.07892924291219122, + "grad_norm": 0.3691290318965912, + "learning_rate": 5.300000000000001e-06, + "loss": 1.5547, + "step": 265 + }, + { + "epoch": 0.07922708911185987, + "grad_norm": 0.3380715847015381, + "learning_rate": 5.320000000000001e-06, + "loss": 1.5509, + "step": 266 + }, + { + "epoch": 0.07952493531152852, + "grad_norm": 0.32584908604621887, + "learning_rate": 5.3400000000000005e-06, + "loss": 1.5476, + "step": 267 + }, + { + "epoch": 0.07982278151119715, + "grad_norm": 0.2909034192562103, + "learning_rate": 5.36e-06, + "loss": 1.5489, + "step": 268 + }, + { + "epoch": 0.0801206277108658, + "grad_norm": 0.2502373456954956, + "learning_rate": 5.380000000000001e-06, + "loss": 1.5511, + "step": 269 + }, + { + "epoch": 0.08041847391053444, + "grad_norm": 0.20297959446907043, + "learning_rate": 5.400000000000001e-06, + "loss": 1.5694, + "step": 270 + }, + { + "epoch": 0.08071632011020309, + "grad_norm": 0.14874856173992157, + "learning_rate": 5.420000000000001e-06, + "loss": 1.5543, + "step": 271 + }, + { + "epoch": 0.08101416630987174, + "grad_norm": 0.13056589663028717, + "learning_rate": 5.4400000000000004e-06, + "loss": 1.5251, + "step": 272 + }, + { + "epoch": 0.08131201250954039, + "grad_norm": 0.12990182638168335, + "learning_rate": 5.460000000000001e-06, + "loss": 1.5272, + "step": 273 + }, + { + "epoch": 0.08160985870920903, + "grad_norm": 0.11918067187070847, + "learning_rate": 5.480000000000001e-06, + "loss": 1.5343, + "step": 274 + }, + { + "epoch": 0.08190770490887768, + "grad_norm": 0.10813926160335541, + "learning_rate": 5.500000000000001e-06, + "loss": 1.532, + "step": 275 + }, + { + "epoch": 0.08220555110854633, + "grad_norm": 0.10963642597198486, + "learning_rate": 5.5200000000000005e-06, + "loss": 1.5594, + "step": 276 + }, + { + "epoch": 0.08250339730821497, + "grad_norm": 0.10151322931051254, + "learning_rate": 5.540000000000001e-06, + "loss": 1.5263, + "step": 277 + }, + { + "epoch": 0.08280124350788362, + "grad_norm": 0.09857906401157379, + "learning_rate": 5.560000000000001e-06, + "loss": 1.5248, + "step": 278 + }, + { + "epoch": 0.08309908970755227, + "grad_norm": 0.09040740132331848, + "learning_rate": 5.580000000000001e-06, + "loss": 1.5135, + "step": 279 + }, + { + "epoch": 0.08339693590722091, + "grad_norm": 0.09191717952489853, + "learning_rate": 5.600000000000001e-06, + "loss": 1.5464, + "step": 280 + }, + { + "epoch": 0.08369478210688955, + "grad_norm": 0.08776262402534485, + "learning_rate": 5.620000000000001e-06, + "loss": 1.5336, + "step": 281 + }, + { + "epoch": 0.0839926283065582, + "grad_norm": 0.08440259844064713, + "learning_rate": 5.64e-06, + "loss": 1.5354, + "step": 282 + }, + { + "epoch": 0.08429047450622684, + "grad_norm": 0.08505409210920334, + "learning_rate": 5.66e-06, + "loss": 1.5324, + "step": 283 + }, + { + "epoch": 0.08458832070589549, + "grad_norm": 0.08075687289237976, + "learning_rate": 5.68e-06, + "loss": 1.5314, + "step": 284 + }, + { + "epoch": 0.08488616690556414, + "grad_norm": 0.08375275880098343, + "learning_rate": 5.7e-06, + "loss": 1.5268, + "step": 285 + }, + { + "epoch": 0.08518401310523278, + "grad_norm": 0.08402302861213684, + "learning_rate": 5.72e-06, + "loss": 1.5414, + "step": 286 + }, + { + "epoch": 0.08548185930490143, + "grad_norm": 0.08408354222774506, + "learning_rate": 5.74e-06, + "loss": 1.5371, + "step": 287 + }, + { + "epoch": 0.08577970550457008, + "grad_norm": 0.08600781112909317, + "learning_rate": 5.76e-06, + "loss": 1.5377, + "step": 288 + }, + { + "epoch": 0.08607755170423873, + "grad_norm": 0.07958939671516418, + "learning_rate": 5.78e-06, + "loss": 1.5353, + "step": 289 + }, + { + "epoch": 0.08637539790390737, + "grad_norm": 0.08092907816171646, + "learning_rate": 5.8e-06, + "loss": 1.5351, + "step": 290 + }, + { + "epoch": 0.08667324410357602, + "grad_norm": 0.07932665199041367, + "learning_rate": 5.82e-06, + "loss": 1.519, + "step": 291 + }, + { + "epoch": 0.08697109030324467, + "grad_norm": 0.0793927013874054, + "learning_rate": 5.84e-06, + "loss": 1.5481, + "step": 292 + }, + { + "epoch": 0.08726893650291331, + "grad_norm": 0.07992880046367645, + "learning_rate": 5.86e-06, + "loss": 1.5377, + "step": 293 + }, + { + "epoch": 0.08756678270258196, + "grad_norm": 0.07722952216863632, + "learning_rate": 5.8800000000000005e-06, + "loss": 1.5277, + "step": 294 + }, + { + "epoch": 0.0878646289022506, + "grad_norm": 0.07810430228710175, + "learning_rate": 5.9e-06, + "loss": 1.5226, + "step": 295 + }, + { + "epoch": 0.08816247510191924, + "grad_norm": 0.07677609473466873, + "learning_rate": 5.92e-06, + "loss": 1.5236, + "step": 296 + }, + { + "epoch": 0.08846032130158789, + "grad_norm": 0.0761965662240982, + "learning_rate": 5.94e-06, + "loss": 1.5195, + "step": 297 + }, + { + "epoch": 0.08875816750125654, + "grad_norm": 0.07856539636850357, + "learning_rate": 5.9600000000000005e-06, + "loss": 1.5277, + "step": 298 + }, + { + "epoch": 0.08905601370092518, + "grad_norm": 0.0727543905377388, + "learning_rate": 5.98e-06, + "loss": 1.5017, + "step": 299 + }, + { + "epoch": 0.08935385990059383, + "grad_norm": 0.07391712069511414, + "learning_rate": 6e-06, + "loss": 1.5229, + "step": 300 + }, + { + "epoch": 0.08965170610026248, + "grad_norm": 0.08318979293107986, + "learning_rate": 6.02e-06, + "loss": 1.5263, + "step": 301 + }, + { + "epoch": 0.08994955229993112, + "grad_norm": 0.06995917111635208, + "learning_rate": 6.040000000000001e-06, + "loss": 1.5201, + "step": 302 + }, + { + "epoch": 0.09024739849959977, + "grad_norm": 0.07505074143409729, + "learning_rate": 6.0600000000000004e-06, + "loss": 1.5303, + "step": 303 + }, + { + "epoch": 0.09054524469926842, + "grad_norm": 0.0731302946805954, + "learning_rate": 6.08e-06, + "loss": 1.5316, + "step": 304 + }, + { + "epoch": 0.09084309089893707, + "grad_norm": 0.06981611996889114, + "learning_rate": 6.1e-06, + "loss": 1.5401, + "step": 305 + }, + { + "epoch": 0.09114093709860571, + "grad_norm": 0.07196632772684097, + "learning_rate": 6.120000000000001e-06, + "loss": 1.5397, + "step": 306 + }, + { + "epoch": 0.09143878329827436, + "grad_norm": 0.07196949422359467, + "learning_rate": 6.1400000000000005e-06, + "loss": 1.5239, + "step": 307 + }, + { + "epoch": 0.091736629497943, + "grad_norm": 0.07190251350402832, + "learning_rate": 6.16e-06, + "loss": 1.532, + "step": 308 + }, + { + "epoch": 0.09203447569761164, + "grad_norm": 0.06778890639543533, + "learning_rate": 6.18e-06, + "loss": 1.5185, + "step": 309 + }, + { + "epoch": 0.09233232189728029, + "grad_norm": 0.07153601199388504, + "learning_rate": 6.200000000000001e-06, + "loss": 1.5356, + "step": 310 + }, + { + "epoch": 0.09263016809694893, + "grad_norm": 0.07012605667114258, + "learning_rate": 6.220000000000001e-06, + "loss": 1.5128, + "step": 311 + }, + { + "epoch": 0.09292801429661758, + "grad_norm": 0.0677528828382492, + "learning_rate": 6.24e-06, + "loss": 1.5458, + "step": 312 + }, + { + "epoch": 0.09322586049628623, + "grad_norm": 0.06769208610057831, + "learning_rate": 6.26e-06, + "loss": 1.5305, + "step": 313 + }, + { + "epoch": 0.09352370669595488, + "grad_norm": 0.0664294883608818, + "learning_rate": 6.280000000000001e-06, + "loss": 1.5381, + "step": 314 + }, + { + "epoch": 0.09382155289562352, + "grad_norm": 0.06461451947689056, + "learning_rate": 6.300000000000001e-06, + "loss": 1.5049, + "step": 315 + }, + { + "epoch": 0.09411939909529217, + "grad_norm": 0.06791339814662933, + "learning_rate": 6.3200000000000005e-06, + "loss": 1.5135, + "step": 316 + }, + { + "epoch": 0.09441724529496082, + "grad_norm": 0.07396858930587769, + "learning_rate": 6.34e-06, + "loss": 1.5102, + "step": 317 + }, + { + "epoch": 0.09471509149462946, + "grad_norm": 0.06848759949207306, + "learning_rate": 6.360000000000001e-06, + "loss": 1.533, + "step": 318 + }, + { + "epoch": 0.09501293769429811, + "grad_norm": 0.06650793552398682, + "learning_rate": 6.380000000000001e-06, + "loss": 1.5063, + "step": 319 + }, + { + "epoch": 0.09531078389396676, + "grad_norm": 0.06422768533229828, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.5267, + "step": 320 + }, + { + "epoch": 0.0956086300936354, + "grad_norm": 0.06654848158359528, + "learning_rate": 6.42e-06, + "loss": 1.5541, + "step": 321 + }, + { + "epoch": 0.09590647629330404, + "grad_norm": 0.06139402836561203, + "learning_rate": 6.440000000000001e-06, + "loss": 1.5116, + "step": 322 + }, + { + "epoch": 0.09620432249297269, + "grad_norm": 0.06376224011182785, + "learning_rate": 6.460000000000001e-06, + "loss": 1.5221, + "step": 323 + }, + { + "epoch": 0.09650216869264133, + "grad_norm": 0.062332574278116226, + "learning_rate": 6.480000000000001e-06, + "loss": 1.5278, + "step": 324 + }, + { + "epoch": 0.09680001489230998, + "grad_norm": 0.06041441112756729, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.5367, + "step": 325 + }, + { + "epoch": 0.09709786109197863, + "grad_norm": 0.06442257761955261, + "learning_rate": 6.520000000000001e-06, + "loss": 1.537, + "step": 326 + }, + { + "epoch": 0.09739570729164727, + "grad_norm": 0.05909942835569382, + "learning_rate": 6.540000000000001e-06, + "loss": 1.5003, + "step": 327 + }, + { + "epoch": 0.09769355349131592, + "grad_norm": 0.060887742787599564, + "learning_rate": 6.560000000000001e-06, + "loss": 1.5192, + "step": 328 + }, + { + "epoch": 0.09799139969098457, + "grad_norm": 0.06028643622994423, + "learning_rate": 6.5800000000000005e-06, + "loss": 1.523, + "step": 329 + }, + { + "epoch": 0.09828924589065322, + "grad_norm": 0.060260046273469925, + "learning_rate": 6.600000000000001e-06, + "loss": 1.5244, + "step": 330 + }, + { + "epoch": 0.09858709209032186, + "grad_norm": 0.05909213423728943, + "learning_rate": 6.620000000000001e-06, + "loss": 1.5181, + "step": 331 + }, + { + "epoch": 0.09888493828999051, + "grad_norm": 0.05903824791312218, + "learning_rate": 6.640000000000001e-06, + "loss": 1.5224, + "step": 332 + }, + { + "epoch": 0.09918278448965916, + "grad_norm": 0.06103455275297165, + "learning_rate": 6.660000000000001e-06, + "loss": 1.5267, + "step": 333 + }, + { + "epoch": 0.0994806306893278, + "grad_norm": 0.06424493342638016, + "learning_rate": 6.680000000000001e-06, + "loss": 1.5262, + "step": 334 + }, + { + "epoch": 0.09977847688899645, + "grad_norm": 0.06069577485322952, + "learning_rate": 6.700000000000001e-06, + "loss": 1.5038, + "step": 335 + }, + { + "epoch": 0.10007632308866508, + "grad_norm": 0.06028122827410698, + "learning_rate": 6.720000000000001e-06, + "loss": 1.5261, + "step": 336 + }, + { + "epoch": 0.10037416928833373, + "grad_norm": 0.06266399472951889, + "learning_rate": 6.740000000000001e-06, + "loss": 1.4917, + "step": 337 + }, + { + "epoch": 0.10067201548800238, + "grad_norm": 0.05822982266545296, + "learning_rate": 6.760000000000001e-06, + "loss": 1.5241, + "step": 338 + }, + { + "epoch": 0.10096986168767103, + "grad_norm": 0.05943974480032921, + "learning_rate": 6.780000000000001e-06, + "loss": 1.5257, + "step": 339 + }, + { + "epoch": 0.10126770788733967, + "grad_norm": 0.05773944407701492, + "learning_rate": 6.800000000000001e-06, + "loss": 1.52, + "step": 340 + }, + { + "epoch": 0.10156555408700832, + "grad_norm": 0.06203080713748932, + "learning_rate": 6.820000000000001e-06, + "loss": 1.5058, + "step": 341 + }, + { + "epoch": 0.10186340028667697, + "grad_norm": 0.10648725926876068, + "learning_rate": 6.8400000000000014e-06, + "loss": 1.5137, + "step": 342 + }, + { + "epoch": 0.10216124648634561, + "grad_norm": 0.058104611933231354, + "learning_rate": 6.860000000000001e-06, + "loss": 1.5397, + "step": 343 + }, + { + "epoch": 0.10245909268601426, + "grad_norm": 0.05618196353316307, + "learning_rate": 6.88e-06, + "loss": 1.5125, + "step": 344 + }, + { + "epoch": 0.10275693888568291, + "grad_norm": 0.056050579994916916, + "learning_rate": 6.9e-06, + "loss": 1.5162, + "step": 345 + }, + { + "epoch": 0.10305478508535156, + "grad_norm": 0.05842519551515579, + "learning_rate": 6.92e-06, + "loss": 1.5138, + "step": 346 + }, + { + "epoch": 0.1033526312850202, + "grad_norm": 0.05745385214686394, + "learning_rate": 6.9400000000000005e-06, + "loss": 1.5162, + "step": 347 + }, + { + "epoch": 0.10365047748468885, + "grad_norm": 0.05770609527826309, + "learning_rate": 6.96e-06, + "loss": 1.5224, + "step": 348 + }, + { + "epoch": 0.1039483236843575, + "grad_norm": 0.06014389544725418, + "learning_rate": 6.98e-06, + "loss": 1.5132, + "step": 349 + }, + { + "epoch": 0.10424616988402613, + "grad_norm": 0.058073949068784714, + "learning_rate": 7e-06, + "loss": 1.5185, + "step": 350 + }, + { + "epoch": 0.10454401608369478, + "grad_norm": 0.05801470950245857, + "learning_rate": 7.0200000000000006e-06, + "loss": 1.5085, + "step": 351 + }, + { + "epoch": 0.10484186228336342, + "grad_norm": 0.05637885257601738, + "learning_rate": 7.04e-06, + "loss": 1.5117, + "step": 352 + }, + { + "epoch": 0.10513970848303207, + "grad_norm": 0.05973244085907936, + "learning_rate": 7.06e-06, + "loss": 1.5353, + "step": 353 + }, + { + "epoch": 0.10543755468270072, + "grad_norm": 0.05698588117957115, + "learning_rate": 7.08e-06, + "loss": 1.5167, + "step": 354 + }, + { + "epoch": 0.10573540088236937, + "grad_norm": 0.056220103055238724, + "learning_rate": 7.100000000000001e-06, + "loss": 1.5156, + "step": 355 + }, + { + "epoch": 0.10603324708203801, + "grad_norm": 0.13667906820774078, + "learning_rate": 7.1200000000000004e-06, + "loss": 1.5007, + "step": 356 + }, + { + "epoch": 0.10633109328170666, + "grad_norm": 0.05707328021526337, + "learning_rate": 7.14e-06, + "loss": 1.513, + "step": 357 + }, + { + "epoch": 0.1066289394813753, + "grad_norm": 0.05900765210390091, + "learning_rate": 7.16e-06, + "loss": 1.501, + "step": 358 + }, + { + "epoch": 0.10692678568104395, + "grad_norm": 0.054961275309324265, + "learning_rate": 7.180000000000001e-06, + "loss": 1.5153, + "step": 359 + }, + { + "epoch": 0.1072246318807126, + "grad_norm": 0.055499982088804245, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.5183, + "step": 360 + }, + { + "epoch": 0.10752247808038125, + "grad_norm": 0.057524703443050385, + "learning_rate": 7.22e-06, + "loss": 1.5185, + "step": 361 + }, + { + "epoch": 0.1078203242800499, + "grad_norm": 0.056625694036483765, + "learning_rate": 7.24e-06, + "loss": 1.5234, + "step": 362 + }, + { + "epoch": 0.10811817047971854, + "grad_norm": 0.05682265758514404, + "learning_rate": 7.260000000000001e-06, + "loss": 1.5049, + "step": 363 + }, + { + "epoch": 0.10841601667938718, + "grad_norm": 0.05868418887257576, + "learning_rate": 7.280000000000001e-06, + "loss": 1.5056, + "step": 364 + }, + { + "epoch": 0.10871386287905582, + "grad_norm": 0.06089801341295242, + "learning_rate": 7.3e-06, + "loss": 1.5091, + "step": 365 + }, + { + "epoch": 0.10901170907872447, + "grad_norm": 0.05808059498667717, + "learning_rate": 7.32e-06, + "loss": 1.5009, + "step": 366 + }, + { + "epoch": 0.10930955527839312, + "grad_norm": 0.05970989167690277, + "learning_rate": 7.340000000000001e-06, + "loss": 1.5084, + "step": 367 + }, + { + "epoch": 0.10960740147806176, + "grad_norm": 0.06059359386563301, + "learning_rate": 7.360000000000001e-06, + "loss": 1.5178, + "step": 368 + }, + { + "epoch": 0.10990524767773041, + "grad_norm": 0.05733639374375343, + "learning_rate": 7.3800000000000005e-06, + "loss": 1.4836, + "step": 369 + }, + { + "epoch": 0.11020309387739906, + "grad_norm": 0.0576791875064373, + "learning_rate": 7.4e-06, + "loss": 1.5061, + "step": 370 + }, + { + "epoch": 0.1105009400770677, + "grad_norm": 0.059270892292261124, + "learning_rate": 7.420000000000001e-06, + "loss": 1.5009, + "step": 371 + }, + { + "epoch": 0.11079878627673635, + "grad_norm": 0.05439314246177673, + "learning_rate": 7.440000000000001e-06, + "loss": 1.5037, + "step": 372 + }, + { + "epoch": 0.111096632476405, + "grad_norm": 0.06574559211730957, + "learning_rate": 7.4600000000000006e-06, + "loss": 1.5103, + "step": 373 + }, + { + "epoch": 0.11139447867607365, + "grad_norm": 0.05662696808576584, + "learning_rate": 7.48e-06, + "loss": 1.5117, + "step": 374 + }, + { + "epoch": 0.1116923248757423, + "grad_norm": 0.05377237871289253, + "learning_rate": 7.500000000000001e-06, + "loss": 1.5202, + "step": 375 + }, + { + "epoch": 0.11199017107541094, + "grad_norm": 0.06260445713996887, + "learning_rate": 7.520000000000001e-06, + "loss": 1.4895, + "step": 376 + }, + { + "epoch": 0.11228801727507957, + "grad_norm": 0.05700628459453583, + "learning_rate": 7.540000000000001e-06, + "loss": 1.4807, + "step": 377 + }, + { + "epoch": 0.11258586347474822, + "grad_norm": 0.05728604272007942, + "learning_rate": 7.5600000000000005e-06, + "loss": 1.4928, + "step": 378 + }, + { + "epoch": 0.11288370967441687, + "grad_norm": 0.0564795583486557, + "learning_rate": 7.58e-06, + "loss": 1.5235, + "step": 379 + }, + { + "epoch": 0.11318155587408552, + "grad_norm": 0.05639738589525223, + "learning_rate": 7.600000000000001e-06, + "loss": 1.5046, + "step": 380 + }, + { + "epoch": 0.11347940207375416, + "grad_norm": 0.06486669927835464, + "learning_rate": 7.620000000000001e-06, + "loss": 1.5143, + "step": 381 + }, + { + "epoch": 0.11377724827342281, + "grad_norm": 0.05547960475087166, + "learning_rate": 7.640000000000001e-06, + "loss": 1.5133, + "step": 382 + }, + { + "epoch": 0.11407509447309146, + "grad_norm": 0.05486346781253815, + "learning_rate": 7.660000000000001e-06, + "loss": 1.5082, + "step": 383 + }, + { + "epoch": 0.1143729406727601, + "grad_norm": 0.05648099631071091, + "learning_rate": 7.680000000000001e-06, + "loss": 1.4997, + "step": 384 + }, + { + "epoch": 0.11467078687242875, + "grad_norm": 0.06804478168487549, + "learning_rate": 7.7e-06, + "loss": 1.5092, + "step": 385 + }, + { + "epoch": 0.1149686330720974, + "grad_norm": 0.05766845867037773, + "learning_rate": 7.72e-06, + "loss": 1.4809, + "step": 386 + }, + { + "epoch": 0.11526647927176605, + "grad_norm": 0.056981757283210754, + "learning_rate": 7.74e-06, + "loss": 1.5188, + "step": 387 + }, + { + "epoch": 0.11556432547143469, + "grad_norm": 0.057405441999435425, + "learning_rate": 7.76e-06, + "loss": 1.5039, + "step": 388 + }, + { + "epoch": 0.11586217167110334, + "grad_norm": 0.05664507672190666, + "learning_rate": 7.78e-06, + "loss": 1.4966, + "step": 389 + }, + { + "epoch": 0.11616001787077199, + "grad_norm": 0.054925765842199326, + "learning_rate": 7.800000000000002e-06, + "loss": 1.4999, + "step": 390 + }, + { + "epoch": 0.11645786407044062, + "grad_norm": 0.0627293661236763, + "learning_rate": 7.820000000000001e-06, + "loss": 1.5253, + "step": 391 + }, + { + "epoch": 0.11675571027010927, + "grad_norm": 0.0569860115647316, + "learning_rate": 7.840000000000001e-06, + "loss": 1.501, + "step": 392 + }, + { + "epoch": 0.11705355646977791, + "grad_norm": 0.053533781319856644, + "learning_rate": 7.860000000000001e-06, + "loss": 1.4995, + "step": 393 + }, + { + "epoch": 0.11735140266944656, + "grad_norm": 0.055959541350603104, + "learning_rate": 7.88e-06, + "loss": 1.4732, + "step": 394 + }, + { + "epoch": 0.11764924886911521, + "grad_norm": 0.0623549222946167, + "learning_rate": 7.9e-06, + "loss": 1.4995, + "step": 395 + }, + { + "epoch": 0.11794709506878386, + "grad_norm": 0.055066898465156555, + "learning_rate": 7.92e-06, + "loss": 1.481, + "step": 396 + }, + { + "epoch": 0.1182449412684525, + "grad_norm": 0.053721833974123, + "learning_rate": 7.94e-06, + "loss": 1.5033, + "step": 397 + }, + { + "epoch": 0.11854278746812115, + "grad_norm": 0.05427899211645126, + "learning_rate": 7.960000000000002e-06, + "loss": 1.4925, + "step": 398 + }, + { + "epoch": 0.1188406336677898, + "grad_norm": 0.05966542661190033, + "learning_rate": 7.980000000000002e-06, + "loss": 1.4939, + "step": 399 + }, + { + "epoch": 0.11913847986745844, + "grad_norm": 0.05510552227497101, + "learning_rate": 8.000000000000001e-06, + "loss": 1.4894, + "step": 400 + }, + { + "epoch": 0.11943632606712709, + "grad_norm": 0.05756401643157005, + "learning_rate": 8.020000000000001e-06, + "loss": 1.5015, + "step": 401 + }, + { + "epoch": 0.11973417226679574, + "grad_norm": 0.06447158753871918, + "learning_rate": 8.040000000000001e-06, + "loss": 1.4989, + "step": 402 + }, + { + "epoch": 0.12003201846646439, + "grad_norm": 0.053807564079761505, + "learning_rate": 8.06e-06, + "loss": 1.4997, + "step": 403 + }, + { + "epoch": 0.12032986466613303, + "grad_norm": 0.05765663832426071, + "learning_rate": 8.08e-06, + "loss": 1.5109, + "step": 404 + }, + { + "epoch": 0.12062771086580167, + "grad_norm": 0.05815264210104942, + "learning_rate": 8.1e-06, + "loss": 1.5204, + "step": 405 + }, + { + "epoch": 0.12092555706547031, + "grad_norm": 0.05554782971739769, + "learning_rate": 8.120000000000002e-06, + "loss": 1.5061, + "step": 406 + }, + { + "epoch": 0.12122340326513896, + "grad_norm": 0.05318152904510498, + "learning_rate": 8.14e-06, + "loss": 1.4944, + "step": 407 + }, + { + "epoch": 0.1215212494648076, + "grad_norm": 0.061731383204460144, + "learning_rate": 8.16e-06, + "loss": 1.5216, + "step": 408 + }, + { + "epoch": 0.12181909566447625, + "grad_norm": 0.060279667377471924, + "learning_rate": 8.18e-06, + "loss": 1.5058, + "step": 409 + }, + { + "epoch": 0.1221169418641449, + "grad_norm": 0.05318167433142662, + "learning_rate": 8.2e-06, + "loss": 1.5058, + "step": 410 + }, + { + "epoch": 0.12241478806381355, + "grad_norm": 0.060518402606248856, + "learning_rate": 8.220000000000001e-06, + "loss": 1.5043, + "step": 411 + }, + { + "epoch": 0.1227126342634822, + "grad_norm": 0.0560225285589695, + "learning_rate": 8.24e-06, + "loss": 1.5003, + "step": 412 + }, + { + "epoch": 0.12301048046315084, + "grad_norm": 0.0653400719165802, + "learning_rate": 8.26e-06, + "loss": 1.504, + "step": 413 + }, + { + "epoch": 0.12330832666281949, + "grad_norm": 0.05497625097632408, + "learning_rate": 8.28e-06, + "loss": 1.5043, + "step": 414 + }, + { + "epoch": 0.12360617286248814, + "grad_norm": 0.058193057775497437, + "learning_rate": 8.3e-06, + "loss": 1.5248, + "step": 415 + }, + { + "epoch": 0.12390401906215678, + "grad_norm": 0.06820514798164368, + "learning_rate": 8.32e-06, + "loss": 1.5004, + "step": 416 + }, + { + "epoch": 0.12420186526182543, + "grad_norm": 0.0610477477312088, + "learning_rate": 8.34e-06, + "loss": 1.5011, + "step": 417 + }, + { + "epoch": 0.12449971146149406, + "grad_norm": 0.09810482710599899, + "learning_rate": 8.36e-06, + "loss": 1.5168, + "step": 418 + }, + { + "epoch": 0.12479755766116271, + "grad_norm": 0.05371012166142464, + "learning_rate": 8.380000000000001e-06, + "loss": 1.5134, + "step": 419 + }, + { + "epoch": 0.12509540386083137, + "grad_norm": 0.0640043094754219, + "learning_rate": 8.400000000000001e-06, + "loss": 1.4888, + "step": 420 + }, + { + "epoch": 0.1253932500605, + "grad_norm": 0.08393670618534088, + "learning_rate": 8.42e-06, + "loss": 1.5245, + "step": 421 + }, + { + "epoch": 0.12569109626016867, + "grad_norm": 0.05550488829612732, + "learning_rate": 8.44e-06, + "loss": 1.482, + "step": 422 + }, + { + "epoch": 0.1259889424598373, + "grad_norm": 0.05311019718647003, + "learning_rate": 8.46e-06, + "loss": 1.5023, + "step": 423 + }, + { + "epoch": 0.12628678865950596, + "grad_norm": 0.05454510822892189, + "learning_rate": 8.48e-06, + "loss": 1.5106, + "step": 424 + }, + { + "epoch": 0.1265846348591746, + "grad_norm": 0.056061387062072754, + "learning_rate": 8.5e-06, + "loss": 1.4945, + "step": 425 + }, + { + "epoch": 0.12688248105884323, + "grad_norm": 0.07110252976417542, + "learning_rate": 8.52e-06, + "loss": 1.4938, + "step": 426 + }, + { + "epoch": 0.1271803272585119, + "grad_norm": 0.05501431226730347, + "learning_rate": 8.540000000000001e-06, + "loss": 1.5179, + "step": 427 + }, + { + "epoch": 0.12747817345818052, + "grad_norm": 0.07934402674436569, + "learning_rate": 8.560000000000001e-06, + "loss": 1.4848, + "step": 428 + }, + { + "epoch": 0.12777601965784918, + "grad_norm": 0.054356809705495834, + "learning_rate": 8.580000000000001e-06, + "loss": 1.5047, + "step": 429 + }, + { + "epoch": 0.12807386585751782, + "grad_norm": 0.054320644587278366, + "learning_rate": 8.6e-06, + "loss": 1.4871, + "step": 430 + }, + { + "epoch": 0.12837171205718648, + "grad_norm": 0.057648032903671265, + "learning_rate": 8.62e-06, + "loss": 1.4814, + "step": 431 + }, + { + "epoch": 0.1286695582568551, + "grad_norm": 0.06109480559825897, + "learning_rate": 8.64e-06, + "loss": 1.4859, + "step": 432 + }, + { + "epoch": 0.12896740445652377, + "grad_norm": 0.06258855015039444, + "learning_rate": 8.66e-06, + "loss": 1.4824, + "step": 433 + }, + { + "epoch": 0.1292652506561924, + "grad_norm": 0.0565275102853775, + "learning_rate": 8.68e-06, + "loss": 1.4883, + "step": 434 + }, + { + "epoch": 0.12956309685586107, + "grad_norm": 0.058759719133377075, + "learning_rate": 8.700000000000001e-06, + "loss": 1.4909, + "step": 435 + }, + { + "epoch": 0.1298609430555297, + "grad_norm": 0.06828753650188446, + "learning_rate": 8.720000000000001e-06, + "loss": 1.5012, + "step": 436 + }, + { + "epoch": 0.13015878925519836, + "grad_norm": 0.05682528391480446, + "learning_rate": 8.740000000000001e-06, + "loss": 1.4848, + "step": 437 + }, + { + "epoch": 0.130456635454867, + "grad_norm": 0.06481580436229706, + "learning_rate": 8.76e-06, + "loss": 1.5015, + "step": 438 + }, + { + "epoch": 0.13075448165453563, + "grad_norm": 0.0758729875087738, + "learning_rate": 8.78e-06, + "loss": 1.491, + "step": 439 + }, + { + "epoch": 0.1310523278542043, + "grad_norm": 0.058766938745975494, + "learning_rate": 8.8e-06, + "loss": 1.4808, + "step": 440 + }, + { + "epoch": 0.13135017405387292, + "grad_norm": 0.07881702482700348, + "learning_rate": 8.82e-06, + "loss": 1.4945, + "step": 441 + }, + { + "epoch": 0.13164802025354158, + "grad_norm": 0.06895948201417923, + "learning_rate": 8.84e-06, + "loss": 1.4932, + "step": 442 + }, + { + "epoch": 0.13194586645321021, + "grad_norm": 0.05118221789598465, + "learning_rate": 8.860000000000002e-06, + "loss": 1.4728, + "step": 443 + }, + { + "epoch": 0.13224371265287888, + "grad_norm": 0.07200183719396591, + "learning_rate": 8.880000000000001e-06, + "loss": 1.4938, + "step": 444 + }, + { + "epoch": 0.1325415588525475, + "grad_norm": 0.07630757242441177, + "learning_rate": 8.900000000000001e-06, + "loss": 1.4919, + "step": 445 + }, + { + "epoch": 0.13283940505221617, + "grad_norm": 0.057770371437072754, + "learning_rate": 8.920000000000001e-06, + "loss": 1.4768, + "step": 446 + }, + { + "epoch": 0.1331372512518848, + "grad_norm": 0.0767252966761589, + "learning_rate": 8.94e-06, + "loss": 1.4739, + "step": 447 + }, + { + "epoch": 0.13343509745155346, + "grad_norm": 0.069420225918293, + "learning_rate": 8.96e-06, + "loss": 1.4904, + "step": 448 + }, + { + "epoch": 0.1337329436512221, + "grad_norm": 0.05710853636264801, + "learning_rate": 8.98e-06, + "loss": 1.5044, + "step": 449 + }, + { + "epoch": 0.13403078985089076, + "grad_norm": 0.10228551924228668, + "learning_rate": 9e-06, + "loss": 1.4748, + "step": 450 + }, + { + "epoch": 0.1343286360505594, + "grad_norm": 0.06879130750894547, + "learning_rate": 9.020000000000002e-06, + "loss": 1.4723, + "step": 451 + }, + { + "epoch": 0.13462648225022805, + "grad_norm": 0.06307007372379303, + "learning_rate": 9.040000000000002e-06, + "loss": 1.4817, + "step": 452 + }, + { + "epoch": 0.13492432844989669, + "grad_norm": 0.08950717747211456, + "learning_rate": 9.060000000000001e-06, + "loss": 1.4978, + "step": 453 + }, + { + "epoch": 0.13522217464956532, + "grad_norm": 0.05961303785443306, + "learning_rate": 9.080000000000001e-06, + "loss": 1.4799, + "step": 454 + }, + { + "epoch": 0.13552002084923398, + "grad_norm": 0.053586605936288834, + "learning_rate": 9.100000000000001e-06, + "loss": 1.4979, + "step": 455 + }, + { + "epoch": 0.1358178670489026, + "grad_norm": 0.08085786551237106, + "learning_rate": 9.12e-06, + "loss": 1.498, + "step": 456 + }, + { + "epoch": 0.13611571324857127, + "grad_norm": 0.08234117180109024, + "learning_rate": 9.14e-06, + "loss": 1.4999, + "step": 457 + }, + { + "epoch": 0.1364135594482399, + "grad_norm": 0.06344082951545715, + "learning_rate": 9.16e-06, + "loss": 1.4977, + "step": 458 + }, + { + "epoch": 0.13671140564790857, + "grad_norm": 0.08588889241218567, + "learning_rate": 9.180000000000002e-06, + "loss": 1.5045, + "step": 459 + }, + { + "epoch": 0.1370092518475772, + "grad_norm": 0.09717432409524918, + "learning_rate": 9.200000000000002e-06, + "loss": 1.5038, + "step": 460 + }, + { + "epoch": 0.13730709804724586, + "grad_norm": 0.06904091686010361, + "learning_rate": 9.220000000000002e-06, + "loss": 1.4827, + "step": 461 + }, + { + "epoch": 0.1376049442469145, + "grad_norm": 0.08623939752578735, + "learning_rate": 9.240000000000001e-06, + "loss": 1.473, + "step": 462 + }, + { + "epoch": 0.13790279044658316, + "grad_norm": 0.0879717618227005, + "learning_rate": 9.260000000000001e-06, + "loss": 1.5091, + "step": 463 + }, + { + "epoch": 0.1382006366462518, + "grad_norm": 0.06129393354058266, + "learning_rate": 9.280000000000001e-06, + "loss": 1.4717, + "step": 464 + }, + { + "epoch": 0.13849848284592045, + "grad_norm": 0.06214470416307449, + "learning_rate": 9.3e-06, + "loss": 1.5008, + "step": 465 + }, + { + "epoch": 0.13879632904558908, + "grad_norm": 0.08542024344205856, + "learning_rate": 9.32e-06, + "loss": 1.5034, + "step": 466 + }, + { + "epoch": 0.13909417524525772, + "grad_norm": 0.06799095869064331, + "learning_rate": 9.340000000000002e-06, + "loss": 1.4733, + "step": 467 + }, + { + "epoch": 0.13939202144492638, + "grad_norm": 0.05802611634135246, + "learning_rate": 9.360000000000002e-06, + "loss": 1.4819, + "step": 468 + }, + { + "epoch": 0.139689867644595, + "grad_norm": 0.06662992388010025, + "learning_rate": 9.38e-06, + "loss": 1.492, + "step": 469 + }, + { + "epoch": 0.13998771384426367, + "grad_norm": 0.05898972228169441, + "learning_rate": 9.4e-06, + "loss": 1.493, + "step": 470 + }, + { + "epoch": 0.1402855600439323, + "grad_norm": 0.05767171084880829, + "learning_rate": 9.42e-06, + "loss": 1.466, + "step": 471 + }, + { + "epoch": 0.14058340624360097, + "grad_norm": 0.062400639057159424, + "learning_rate": 9.440000000000001e-06, + "loss": 1.48, + "step": 472 + }, + { + "epoch": 0.1408812524432696, + "grad_norm": 0.0686592310667038, + "learning_rate": 9.460000000000001e-06, + "loss": 1.4856, + "step": 473 + }, + { + "epoch": 0.14117909864293826, + "grad_norm": 0.06079595908522606, + "learning_rate": 9.48e-06, + "loss": 1.4846, + "step": 474 + }, + { + "epoch": 0.1414769448426069, + "grad_norm": 0.06447052955627441, + "learning_rate": 9.5e-06, + "loss": 1.4845, + "step": 475 + }, + { + "epoch": 0.14177479104227556, + "grad_norm": 0.06018751487135887, + "learning_rate": 9.52e-06, + "loss": 1.4913, + "step": 476 + }, + { + "epoch": 0.1420726372419442, + "grad_norm": 0.06974750012159348, + "learning_rate": 9.54e-06, + "loss": 1.4878, + "step": 477 + }, + { + "epoch": 0.14237048344161285, + "grad_norm": 0.06062848120927811, + "learning_rate": 9.56e-06, + "loss": 1.4897, + "step": 478 + }, + { + "epoch": 0.14266832964128148, + "grad_norm": 0.06055650860071182, + "learning_rate": 9.58e-06, + "loss": 1.4746, + "step": 479 + }, + { + "epoch": 0.14296617584095012, + "grad_norm": 0.06422478705644608, + "learning_rate": 9.600000000000001e-06, + "loss": 1.4749, + "step": 480 + }, + { + "epoch": 0.14326402204061878, + "grad_norm": 0.05979803204536438, + "learning_rate": 9.620000000000001e-06, + "loss": 1.4844, + "step": 481 + }, + { + "epoch": 0.1435618682402874, + "grad_norm": 0.0778728798031807, + "learning_rate": 9.640000000000001e-06, + "loss": 1.4919, + "step": 482 + }, + { + "epoch": 0.14385971443995607, + "grad_norm": 0.06429169327020645, + "learning_rate": 9.66e-06, + "loss": 1.4987, + "step": 483 + }, + { + "epoch": 0.1441575606396247, + "grad_norm": 0.05542841926217079, + "learning_rate": 9.68e-06, + "loss": 1.4737, + "step": 484 + }, + { + "epoch": 0.14445540683929337, + "grad_norm": 0.07081244140863419, + "learning_rate": 9.7e-06, + "loss": 1.4816, + "step": 485 + }, + { + "epoch": 0.144753253038962, + "grad_norm": 0.07182405889034271, + "learning_rate": 9.72e-06, + "loss": 1.4958, + "step": 486 + }, + { + "epoch": 0.14505109923863066, + "grad_norm": 0.0578744113445282, + "learning_rate": 9.74e-06, + "loss": 1.4902, + "step": 487 + }, + { + "epoch": 0.1453489454382993, + "grad_norm": 0.07073856890201569, + "learning_rate": 9.760000000000001e-06, + "loss": 1.4954, + "step": 488 + }, + { + "epoch": 0.14564679163796795, + "grad_norm": 0.0563889779150486, + "learning_rate": 9.780000000000001e-06, + "loss": 1.4759, + "step": 489 + }, + { + "epoch": 0.1459446378376366, + "grad_norm": 0.06201820820569992, + "learning_rate": 9.800000000000001e-06, + "loss": 1.4978, + "step": 490 + }, + { + "epoch": 0.14624248403730525, + "grad_norm": 0.058224428445100784, + "learning_rate": 9.820000000000001e-06, + "loss": 1.4554, + "step": 491 + }, + { + "epoch": 0.14654033023697388, + "grad_norm": 0.05769447609782219, + "learning_rate": 9.84e-06, + "loss": 1.4685, + "step": 492 + }, + { + "epoch": 0.14683817643664254, + "grad_norm": 0.06263457983732224, + "learning_rate": 9.86e-06, + "loss": 1.475, + "step": 493 + }, + { + "epoch": 0.14713602263631118, + "grad_norm": 0.05658208206295967, + "learning_rate": 9.88e-06, + "loss": 1.4806, + "step": 494 + }, + { + "epoch": 0.1474338688359798, + "grad_norm": 0.07487098127603531, + "learning_rate": 9.9e-06, + "loss": 1.4962, + "step": 495 + }, + { + "epoch": 0.14773171503564847, + "grad_norm": 0.05984261631965637, + "learning_rate": 9.920000000000002e-06, + "loss": 1.471, + "step": 496 + }, + { + "epoch": 0.1480295612353171, + "grad_norm": 0.061114851385354996, + "learning_rate": 9.940000000000001e-06, + "loss": 1.4734, + "step": 497 + }, + { + "epoch": 0.14832740743498576, + "grad_norm": 0.06528756022453308, + "learning_rate": 9.960000000000001e-06, + "loss": 1.4599, + "step": 498 + }, + { + "epoch": 0.1486252536346544, + "grad_norm": 0.08878826349973679, + "learning_rate": 9.980000000000001e-06, + "loss": 1.4971, + "step": 499 + }, + { + "epoch": 0.14892309983432306, + "grad_norm": 0.09886343032121658, + "learning_rate": 1e-05, + "loss": 1.4828, + "step": 500 + }, + { + "epoch": 0.14892309983432306, + "eval_loss": 1.4305658340454102, + "eval_runtime": 18.358, + "eval_samples_per_second": 94.455, + "eval_steps_per_second": 5.937, + "step": 500 + }, + { + "epoch": 0.1492209460339917, + "grad_norm": 0.06291390210390091, + "learning_rate": 1.002e-05, + "loss": 1.4785, + "step": 501 + }, + { + "epoch": 0.14951879223366035, + "grad_norm": 0.0753321647644043, + "learning_rate": 1.004e-05, + "loss": 1.4701, + "step": 502 + }, + { + "epoch": 0.14981663843332899, + "grad_norm": 0.06809154152870178, + "learning_rate": 1.006e-05, + "loss": 1.4695, + "step": 503 + }, + { + "epoch": 0.15011448463299765, + "grad_norm": 0.08590058237314224, + "learning_rate": 1.008e-05, + "loss": 1.4905, + "step": 504 + }, + { + "epoch": 0.15041233083266628, + "grad_norm": 0.0781983733177185, + "learning_rate": 1.0100000000000002e-05, + "loss": 1.4806, + "step": 505 + }, + { + "epoch": 0.15071017703233494, + "grad_norm": 0.0789913758635521, + "learning_rate": 1.0120000000000001e-05, + "loss": 1.4838, + "step": 506 + }, + { + "epoch": 0.15100802323200357, + "grad_norm": 0.07115836441516876, + "learning_rate": 1.0140000000000001e-05, + "loss": 1.4888, + "step": 507 + }, + { + "epoch": 0.1513058694316722, + "grad_norm": 0.06764430552721024, + "learning_rate": 1.0160000000000001e-05, + "loss": 1.4657, + "step": 508 + }, + { + "epoch": 0.15160371563134087, + "grad_norm": 0.0823146253824234, + "learning_rate": 1.018e-05, + "loss": 1.4973, + "step": 509 + }, + { + "epoch": 0.1519015618310095, + "grad_norm": 0.11073504388332367, + "learning_rate": 1.02e-05, + "loss": 1.4692, + "step": 510 + }, + { + "epoch": 0.15219940803067816, + "grad_norm": 0.07361211627721786, + "learning_rate": 1.022e-05, + "loss": 1.4951, + "step": 511 + }, + { + "epoch": 0.1524972542303468, + "grad_norm": 0.11719920486211777, + "learning_rate": 1.024e-05, + "loss": 1.4888, + "step": 512 + }, + { + "epoch": 0.15279510043001546, + "grad_norm": 0.06927847117185593, + "learning_rate": 1.0260000000000002e-05, + "loss": 1.4728, + "step": 513 + }, + { + "epoch": 0.1530929466296841, + "grad_norm": 0.0775856226682663, + "learning_rate": 1.0280000000000002e-05, + "loss": 1.4768, + "step": 514 + }, + { + "epoch": 0.15339079282935275, + "grad_norm": 0.10022097826004028, + "learning_rate": 1.0300000000000001e-05, + "loss": 1.4891, + "step": 515 + }, + { + "epoch": 0.15368863902902138, + "grad_norm": 0.06402795761823654, + "learning_rate": 1.0320000000000001e-05, + "loss": 1.4642, + "step": 516 + }, + { + "epoch": 0.15398648522869005, + "grad_norm": 0.07679101824760437, + "learning_rate": 1.0340000000000001e-05, + "loss": 1.4729, + "step": 517 + }, + { + "epoch": 0.15428433142835868, + "grad_norm": 0.11688075214624405, + "learning_rate": 1.036e-05, + "loss": 1.488, + "step": 518 + }, + { + "epoch": 0.15458217762802734, + "grad_norm": 0.06289079040288925, + "learning_rate": 1.038e-05, + "loss": 1.4791, + "step": 519 + }, + { + "epoch": 0.15488002382769597, + "grad_norm": 0.07349809259176254, + "learning_rate": 1.04e-05, + "loss": 1.4794, + "step": 520 + }, + { + "epoch": 0.1551778700273646, + "grad_norm": 0.0793452039361, + "learning_rate": 1.0420000000000002e-05, + "loss": 1.5016, + "step": 521 + }, + { + "epoch": 0.15547571622703327, + "grad_norm": 0.0695367380976677, + "learning_rate": 1.0440000000000002e-05, + "loss": 1.4687, + "step": 522 + }, + { + "epoch": 0.1557735624267019, + "grad_norm": 0.0566876195371151, + "learning_rate": 1.0460000000000001e-05, + "loss": 1.4522, + "step": 523 + }, + { + "epoch": 0.15607140862637056, + "grad_norm": 0.06410634517669678, + "learning_rate": 1.0480000000000001e-05, + "loss": 1.5194, + "step": 524 + }, + { + "epoch": 0.1563692548260392, + "grad_norm": 0.09583567082881927, + "learning_rate": 1.0500000000000001e-05, + "loss": 1.4795, + "step": 525 + }, + { + "epoch": 0.15666710102570786, + "grad_norm": 0.0846511498093605, + "learning_rate": 1.0520000000000001e-05, + "loss": 1.4784, + "step": 526 + }, + { + "epoch": 0.1569649472253765, + "grad_norm": 0.07598602026700974, + "learning_rate": 1.054e-05, + "loss": 1.4767, + "step": 527 + }, + { + "epoch": 0.15726279342504515, + "grad_norm": 0.07100868970155716, + "learning_rate": 1.056e-05, + "loss": 1.4854, + "step": 528 + }, + { + "epoch": 0.15756063962471378, + "grad_norm": 0.06840056926012039, + "learning_rate": 1.0580000000000002e-05, + "loss": 1.4487, + "step": 529 + }, + { + "epoch": 0.15785848582438244, + "grad_norm": 0.072611004114151, + "learning_rate": 1.0600000000000002e-05, + "loss": 1.4675, + "step": 530 + }, + { + "epoch": 0.15815633202405108, + "grad_norm": 0.08857429027557373, + "learning_rate": 1.0620000000000002e-05, + "loss": 1.4517, + "step": 531 + }, + { + "epoch": 0.15845417822371974, + "grad_norm": 0.0728512555360794, + "learning_rate": 1.0640000000000001e-05, + "loss": 1.4671, + "step": 532 + }, + { + "epoch": 0.15875202442338837, + "grad_norm": 0.08091485500335693, + "learning_rate": 1.0660000000000001e-05, + "loss": 1.4882, + "step": 533 + }, + { + "epoch": 0.15904987062305703, + "grad_norm": 0.06646449863910675, + "learning_rate": 1.0680000000000001e-05, + "loss": 1.4777, + "step": 534 + }, + { + "epoch": 0.15934771682272567, + "grad_norm": 0.06292653828859329, + "learning_rate": 1.0700000000000001e-05, + "loss": 1.4853, + "step": 535 + }, + { + "epoch": 0.1596455630223943, + "grad_norm": 0.07076392322778702, + "learning_rate": 1.072e-05, + "loss": 1.4711, + "step": 536 + }, + { + "epoch": 0.15994340922206296, + "grad_norm": 0.07309917360544205, + "learning_rate": 1.0740000000000002e-05, + "loss": 1.4629, + "step": 537 + }, + { + "epoch": 0.1602412554217316, + "grad_norm": 0.06675314903259277, + "learning_rate": 1.0760000000000002e-05, + "loss": 1.4567, + "step": 538 + }, + { + "epoch": 0.16053910162140025, + "grad_norm": 0.06333121657371521, + "learning_rate": 1.0780000000000002e-05, + "loss": 1.4799, + "step": 539 + }, + { + "epoch": 0.1608369478210689, + "grad_norm": 0.07681736350059509, + "learning_rate": 1.0800000000000002e-05, + "loss": 1.4699, + "step": 540 + }, + { + "epoch": 0.16113479402073755, + "grad_norm": 0.058470066636800766, + "learning_rate": 1.0820000000000001e-05, + "loss": 1.4713, + "step": 541 + }, + { + "epoch": 0.16143264022040618, + "grad_norm": 0.06542841345071793, + "learning_rate": 1.0840000000000001e-05, + "loss": 1.4826, + "step": 542 + }, + { + "epoch": 0.16173048642007484, + "grad_norm": 0.06384658813476562, + "learning_rate": 1.0860000000000001e-05, + "loss": 1.4763, + "step": 543 + }, + { + "epoch": 0.16202833261974348, + "grad_norm": 0.06615178287029266, + "learning_rate": 1.0880000000000001e-05, + "loss": 1.4606, + "step": 544 + }, + { + "epoch": 0.16232617881941214, + "grad_norm": 0.06471758335828781, + "learning_rate": 1.0900000000000002e-05, + "loss": 1.4896, + "step": 545 + }, + { + "epoch": 0.16262402501908077, + "grad_norm": 0.06672003865242004, + "learning_rate": 1.0920000000000002e-05, + "loss": 1.4744, + "step": 546 + }, + { + "epoch": 0.16292187121874943, + "grad_norm": 0.07130942493677139, + "learning_rate": 1.0940000000000002e-05, + "loss": 1.493, + "step": 547 + }, + { + "epoch": 0.16321971741841806, + "grad_norm": 0.07708992809057236, + "learning_rate": 1.0960000000000002e-05, + "loss": 1.4663, + "step": 548 + }, + { + "epoch": 0.1635175636180867, + "grad_norm": 0.06569571793079376, + "learning_rate": 1.0980000000000002e-05, + "loss": 1.461, + "step": 549 + }, + { + "epoch": 0.16381540981775536, + "grad_norm": 0.06068449467420578, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.462, + "step": 550 + }, + { + "epoch": 0.164113256017424, + "grad_norm": 0.06866385042667389, + "learning_rate": 1.1020000000000001e-05, + "loss": 1.4795, + "step": 551 + }, + { + "epoch": 0.16441110221709265, + "grad_norm": 0.07375936955213547, + "learning_rate": 1.1040000000000001e-05, + "loss": 1.4568, + "step": 552 + }, + { + "epoch": 0.16470894841676129, + "grad_norm": 0.07791747152805328, + "learning_rate": 1.1060000000000003e-05, + "loss": 1.4699, + "step": 553 + }, + { + "epoch": 0.16500679461642995, + "grad_norm": 0.06984464079141617, + "learning_rate": 1.1080000000000002e-05, + "loss": 1.4797, + "step": 554 + }, + { + "epoch": 0.16530464081609858, + "grad_norm": 0.06589505821466446, + "learning_rate": 1.1100000000000002e-05, + "loss": 1.4671, + "step": 555 + }, + { + "epoch": 0.16560248701576724, + "grad_norm": 0.08439430594444275, + "learning_rate": 1.1120000000000002e-05, + "loss": 1.4732, + "step": 556 + }, + { + "epoch": 0.16590033321543587, + "grad_norm": 0.07184866070747375, + "learning_rate": 1.1140000000000002e-05, + "loss": 1.4744, + "step": 557 + }, + { + "epoch": 0.16619817941510454, + "grad_norm": 0.07689396291971207, + "learning_rate": 1.1160000000000002e-05, + "loss": 1.4657, + "step": 558 + }, + { + "epoch": 0.16649602561477317, + "grad_norm": 0.06957592815160751, + "learning_rate": 1.1180000000000001e-05, + "loss": 1.4686, + "step": 559 + }, + { + "epoch": 0.16679387181444183, + "grad_norm": 0.0719662606716156, + "learning_rate": 1.1200000000000001e-05, + "loss": 1.476, + "step": 560 + }, + { + "epoch": 0.16709171801411046, + "grad_norm": 0.08123235404491425, + "learning_rate": 1.1220000000000003e-05, + "loss": 1.4563, + "step": 561 + }, + { + "epoch": 0.1673895642137791, + "grad_norm": 0.08388621360063553, + "learning_rate": 1.1240000000000002e-05, + "loss": 1.469, + "step": 562 + }, + { + "epoch": 0.16768741041344776, + "grad_norm": 0.06538707762956619, + "learning_rate": 1.126e-05, + "loss": 1.4779, + "step": 563 + }, + { + "epoch": 0.1679852566131164, + "grad_norm": 0.0906519740819931, + "learning_rate": 1.128e-05, + "loss": 1.4712, + "step": 564 + }, + { + "epoch": 0.16828310281278505, + "grad_norm": 0.08886244148015976, + "learning_rate": 1.13e-05, + "loss": 1.4566, + "step": 565 + }, + { + "epoch": 0.16858094901245368, + "grad_norm": 0.06940344721078873, + "learning_rate": 1.132e-05, + "loss": 1.4622, + "step": 566 + }, + { + "epoch": 0.16887879521212235, + "grad_norm": 0.07784947752952576, + "learning_rate": 1.134e-05, + "loss": 1.4923, + "step": 567 + }, + { + "epoch": 0.16917664141179098, + "grad_norm": 0.08280647546052933, + "learning_rate": 1.136e-05, + "loss": 1.4729, + "step": 568 + }, + { + "epoch": 0.16947448761145964, + "grad_norm": 0.0870596393942833, + "learning_rate": 1.138e-05, + "loss": 1.471, + "step": 569 + }, + { + "epoch": 0.16977233381112827, + "grad_norm": 0.07927907258272171, + "learning_rate": 1.14e-05, + "loss": 1.444, + "step": 570 + }, + { + "epoch": 0.17007018001079693, + "grad_norm": 0.08484699577093124, + "learning_rate": 1.142e-05, + "loss": 1.4811, + "step": 571 + }, + { + "epoch": 0.17036802621046557, + "grad_norm": 0.07751280069351196, + "learning_rate": 1.144e-05, + "loss": 1.4679, + "step": 572 + }, + { + "epoch": 0.17066587241013423, + "grad_norm": 0.06520108878612518, + "learning_rate": 1.146e-05, + "loss": 1.471, + "step": 573 + }, + { + "epoch": 0.17096371860980286, + "grad_norm": 0.09336327761411667, + "learning_rate": 1.148e-05, + "loss": 1.4566, + "step": 574 + }, + { + "epoch": 0.17126156480947152, + "grad_norm": 0.0706481784582138, + "learning_rate": 1.15e-05, + "loss": 1.4694, + "step": 575 + }, + { + "epoch": 0.17155941100914016, + "grad_norm": 0.11398935317993164, + "learning_rate": 1.152e-05, + "loss": 1.4729, + "step": 576 + }, + { + "epoch": 0.1718572572088088, + "grad_norm": 0.07338732481002808, + "learning_rate": 1.154e-05, + "loss": 1.4707, + "step": 577 + }, + { + "epoch": 0.17215510340847745, + "grad_norm": 0.07571332901716232, + "learning_rate": 1.156e-05, + "loss": 1.4594, + "step": 578 + }, + { + "epoch": 0.17245294960814608, + "grad_norm": 0.07202799618244171, + "learning_rate": 1.1580000000000001e-05, + "loss": 1.4657, + "step": 579 + }, + { + "epoch": 0.17275079580781474, + "grad_norm": 0.07837383449077606, + "learning_rate": 1.16e-05, + "loss": 1.4493, + "step": 580 + }, + { + "epoch": 0.17304864200748338, + "grad_norm": 0.07715293765068054, + "learning_rate": 1.162e-05, + "loss": 1.4824, + "step": 581 + }, + { + "epoch": 0.17334648820715204, + "grad_norm": 0.07711914926767349, + "learning_rate": 1.164e-05, + "loss": 1.4495, + "step": 582 + }, + { + "epoch": 0.17364433440682067, + "grad_norm": 0.07181393355131149, + "learning_rate": 1.166e-05, + "loss": 1.4903, + "step": 583 + }, + { + "epoch": 0.17394218060648933, + "grad_norm": 0.07789280265569687, + "learning_rate": 1.168e-05, + "loss": 1.4734, + "step": 584 + }, + { + "epoch": 0.17424002680615797, + "grad_norm": 0.07323314249515533, + "learning_rate": 1.17e-05, + "loss": 1.4702, + "step": 585 + }, + { + "epoch": 0.17453787300582663, + "grad_norm": 0.06602338701486588, + "learning_rate": 1.172e-05, + "loss": 1.4532, + "step": 586 + }, + { + "epoch": 0.17483571920549526, + "grad_norm": 0.07128585875034332, + "learning_rate": 1.1740000000000001e-05, + "loss": 1.456, + "step": 587 + }, + { + "epoch": 0.17513356540516392, + "grad_norm": 0.06492584943771362, + "learning_rate": 1.1760000000000001e-05, + "loss": 1.4524, + "step": 588 + }, + { + "epoch": 0.17543141160483255, + "grad_norm": 0.06519263237714767, + "learning_rate": 1.178e-05, + "loss": 1.4638, + "step": 589 + }, + { + "epoch": 0.1757292578045012, + "grad_norm": 0.0794047936797142, + "learning_rate": 1.18e-05, + "loss": 1.4688, + "step": 590 + }, + { + "epoch": 0.17602710400416985, + "grad_norm": 0.07813981920480728, + "learning_rate": 1.182e-05, + "loss": 1.4661, + "step": 591 + }, + { + "epoch": 0.17632495020383848, + "grad_norm": 0.06432987749576569, + "learning_rate": 1.184e-05, + "loss": 1.4653, + "step": 592 + }, + { + "epoch": 0.17662279640350714, + "grad_norm": 0.06692880392074585, + "learning_rate": 1.186e-05, + "loss": 1.4577, + "step": 593 + }, + { + "epoch": 0.17692064260317578, + "grad_norm": 0.07275351136922836, + "learning_rate": 1.188e-05, + "loss": 1.4617, + "step": 594 + }, + { + "epoch": 0.17721848880284444, + "grad_norm": 0.07653673738241196, + "learning_rate": 1.1900000000000001e-05, + "loss": 1.4733, + "step": 595 + }, + { + "epoch": 0.17751633500251307, + "grad_norm": 0.07326718419790268, + "learning_rate": 1.1920000000000001e-05, + "loss": 1.4585, + "step": 596 + }, + { + "epoch": 0.17781418120218173, + "grad_norm": 0.07613977789878845, + "learning_rate": 1.1940000000000001e-05, + "loss": 1.4776, + "step": 597 + }, + { + "epoch": 0.17811202740185036, + "grad_norm": 0.07090508937835693, + "learning_rate": 1.196e-05, + "loss": 1.4704, + "step": 598 + }, + { + "epoch": 0.17840987360151903, + "grad_norm": 0.07558804005384445, + "learning_rate": 1.198e-05, + "loss": 1.4618, + "step": 599 + }, + { + "epoch": 0.17870771980118766, + "grad_norm": 0.07647745311260223, + "learning_rate": 1.2e-05, + "loss": 1.4842, + "step": 600 + }, + { + "epoch": 0.17900556600085632, + "grad_norm": 0.08317311108112335, + "learning_rate": 1.202e-05, + "loss": 1.4756, + "step": 601 + }, + { + "epoch": 0.17930341220052495, + "grad_norm": 0.08071677386760712, + "learning_rate": 1.204e-05, + "loss": 1.4592, + "step": 602 + }, + { + "epoch": 0.1796012584001936, + "grad_norm": 0.06859596073627472, + "learning_rate": 1.2060000000000001e-05, + "loss": 1.4583, + "step": 603 + }, + { + "epoch": 0.17989910459986225, + "grad_norm": 0.07272063195705414, + "learning_rate": 1.2080000000000001e-05, + "loss": 1.4582, + "step": 604 + }, + { + "epoch": 0.18019695079953088, + "grad_norm": 0.2326221913099289, + "learning_rate": 1.2100000000000001e-05, + "loss": 1.464, + "step": 605 + }, + { + "epoch": 0.18049479699919954, + "grad_norm": 0.08455421030521393, + "learning_rate": 1.2120000000000001e-05, + "loss": 1.4555, + "step": 606 + }, + { + "epoch": 0.18079264319886817, + "grad_norm": 0.08261154592037201, + "learning_rate": 1.214e-05, + "loss": 1.4779, + "step": 607 + }, + { + "epoch": 0.18109048939853684, + "grad_norm": 0.08473054319620132, + "learning_rate": 1.216e-05, + "loss": 1.4533, + "step": 608 + }, + { + "epoch": 0.18138833559820547, + "grad_norm": 0.10028796643018723, + "learning_rate": 1.218e-05, + "loss": 1.4532, + "step": 609 + }, + { + "epoch": 0.18168618179787413, + "grad_norm": 0.07884956896305084, + "learning_rate": 1.22e-05, + "loss": 1.4874, + "step": 610 + }, + { + "epoch": 0.18198402799754276, + "grad_norm": 0.0739910900592804, + "learning_rate": 1.2220000000000002e-05, + "loss": 1.4523, + "step": 611 + }, + { + "epoch": 0.18228187419721142, + "grad_norm": 0.07816793024539948, + "learning_rate": 1.2240000000000001e-05, + "loss": 1.4814, + "step": 612 + }, + { + "epoch": 0.18257972039688006, + "grad_norm": 0.08463511615991592, + "learning_rate": 1.2260000000000001e-05, + "loss": 1.4639, + "step": 613 + }, + { + "epoch": 0.18287756659654872, + "grad_norm": 0.0768217220902443, + "learning_rate": 1.2280000000000001e-05, + "loss": 1.4461, + "step": 614 + }, + { + "epoch": 0.18317541279621735, + "grad_norm": 0.06891295313835144, + "learning_rate": 1.23e-05, + "loss": 1.4489, + "step": 615 + }, + { + "epoch": 0.183473258995886, + "grad_norm": 0.08129339665174484, + "learning_rate": 1.232e-05, + "loss": 1.4646, + "step": 616 + }, + { + "epoch": 0.18377110519555465, + "grad_norm": 0.09381826967000961, + "learning_rate": 1.234e-05, + "loss": 1.4772, + "step": 617 + }, + { + "epoch": 0.18406895139522328, + "grad_norm": 0.07440833002328873, + "learning_rate": 1.236e-05, + "loss": 1.453, + "step": 618 + }, + { + "epoch": 0.18436679759489194, + "grad_norm": 0.07224272936582565, + "learning_rate": 1.2380000000000002e-05, + "loss": 1.4581, + "step": 619 + }, + { + "epoch": 0.18466464379456057, + "grad_norm": 0.09223474562168121, + "learning_rate": 1.2400000000000002e-05, + "loss": 1.4644, + "step": 620 + }, + { + "epoch": 0.18496248999422923, + "grad_norm": 0.072278693318367, + "learning_rate": 1.2420000000000001e-05, + "loss": 1.4756, + "step": 621 + }, + { + "epoch": 0.18526033619389787, + "grad_norm": 0.07935364544391632, + "learning_rate": 1.2440000000000001e-05, + "loss": 1.4405, + "step": 622 + }, + { + "epoch": 0.18555818239356653, + "grad_norm": 0.08382735401391983, + "learning_rate": 1.2460000000000001e-05, + "loss": 1.4551, + "step": 623 + }, + { + "epoch": 0.18585602859323516, + "grad_norm": 0.09439096599817276, + "learning_rate": 1.248e-05, + "loss": 1.4753, + "step": 624 + }, + { + "epoch": 0.18615387479290382, + "grad_norm": 0.08867479115724564, + "learning_rate": 1.25e-05, + "loss": 1.453, + "step": 625 + }, + { + "epoch": 0.18645172099257246, + "grad_norm": 0.07792381942272186, + "learning_rate": 1.252e-05, + "loss": 1.4379, + "step": 626 + }, + { + "epoch": 0.18674956719224112, + "grad_norm": 0.09601400792598724, + "learning_rate": 1.254e-05, + "loss": 1.47, + "step": 627 + }, + { + "epoch": 0.18704741339190975, + "grad_norm": 0.07906622439622879, + "learning_rate": 1.2560000000000002e-05, + "loss": 1.4668, + "step": 628 + }, + { + "epoch": 0.1873452595915784, + "grad_norm": 0.0777350515127182, + "learning_rate": 1.2580000000000002e-05, + "loss": 1.4527, + "step": 629 + }, + { + "epoch": 0.18764310579124704, + "grad_norm": 0.06617771834135056, + "learning_rate": 1.2600000000000001e-05, + "loss": 1.4496, + "step": 630 + }, + { + "epoch": 0.18794095199091568, + "grad_norm": 0.07273231446743011, + "learning_rate": 1.2620000000000001e-05, + "loss": 1.4607, + "step": 631 + }, + { + "epoch": 0.18823879819058434, + "grad_norm": 0.08392848074436188, + "learning_rate": 1.2640000000000001e-05, + "loss": 1.4503, + "step": 632 + }, + { + "epoch": 0.18853664439025297, + "grad_norm": 0.08314831554889679, + "learning_rate": 1.266e-05, + "loss": 1.4564, + "step": 633 + }, + { + "epoch": 0.18883449058992163, + "grad_norm": 0.07674255967140198, + "learning_rate": 1.268e-05, + "loss": 1.4436, + "step": 634 + }, + { + "epoch": 0.18913233678959027, + "grad_norm": 0.08189515769481659, + "learning_rate": 1.27e-05, + "loss": 1.4528, + "step": 635 + }, + { + "epoch": 0.18943018298925893, + "grad_norm": 0.07610049098730087, + "learning_rate": 1.2720000000000002e-05, + "loss": 1.4644, + "step": 636 + }, + { + "epoch": 0.18972802918892756, + "grad_norm": 0.08168883621692657, + "learning_rate": 1.2740000000000002e-05, + "loss": 1.43, + "step": 637 + }, + { + "epoch": 0.19002587538859622, + "grad_norm": 0.09663128852844238, + "learning_rate": 1.2760000000000001e-05, + "loss": 1.4638, + "step": 638 + }, + { + "epoch": 0.19032372158826485, + "grad_norm": 0.09020671248435974, + "learning_rate": 1.2780000000000001e-05, + "loss": 1.4427, + "step": 639 + }, + { + "epoch": 0.19062156778793352, + "grad_norm": 0.08151830732822418, + "learning_rate": 1.2800000000000001e-05, + "loss": 1.4469, + "step": 640 + }, + { + "epoch": 0.19091941398760215, + "grad_norm": 0.074073925614357, + "learning_rate": 1.2820000000000001e-05, + "loss": 1.4487, + "step": 641 + }, + { + "epoch": 0.1912172601872708, + "grad_norm": 0.08020228892564774, + "learning_rate": 1.284e-05, + "loss": 1.4499, + "step": 642 + }, + { + "epoch": 0.19151510638693944, + "grad_norm": 0.0663752481341362, + "learning_rate": 1.286e-05, + "loss": 1.4403, + "step": 643 + }, + { + "epoch": 0.19181295258660808, + "grad_norm": 0.07641787081956863, + "learning_rate": 1.2880000000000002e-05, + "loss": 1.4646, + "step": 644 + }, + { + "epoch": 0.19211079878627674, + "grad_norm": 0.09084177017211914, + "learning_rate": 1.2900000000000002e-05, + "loss": 1.4533, + "step": 645 + }, + { + "epoch": 0.19240864498594537, + "grad_norm": 0.06982532143592834, + "learning_rate": 1.2920000000000002e-05, + "loss": 1.4484, + "step": 646 + }, + { + "epoch": 0.19270649118561403, + "grad_norm": 0.07324111461639404, + "learning_rate": 1.2940000000000001e-05, + "loss": 1.4629, + "step": 647 + }, + { + "epoch": 0.19300433738528266, + "grad_norm": 0.08842533081769943, + "learning_rate": 1.2960000000000001e-05, + "loss": 1.4495, + "step": 648 + }, + { + "epoch": 0.19330218358495133, + "grad_norm": 0.07014186680316925, + "learning_rate": 1.2980000000000001e-05, + "loss": 1.4647, + "step": 649 + }, + { + "epoch": 0.19360002978461996, + "grad_norm": 0.06982603669166565, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.44, + "step": 650 + }, + { + "epoch": 0.19389787598428862, + "grad_norm": 0.06965212523937225, + "learning_rate": 1.302e-05, + "loss": 1.4537, + "step": 651 + }, + { + "epoch": 0.19419572218395725, + "grad_norm": 0.07888025045394897, + "learning_rate": 1.3040000000000002e-05, + "loss": 1.4553, + "step": 652 + }, + { + "epoch": 0.19449356838362591, + "grad_norm": 0.0851881206035614, + "learning_rate": 1.3060000000000002e-05, + "loss": 1.4534, + "step": 653 + }, + { + "epoch": 0.19479141458329455, + "grad_norm": 0.06838874518871307, + "learning_rate": 1.3080000000000002e-05, + "loss": 1.4632, + "step": 654 + }, + { + "epoch": 0.1950892607829632, + "grad_norm": 0.07981374114751816, + "learning_rate": 1.3100000000000002e-05, + "loss": 1.4357, + "step": 655 + }, + { + "epoch": 0.19538710698263184, + "grad_norm": 0.0882086381316185, + "learning_rate": 1.3120000000000001e-05, + "loss": 1.4601, + "step": 656 + }, + { + "epoch": 0.1956849531823005, + "grad_norm": 0.08774475008249283, + "learning_rate": 1.3140000000000001e-05, + "loss": 1.4629, + "step": 657 + }, + { + "epoch": 0.19598279938196914, + "grad_norm": 0.08365315198898315, + "learning_rate": 1.3160000000000001e-05, + "loss": 1.4605, + "step": 658 + }, + { + "epoch": 0.19628064558163777, + "grad_norm": 0.09570679068565369, + "learning_rate": 1.3180000000000001e-05, + "loss": 1.4503, + "step": 659 + }, + { + "epoch": 0.19657849178130643, + "grad_norm": 0.07877921313047409, + "learning_rate": 1.3200000000000002e-05, + "loss": 1.4424, + "step": 660 + }, + { + "epoch": 0.19687633798097506, + "grad_norm": 0.07286708801984787, + "learning_rate": 1.3220000000000002e-05, + "loss": 1.4396, + "step": 661 + }, + { + "epoch": 0.19717418418064372, + "grad_norm": 0.07777304947376251, + "learning_rate": 1.3240000000000002e-05, + "loss": 1.4584, + "step": 662 + }, + { + "epoch": 0.19747203038031236, + "grad_norm": 0.07005325704813004, + "learning_rate": 1.3260000000000002e-05, + "loss": 1.4481, + "step": 663 + }, + { + "epoch": 0.19776987657998102, + "grad_norm": 0.08193511515855789, + "learning_rate": 1.3280000000000002e-05, + "loss": 1.4664, + "step": 664 + }, + { + "epoch": 0.19806772277964965, + "grad_norm": 0.08347135782241821, + "learning_rate": 1.3300000000000001e-05, + "loss": 1.4421, + "step": 665 + }, + { + "epoch": 0.1983655689793183, + "grad_norm": 0.07640582323074341, + "learning_rate": 1.3320000000000001e-05, + "loss": 1.4498, + "step": 666 + }, + { + "epoch": 0.19866341517898695, + "grad_norm": 0.0817837044596672, + "learning_rate": 1.3340000000000001e-05, + "loss": 1.4395, + "step": 667 + }, + { + "epoch": 0.1989612613786556, + "grad_norm": 0.07509731501340866, + "learning_rate": 1.3360000000000003e-05, + "loss": 1.4529, + "step": 668 + }, + { + "epoch": 0.19925910757832424, + "grad_norm": 0.07815979421138763, + "learning_rate": 1.3380000000000002e-05, + "loss": 1.4491, + "step": 669 + }, + { + "epoch": 0.1995569537779929, + "grad_norm": 0.07622452825307846, + "learning_rate": 1.3400000000000002e-05, + "loss": 1.4553, + "step": 670 + }, + { + "epoch": 0.19985479997766153, + "grad_norm": 0.07803203910589218, + "learning_rate": 1.3420000000000002e-05, + "loss": 1.4638, + "step": 671 + }, + { + "epoch": 0.20015264617733017, + "grad_norm": 0.0738442987203598, + "learning_rate": 1.3440000000000002e-05, + "loss": 1.4448, + "step": 672 + }, + { + "epoch": 0.20045049237699883, + "grad_norm": 0.07957897335290909, + "learning_rate": 1.3460000000000002e-05, + "loss": 1.4426, + "step": 673 + }, + { + "epoch": 0.20074833857666746, + "grad_norm": 0.07637246698141098, + "learning_rate": 1.3480000000000001e-05, + "loss": 1.4368, + "step": 674 + }, + { + "epoch": 0.20104618477633612, + "grad_norm": 0.07760189473628998, + "learning_rate": 1.3500000000000001e-05, + "loss": 1.4445, + "step": 675 + }, + { + "epoch": 0.20134403097600476, + "grad_norm": 0.0710621029138565, + "learning_rate": 1.3520000000000003e-05, + "loss": 1.4443, + "step": 676 + }, + { + "epoch": 0.20164187717567342, + "grad_norm": 0.07569440454244614, + "learning_rate": 1.3540000000000003e-05, + "loss": 1.4498, + "step": 677 + }, + { + "epoch": 0.20193972337534205, + "grad_norm": 0.0767282173037529, + "learning_rate": 1.3560000000000002e-05, + "loss": 1.4422, + "step": 678 + }, + { + "epoch": 0.2022375695750107, + "grad_norm": 0.08722124248743057, + "learning_rate": 1.3580000000000002e-05, + "loss": 1.4596, + "step": 679 + }, + { + "epoch": 0.20253541577467934, + "grad_norm": 0.09099457412958145, + "learning_rate": 1.3600000000000002e-05, + "loss": 1.4469, + "step": 680 + }, + { + "epoch": 0.202833261974348, + "grad_norm": 0.06960950046777725, + "learning_rate": 1.3620000000000002e-05, + "loss": 1.4701, + "step": 681 + }, + { + "epoch": 0.20313110817401664, + "grad_norm": 0.07177898287773132, + "learning_rate": 1.3640000000000002e-05, + "loss": 1.4425, + "step": 682 + }, + { + "epoch": 0.2034289543736853, + "grad_norm": 0.07588344812393188, + "learning_rate": 1.3660000000000001e-05, + "loss": 1.4415, + "step": 683 + }, + { + "epoch": 0.20372680057335393, + "grad_norm": 0.07126078009605408, + "learning_rate": 1.3680000000000003e-05, + "loss": 1.4592, + "step": 684 + }, + { + "epoch": 0.2040246467730226, + "grad_norm": 0.0929664671421051, + "learning_rate": 1.3700000000000003e-05, + "loss": 1.4479, + "step": 685 + }, + { + "epoch": 0.20432249297269123, + "grad_norm": 0.07565732300281525, + "learning_rate": 1.3720000000000002e-05, + "loss": 1.4404, + "step": 686 + }, + { + "epoch": 0.20462033917235986, + "grad_norm": 0.09555564820766449, + "learning_rate": 1.3740000000000002e-05, + "loss": 1.4419, + "step": 687 + }, + { + "epoch": 0.20491818537202852, + "grad_norm": 0.0677831768989563, + "learning_rate": 1.376e-05, + "loss": 1.4457, + "step": 688 + }, + { + "epoch": 0.20521603157169716, + "grad_norm": 0.07221392542123795, + "learning_rate": 1.378e-05, + "loss": 1.4456, + "step": 689 + }, + { + "epoch": 0.20551387777136582, + "grad_norm": 0.07777795195579529, + "learning_rate": 1.38e-05, + "loss": 1.4432, + "step": 690 + }, + { + "epoch": 0.20581172397103445, + "grad_norm": 0.07490269839763641, + "learning_rate": 1.382e-05, + "loss": 1.4417, + "step": 691 + }, + { + "epoch": 0.2061095701707031, + "grad_norm": 0.079764723777771, + "learning_rate": 1.384e-05, + "loss": 1.4455, + "step": 692 + }, + { + "epoch": 0.20640741637037174, + "grad_norm": 0.09166201949119568, + "learning_rate": 1.386e-05, + "loss": 1.446, + "step": 693 + }, + { + "epoch": 0.2067052625700404, + "grad_norm": 0.07424476742744446, + "learning_rate": 1.3880000000000001e-05, + "loss": 1.4458, + "step": 694 + }, + { + "epoch": 0.20700310876970904, + "grad_norm": 0.07070266455411911, + "learning_rate": 1.39e-05, + "loss": 1.4336, + "step": 695 + }, + { + "epoch": 0.2073009549693777, + "grad_norm": 0.07643958926200867, + "learning_rate": 1.392e-05, + "loss": 1.4536, + "step": 696 + }, + { + "epoch": 0.20759880116904633, + "grad_norm": 0.07516030222177505, + "learning_rate": 1.394e-05, + "loss": 1.4408, + "step": 697 + }, + { + "epoch": 0.207896647368715, + "grad_norm": 0.0756191536784172, + "learning_rate": 1.396e-05, + "loss": 1.4337, + "step": 698 + }, + { + "epoch": 0.20819449356838363, + "grad_norm": 0.07343825697898865, + "learning_rate": 1.398e-05, + "loss": 1.4437, + "step": 699 + }, + { + "epoch": 0.20849233976805226, + "grad_norm": 0.10146047174930573, + "learning_rate": 1.4e-05, + "loss": 1.4374, + "step": 700 + }, + { + "epoch": 0.20879018596772092, + "grad_norm": 0.0799114927649498, + "learning_rate": 1.402e-05, + "loss": 1.4517, + "step": 701 + }, + { + "epoch": 0.20908803216738955, + "grad_norm": 0.07850436866283417, + "learning_rate": 1.4040000000000001e-05, + "loss": 1.445, + "step": 702 + }, + { + "epoch": 0.20938587836705821, + "grad_norm": 0.07773782312870026, + "learning_rate": 1.4060000000000001e-05, + "loss": 1.4392, + "step": 703 + }, + { + "epoch": 0.20968372456672685, + "grad_norm": 0.08914069831371307, + "learning_rate": 1.408e-05, + "loss": 1.4579, + "step": 704 + }, + { + "epoch": 0.2099815707663955, + "grad_norm": 0.07327356189489365, + "learning_rate": 1.41e-05, + "loss": 1.4472, + "step": 705 + }, + { + "epoch": 0.21027941696606414, + "grad_norm": 0.0727405697107315, + "learning_rate": 1.412e-05, + "loss": 1.4548, + "step": 706 + }, + { + "epoch": 0.2105772631657328, + "grad_norm": 0.08400865644216537, + "learning_rate": 1.414e-05, + "loss": 1.4488, + "step": 707 + }, + { + "epoch": 0.21087510936540144, + "grad_norm": 0.07869898527860641, + "learning_rate": 1.416e-05, + "loss": 1.4435, + "step": 708 + }, + { + "epoch": 0.2111729555650701, + "grad_norm": 0.07953284680843353, + "learning_rate": 1.418e-05, + "loss": 1.4513, + "step": 709 + }, + { + "epoch": 0.21147080176473873, + "grad_norm": 0.07338771224021912, + "learning_rate": 1.4200000000000001e-05, + "loss": 1.4415, + "step": 710 + }, + { + "epoch": 0.2117686479644074, + "grad_norm": 0.08386941999197006, + "learning_rate": 1.4220000000000001e-05, + "loss": 1.441, + "step": 711 + }, + { + "epoch": 0.21206649416407602, + "grad_norm": 0.08398545533418655, + "learning_rate": 1.4240000000000001e-05, + "loss": 1.4567, + "step": 712 + }, + { + "epoch": 0.21236434036374466, + "grad_norm": 0.0810646340250969, + "learning_rate": 1.426e-05, + "loss": 1.4614, + "step": 713 + }, + { + "epoch": 0.21266218656341332, + "grad_norm": 0.07760893553495407, + "learning_rate": 1.428e-05, + "loss": 1.434, + "step": 714 + }, + { + "epoch": 0.21296003276308195, + "grad_norm": 0.07823329418897629, + "learning_rate": 1.43e-05, + "loss": 1.4318, + "step": 715 + }, + { + "epoch": 0.2132578789627506, + "grad_norm": 0.08283335715532303, + "learning_rate": 1.432e-05, + "loss": 1.4331, + "step": 716 + }, + { + "epoch": 0.21355572516241925, + "grad_norm": 0.07786435633897781, + "learning_rate": 1.434e-05, + "loss": 1.436, + "step": 717 + }, + { + "epoch": 0.2138535713620879, + "grad_norm": 0.0875328779220581, + "learning_rate": 1.4360000000000001e-05, + "loss": 1.4267, + "step": 718 + }, + { + "epoch": 0.21415141756175654, + "grad_norm": 0.08489986509084702, + "learning_rate": 1.4380000000000001e-05, + "loss": 1.442, + "step": 719 + }, + { + "epoch": 0.2144492637614252, + "grad_norm": 0.09320718050003052, + "learning_rate": 1.4400000000000001e-05, + "loss": 1.4436, + "step": 720 + }, + { + "epoch": 0.21474710996109383, + "grad_norm": 0.08342552185058594, + "learning_rate": 1.4420000000000001e-05, + "loss": 1.4347, + "step": 721 + }, + { + "epoch": 0.2150449561607625, + "grad_norm": 0.08096481114625931, + "learning_rate": 1.444e-05, + "loss": 1.4307, + "step": 722 + }, + { + "epoch": 0.21534280236043113, + "grad_norm": 0.0804436206817627, + "learning_rate": 1.446e-05, + "loss": 1.4415, + "step": 723 + }, + { + "epoch": 0.2156406485600998, + "grad_norm": 0.0846792608499527, + "learning_rate": 1.448e-05, + "loss": 1.4508, + "step": 724 + }, + { + "epoch": 0.21593849475976842, + "grad_norm": 0.07710380107164383, + "learning_rate": 1.45e-05, + "loss": 1.441, + "step": 725 + }, + { + "epoch": 0.21623634095943708, + "grad_norm": 0.08490527421236038, + "learning_rate": 1.4520000000000002e-05, + "loss": 1.4427, + "step": 726 + }, + { + "epoch": 0.21653418715910572, + "grad_norm": 0.08044250309467316, + "learning_rate": 1.4540000000000001e-05, + "loss": 1.4409, + "step": 727 + }, + { + "epoch": 0.21683203335877435, + "grad_norm": 0.08590737730264664, + "learning_rate": 1.4560000000000001e-05, + "loss": 1.4366, + "step": 728 + }, + { + "epoch": 0.217129879558443, + "grad_norm": 0.07575635612010956, + "learning_rate": 1.4580000000000001e-05, + "loss": 1.437, + "step": 729 + }, + { + "epoch": 0.21742772575811165, + "grad_norm": 0.07428482174873352, + "learning_rate": 1.46e-05, + "loss": 1.4243, + "step": 730 + }, + { + "epoch": 0.2177255719577803, + "grad_norm": 0.08661162853240967, + "learning_rate": 1.462e-05, + "loss": 1.4346, + "step": 731 + }, + { + "epoch": 0.21802341815744894, + "grad_norm": 0.0743163526058197, + "learning_rate": 1.464e-05, + "loss": 1.4184, + "step": 732 + }, + { + "epoch": 0.2183212643571176, + "grad_norm": 0.15677453577518463, + "learning_rate": 1.466e-05, + "loss": 1.4413, + "step": 733 + }, + { + "epoch": 0.21861911055678623, + "grad_norm": 0.09177551418542862, + "learning_rate": 1.4680000000000002e-05, + "loss": 1.4463, + "step": 734 + }, + { + "epoch": 0.2189169567564549, + "grad_norm": 0.08581575006246567, + "learning_rate": 1.4700000000000002e-05, + "loss": 1.4566, + "step": 735 + }, + { + "epoch": 0.21921480295612353, + "grad_norm": 0.07593221962451935, + "learning_rate": 1.4720000000000001e-05, + "loss": 1.4337, + "step": 736 + }, + { + "epoch": 0.2195126491557922, + "grad_norm": 0.07464537024497986, + "learning_rate": 1.4740000000000001e-05, + "loss": 1.4351, + "step": 737 + }, + { + "epoch": 0.21981049535546082, + "grad_norm": 0.08558041602373123, + "learning_rate": 1.4760000000000001e-05, + "loss": 1.4554, + "step": 738 + }, + { + "epoch": 0.22010834155512948, + "grad_norm": 0.0854952484369278, + "learning_rate": 1.478e-05, + "loss": 1.4363, + "step": 739 + }, + { + "epoch": 0.22040618775479812, + "grad_norm": 0.08115620911121368, + "learning_rate": 1.48e-05, + "loss": 1.4449, + "step": 740 + }, + { + "epoch": 0.22070403395446675, + "grad_norm": 0.09755454212427139, + "learning_rate": 1.482e-05, + "loss": 1.4229, + "step": 741 + }, + { + "epoch": 0.2210018801541354, + "grad_norm": 0.0890473797917366, + "learning_rate": 1.4840000000000002e-05, + "loss": 1.4298, + "step": 742 + }, + { + "epoch": 0.22129972635380404, + "grad_norm": 0.09477613121271133, + "learning_rate": 1.4860000000000002e-05, + "loss": 1.4341, + "step": 743 + }, + { + "epoch": 0.2215975725534727, + "grad_norm": 0.08408848196268082, + "learning_rate": 1.4880000000000002e-05, + "loss": 1.449, + "step": 744 + }, + { + "epoch": 0.22189541875314134, + "grad_norm": 0.09292439371347427, + "learning_rate": 1.4900000000000001e-05, + "loss": 1.4544, + "step": 745 + }, + { + "epoch": 0.22219326495281, + "grad_norm": 0.0929374247789383, + "learning_rate": 1.4920000000000001e-05, + "loss": 1.4257, + "step": 746 + }, + { + "epoch": 0.22249111115247863, + "grad_norm": 0.07992805540561676, + "learning_rate": 1.4940000000000001e-05, + "loss": 1.4216, + "step": 747 + }, + { + "epoch": 0.2227889573521473, + "grad_norm": 0.08281844854354858, + "learning_rate": 1.496e-05, + "loss": 1.4437, + "step": 748 + }, + { + "epoch": 0.22308680355181593, + "grad_norm": 0.08912403136491776, + "learning_rate": 1.498e-05, + "loss": 1.4343, + "step": 749 + }, + { + "epoch": 0.2233846497514846, + "grad_norm": 0.08185693621635437, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.4339, + "step": 750 + }, + { + "epoch": 0.22368249595115322, + "grad_norm": 0.09299539029598236, + "learning_rate": 1.5020000000000002e-05, + "loss": 1.4288, + "step": 751 + }, + { + "epoch": 0.22398034215082188, + "grad_norm": 0.08689764887094498, + "learning_rate": 1.5040000000000002e-05, + "loss": 1.4256, + "step": 752 + }, + { + "epoch": 0.22427818835049051, + "grad_norm": 0.0877898633480072, + "learning_rate": 1.5060000000000001e-05, + "loss": 1.4254, + "step": 753 + }, + { + "epoch": 0.22457603455015915, + "grad_norm": 0.08317238837480545, + "learning_rate": 1.5080000000000001e-05, + "loss": 1.4254, + "step": 754 + }, + { + "epoch": 0.2248738807498278, + "grad_norm": 0.07793273776769638, + "learning_rate": 1.5100000000000001e-05, + "loss": 1.4406, + "step": 755 + }, + { + "epoch": 0.22517172694949644, + "grad_norm": 0.0886562243103981, + "learning_rate": 1.5120000000000001e-05, + "loss": 1.4486, + "step": 756 + }, + { + "epoch": 0.2254695731491651, + "grad_norm": 0.08472087979316711, + "learning_rate": 1.514e-05, + "loss": 1.4171, + "step": 757 + }, + { + "epoch": 0.22576741934883374, + "grad_norm": 0.09509363025426865, + "learning_rate": 1.516e-05, + "loss": 1.4207, + "step": 758 + }, + { + "epoch": 0.2260652655485024, + "grad_norm": 0.09654158353805542, + "learning_rate": 1.5180000000000002e-05, + "loss": 1.424, + "step": 759 + }, + { + "epoch": 0.22636311174817103, + "grad_norm": 0.07955440133810043, + "learning_rate": 1.5200000000000002e-05, + "loss": 1.4335, + "step": 760 + }, + { + "epoch": 0.2266609579478397, + "grad_norm": 0.09881535917520523, + "learning_rate": 1.5220000000000002e-05, + "loss": 1.42, + "step": 761 + }, + { + "epoch": 0.22695880414750833, + "grad_norm": 0.09733070433139801, + "learning_rate": 1.5240000000000001e-05, + "loss": 1.4325, + "step": 762 + }, + { + "epoch": 0.22725665034717699, + "grad_norm": 0.08393367379903793, + "learning_rate": 1.5260000000000003e-05, + "loss": 1.4338, + "step": 763 + }, + { + "epoch": 0.22755449654684562, + "grad_norm": 0.08710719645023346, + "learning_rate": 1.5280000000000003e-05, + "loss": 1.4436, + "step": 764 + }, + { + "epoch": 0.22785234274651428, + "grad_norm": 0.0892837718129158, + "learning_rate": 1.5300000000000003e-05, + "loss": 1.4317, + "step": 765 + }, + { + "epoch": 0.2281501889461829, + "grad_norm": 0.09063079208135605, + "learning_rate": 1.5320000000000002e-05, + "loss": 1.439, + "step": 766 + }, + { + "epoch": 0.22844803514585157, + "grad_norm": 0.0837310254573822, + "learning_rate": 1.5340000000000002e-05, + "loss": 1.4154, + "step": 767 + }, + { + "epoch": 0.2287458813455202, + "grad_norm": 0.0990089401602745, + "learning_rate": 1.5360000000000002e-05, + "loss": 1.4355, + "step": 768 + }, + { + "epoch": 0.22904372754518884, + "grad_norm": 0.08116359263658524, + "learning_rate": 1.5380000000000002e-05, + "loss": 1.4406, + "step": 769 + }, + { + "epoch": 0.2293415737448575, + "grad_norm": 0.09696918725967407, + "learning_rate": 1.54e-05, + "loss": 1.4249, + "step": 770 + }, + { + "epoch": 0.22963941994452614, + "grad_norm": 0.0966629609465599, + "learning_rate": 1.542e-05, + "loss": 1.4278, + "step": 771 + }, + { + "epoch": 0.2299372661441948, + "grad_norm": 0.08971633017063141, + "learning_rate": 1.544e-05, + "loss": 1.4296, + "step": 772 + }, + { + "epoch": 0.23023511234386343, + "grad_norm": 0.0809473842382431, + "learning_rate": 1.546e-05, + "loss": 1.4143, + "step": 773 + }, + { + "epoch": 0.2305329585435321, + "grad_norm": 0.13298514485359192, + "learning_rate": 1.548e-05, + "loss": 1.4372, + "step": 774 + }, + { + "epoch": 0.23083080474320072, + "grad_norm": 0.08953984081745148, + "learning_rate": 1.55e-05, + "loss": 1.4328, + "step": 775 + }, + { + "epoch": 0.23112865094286938, + "grad_norm": 0.08975512534379959, + "learning_rate": 1.552e-05, + "loss": 1.4188, + "step": 776 + }, + { + "epoch": 0.23142649714253802, + "grad_norm": 0.08501532673835754, + "learning_rate": 1.554e-05, + "loss": 1.435, + "step": 777 + }, + { + "epoch": 0.23172434334220668, + "grad_norm": 0.08300330489873886, + "learning_rate": 1.556e-05, + "loss": 1.4346, + "step": 778 + }, + { + "epoch": 0.2320221895418753, + "grad_norm": 0.08758524805307388, + "learning_rate": 1.5580000000000003e-05, + "loss": 1.4267, + "step": 779 + }, + { + "epoch": 0.23232003574154397, + "grad_norm": 0.08660390228033066, + "learning_rate": 1.5600000000000003e-05, + "loss": 1.4428, + "step": 780 + }, + { + "epoch": 0.2326178819412126, + "grad_norm": 0.08623852580785751, + "learning_rate": 1.5620000000000003e-05, + "loss": 1.4178, + "step": 781 + }, + { + "epoch": 0.23291572814088124, + "grad_norm": 0.09258664399385452, + "learning_rate": 1.5640000000000003e-05, + "loss": 1.4408, + "step": 782 + }, + { + "epoch": 0.2332135743405499, + "grad_norm": 0.08016887307167053, + "learning_rate": 1.5660000000000003e-05, + "loss": 1.4232, + "step": 783 + }, + { + "epoch": 0.23351142054021853, + "grad_norm": 0.08588265627622604, + "learning_rate": 1.5680000000000002e-05, + "loss": 1.431, + "step": 784 + }, + { + "epoch": 0.2338092667398872, + "grad_norm": 0.08261680603027344, + "learning_rate": 1.5700000000000002e-05, + "loss": 1.4283, + "step": 785 + }, + { + "epoch": 0.23410711293955583, + "grad_norm": 0.0949244499206543, + "learning_rate": 1.5720000000000002e-05, + "loss": 1.4268, + "step": 786 + }, + { + "epoch": 0.2344049591392245, + "grad_norm": 0.10626151412725449, + "learning_rate": 1.5740000000000002e-05, + "loss": 1.4331, + "step": 787 + }, + { + "epoch": 0.23470280533889312, + "grad_norm": 0.08633959293365479, + "learning_rate": 1.576e-05, + "loss": 1.4458, + "step": 788 + }, + { + "epoch": 0.23500065153856178, + "grad_norm": 0.08385222405195236, + "learning_rate": 1.578e-05, + "loss": 1.4275, + "step": 789 + }, + { + "epoch": 0.23529849773823042, + "grad_norm": 0.0932660847902298, + "learning_rate": 1.58e-05, + "loss": 1.4371, + "step": 790 + }, + { + "epoch": 0.23559634393789908, + "grad_norm": 0.09389449656009674, + "learning_rate": 1.582e-05, + "loss": 1.4452, + "step": 791 + }, + { + "epoch": 0.2358941901375677, + "grad_norm": 0.08671513944864273, + "learning_rate": 1.584e-05, + "loss": 1.4261, + "step": 792 + }, + { + "epoch": 0.23619203633723637, + "grad_norm": 0.10053084790706635, + "learning_rate": 1.586e-05, + "loss": 1.4061, + "step": 793 + }, + { + "epoch": 0.236489882536905, + "grad_norm": 0.10518268495798111, + "learning_rate": 1.588e-05, + "loss": 1.4232, + "step": 794 + }, + { + "epoch": 0.23678772873657364, + "grad_norm": 0.09803315252065659, + "learning_rate": 1.5900000000000004e-05, + "loss": 1.4485, + "step": 795 + }, + { + "epoch": 0.2370855749362423, + "grad_norm": 0.09714682400226593, + "learning_rate": 1.5920000000000003e-05, + "loss": 1.4359, + "step": 796 + }, + { + "epoch": 0.23738342113591093, + "grad_norm": 0.09825300425291061, + "learning_rate": 1.5940000000000003e-05, + "loss": 1.4334, + "step": 797 + }, + { + "epoch": 0.2376812673355796, + "grad_norm": 0.0919112116098404, + "learning_rate": 1.5960000000000003e-05, + "loss": 1.4379, + "step": 798 + }, + { + "epoch": 0.23797911353524823, + "grad_norm": 0.08471221476793289, + "learning_rate": 1.5980000000000003e-05, + "loss": 1.4105, + "step": 799 + }, + { + "epoch": 0.2382769597349169, + "grad_norm": 0.08819548040628433, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.444, + "step": 800 + }, + { + "epoch": 0.23857480593458552, + "grad_norm": 0.09059222787618637, + "learning_rate": 1.6020000000000002e-05, + "loss": 1.4186, + "step": 801 + }, + { + "epoch": 0.23887265213425418, + "grad_norm": 0.09318530559539795, + "learning_rate": 1.6040000000000002e-05, + "loss": 1.4273, + "step": 802 + }, + { + "epoch": 0.23917049833392282, + "grad_norm": 0.0928410068154335, + "learning_rate": 1.6060000000000002e-05, + "loss": 1.4338, + "step": 803 + }, + { + "epoch": 0.23946834453359148, + "grad_norm": 0.09981449693441391, + "learning_rate": 1.6080000000000002e-05, + "loss": 1.4294, + "step": 804 + }, + { + "epoch": 0.2397661907332601, + "grad_norm": 0.08160090446472168, + "learning_rate": 1.6100000000000002e-05, + "loss": 1.4104, + "step": 805 + }, + { + "epoch": 0.24006403693292877, + "grad_norm": 0.09856461733579636, + "learning_rate": 1.612e-05, + "loss": 1.4432, + "step": 806 + }, + { + "epoch": 0.2403618831325974, + "grad_norm": 0.08490412682294846, + "learning_rate": 1.614e-05, + "loss": 1.439, + "step": 807 + }, + { + "epoch": 0.24065972933226606, + "grad_norm": 0.10598944127559662, + "learning_rate": 1.616e-05, + "loss": 1.4408, + "step": 808 + }, + { + "epoch": 0.2409575755319347, + "grad_norm": 0.09258910268545151, + "learning_rate": 1.618e-05, + "loss": 1.4108, + "step": 809 + }, + { + "epoch": 0.24125542173160333, + "grad_norm": 0.08649790287017822, + "learning_rate": 1.62e-05, + "loss": 1.4151, + "step": 810 + }, + { + "epoch": 0.241553267931272, + "grad_norm": 0.08332608640193939, + "learning_rate": 1.6220000000000004e-05, + "loss": 1.4288, + "step": 811 + }, + { + "epoch": 0.24185111413094063, + "grad_norm": 0.09572703391313553, + "learning_rate": 1.6240000000000004e-05, + "loss": 1.42, + "step": 812 + }, + { + "epoch": 0.2421489603306093, + "grad_norm": 0.10475686937570572, + "learning_rate": 1.626e-05, + "loss": 1.4474, + "step": 813 + }, + { + "epoch": 0.24244680653027792, + "grad_norm": 0.11400950700044632, + "learning_rate": 1.628e-05, + "loss": 1.435, + "step": 814 + }, + { + "epoch": 0.24274465272994658, + "grad_norm": 0.08923758566379547, + "learning_rate": 1.63e-05, + "loss": 1.4216, + "step": 815 + }, + { + "epoch": 0.2430424989296152, + "grad_norm": 0.09222014993429184, + "learning_rate": 1.632e-05, + "loss": 1.4204, + "step": 816 + }, + { + "epoch": 0.24334034512928387, + "grad_norm": 0.10609360784292221, + "learning_rate": 1.634e-05, + "loss": 1.4313, + "step": 817 + }, + { + "epoch": 0.2436381913289525, + "grad_norm": 0.09008601307868958, + "learning_rate": 1.636e-05, + "loss": 1.4127, + "step": 818 + }, + { + "epoch": 0.24393603752862117, + "grad_norm": 0.09481731057167053, + "learning_rate": 1.638e-05, + "loss": 1.4234, + "step": 819 + }, + { + "epoch": 0.2442338837282898, + "grad_norm": 0.09240791201591492, + "learning_rate": 1.64e-05, + "loss": 1.4503, + "step": 820 + }, + { + "epoch": 0.24453172992795846, + "grad_norm": 0.09623836725950241, + "learning_rate": 1.6420000000000002e-05, + "loss": 1.4357, + "step": 821 + }, + { + "epoch": 0.2448295761276271, + "grad_norm": 0.09477917104959488, + "learning_rate": 1.6440000000000002e-05, + "loss": 1.3946, + "step": 822 + }, + { + "epoch": 0.24512742232729573, + "grad_norm": 0.08368118852376938, + "learning_rate": 1.646e-05, + "loss": 1.4181, + "step": 823 + }, + { + "epoch": 0.2454252685269644, + "grad_norm": 0.08611281961202621, + "learning_rate": 1.648e-05, + "loss": 1.4196, + "step": 824 + }, + { + "epoch": 0.24572311472663302, + "grad_norm": 0.09466510266065598, + "learning_rate": 1.65e-05, + "loss": 1.409, + "step": 825 + }, + { + "epoch": 0.24602096092630169, + "grad_norm": 0.09006678313016891, + "learning_rate": 1.652e-05, + "loss": 1.4372, + "step": 826 + }, + { + "epoch": 0.24631880712597032, + "grad_norm": 0.0944555401802063, + "learning_rate": 1.654e-05, + "loss": 1.4294, + "step": 827 + }, + { + "epoch": 0.24661665332563898, + "grad_norm": 0.08576387912034988, + "learning_rate": 1.656e-05, + "loss": 1.4145, + "step": 828 + }, + { + "epoch": 0.2469144995253076, + "grad_norm": 0.09978868067264557, + "learning_rate": 1.658e-05, + "loss": 1.4233, + "step": 829 + }, + { + "epoch": 0.24721234572497627, + "grad_norm": 0.0882377177476883, + "learning_rate": 1.66e-05, + "loss": 1.4413, + "step": 830 + }, + { + "epoch": 0.2475101919246449, + "grad_norm": 0.09680736809968948, + "learning_rate": 1.662e-05, + "loss": 1.4082, + "step": 831 + }, + { + "epoch": 0.24780803812431357, + "grad_norm": 0.0893857553601265, + "learning_rate": 1.664e-05, + "loss": 1.4185, + "step": 832 + }, + { + "epoch": 0.2481058843239822, + "grad_norm": 0.09749665856361389, + "learning_rate": 1.666e-05, + "loss": 1.395, + "step": 833 + }, + { + "epoch": 0.24840373052365086, + "grad_norm": 0.08752049505710602, + "learning_rate": 1.668e-05, + "loss": 1.429, + "step": 834 + }, + { + "epoch": 0.2487015767233195, + "grad_norm": 0.10418985784053802, + "learning_rate": 1.67e-05, + "loss": 1.4238, + "step": 835 + }, + { + "epoch": 0.24899942292298813, + "grad_norm": 0.09133674949407578, + "learning_rate": 1.672e-05, + "loss": 1.4041, + "step": 836 + }, + { + "epoch": 0.2492972691226568, + "grad_norm": 0.08586476743221283, + "learning_rate": 1.6740000000000002e-05, + "loss": 1.4163, + "step": 837 + }, + { + "epoch": 0.24959511532232542, + "grad_norm": 0.09824176877737045, + "learning_rate": 1.6760000000000002e-05, + "loss": 1.441, + "step": 838 + }, + { + "epoch": 0.24989296152199408, + "grad_norm": 0.08725030720233917, + "learning_rate": 1.6780000000000002e-05, + "loss": 1.4267, + "step": 839 + }, + { + "epoch": 0.25019080772166274, + "grad_norm": 0.11314549297094345, + "learning_rate": 1.6800000000000002e-05, + "loss": 1.4128, + "step": 840 + }, + { + "epoch": 0.25048865392133135, + "grad_norm": 0.09061966836452484, + "learning_rate": 1.682e-05, + "loss": 1.4415, + "step": 841 + }, + { + "epoch": 0.250786500121, + "grad_norm": 0.11773449182510376, + "learning_rate": 1.684e-05, + "loss": 1.4315, + "step": 842 + }, + { + "epoch": 0.25108434632066867, + "grad_norm": 0.09932917356491089, + "learning_rate": 1.686e-05, + "loss": 1.4214, + "step": 843 + }, + { + "epoch": 0.25138219252033733, + "grad_norm": 0.09133890271186829, + "learning_rate": 1.688e-05, + "loss": 1.4078, + "step": 844 + }, + { + "epoch": 0.25168003872000594, + "grad_norm": 0.12656791508197784, + "learning_rate": 1.69e-05, + "loss": 1.4246, + "step": 845 + }, + { + "epoch": 0.2519778849196746, + "grad_norm": 0.08804042637348175, + "learning_rate": 1.692e-05, + "loss": 1.4184, + "step": 846 + }, + { + "epoch": 0.25227573111934326, + "grad_norm": 0.09051616489887238, + "learning_rate": 1.694e-05, + "loss": 1.4024, + "step": 847 + }, + { + "epoch": 0.2525735773190119, + "grad_norm": 0.11514545232057571, + "learning_rate": 1.696e-05, + "loss": 1.4167, + "step": 848 + }, + { + "epoch": 0.2528714235186805, + "grad_norm": 0.08954603224992752, + "learning_rate": 1.698e-05, + "loss": 1.4114, + "step": 849 + }, + { + "epoch": 0.2531692697183492, + "grad_norm": 0.10602930933237076, + "learning_rate": 1.7e-05, + "loss": 1.4161, + "step": 850 + }, + { + "epoch": 0.25346711591801785, + "grad_norm": 0.09775227308273315, + "learning_rate": 1.702e-05, + "loss": 1.4334, + "step": 851 + }, + { + "epoch": 0.25376496211768645, + "grad_norm": 0.0997774749994278, + "learning_rate": 1.704e-05, + "loss": 1.412, + "step": 852 + }, + { + "epoch": 0.2540628083173551, + "grad_norm": 0.09448961913585663, + "learning_rate": 1.7060000000000003e-05, + "loss": 1.4162, + "step": 853 + }, + { + "epoch": 0.2543606545170238, + "grad_norm": 0.10423950850963593, + "learning_rate": 1.7080000000000002e-05, + "loss": 1.4313, + "step": 854 + }, + { + "epoch": 0.25465850071669244, + "grad_norm": 0.10520417243242264, + "learning_rate": 1.7100000000000002e-05, + "loss": 1.4229, + "step": 855 + }, + { + "epoch": 0.25495634691636104, + "grad_norm": 0.09463100880384445, + "learning_rate": 1.7120000000000002e-05, + "loss": 1.4128, + "step": 856 + }, + { + "epoch": 0.2552541931160297, + "grad_norm": 0.09853941947221756, + "learning_rate": 1.7140000000000002e-05, + "loss": 1.426, + "step": 857 + }, + { + "epoch": 0.25555203931569836, + "grad_norm": 0.09377727657556534, + "learning_rate": 1.7160000000000002e-05, + "loss": 1.4198, + "step": 858 + }, + { + "epoch": 0.255849885515367, + "grad_norm": 0.08863189816474915, + "learning_rate": 1.718e-05, + "loss": 1.4079, + "step": 859 + }, + { + "epoch": 0.25614773171503563, + "grad_norm": 0.08952966332435608, + "learning_rate": 1.72e-05, + "loss": 1.4312, + "step": 860 + }, + { + "epoch": 0.2564455779147043, + "grad_norm": 0.10895556956529617, + "learning_rate": 1.722e-05, + "loss": 1.3928, + "step": 861 + }, + { + "epoch": 0.25674342411437295, + "grad_norm": 0.09976348280906677, + "learning_rate": 1.724e-05, + "loss": 1.4131, + "step": 862 + }, + { + "epoch": 0.2570412703140416, + "grad_norm": 0.10217374563217163, + "learning_rate": 1.726e-05, + "loss": 1.4301, + "step": 863 + }, + { + "epoch": 0.2573391165137102, + "grad_norm": 0.0923185646533966, + "learning_rate": 1.728e-05, + "loss": 1.4379, + "step": 864 + }, + { + "epoch": 0.2576369627133789, + "grad_norm": 0.09548322856426239, + "learning_rate": 1.73e-05, + "loss": 1.4158, + "step": 865 + }, + { + "epoch": 0.25793480891304754, + "grad_norm": 0.09015744179487228, + "learning_rate": 1.732e-05, + "loss": 1.4119, + "step": 866 + }, + { + "epoch": 0.25823265511271615, + "grad_norm": 0.09577369689941406, + "learning_rate": 1.734e-05, + "loss": 1.4046, + "step": 867 + }, + { + "epoch": 0.2585305013123848, + "grad_norm": 0.09768911451101303, + "learning_rate": 1.736e-05, + "loss": 1.4031, + "step": 868 + }, + { + "epoch": 0.25882834751205347, + "grad_norm": 0.10957697778940201, + "learning_rate": 1.7380000000000003e-05, + "loss": 1.4117, + "step": 869 + }, + { + "epoch": 0.25912619371172213, + "grad_norm": 0.0891515463590622, + "learning_rate": 1.7400000000000003e-05, + "loss": 1.4045, + "step": 870 + }, + { + "epoch": 0.25942403991139074, + "grad_norm": 0.09458895772695541, + "learning_rate": 1.7420000000000003e-05, + "loss": 1.4109, + "step": 871 + }, + { + "epoch": 0.2597218861110594, + "grad_norm": 0.0916818156838417, + "learning_rate": 1.7440000000000002e-05, + "loss": 1.4028, + "step": 872 + }, + { + "epoch": 0.26001973231072806, + "grad_norm": 0.10320673137903214, + "learning_rate": 1.7460000000000002e-05, + "loss": 1.4187, + "step": 873 + }, + { + "epoch": 0.2603175785103967, + "grad_norm": 0.099599190056324, + "learning_rate": 1.7480000000000002e-05, + "loss": 1.4265, + "step": 874 + }, + { + "epoch": 0.2606154247100653, + "grad_norm": 0.09808894246816635, + "learning_rate": 1.7500000000000002e-05, + "loss": 1.4232, + "step": 875 + }, + { + "epoch": 0.260913270909734, + "grad_norm": 0.09341083467006683, + "learning_rate": 1.752e-05, + "loss": 1.4233, + "step": 876 + }, + { + "epoch": 0.26121111710940265, + "grad_norm": 0.10195144265890121, + "learning_rate": 1.754e-05, + "loss": 1.4139, + "step": 877 + }, + { + "epoch": 0.26150896330907125, + "grad_norm": 0.09913980960845947, + "learning_rate": 1.756e-05, + "loss": 1.4196, + "step": 878 + }, + { + "epoch": 0.2618068095087399, + "grad_norm": 0.09989370405673981, + "learning_rate": 1.758e-05, + "loss": 1.4288, + "step": 879 + }, + { + "epoch": 0.2621046557084086, + "grad_norm": 0.10181991010904312, + "learning_rate": 1.76e-05, + "loss": 1.4175, + "step": 880 + }, + { + "epoch": 0.26240250190807723, + "grad_norm": 0.09663712233304977, + "learning_rate": 1.762e-05, + "loss": 1.4406, + "step": 881 + }, + { + "epoch": 0.26270034810774584, + "grad_norm": 0.09889239817857742, + "learning_rate": 1.764e-05, + "loss": 1.4101, + "step": 882 + }, + { + "epoch": 0.2629981943074145, + "grad_norm": 0.0971718281507492, + "learning_rate": 1.766e-05, + "loss": 1.4217, + "step": 883 + }, + { + "epoch": 0.26329604050708316, + "grad_norm": 0.095889151096344, + "learning_rate": 1.768e-05, + "loss": 1.4181, + "step": 884 + }, + { + "epoch": 0.2635938867067518, + "grad_norm": 0.09541714191436768, + "learning_rate": 1.77e-05, + "loss": 1.4212, + "step": 885 + }, + { + "epoch": 0.26389173290642043, + "grad_norm": 0.09994743764400482, + "learning_rate": 1.7720000000000003e-05, + "loss": 1.4147, + "step": 886 + }, + { + "epoch": 0.2641895791060891, + "grad_norm": 0.09512241184711456, + "learning_rate": 1.7740000000000003e-05, + "loss": 1.4326, + "step": 887 + }, + { + "epoch": 0.26448742530575775, + "grad_norm": 0.09152122586965561, + "learning_rate": 1.7760000000000003e-05, + "loss": 1.4007, + "step": 888 + }, + { + "epoch": 0.2647852715054264, + "grad_norm": 0.10552559047937393, + "learning_rate": 1.7780000000000003e-05, + "loss": 1.4241, + "step": 889 + }, + { + "epoch": 0.265083117705095, + "grad_norm": 0.0983191430568695, + "learning_rate": 1.7800000000000002e-05, + "loss": 1.3978, + "step": 890 + }, + { + "epoch": 0.2653809639047637, + "grad_norm": 0.09858065098524094, + "learning_rate": 1.7820000000000002e-05, + "loss": 1.4227, + "step": 891 + }, + { + "epoch": 0.26567881010443234, + "grad_norm": 0.0938938781619072, + "learning_rate": 1.7840000000000002e-05, + "loss": 1.3974, + "step": 892 + }, + { + "epoch": 0.26597665630410094, + "grad_norm": 0.09322497993707657, + "learning_rate": 1.7860000000000002e-05, + "loss": 1.408, + "step": 893 + }, + { + "epoch": 0.2662745025037696, + "grad_norm": 0.10351376980543137, + "learning_rate": 1.788e-05, + "loss": 1.4184, + "step": 894 + }, + { + "epoch": 0.26657234870343827, + "grad_norm": 0.1024598702788353, + "learning_rate": 1.79e-05, + "loss": 1.4024, + "step": 895 + }, + { + "epoch": 0.2668701949031069, + "grad_norm": 0.0989329069852829, + "learning_rate": 1.792e-05, + "loss": 1.4221, + "step": 896 + }, + { + "epoch": 0.26716804110277553, + "grad_norm": 0.1000707745552063, + "learning_rate": 1.794e-05, + "loss": 1.4036, + "step": 897 + }, + { + "epoch": 0.2674658873024442, + "grad_norm": 0.0954870656132698, + "learning_rate": 1.796e-05, + "loss": 1.4084, + "step": 898 + }, + { + "epoch": 0.26776373350211286, + "grad_norm": 0.09943860024213791, + "learning_rate": 1.798e-05, + "loss": 1.4081, + "step": 899 + }, + { + "epoch": 0.2680615797017815, + "grad_norm": 0.09641280025243759, + "learning_rate": 1.8e-05, + "loss": 1.4056, + "step": 900 + }, + { + "epoch": 0.2683594259014501, + "grad_norm": 0.09933046251535416, + "learning_rate": 1.802e-05, + "loss": 1.4015, + "step": 901 + }, + { + "epoch": 0.2686572721011188, + "grad_norm": 0.09325650334358215, + "learning_rate": 1.8040000000000003e-05, + "loss": 1.4166, + "step": 902 + }, + { + "epoch": 0.26895511830078744, + "grad_norm": 0.09692834317684174, + "learning_rate": 1.8060000000000003e-05, + "loss": 1.3878, + "step": 903 + }, + { + "epoch": 0.2692529645004561, + "grad_norm": 0.0937558114528656, + "learning_rate": 1.8080000000000003e-05, + "loss": 1.4024, + "step": 904 + }, + { + "epoch": 0.2695508107001247, + "grad_norm": 0.10609590262174606, + "learning_rate": 1.8100000000000003e-05, + "loss": 1.394, + "step": 905 + }, + { + "epoch": 0.26984865689979337, + "grad_norm": 0.10428649187088013, + "learning_rate": 1.8120000000000003e-05, + "loss": 1.409, + "step": 906 + }, + { + "epoch": 0.27014650309946203, + "grad_norm": 0.10072480142116547, + "learning_rate": 1.8140000000000003e-05, + "loss": 1.4273, + "step": 907 + }, + { + "epoch": 0.27044434929913064, + "grad_norm": 0.10124707221984863, + "learning_rate": 1.8160000000000002e-05, + "loss": 1.4101, + "step": 908 + }, + { + "epoch": 0.2707421954987993, + "grad_norm": 0.11809855699539185, + "learning_rate": 1.8180000000000002e-05, + "loss": 1.4181, + "step": 909 + }, + { + "epoch": 0.27104004169846796, + "grad_norm": 0.10658524185419083, + "learning_rate": 1.8200000000000002e-05, + "loss": 1.4127, + "step": 910 + }, + { + "epoch": 0.2713378878981366, + "grad_norm": 0.09508049488067627, + "learning_rate": 1.8220000000000002e-05, + "loss": 1.4078, + "step": 911 + }, + { + "epoch": 0.2716357340978052, + "grad_norm": 0.09699340164661407, + "learning_rate": 1.824e-05, + "loss": 1.4023, + "step": 912 + }, + { + "epoch": 0.2719335802974739, + "grad_norm": 0.10190922766923904, + "learning_rate": 1.826e-05, + "loss": 1.3913, + "step": 913 + }, + { + "epoch": 0.27223142649714255, + "grad_norm": 0.09622354805469513, + "learning_rate": 1.828e-05, + "loss": 1.3964, + "step": 914 + }, + { + "epoch": 0.2725292726968112, + "grad_norm": 0.10256228595972061, + "learning_rate": 1.83e-05, + "loss": 1.4014, + "step": 915 + }, + { + "epoch": 0.2728271188964798, + "grad_norm": 0.10841874033212662, + "learning_rate": 1.832e-05, + "loss": 1.3959, + "step": 916 + }, + { + "epoch": 0.2731249650961485, + "grad_norm": 0.10006251186132431, + "learning_rate": 1.834e-05, + "loss": 1.3897, + "step": 917 + }, + { + "epoch": 0.27342281129581714, + "grad_norm": 0.10497182607650757, + "learning_rate": 1.8360000000000004e-05, + "loss": 1.4088, + "step": 918 + }, + { + "epoch": 0.27372065749548574, + "grad_norm": 0.09609287977218628, + "learning_rate": 1.8380000000000004e-05, + "loss": 1.4052, + "step": 919 + }, + { + "epoch": 0.2740185036951544, + "grad_norm": 0.09894520044326782, + "learning_rate": 1.8400000000000003e-05, + "loss": 1.4145, + "step": 920 + }, + { + "epoch": 0.27431634989482306, + "grad_norm": 0.11151731759309769, + "learning_rate": 1.8420000000000003e-05, + "loss": 1.4094, + "step": 921 + }, + { + "epoch": 0.2746141960944917, + "grad_norm": 0.11131482571363449, + "learning_rate": 1.8440000000000003e-05, + "loss": 1.389, + "step": 922 + }, + { + "epoch": 0.27491204229416033, + "grad_norm": 0.1114390641450882, + "learning_rate": 1.8460000000000003e-05, + "loss": 1.4065, + "step": 923 + }, + { + "epoch": 0.275209888493829, + "grad_norm": 0.10770322382450104, + "learning_rate": 1.8480000000000003e-05, + "loss": 1.4018, + "step": 924 + }, + { + "epoch": 0.27550773469349765, + "grad_norm": 0.11392100900411606, + "learning_rate": 1.8500000000000002e-05, + "loss": 1.4076, + "step": 925 + }, + { + "epoch": 0.2758055808931663, + "grad_norm": 0.11138052493333817, + "learning_rate": 1.8520000000000002e-05, + "loss": 1.4204, + "step": 926 + }, + { + "epoch": 0.2761034270928349, + "grad_norm": 0.09689308702945709, + "learning_rate": 1.8540000000000002e-05, + "loss": 1.4126, + "step": 927 + }, + { + "epoch": 0.2764012732925036, + "grad_norm": 0.10757928341627121, + "learning_rate": 1.8560000000000002e-05, + "loss": 1.4057, + "step": 928 + }, + { + "epoch": 0.27669911949217224, + "grad_norm": 0.10798566043376923, + "learning_rate": 1.858e-05, + "loss": 1.4037, + "step": 929 + }, + { + "epoch": 0.2769969656918409, + "grad_norm": 0.09981115907430649, + "learning_rate": 1.86e-05, + "loss": 1.4028, + "step": 930 + }, + { + "epoch": 0.2772948118915095, + "grad_norm": 0.10636036098003387, + "learning_rate": 1.862e-05, + "loss": 1.4002, + "step": 931 + }, + { + "epoch": 0.27759265809117817, + "grad_norm": 0.10867718607187271, + "learning_rate": 1.864e-05, + "loss": 1.3935, + "step": 932 + }, + { + "epoch": 0.27789050429084683, + "grad_norm": 0.0998225212097168, + "learning_rate": 1.866e-05, + "loss": 1.4056, + "step": 933 + }, + { + "epoch": 0.27818835049051543, + "grad_norm": 0.09779660403728485, + "learning_rate": 1.8680000000000004e-05, + "loss": 1.4096, + "step": 934 + }, + { + "epoch": 0.2784861966901841, + "grad_norm": 0.10754135251045227, + "learning_rate": 1.8700000000000004e-05, + "loss": 1.4028, + "step": 935 + }, + { + "epoch": 0.27878404288985276, + "grad_norm": 0.09656241536140442, + "learning_rate": 1.8720000000000004e-05, + "loss": 1.3919, + "step": 936 + }, + { + "epoch": 0.2790818890895214, + "grad_norm": 0.1022043228149414, + "learning_rate": 1.8740000000000004e-05, + "loss": 1.4003, + "step": 937 + }, + { + "epoch": 0.27937973528919, + "grad_norm": 0.10005227476358414, + "learning_rate": 1.876e-05, + "loss": 1.397, + "step": 938 + }, + { + "epoch": 0.2796775814888587, + "grad_norm": 0.10625877231359482, + "learning_rate": 1.878e-05, + "loss": 1.389, + "step": 939 + }, + { + "epoch": 0.27997542768852735, + "grad_norm": 0.09828570485115051, + "learning_rate": 1.88e-05, + "loss": 1.3926, + "step": 940 + }, + { + "epoch": 0.280273273888196, + "grad_norm": 0.09796032309532166, + "learning_rate": 1.882e-05, + "loss": 1.4054, + "step": 941 + }, + { + "epoch": 0.2805711200878646, + "grad_norm": 0.10021141171455383, + "learning_rate": 1.884e-05, + "loss": 1.3962, + "step": 942 + }, + { + "epoch": 0.2808689662875333, + "grad_norm": 0.10457919538021088, + "learning_rate": 1.886e-05, + "loss": 1.3957, + "step": 943 + }, + { + "epoch": 0.28116681248720193, + "grad_norm": 0.10513138771057129, + "learning_rate": 1.8880000000000002e-05, + "loss": 1.4013, + "step": 944 + }, + { + "epoch": 0.2814646586868706, + "grad_norm": 0.10234611481428146, + "learning_rate": 1.8900000000000002e-05, + "loss": 1.4118, + "step": 945 + }, + { + "epoch": 0.2817625048865392, + "grad_norm": 0.09986097365617752, + "learning_rate": 1.8920000000000002e-05, + "loss": 1.3906, + "step": 946 + }, + { + "epoch": 0.28206035108620786, + "grad_norm": 0.10012020915746689, + "learning_rate": 1.894e-05, + "loss": 1.4008, + "step": 947 + }, + { + "epoch": 0.2823581972858765, + "grad_norm": 0.10664036870002747, + "learning_rate": 1.896e-05, + "loss": 1.4044, + "step": 948 + }, + { + "epoch": 0.2826560434855451, + "grad_norm": 0.10882434248924255, + "learning_rate": 1.898e-05, + "loss": 1.3903, + "step": 949 + }, + { + "epoch": 0.2829538896852138, + "grad_norm": 0.1019265428185463, + "learning_rate": 1.9e-05, + "loss": 1.3986, + "step": 950 + }, + { + "epoch": 0.28325173588488245, + "grad_norm": 0.09696544706821442, + "learning_rate": 1.902e-05, + "loss": 1.3848, + "step": 951 + }, + { + "epoch": 0.2835495820845511, + "grad_norm": 0.10864205658435822, + "learning_rate": 1.904e-05, + "loss": 1.4086, + "step": 952 + }, + { + "epoch": 0.2838474282842197, + "grad_norm": 0.1046675369143486, + "learning_rate": 1.906e-05, + "loss": 1.4066, + "step": 953 + }, + { + "epoch": 0.2841452744838884, + "grad_norm": 0.10185607522726059, + "learning_rate": 1.908e-05, + "loss": 1.4088, + "step": 954 + }, + { + "epoch": 0.28444312068355704, + "grad_norm": 0.10509466379880905, + "learning_rate": 1.91e-05, + "loss": 1.4144, + "step": 955 + }, + { + "epoch": 0.2847409668832257, + "grad_norm": 0.11369527131319046, + "learning_rate": 1.912e-05, + "loss": 1.42, + "step": 956 + }, + { + "epoch": 0.2850388130828943, + "grad_norm": 0.1027270033955574, + "learning_rate": 1.914e-05, + "loss": 1.3971, + "step": 957 + }, + { + "epoch": 0.28533665928256297, + "grad_norm": 0.11005938798189163, + "learning_rate": 1.916e-05, + "loss": 1.3954, + "step": 958 + }, + { + "epoch": 0.2856345054822316, + "grad_norm": 0.10584299266338348, + "learning_rate": 1.918e-05, + "loss": 1.4025, + "step": 959 + }, + { + "epoch": 0.28593235168190023, + "grad_norm": 0.09938495606184006, + "learning_rate": 1.9200000000000003e-05, + "loss": 1.404, + "step": 960 + }, + { + "epoch": 0.2862301978815689, + "grad_norm": 0.10662488639354706, + "learning_rate": 1.9220000000000002e-05, + "loss": 1.3897, + "step": 961 + }, + { + "epoch": 0.28652804408123755, + "grad_norm": 0.11186008900403976, + "learning_rate": 1.9240000000000002e-05, + "loss": 1.3969, + "step": 962 + }, + { + "epoch": 0.2868258902809062, + "grad_norm": 0.11389771103858948, + "learning_rate": 1.9260000000000002e-05, + "loss": 1.4106, + "step": 963 + }, + { + "epoch": 0.2871237364805748, + "grad_norm": 0.10690823197364807, + "learning_rate": 1.9280000000000002e-05, + "loss": 1.4152, + "step": 964 + }, + { + "epoch": 0.2874215826802435, + "grad_norm": 0.10708753764629364, + "learning_rate": 1.93e-05, + "loss": 1.3998, + "step": 965 + }, + { + "epoch": 0.28771942887991214, + "grad_norm": 0.11222704499959946, + "learning_rate": 1.932e-05, + "loss": 1.3986, + "step": 966 + }, + { + "epoch": 0.2880172750795808, + "grad_norm": 0.10319394618272781, + "learning_rate": 1.934e-05, + "loss": 1.3831, + "step": 967 + }, + { + "epoch": 0.2883151212792494, + "grad_norm": 0.10459738224744797, + "learning_rate": 1.936e-05, + "loss": 1.4113, + "step": 968 + }, + { + "epoch": 0.28861296747891807, + "grad_norm": 0.10582360625267029, + "learning_rate": 1.938e-05, + "loss": 1.4131, + "step": 969 + }, + { + "epoch": 0.28891081367858673, + "grad_norm": 0.10391835868358612, + "learning_rate": 1.94e-05, + "loss": 1.4068, + "step": 970 + }, + { + "epoch": 0.2892086598782554, + "grad_norm": 0.1056375801563263, + "learning_rate": 1.942e-05, + "loss": 1.3891, + "step": 971 + }, + { + "epoch": 0.289506506077924, + "grad_norm": 0.10760103911161423, + "learning_rate": 1.944e-05, + "loss": 1.4115, + "step": 972 + }, + { + "epoch": 0.28980435227759266, + "grad_norm": 0.10337383300065994, + "learning_rate": 1.946e-05, + "loss": 1.3781, + "step": 973 + }, + { + "epoch": 0.2901021984772613, + "grad_norm": 0.1072370707988739, + "learning_rate": 1.948e-05, + "loss": 1.3953, + "step": 974 + }, + { + "epoch": 0.2904000446769299, + "grad_norm": 0.10368131101131439, + "learning_rate": 1.95e-05, + "loss": 1.4046, + "step": 975 + }, + { + "epoch": 0.2906978908765986, + "grad_norm": 0.10708404332399368, + "learning_rate": 1.9520000000000003e-05, + "loss": 1.3919, + "step": 976 + }, + { + "epoch": 0.29099573707626725, + "grad_norm": 0.1009281799197197, + "learning_rate": 1.9540000000000003e-05, + "loss": 1.4213, + "step": 977 + }, + { + "epoch": 0.2912935832759359, + "grad_norm": 0.10308690369129181, + "learning_rate": 1.9560000000000002e-05, + "loss": 1.3902, + "step": 978 + }, + { + "epoch": 0.2915914294756045, + "grad_norm": 0.11442205309867859, + "learning_rate": 1.9580000000000002e-05, + "loss": 1.3841, + "step": 979 + }, + { + "epoch": 0.2918892756752732, + "grad_norm": 0.11296455562114716, + "learning_rate": 1.9600000000000002e-05, + "loss": 1.3842, + "step": 980 + }, + { + "epoch": 0.29218712187494184, + "grad_norm": 0.10912510752677917, + "learning_rate": 1.9620000000000002e-05, + "loss": 1.3975, + "step": 981 + }, + { + "epoch": 0.2924849680746105, + "grad_norm": 0.11549566686153412, + "learning_rate": 1.9640000000000002e-05, + "loss": 1.3844, + "step": 982 + }, + { + "epoch": 0.2927828142742791, + "grad_norm": 0.10916072130203247, + "learning_rate": 1.966e-05, + "loss": 1.399, + "step": 983 + }, + { + "epoch": 0.29308066047394776, + "grad_norm": 0.12139784544706345, + "learning_rate": 1.968e-05, + "loss": 1.3983, + "step": 984 + }, + { + "epoch": 0.2933785066736164, + "grad_norm": 0.11861048638820648, + "learning_rate": 1.97e-05, + "loss": 1.4091, + "step": 985 + }, + { + "epoch": 0.2936763528732851, + "grad_norm": 0.11576896905899048, + "learning_rate": 1.972e-05, + "loss": 1.4114, + "step": 986 + }, + { + "epoch": 0.2939741990729537, + "grad_norm": 0.11256488412618637, + "learning_rate": 1.974e-05, + "loss": 1.4046, + "step": 987 + }, + { + "epoch": 0.29427204527262235, + "grad_norm": 0.11060027033090591, + "learning_rate": 1.976e-05, + "loss": 1.3987, + "step": 988 + }, + { + "epoch": 0.294569891472291, + "grad_norm": 0.10744709521532059, + "learning_rate": 1.978e-05, + "loss": 1.3953, + "step": 989 + }, + { + "epoch": 0.2948677376719596, + "grad_norm": 0.109938845038414, + "learning_rate": 1.98e-05, + "loss": 1.3942, + "step": 990 + }, + { + "epoch": 0.2951655838716283, + "grad_norm": 0.12279019504785538, + "learning_rate": 1.982e-05, + "loss": 1.3926, + "step": 991 + }, + { + "epoch": 0.29546343007129694, + "grad_norm": 0.1063704863190651, + "learning_rate": 1.9840000000000003e-05, + "loss": 1.401, + "step": 992 + }, + { + "epoch": 0.2957612762709656, + "grad_norm": 0.11839362978935242, + "learning_rate": 1.9860000000000003e-05, + "loss": 1.401, + "step": 993 + }, + { + "epoch": 0.2960591224706342, + "grad_norm": 0.11602285504341125, + "learning_rate": 1.9880000000000003e-05, + "loss": 1.3978, + "step": 994 + }, + { + "epoch": 0.29635696867030287, + "grad_norm": 0.11820555478334427, + "learning_rate": 1.9900000000000003e-05, + "loss": 1.3963, + "step": 995 + }, + { + "epoch": 0.29665481486997153, + "grad_norm": 0.10548858344554901, + "learning_rate": 1.9920000000000002e-05, + "loss": 1.3883, + "step": 996 + }, + { + "epoch": 0.2969526610696402, + "grad_norm": 0.9518315196037292, + "learning_rate": 1.9940000000000002e-05, + "loss": 1.4016, + "step": 997 + }, + { + "epoch": 0.2972505072693088, + "grad_norm": 0.11482294648885727, + "learning_rate": 1.9960000000000002e-05, + "loss": 1.3948, + "step": 998 + }, + { + "epoch": 0.29754835346897746, + "grad_norm": 0.12062684446573257, + "learning_rate": 1.9980000000000002e-05, + "loss": 1.4016, + "step": 999 + }, + { + "epoch": 0.2978461996686461, + "grad_norm": 0.12081863731145859, + "learning_rate": 2e-05, + "loss": 1.4047, + "step": 1000 + }, + { + "epoch": 0.2978461996686461, + "eval_loss": 1.397951602935791, + "eval_runtime": 18.6121, + "eval_samples_per_second": 93.165, + "eval_steps_per_second": 5.856, + "step": 1000 + }, + { + "epoch": 0.2981440458683147, + "grad_norm": 0.11395775526762009, + "learning_rate": 1.9999999953480586e-05, + "loss": 1.3953, + "step": 1001 + }, + { + "epoch": 0.2984418920679834, + "grad_norm": 0.12259924411773682, + "learning_rate": 1.9999999813922347e-05, + "loss": 1.4078, + "step": 1002 + }, + { + "epoch": 0.29873973826765204, + "grad_norm": 0.12151668965816498, + "learning_rate": 1.999999958132528e-05, + "loss": 1.3956, + "step": 1003 + }, + { + "epoch": 0.2990375844673207, + "grad_norm": 0.11236704140901566, + "learning_rate": 1.9999999255689393e-05, + "loss": 1.3904, + "step": 1004 + }, + { + "epoch": 0.2993354306669893, + "grad_norm": 0.11873602867126465, + "learning_rate": 1.999999883701468e-05, + "loss": 1.384, + "step": 1005 + }, + { + "epoch": 0.29963327686665797, + "grad_norm": 0.12156610190868378, + "learning_rate": 1.9999998325301156e-05, + "loss": 1.3934, + "step": 1006 + }, + { + "epoch": 0.29993112306632663, + "grad_norm": 0.11415664851665497, + "learning_rate": 1.9999997720548817e-05, + "loss": 1.3992, + "step": 1007 + }, + { + "epoch": 0.3002289692659953, + "grad_norm": 0.11592312157154083, + "learning_rate": 1.9999997022757675e-05, + "loss": 1.4118, + "step": 1008 + }, + { + "epoch": 0.3005268154656639, + "grad_norm": 0.12188813090324402, + "learning_rate": 1.999999623192773e-05, + "loss": 1.3994, + "step": 1009 + }, + { + "epoch": 0.30082466166533256, + "grad_norm": 0.11910311132669449, + "learning_rate": 1.9999995348058992e-05, + "loss": 1.4028, + "step": 1010 + }, + { + "epoch": 0.3011225078650012, + "grad_norm": 0.10839038342237473, + "learning_rate": 1.9999994371151472e-05, + "loss": 1.3934, + "step": 1011 + }, + { + "epoch": 0.3014203540646699, + "grad_norm": 0.12016301602125168, + "learning_rate": 1.9999993301205176e-05, + "loss": 1.3969, + "step": 1012 + }, + { + "epoch": 0.3017182002643385, + "grad_norm": 0.1121194139122963, + "learning_rate": 1.9999992138220117e-05, + "loss": 1.4116, + "step": 1013 + }, + { + "epoch": 0.30201604646400715, + "grad_norm": 0.11905040591955185, + "learning_rate": 1.99999908821963e-05, + "loss": 1.4097, + "step": 1014 + }, + { + "epoch": 0.3023138926636758, + "grad_norm": 0.11118368059396744, + "learning_rate": 1.9999989533133743e-05, + "loss": 1.3893, + "step": 1015 + }, + { + "epoch": 0.3026117388633444, + "grad_norm": 0.11388043314218521, + "learning_rate": 1.9999988091032456e-05, + "loss": 1.4, + "step": 1016 + }, + { + "epoch": 0.3029095850630131, + "grad_norm": 0.11436357349157333, + "learning_rate": 1.9999986555892453e-05, + "loss": 1.4029, + "step": 1017 + }, + { + "epoch": 0.30320743126268174, + "grad_norm": 0.10592655092477798, + "learning_rate": 1.9999984927713748e-05, + "loss": 1.4006, + "step": 1018 + }, + { + "epoch": 0.3035052774623504, + "grad_norm": 0.1133023053407669, + "learning_rate": 1.9999983206496355e-05, + "loss": 1.3881, + "step": 1019 + }, + { + "epoch": 0.303803123662019, + "grad_norm": 0.1287492960691452, + "learning_rate": 1.999998139224029e-05, + "loss": 1.3987, + "step": 1020 + }, + { + "epoch": 0.30410096986168766, + "grad_norm": 0.11255930364131927, + "learning_rate": 1.999997948494557e-05, + "loss": 1.3882, + "step": 1021 + }, + { + "epoch": 0.3043988160613563, + "grad_norm": 0.11940372735261917, + "learning_rate": 1.9999977484612217e-05, + "loss": 1.3873, + "step": 1022 + }, + { + "epoch": 0.304696662261025, + "grad_norm": 0.1144757941365242, + "learning_rate": 1.9999975391240242e-05, + "loss": 1.3942, + "step": 1023 + }, + { + "epoch": 0.3049945084606936, + "grad_norm": 0.1100514829158783, + "learning_rate": 1.999997320482967e-05, + "loss": 1.4043, + "step": 1024 + }, + { + "epoch": 0.30529235466036225, + "grad_norm": 0.11846914142370224, + "learning_rate": 1.9999970925380526e-05, + "loss": 1.3971, + "step": 1025 + }, + { + "epoch": 0.3055902008600309, + "grad_norm": 0.12528426945209503, + "learning_rate": 1.9999968552892815e-05, + "loss": 1.3991, + "step": 1026 + }, + { + "epoch": 0.3058880470596996, + "grad_norm": 0.1267407387495041, + "learning_rate": 1.9999966087366575e-05, + "loss": 1.3861, + "step": 1027 + }, + { + "epoch": 0.3061858932593682, + "grad_norm": 0.10988324135541916, + "learning_rate": 1.9999963528801826e-05, + "loss": 1.4037, + "step": 1028 + }, + { + "epoch": 0.30648373945903684, + "grad_norm": 0.12478654831647873, + "learning_rate": 1.9999960877198585e-05, + "loss": 1.3927, + "step": 1029 + }, + { + "epoch": 0.3067815856587055, + "grad_norm": 0.11915039271116257, + "learning_rate": 1.9999958132556882e-05, + "loss": 1.3863, + "step": 1030 + }, + { + "epoch": 0.3070794318583741, + "grad_norm": 0.11628096550703049, + "learning_rate": 1.9999955294876738e-05, + "loss": 1.3958, + "step": 1031 + }, + { + "epoch": 0.30737727805804277, + "grad_norm": 0.11177193373441696, + "learning_rate": 1.9999952364158184e-05, + "loss": 1.3972, + "step": 1032 + }, + { + "epoch": 0.30767512425771143, + "grad_norm": 0.11739183962345123, + "learning_rate": 1.999994934040125e-05, + "loss": 1.3969, + "step": 1033 + }, + { + "epoch": 0.3079729704573801, + "grad_norm": 0.10952276736497879, + "learning_rate": 1.9999946223605955e-05, + "loss": 1.3966, + "step": 1034 + }, + { + "epoch": 0.3082708166570487, + "grad_norm": 0.11301985383033752, + "learning_rate": 1.9999943013772335e-05, + "loss": 1.4064, + "step": 1035 + }, + { + "epoch": 0.30856866285671736, + "grad_norm": 0.12167651951313019, + "learning_rate": 1.999993971090042e-05, + "loss": 1.4039, + "step": 1036 + }, + { + "epoch": 0.308866509056386, + "grad_norm": 0.1168079599738121, + "learning_rate": 1.9999936314990235e-05, + "loss": 1.4044, + "step": 1037 + }, + { + "epoch": 0.3091643552560547, + "grad_norm": 0.11724202334880829, + "learning_rate": 1.9999932826041813e-05, + "loss": 1.3941, + "step": 1038 + }, + { + "epoch": 0.3094622014557233, + "grad_norm": 0.117185078561306, + "learning_rate": 1.9999929244055194e-05, + "loss": 1.394, + "step": 1039 + }, + { + "epoch": 0.30976004765539195, + "grad_norm": 0.1150064691901207, + "learning_rate": 1.9999925569030405e-05, + "loss": 1.3928, + "step": 1040 + }, + { + "epoch": 0.3100578938550606, + "grad_norm": 0.11486277729272842, + "learning_rate": 1.999992180096748e-05, + "loss": 1.3843, + "step": 1041 + }, + { + "epoch": 0.3103557400547292, + "grad_norm": 0.1181897297501564, + "learning_rate": 1.9999917939866455e-05, + "loss": 1.3904, + "step": 1042 + }, + { + "epoch": 0.3106535862543979, + "grad_norm": 0.11689642071723938, + "learning_rate": 1.9999913985727362e-05, + "loss": 1.366, + "step": 1043 + }, + { + "epoch": 0.31095143245406653, + "grad_norm": 0.10985434800386429, + "learning_rate": 1.9999909938550252e-05, + "loss": 1.4004, + "step": 1044 + }, + { + "epoch": 0.3112492786537352, + "grad_norm": 0.11144163459539413, + "learning_rate": 1.9999905798335148e-05, + "loss": 1.3762, + "step": 1045 + }, + { + "epoch": 0.3115471248534038, + "grad_norm": 0.11040697246789932, + "learning_rate": 1.9999901565082087e-05, + "loss": 1.391, + "step": 1046 + }, + { + "epoch": 0.31184497105307246, + "grad_norm": 0.11224711686372757, + "learning_rate": 1.999989723879112e-05, + "loss": 1.4161, + "step": 1047 + }, + { + "epoch": 0.3121428172527411, + "grad_norm": 0.1203780397772789, + "learning_rate": 1.9999892819462282e-05, + "loss": 1.3899, + "step": 1048 + }, + { + "epoch": 0.3124406634524098, + "grad_norm": 0.11567550152540207, + "learning_rate": 1.9999888307095615e-05, + "loss": 1.384, + "step": 1049 + }, + { + "epoch": 0.3127385096520784, + "grad_norm": 0.12175623327493668, + "learning_rate": 1.9999883701691155e-05, + "loss": 1.3764, + "step": 1050 + }, + { + "epoch": 0.31303635585174705, + "grad_norm": 0.11729805171489716, + "learning_rate": 1.9999879003248955e-05, + "loss": 1.3944, + "step": 1051 + }, + { + "epoch": 0.3133342020514157, + "grad_norm": 0.11439957469701767, + "learning_rate": 1.999987421176905e-05, + "loss": 1.3871, + "step": 1052 + }, + { + "epoch": 0.31363204825108437, + "grad_norm": 0.11447083204984665, + "learning_rate": 1.9999869327251487e-05, + "loss": 1.3765, + "step": 1053 + }, + { + "epoch": 0.313929894450753, + "grad_norm": 0.11776264011859894, + "learning_rate": 1.9999864349696315e-05, + "loss": 1.3945, + "step": 1054 + }, + { + "epoch": 0.31422774065042164, + "grad_norm": 0.1123151183128357, + "learning_rate": 1.9999859279103576e-05, + "loss": 1.3948, + "step": 1055 + }, + { + "epoch": 0.3145255868500903, + "grad_norm": 0.11624394357204437, + "learning_rate": 1.9999854115473324e-05, + "loss": 1.4, + "step": 1056 + }, + { + "epoch": 0.3148234330497589, + "grad_norm": 0.11331585049629211, + "learning_rate": 1.9999848858805596e-05, + "loss": 1.4026, + "step": 1057 + }, + { + "epoch": 0.31512127924942757, + "grad_norm": 0.11557549983263016, + "learning_rate": 1.999984350910045e-05, + "loss": 1.3642, + "step": 1058 + }, + { + "epoch": 0.3154191254490962, + "grad_norm": 0.1166551485657692, + "learning_rate": 1.9999838066357932e-05, + "loss": 1.3858, + "step": 1059 + }, + { + "epoch": 0.3157169716487649, + "grad_norm": 0.11316041648387909, + "learning_rate": 1.9999832530578093e-05, + "loss": 1.392, + "step": 1060 + }, + { + "epoch": 0.3160148178484335, + "grad_norm": 0.1141393855214119, + "learning_rate": 1.9999826901760985e-05, + "loss": 1.3801, + "step": 1061 + }, + { + "epoch": 0.31631266404810215, + "grad_norm": 0.11633685976266861, + "learning_rate": 1.999982117990666e-05, + "loss": 1.3999, + "step": 1062 + }, + { + "epoch": 0.3166105102477708, + "grad_norm": 0.11223947256803513, + "learning_rate": 1.999981536501517e-05, + "loss": 1.4016, + "step": 1063 + }, + { + "epoch": 0.3169083564474395, + "grad_norm": 0.11991715431213379, + "learning_rate": 1.999980945708657e-05, + "loss": 1.3807, + "step": 1064 + }, + { + "epoch": 0.3172062026471081, + "grad_norm": 0.11510065943002701, + "learning_rate": 1.999980345612092e-05, + "loss": 1.3888, + "step": 1065 + }, + { + "epoch": 0.31750404884677674, + "grad_norm": 0.11234598606824875, + "learning_rate": 1.9999797362118263e-05, + "loss": 1.3844, + "step": 1066 + }, + { + "epoch": 0.3178018950464454, + "grad_norm": 0.12312228232622147, + "learning_rate": 1.9999791175078674e-05, + "loss": 1.3841, + "step": 1067 + }, + { + "epoch": 0.31809974124611406, + "grad_norm": 0.11295381933450699, + "learning_rate": 1.9999784895002196e-05, + "loss": 1.3809, + "step": 1068 + }, + { + "epoch": 0.31839758744578267, + "grad_norm": 0.11573398858308792, + "learning_rate": 1.9999778521888892e-05, + "loss": 1.3875, + "step": 1069 + }, + { + "epoch": 0.31869543364545133, + "grad_norm": 0.11296920478343964, + "learning_rate": 1.999977205573882e-05, + "loss": 1.3746, + "step": 1070 + }, + { + "epoch": 0.31899327984512, + "grad_norm": 0.11616770923137665, + "learning_rate": 1.999976549655204e-05, + "loss": 1.3809, + "step": 1071 + }, + { + "epoch": 0.3192911260447886, + "grad_norm": 0.10907561331987381, + "learning_rate": 1.9999758844328618e-05, + "loss": 1.3877, + "step": 1072 + }, + { + "epoch": 0.31958897224445726, + "grad_norm": 0.13153153657913208, + "learning_rate": 1.999975209906861e-05, + "loss": 1.3811, + "step": 1073 + }, + { + "epoch": 0.3198868184441259, + "grad_norm": 0.11982592195272446, + "learning_rate": 1.9999745260772087e-05, + "loss": 1.3885, + "step": 1074 + }, + { + "epoch": 0.3201846646437946, + "grad_norm": 0.12647011876106262, + "learning_rate": 1.9999738329439097e-05, + "loss": 1.3951, + "step": 1075 + }, + { + "epoch": 0.3204825108434632, + "grad_norm": 0.11812597513198853, + "learning_rate": 1.9999731305069723e-05, + "loss": 1.3965, + "step": 1076 + }, + { + "epoch": 0.32078035704313185, + "grad_norm": 0.11641401052474976, + "learning_rate": 1.999972418766402e-05, + "loss": 1.4019, + "step": 1077 + }, + { + "epoch": 0.3210782032428005, + "grad_norm": 0.12187743186950684, + "learning_rate": 1.999971697722205e-05, + "loss": 1.4009, + "step": 1078 + }, + { + "epoch": 0.32137604944246917, + "grad_norm": 0.12092571705579758, + "learning_rate": 1.999970967374389e-05, + "loss": 1.3813, + "step": 1079 + }, + { + "epoch": 0.3216738956421378, + "grad_norm": 0.12640085816383362, + "learning_rate": 1.9999702277229604e-05, + "loss": 1.4036, + "step": 1080 + }, + { + "epoch": 0.32197174184180644, + "grad_norm": 0.13022305071353912, + "learning_rate": 1.999969478767926e-05, + "loss": 1.4151, + "step": 1081 + }, + { + "epoch": 0.3222695880414751, + "grad_norm": 0.11454661935567856, + "learning_rate": 1.9999687205092926e-05, + "loss": 1.3837, + "step": 1082 + }, + { + "epoch": 0.3225674342411437, + "grad_norm": 0.12185961753129959, + "learning_rate": 1.999967952947068e-05, + "loss": 1.3805, + "step": 1083 + }, + { + "epoch": 0.32286528044081236, + "grad_norm": 0.12060558050870895, + "learning_rate": 1.9999671760812584e-05, + "loss": 1.3811, + "step": 1084 + }, + { + "epoch": 0.323163126640481, + "grad_norm": 0.12417693436145782, + "learning_rate": 1.9999663899118714e-05, + "loss": 1.3885, + "step": 1085 + }, + { + "epoch": 0.3234609728401497, + "grad_norm": 0.1290057897567749, + "learning_rate": 1.9999655944389147e-05, + "loss": 1.399, + "step": 1086 + }, + { + "epoch": 0.3237588190398183, + "grad_norm": 0.12008559703826904, + "learning_rate": 1.9999647896623954e-05, + "loss": 1.3769, + "step": 1087 + }, + { + "epoch": 0.32405666523948695, + "grad_norm": 0.11745686084032059, + "learning_rate": 1.9999639755823207e-05, + "loss": 1.3899, + "step": 1088 + }, + { + "epoch": 0.3243545114391556, + "grad_norm": 0.1250544935464859, + "learning_rate": 1.9999631521986983e-05, + "loss": 1.3873, + "step": 1089 + }, + { + "epoch": 0.3246523576388243, + "grad_norm": 0.11275584250688553, + "learning_rate": 1.9999623195115364e-05, + "loss": 1.3632, + "step": 1090 + }, + { + "epoch": 0.3249502038384929, + "grad_norm": 0.1159360483288765, + "learning_rate": 1.999961477520842e-05, + "loss": 1.3642, + "step": 1091 + }, + { + "epoch": 0.32524805003816154, + "grad_norm": 0.11819259077310562, + "learning_rate": 1.999960626226624e-05, + "loss": 1.3717, + "step": 1092 + }, + { + "epoch": 0.3255458962378302, + "grad_norm": 0.124155193567276, + "learning_rate": 1.9999597656288886e-05, + "loss": 1.4006, + "step": 1093 + }, + { + "epoch": 0.32584374243749886, + "grad_norm": 0.1236296072602272, + "learning_rate": 1.9999588957276455e-05, + "loss": 1.3903, + "step": 1094 + }, + { + "epoch": 0.32614158863716747, + "grad_norm": 0.11855696886777878, + "learning_rate": 1.999958016522902e-05, + "loss": 1.3944, + "step": 1095 + }, + { + "epoch": 0.32643943483683613, + "grad_norm": 0.11899484694004059, + "learning_rate": 1.999957128014666e-05, + "loss": 1.385, + "step": 1096 + }, + { + "epoch": 0.3267372810365048, + "grad_norm": 0.11923281103372574, + "learning_rate": 1.9999562302029462e-05, + "loss": 1.4006, + "step": 1097 + }, + { + "epoch": 0.3270351272361734, + "grad_norm": 0.1309370994567871, + "learning_rate": 1.999955323087751e-05, + "loss": 1.4025, + "step": 1098 + }, + { + "epoch": 0.32733297343584206, + "grad_norm": 0.11568192392587662, + "learning_rate": 1.999954406669089e-05, + "loss": 1.3911, + "step": 1099 + }, + { + "epoch": 0.3276308196355107, + "grad_norm": 0.12005109339952469, + "learning_rate": 1.999953480946968e-05, + "loss": 1.3857, + "step": 1100 + }, + { + "epoch": 0.3279286658351794, + "grad_norm": 0.12727560102939606, + "learning_rate": 1.9999525459213975e-05, + "loss": 1.3974, + "step": 1101 + }, + { + "epoch": 0.328226512034848, + "grad_norm": 0.12770545482635498, + "learning_rate": 1.9999516015923854e-05, + "loss": 1.3924, + "step": 1102 + }, + { + "epoch": 0.32852435823451664, + "grad_norm": 0.11987710744142532, + "learning_rate": 1.999950647959941e-05, + "loss": 1.3799, + "step": 1103 + }, + { + "epoch": 0.3288222044341853, + "grad_norm": 0.11894366145133972, + "learning_rate": 1.9999496850240732e-05, + "loss": 1.3796, + "step": 1104 + }, + { + "epoch": 0.32912005063385397, + "grad_norm": 0.1279422640800476, + "learning_rate": 1.999948712784791e-05, + "loss": 1.3883, + "step": 1105 + }, + { + "epoch": 0.32941789683352257, + "grad_norm": 0.11908257007598877, + "learning_rate": 1.9999477312421026e-05, + "loss": 1.3911, + "step": 1106 + }, + { + "epoch": 0.32971574303319123, + "grad_norm": 0.1226000040769577, + "learning_rate": 1.999946740396018e-05, + "loss": 1.385, + "step": 1107 + }, + { + "epoch": 0.3300135892328599, + "grad_norm": 0.1315961480140686, + "learning_rate": 1.999945740246546e-05, + "loss": 1.3763, + "step": 1108 + }, + { + "epoch": 0.33031143543252856, + "grad_norm": 0.12968845665454865, + "learning_rate": 1.9999447307936967e-05, + "loss": 1.3814, + "step": 1109 + }, + { + "epoch": 0.33060928163219716, + "grad_norm": 0.14246723055839539, + "learning_rate": 1.9999437120374784e-05, + "loss": 1.3984, + "step": 1110 + }, + { + "epoch": 0.3309071278318658, + "grad_norm": 0.12243639677762985, + "learning_rate": 1.9999426839779013e-05, + "loss": 1.3754, + "step": 1111 + }, + { + "epoch": 0.3312049740315345, + "grad_norm": 0.13210417330265045, + "learning_rate": 1.9999416466149747e-05, + "loss": 1.3841, + "step": 1112 + }, + { + "epoch": 0.3315028202312031, + "grad_norm": 0.12163086235523224, + "learning_rate": 1.999940599948708e-05, + "loss": 1.3716, + "step": 1113 + }, + { + "epoch": 0.33180066643087175, + "grad_norm": 0.12960121035575867, + "learning_rate": 1.9999395439791117e-05, + "loss": 1.3929, + "step": 1114 + }, + { + "epoch": 0.3320985126305404, + "grad_norm": 0.13061104714870453, + "learning_rate": 1.9999384787061947e-05, + "loss": 1.3651, + "step": 1115 + }, + { + "epoch": 0.33239635883020907, + "grad_norm": 0.12969915568828583, + "learning_rate": 1.9999374041299676e-05, + "loss": 1.3643, + "step": 1116 + }, + { + "epoch": 0.3326942050298777, + "grad_norm": 0.12098529189825058, + "learning_rate": 1.99993632025044e-05, + "loss": 1.3715, + "step": 1117 + }, + { + "epoch": 0.33299205122954634, + "grad_norm": 0.1331017166376114, + "learning_rate": 1.999935227067622e-05, + "loss": 1.4062, + "step": 1118 + }, + { + "epoch": 0.333289897429215, + "grad_norm": 0.1369689255952835, + "learning_rate": 1.9999341245815242e-05, + "loss": 1.3818, + "step": 1119 + }, + { + "epoch": 0.33358774362888366, + "grad_norm": 0.12456752359867096, + "learning_rate": 1.9999330127921564e-05, + "loss": 1.3734, + "step": 1120 + }, + { + "epoch": 0.33388558982855226, + "grad_norm": 0.11823807656764984, + "learning_rate": 1.9999318916995293e-05, + "loss": 1.3964, + "step": 1121 + }, + { + "epoch": 0.3341834360282209, + "grad_norm": 0.12144974619150162, + "learning_rate": 1.999930761303653e-05, + "loss": 1.376, + "step": 1122 + }, + { + "epoch": 0.3344812822278896, + "grad_norm": 0.13049054145812988, + "learning_rate": 1.999929621604538e-05, + "loss": 1.3864, + "step": 1123 + }, + { + "epoch": 0.3347791284275582, + "grad_norm": 0.12313073873519897, + "learning_rate": 1.9999284726021954e-05, + "loss": 1.388, + "step": 1124 + }, + { + "epoch": 0.33507697462722685, + "grad_norm": 0.12566570937633514, + "learning_rate": 1.9999273142966354e-05, + "loss": 1.3871, + "step": 1125 + }, + { + "epoch": 0.3353748208268955, + "grad_norm": 0.11997831612825394, + "learning_rate": 1.999926146687869e-05, + "loss": 1.3915, + "step": 1126 + }, + { + "epoch": 0.3356726670265642, + "grad_norm": 0.12298216670751572, + "learning_rate": 1.9999249697759068e-05, + "loss": 1.3803, + "step": 1127 + }, + { + "epoch": 0.3359705132262328, + "grad_norm": 0.1251022070646286, + "learning_rate": 1.99992378356076e-05, + "loss": 1.3808, + "step": 1128 + }, + { + "epoch": 0.33626835942590144, + "grad_norm": 0.12131714075803757, + "learning_rate": 1.9999225880424397e-05, + "loss": 1.3875, + "step": 1129 + }, + { + "epoch": 0.3365662056255701, + "grad_norm": 0.1261511743068695, + "learning_rate": 1.9999213832209568e-05, + "loss": 1.3805, + "step": 1130 + }, + { + "epoch": 0.33686405182523876, + "grad_norm": 0.11724023520946503, + "learning_rate": 1.9999201690963223e-05, + "loss": 1.3816, + "step": 1131 + }, + { + "epoch": 0.33716189802490737, + "grad_norm": 0.12367585301399231, + "learning_rate": 1.999918945668548e-05, + "loss": 1.4037, + "step": 1132 + }, + { + "epoch": 0.33745974422457603, + "grad_norm": 0.12494815140962601, + "learning_rate": 1.9999177129376456e-05, + "loss": 1.3864, + "step": 1133 + }, + { + "epoch": 0.3377575904242447, + "grad_norm": 0.122878797352314, + "learning_rate": 1.9999164709036255e-05, + "loss": 1.3728, + "step": 1134 + }, + { + "epoch": 0.33805543662391335, + "grad_norm": 0.12429952621459961, + "learning_rate": 1.9999152195665e-05, + "loss": 1.362, + "step": 1135 + }, + { + "epoch": 0.33835328282358196, + "grad_norm": 0.12699133157730103, + "learning_rate": 1.9999139589262806e-05, + "loss": 1.3543, + "step": 1136 + }, + { + "epoch": 0.3386511290232506, + "grad_norm": 0.12095256894826889, + "learning_rate": 1.999912688982979e-05, + "loss": 1.3691, + "step": 1137 + }, + { + "epoch": 0.3389489752229193, + "grad_norm": 0.12273938208818436, + "learning_rate": 1.9999114097366066e-05, + "loss": 1.3842, + "step": 1138 + }, + { + "epoch": 0.3392468214225879, + "grad_norm": 0.12688860297203064, + "learning_rate": 1.9999101211871762e-05, + "loss": 1.3776, + "step": 1139 + }, + { + "epoch": 0.33954466762225655, + "grad_norm": 0.11873604357242584, + "learning_rate": 1.999908823334699e-05, + "loss": 1.3854, + "step": 1140 + }, + { + "epoch": 0.3398425138219252, + "grad_norm": 0.12411191314458847, + "learning_rate": 1.9999075161791873e-05, + "loss": 1.3716, + "step": 1141 + }, + { + "epoch": 0.34014036002159387, + "grad_norm": 0.11912590265274048, + "learning_rate": 1.999906199720654e-05, + "loss": 1.3823, + "step": 1142 + }, + { + "epoch": 0.3404382062212625, + "grad_norm": 0.11869439482688904, + "learning_rate": 1.9999048739591094e-05, + "loss": 1.3851, + "step": 1143 + }, + { + "epoch": 0.34073605242093113, + "grad_norm": 0.12020917981863022, + "learning_rate": 1.999903538894568e-05, + "loss": 1.3835, + "step": 1144 + }, + { + "epoch": 0.3410338986205998, + "grad_norm": 0.12506596744060516, + "learning_rate": 1.9999021945270412e-05, + "loss": 1.3816, + "step": 1145 + }, + { + "epoch": 0.34133174482026846, + "grad_norm": 0.13336603343486786, + "learning_rate": 1.9999008408565415e-05, + "loss": 1.3954, + "step": 1146 + }, + { + "epoch": 0.34162959101993706, + "grad_norm": 0.12509402632713318, + "learning_rate": 1.999899477883082e-05, + "loss": 1.3796, + "step": 1147 + }, + { + "epoch": 0.3419274372196057, + "grad_norm": 0.12254216521978378, + "learning_rate": 1.9998981056066745e-05, + "loss": 1.3821, + "step": 1148 + }, + { + "epoch": 0.3422252834192744, + "grad_norm": 0.1316823810338974, + "learning_rate": 1.999896724027333e-05, + "loss": 1.3809, + "step": 1149 + }, + { + "epoch": 0.34252312961894305, + "grad_norm": 0.12617230415344238, + "learning_rate": 1.999895333145069e-05, + "loss": 1.3716, + "step": 1150 + }, + { + "epoch": 0.34282097581861165, + "grad_norm": 0.12473230063915253, + "learning_rate": 1.9998939329598964e-05, + "loss": 1.3745, + "step": 1151 + }, + { + "epoch": 0.3431188220182803, + "grad_norm": 0.12697988748550415, + "learning_rate": 1.9998925234718275e-05, + "loss": 1.3707, + "step": 1152 + }, + { + "epoch": 0.343416668217949, + "grad_norm": 0.12683434784412384, + "learning_rate": 1.999891104680876e-05, + "loss": 1.3791, + "step": 1153 + }, + { + "epoch": 0.3437145144176176, + "grad_norm": 0.13295918703079224, + "learning_rate": 1.999889676587055e-05, + "loss": 1.3924, + "step": 1154 + }, + { + "epoch": 0.34401236061728624, + "grad_norm": 0.12189807742834091, + "learning_rate": 1.9998882391903778e-05, + "loss": 1.3818, + "step": 1155 + }, + { + "epoch": 0.3443102068169549, + "grad_norm": 0.12889567017555237, + "learning_rate": 1.9998867924908576e-05, + "loss": 1.3945, + "step": 1156 + }, + { + "epoch": 0.34460805301662356, + "grad_norm": 0.14350058138370514, + "learning_rate": 1.9998853364885077e-05, + "loss": 1.3862, + "step": 1157 + }, + { + "epoch": 0.34490589921629217, + "grad_norm": 0.13133090734481812, + "learning_rate": 1.999883871183342e-05, + "loss": 1.3731, + "step": 1158 + }, + { + "epoch": 0.3452037454159608, + "grad_norm": 0.12137048691511154, + "learning_rate": 1.999882396575374e-05, + "loss": 1.3666, + "step": 1159 + }, + { + "epoch": 0.3455015916156295, + "grad_norm": 0.12902837991714478, + "learning_rate": 1.9998809126646178e-05, + "loss": 1.3913, + "step": 1160 + }, + { + "epoch": 0.34579943781529815, + "grad_norm": 0.1265643686056137, + "learning_rate": 1.9998794194510863e-05, + "loss": 1.412, + "step": 1161 + }, + { + "epoch": 0.34609728401496676, + "grad_norm": 0.11998944729566574, + "learning_rate": 1.9998779169347942e-05, + "loss": 1.3625, + "step": 1162 + }, + { + "epoch": 0.3463951302146354, + "grad_norm": 0.12442991882562637, + "learning_rate": 1.999876405115755e-05, + "loss": 1.3628, + "step": 1163 + }, + { + "epoch": 0.3466929764143041, + "grad_norm": 0.12390732765197754, + "learning_rate": 1.9998748839939836e-05, + "loss": 1.3678, + "step": 1164 + }, + { + "epoch": 0.3469908226139727, + "grad_norm": 0.1266006976366043, + "learning_rate": 1.999873353569493e-05, + "loss": 1.3724, + "step": 1165 + }, + { + "epoch": 0.34728866881364134, + "grad_norm": 0.12780870497226715, + "learning_rate": 1.9998718138422983e-05, + "loss": 1.3719, + "step": 1166 + }, + { + "epoch": 0.34758651501331, + "grad_norm": 0.12996380031108856, + "learning_rate": 1.999870264812413e-05, + "loss": 1.387, + "step": 1167 + }, + { + "epoch": 0.34788436121297867, + "grad_norm": 0.12336032092571259, + "learning_rate": 1.9998687064798522e-05, + "loss": 1.3861, + "step": 1168 + }, + { + "epoch": 0.34818220741264727, + "grad_norm": 0.13241474330425262, + "learning_rate": 1.9998671388446304e-05, + "loss": 1.3868, + "step": 1169 + }, + { + "epoch": 0.34848005361231593, + "grad_norm": 0.12734943628311157, + "learning_rate": 1.999865561906762e-05, + "loss": 1.3959, + "step": 1170 + }, + { + "epoch": 0.3487778998119846, + "grad_norm": 0.12355831265449524, + "learning_rate": 1.9998639756662614e-05, + "loss": 1.3676, + "step": 1171 + }, + { + "epoch": 0.34907574601165325, + "grad_norm": 0.12056966125965118, + "learning_rate": 1.9998623801231438e-05, + "loss": 1.3849, + "step": 1172 + }, + { + "epoch": 0.34937359221132186, + "grad_norm": 0.1257609874010086, + "learning_rate": 1.9998607752774238e-05, + "loss": 1.3746, + "step": 1173 + }, + { + "epoch": 0.3496714384109905, + "grad_norm": 0.12630680203437805, + "learning_rate": 1.9998591611291166e-05, + "loss": 1.3744, + "step": 1174 + }, + { + "epoch": 0.3499692846106592, + "grad_norm": 0.13128453493118286, + "learning_rate": 1.9998575376782366e-05, + "loss": 1.3711, + "step": 1175 + }, + { + "epoch": 0.35026713081032784, + "grad_norm": 0.12793263792991638, + "learning_rate": 1.9998559049247996e-05, + "loss": 1.366, + "step": 1176 + }, + { + "epoch": 0.35056497700999645, + "grad_norm": 0.12537628412246704, + "learning_rate": 1.9998542628688206e-05, + "loss": 1.3642, + "step": 1177 + }, + { + "epoch": 0.3508628232096651, + "grad_norm": 0.1319340020418167, + "learning_rate": 1.9998526115103144e-05, + "loss": 1.3805, + "step": 1178 + }, + { + "epoch": 0.35116066940933377, + "grad_norm": 0.13178007304668427, + "learning_rate": 1.999850950849297e-05, + "loss": 1.3759, + "step": 1179 + }, + { + "epoch": 0.3514585156090024, + "grad_norm": 0.13513357937335968, + "learning_rate": 1.999849280885784e-05, + "loss": 1.3761, + "step": 1180 + }, + { + "epoch": 0.35175636180867104, + "grad_norm": 0.134556844830513, + "learning_rate": 1.99984760161979e-05, + "loss": 1.3881, + "step": 1181 + }, + { + "epoch": 0.3520542080083397, + "grad_norm": 0.13081508874893188, + "learning_rate": 1.9998459130513313e-05, + "loss": 1.376, + "step": 1182 + }, + { + "epoch": 0.35235205420800836, + "grad_norm": 0.1282830685377121, + "learning_rate": 1.9998442151804235e-05, + "loss": 1.3728, + "step": 1183 + }, + { + "epoch": 0.35264990040767696, + "grad_norm": 0.13131962716579437, + "learning_rate": 1.9998425080070824e-05, + "loss": 1.3728, + "step": 1184 + }, + { + "epoch": 0.3529477466073456, + "grad_norm": 0.12443128228187561, + "learning_rate": 1.9998407915313236e-05, + "loss": 1.3756, + "step": 1185 + }, + { + "epoch": 0.3532455928070143, + "grad_norm": 0.12644220888614655, + "learning_rate": 1.9998390657531637e-05, + "loss": 1.3736, + "step": 1186 + }, + { + "epoch": 0.35354343900668295, + "grad_norm": 0.13326533138751984, + "learning_rate": 1.999837330672618e-05, + "loss": 1.37, + "step": 1187 + }, + { + "epoch": 0.35384128520635155, + "grad_norm": 0.11835305392742157, + "learning_rate": 1.9998355862897032e-05, + "loss": 1.3899, + "step": 1188 + }, + { + "epoch": 0.3541391314060202, + "grad_norm": 0.12689045071601868, + "learning_rate": 1.9998338326044356e-05, + "loss": 1.3734, + "step": 1189 + }, + { + "epoch": 0.3544369776056889, + "grad_norm": 0.12657564878463745, + "learning_rate": 1.9998320696168308e-05, + "loss": 1.3877, + "step": 1190 + }, + { + "epoch": 0.35473482380535754, + "grad_norm": 0.12246771156787872, + "learning_rate": 1.9998302973269063e-05, + "loss": 1.3738, + "step": 1191 + }, + { + "epoch": 0.35503267000502614, + "grad_norm": 0.1292853057384491, + "learning_rate": 1.9998285157346772e-05, + "loss": 1.3936, + "step": 1192 + }, + { + "epoch": 0.3553305162046948, + "grad_norm": 0.13208384811878204, + "learning_rate": 1.999826724840161e-05, + "loss": 1.3598, + "step": 1193 + }, + { + "epoch": 0.35562836240436346, + "grad_norm": 0.13662226498126984, + "learning_rate": 1.9998249246433748e-05, + "loss": 1.3704, + "step": 1194 + }, + { + "epoch": 0.35592620860403207, + "grad_norm": 0.13780462741851807, + "learning_rate": 1.9998231151443344e-05, + "loss": 1.4053, + "step": 1195 + }, + { + "epoch": 0.35622405480370073, + "grad_norm": 0.1379765272140503, + "learning_rate": 1.999821296343057e-05, + "loss": 1.3878, + "step": 1196 + }, + { + "epoch": 0.3565219010033694, + "grad_norm": 0.13109084963798523, + "learning_rate": 1.9998194682395592e-05, + "loss": 1.3607, + "step": 1197 + }, + { + "epoch": 0.35681974720303805, + "grad_norm": 0.13432908058166504, + "learning_rate": 1.9998176308338582e-05, + "loss": 1.3725, + "step": 1198 + }, + { + "epoch": 0.35711759340270666, + "grad_norm": 0.13718818128108978, + "learning_rate": 1.9998157841259716e-05, + "loss": 1.3735, + "step": 1199 + }, + { + "epoch": 0.3574154396023753, + "grad_norm": 0.1316278874874115, + "learning_rate": 1.9998139281159164e-05, + "loss": 1.3601, + "step": 1200 + }, + { + "epoch": 0.357713285802044, + "grad_norm": 0.12941031157970428, + "learning_rate": 1.9998120628037094e-05, + "loss": 1.3809, + "step": 1201 + }, + { + "epoch": 0.35801113200171264, + "grad_norm": 0.13530942797660828, + "learning_rate": 1.9998101881893683e-05, + "loss": 1.3818, + "step": 1202 + }, + { + "epoch": 0.35830897820138125, + "grad_norm": 0.1344297230243683, + "learning_rate": 1.9998083042729103e-05, + "loss": 1.375, + "step": 1203 + }, + { + "epoch": 0.3586068244010499, + "grad_norm": 0.1414794921875, + "learning_rate": 1.999806411054353e-05, + "loss": 1.3912, + "step": 1204 + }, + { + "epoch": 0.35890467060071857, + "grad_norm": 0.14133290946483612, + "learning_rate": 1.9998045085337147e-05, + "loss": 1.3715, + "step": 1205 + }, + { + "epoch": 0.3592025168003872, + "grad_norm": 0.13531997799873352, + "learning_rate": 1.999802596711012e-05, + "loss": 1.3691, + "step": 1206 + }, + { + "epoch": 0.35950036300005583, + "grad_norm": 0.13153564929962158, + "learning_rate": 1.9998006755862628e-05, + "loss": 1.3804, + "step": 1207 + }, + { + "epoch": 0.3597982091997245, + "grad_norm": 0.12766240537166595, + "learning_rate": 1.999798745159486e-05, + "loss": 1.4068, + "step": 1208 + }, + { + "epoch": 0.36009605539939316, + "grad_norm": 0.1314178705215454, + "learning_rate": 1.9997968054306985e-05, + "loss": 1.3712, + "step": 1209 + }, + { + "epoch": 0.36039390159906176, + "grad_norm": 0.14264385402202606, + "learning_rate": 1.9997948563999194e-05, + "loss": 1.3744, + "step": 1210 + }, + { + "epoch": 0.3606917477987304, + "grad_norm": 0.13112768530845642, + "learning_rate": 1.9997928980671653e-05, + "loss": 1.3607, + "step": 1211 + }, + { + "epoch": 0.3609895939983991, + "grad_norm": 0.12162243574857712, + "learning_rate": 1.9997909304324557e-05, + "loss": 1.3697, + "step": 1212 + }, + { + "epoch": 0.36128744019806774, + "grad_norm": 0.1289089173078537, + "learning_rate": 1.9997889534958088e-05, + "loss": 1.3828, + "step": 1213 + }, + { + "epoch": 0.36158528639773635, + "grad_norm": 0.1352340281009674, + "learning_rate": 1.9997869672572426e-05, + "loss": 1.3635, + "step": 1214 + }, + { + "epoch": 0.361883132597405, + "grad_norm": 0.13754944503307343, + "learning_rate": 1.9997849717167757e-05, + "loss": 1.3714, + "step": 1215 + }, + { + "epoch": 0.36218097879707367, + "grad_norm": 0.13485591113567352, + "learning_rate": 1.9997829668744265e-05, + "loss": 1.3694, + "step": 1216 + }, + { + "epoch": 0.36247882499674233, + "grad_norm": 0.13831846415996552, + "learning_rate": 1.9997809527302138e-05, + "loss": 1.3891, + "step": 1217 + }, + { + "epoch": 0.36277667119641094, + "grad_norm": 0.12861719727516174, + "learning_rate": 1.9997789292841564e-05, + "loss": 1.3803, + "step": 1218 + }, + { + "epoch": 0.3630745173960796, + "grad_norm": 0.122126504778862, + "learning_rate": 1.999776896536273e-05, + "loss": 1.3648, + "step": 1219 + }, + { + "epoch": 0.36337236359574826, + "grad_norm": 0.13395242393016815, + "learning_rate": 1.999774854486583e-05, + "loss": 1.3614, + "step": 1220 + }, + { + "epoch": 0.36367020979541687, + "grad_norm": 0.13298609852790833, + "learning_rate": 1.9997728031351044e-05, + "loss": 1.3718, + "step": 1221 + }, + { + "epoch": 0.3639680559950855, + "grad_norm": 0.13225732743740082, + "learning_rate": 1.9997707424818572e-05, + "loss": 1.3664, + "step": 1222 + }, + { + "epoch": 0.3642659021947542, + "grad_norm": 0.13129866123199463, + "learning_rate": 1.9997686725268605e-05, + "loss": 1.382, + "step": 1223 + }, + { + "epoch": 0.36456374839442285, + "grad_norm": 0.1358112394809723, + "learning_rate": 1.999766593270133e-05, + "loss": 1.3902, + "step": 1224 + }, + { + "epoch": 0.36486159459409145, + "grad_norm": 0.13032831251621246, + "learning_rate": 1.9997645047116942e-05, + "loss": 1.3951, + "step": 1225 + }, + { + "epoch": 0.3651594407937601, + "grad_norm": 0.1316758096218109, + "learning_rate": 1.9997624068515643e-05, + "loss": 1.3742, + "step": 1226 + }, + { + "epoch": 0.3654572869934288, + "grad_norm": 0.12895487248897552, + "learning_rate": 1.9997602996897614e-05, + "loss": 1.3722, + "step": 1227 + }, + { + "epoch": 0.36575513319309744, + "grad_norm": 0.13532277941703796, + "learning_rate": 1.9997581832263065e-05, + "loss": 1.3742, + "step": 1228 + }, + { + "epoch": 0.36605297939276604, + "grad_norm": 0.12580768764019012, + "learning_rate": 1.9997560574612186e-05, + "loss": 1.3642, + "step": 1229 + }, + { + "epoch": 0.3663508255924347, + "grad_norm": 0.12978699803352356, + "learning_rate": 1.9997539223945174e-05, + "loss": 1.3675, + "step": 1230 + }, + { + "epoch": 0.36664867179210336, + "grad_norm": 0.1309354305267334, + "learning_rate": 1.999751778026223e-05, + "loss": 1.3815, + "step": 1231 + }, + { + "epoch": 0.366946517991772, + "grad_norm": 0.12747173011302948, + "learning_rate": 1.9997496243563553e-05, + "loss": 1.3836, + "step": 1232 + }, + { + "epoch": 0.36724436419144063, + "grad_norm": 0.13163839280605316, + "learning_rate": 1.9997474613849346e-05, + "loss": 1.368, + "step": 1233 + }, + { + "epoch": 0.3675422103911093, + "grad_norm": 0.13420511782169342, + "learning_rate": 1.9997452891119804e-05, + "loss": 1.3477, + "step": 1234 + }, + { + "epoch": 0.36784005659077795, + "grad_norm": 0.13195165991783142, + "learning_rate": 1.9997431075375132e-05, + "loss": 1.3848, + "step": 1235 + }, + { + "epoch": 0.36813790279044656, + "grad_norm": 0.12061884999275208, + "learning_rate": 1.9997409166615535e-05, + "loss": 1.3767, + "step": 1236 + }, + { + "epoch": 0.3684357489901152, + "grad_norm": 0.1303822249174118, + "learning_rate": 1.9997387164841218e-05, + "loss": 1.3754, + "step": 1237 + }, + { + "epoch": 0.3687335951897839, + "grad_norm": 0.1285220980644226, + "learning_rate": 1.9997365070052383e-05, + "loss": 1.3746, + "step": 1238 + }, + { + "epoch": 0.36903144138945254, + "grad_norm": 0.126163050532341, + "learning_rate": 1.9997342882249234e-05, + "loss": 1.3775, + "step": 1239 + }, + { + "epoch": 0.36932928758912115, + "grad_norm": 0.1282956302165985, + "learning_rate": 1.9997320601431974e-05, + "loss": 1.3844, + "step": 1240 + }, + { + "epoch": 0.3696271337887898, + "grad_norm": 0.12833084166049957, + "learning_rate": 1.9997298227600823e-05, + "loss": 1.3825, + "step": 1241 + }, + { + "epoch": 0.36992497998845847, + "grad_norm": 0.1357254534959793, + "learning_rate": 1.999727576075598e-05, + "loss": 1.3499, + "step": 1242 + }, + { + "epoch": 0.37022282618812713, + "grad_norm": 0.12870509922504425, + "learning_rate": 1.9997253200897652e-05, + "loss": 1.3799, + "step": 1243 + }, + { + "epoch": 0.37052067238779574, + "grad_norm": 0.1316627711057663, + "learning_rate": 1.9997230548026056e-05, + "loss": 1.3887, + "step": 1244 + }, + { + "epoch": 0.3708185185874644, + "grad_norm": 0.12772461771965027, + "learning_rate": 1.9997207802141402e-05, + "loss": 1.3866, + "step": 1245 + }, + { + "epoch": 0.37111636478713306, + "grad_norm": 0.12759855389595032, + "learning_rate": 1.9997184963243894e-05, + "loss": 1.372, + "step": 1246 + }, + { + "epoch": 0.37141421098680166, + "grad_norm": 0.13649018108844757, + "learning_rate": 1.999716203133375e-05, + "loss": 1.3644, + "step": 1247 + }, + { + "epoch": 0.3717120571864703, + "grad_norm": 0.12223384529352188, + "learning_rate": 1.9997139006411184e-05, + "loss": 1.3669, + "step": 1248 + }, + { + "epoch": 0.372009903386139, + "grad_norm": 0.1267387717962265, + "learning_rate": 1.999711588847641e-05, + "loss": 1.3695, + "step": 1249 + }, + { + "epoch": 0.37230774958580765, + "grad_norm": 0.13214010000228882, + "learning_rate": 1.999709267752964e-05, + "loss": 1.3931, + "step": 1250 + }, + { + "epoch": 0.37260559578547625, + "grad_norm": 0.13361282646656036, + "learning_rate": 1.9997069373571092e-05, + "loss": 1.3872, + "step": 1251 + }, + { + "epoch": 0.3729034419851449, + "grad_norm": 0.13205532729625702, + "learning_rate": 1.9997045976600983e-05, + "loss": 1.3791, + "step": 1252 + }, + { + "epoch": 0.3732012881848136, + "grad_norm": 0.1310456097126007, + "learning_rate": 1.9997022486619532e-05, + "loss": 1.3792, + "step": 1253 + }, + { + "epoch": 0.37349913438448223, + "grad_norm": 0.12933965027332306, + "learning_rate": 1.9996998903626955e-05, + "loss": 1.3736, + "step": 1254 + }, + { + "epoch": 0.37379698058415084, + "grad_norm": 0.12627552449703217, + "learning_rate": 1.9996975227623476e-05, + "loss": 1.3714, + "step": 1255 + }, + { + "epoch": 0.3740948267838195, + "grad_norm": 0.13484114408493042, + "learning_rate": 1.9996951458609307e-05, + "loss": 1.382, + "step": 1256 + }, + { + "epoch": 0.37439267298348816, + "grad_norm": 0.1362007111310959, + "learning_rate": 1.9996927596584673e-05, + "loss": 1.3668, + "step": 1257 + }, + { + "epoch": 0.3746905191831568, + "grad_norm": 0.12914560735225677, + "learning_rate": 1.99969036415498e-05, + "loss": 1.3483, + "step": 1258 + }, + { + "epoch": 0.37498836538282543, + "grad_norm": 0.12913164496421814, + "learning_rate": 1.999687959350491e-05, + "loss": 1.3622, + "step": 1259 + }, + { + "epoch": 0.3752862115824941, + "grad_norm": 0.13875743746757507, + "learning_rate": 1.999685545245022e-05, + "loss": 1.3773, + "step": 1260 + }, + { + "epoch": 0.37558405778216275, + "grad_norm": 0.14095179736614227, + "learning_rate": 1.9996831218385964e-05, + "loss": 1.3696, + "step": 1261 + }, + { + "epoch": 0.37588190398183136, + "grad_norm": 0.1370166391134262, + "learning_rate": 1.9996806891312358e-05, + "loss": 1.3834, + "step": 1262 + }, + { + "epoch": 0.3761797501815, + "grad_norm": 0.12707234919071198, + "learning_rate": 1.9996782471229635e-05, + "loss": 1.367, + "step": 1263 + }, + { + "epoch": 0.3764775963811687, + "grad_norm": 0.13285866379737854, + "learning_rate": 1.999675795813802e-05, + "loss": 1.3775, + "step": 1264 + }, + { + "epoch": 0.37677544258083734, + "grad_norm": 0.13654166460037231, + "learning_rate": 1.9996733352037743e-05, + "loss": 1.3714, + "step": 1265 + }, + { + "epoch": 0.37707328878050594, + "grad_norm": 0.1282128542661667, + "learning_rate": 1.9996708652929028e-05, + "loss": 1.375, + "step": 1266 + }, + { + "epoch": 0.3773711349801746, + "grad_norm": 0.13190089166164398, + "learning_rate": 1.9996683860812108e-05, + "loss": 1.3691, + "step": 1267 + }, + { + "epoch": 0.37766898117984327, + "grad_norm": 0.1347484290599823, + "learning_rate": 1.9996658975687216e-05, + "loss": 1.3743, + "step": 1268 + }, + { + "epoch": 0.3779668273795119, + "grad_norm": 0.13643623888492584, + "learning_rate": 1.999663399755458e-05, + "loss": 1.3857, + "step": 1269 + }, + { + "epoch": 0.37826467357918053, + "grad_norm": 0.13558420538902283, + "learning_rate": 1.9996608926414435e-05, + "loss": 1.378, + "step": 1270 + }, + { + "epoch": 0.3785625197788492, + "grad_norm": 0.13643789291381836, + "learning_rate": 1.999658376226701e-05, + "loss": 1.353, + "step": 1271 + }, + { + "epoch": 0.37886036597851785, + "grad_norm": 0.14463697373867035, + "learning_rate": 1.9996558505112543e-05, + "loss": 1.3773, + "step": 1272 + }, + { + "epoch": 0.3791582121781865, + "grad_norm": 0.1375194638967514, + "learning_rate": 1.999653315495127e-05, + "loss": 1.3807, + "step": 1273 + }, + { + "epoch": 0.3794560583778551, + "grad_norm": 0.13336382806301117, + "learning_rate": 1.9996507711783422e-05, + "loss": 1.3595, + "step": 1274 + }, + { + "epoch": 0.3797539045775238, + "grad_norm": 0.14004971086978912, + "learning_rate": 1.9996482175609237e-05, + "loss": 1.364, + "step": 1275 + }, + { + "epoch": 0.38005175077719244, + "grad_norm": 0.1399676501750946, + "learning_rate": 1.9996456546428957e-05, + "loss": 1.3712, + "step": 1276 + }, + { + "epoch": 0.38034959697686105, + "grad_norm": 0.1426711082458496, + "learning_rate": 1.9996430824242817e-05, + "loss": 1.3625, + "step": 1277 + }, + { + "epoch": 0.3806474431765297, + "grad_norm": 0.1441764235496521, + "learning_rate": 1.999640500905106e-05, + "loss": 1.3775, + "step": 1278 + }, + { + "epoch": 0.38094528937619837, + "grad_norm": 0.13327822089195251, + "learning_rate": 1.999637910085392e-05, + "loss": 1.3816, + "step": 1279 + }, + { + "epoch": 0.38124313557586703, + "grad_norm": 0.13619780540466309, + "learning_rate": 1.999635309965164e-05, + "loss": 1.3675, + "step": 1280 + }, + { + "epoch": 0.38154098177553564, + "grad_norm": 0.13389413058757782, + "learning_rate": 1.999632700544446e-05, + "loss": 1.3603, + "step": 1281 + }, + { + "epoch": 0.3818388279752043, + "grad_norm": 0.1472572386264801, + "learning_rate": 1.999630081823263e-05, + "loss": 1.3683, + "step": 1282 + }, + { + "epoch": 0.38213667417487296, + "grad_norm": 0.14208440482616425, + "learning_rate": 1.9996274538016394e-05, + "loss": 1.3722, + "step": 1283 + }, + { + "epoch": 0.3824345203745416, + "grad_norm": 0.1297113448381424, + "learning_rate": 1.9996248164795987e-05, + "loss": 1.3658, + "step": 1284 + }, + { + "epoch": 0.3827323665742102, + "grad_norm": 0.13482092320919037, + "learning_rate": 1.9996221698571657e-05, + "loss": 1.3787, + "step": 1285 + }, + { + "epoch": 0.3830302127738789, + "grad_norm": 0.13355214893817902, + "learning_rate": 1.999619513934366e-05, + "loss": 1.3719, + "step": 1286 + }, + { + "epoch": 0.38332805897354755, + "grad_norm": 0.1377803385257721, + "learning_rate": 1.9996168487112228e-05, + "loss": 1.3664, + "step": 1287 + }, + { + "epoch": 0.38362590517321615, + "grad_norm": 0.13205859065055847, + "learning_rate": 1.999614174187762e-05, + "loss": 1.3554, + "step": 1288 + }, + { + "epoch": 0.3839237513728848, + "grad_norm": 0.14215654134750366, + "learning_rate": 1.999611490364008e-05, + "loss": 1.3671, + "step": 1289 + }, + { + "epoch": 0.3842215975725535, + "grad_norm": 0.13025464117527008, + "learning_rate": 1.999608797239986e-05, + "loss": 1.3813, + "step": 1290 + }, + { + "epoch": 0.38451944377222214, + "grad_norm": 0.1379905492067337, + "learning_rate": 1.9996060948157206e-05, + "loss": 1.3803, + "step": 1291 + }, + { + "epoch": 0.38481728997189074, + "grad_norm": 0.13925138115882874, + "learning_rate": 1.999603383091238e-05, + "loss": 1.3786, + "step": 1292 + }, + { + "epoch": 0.3851151361715594, + "grad_norm": 0.14388136565685272, + "learning_rate": 1.999600662066562e-05, + "loss": 1.3471, + "step": 1293 + }, + { + "epoch": 0.38541298237122806, + "grad_norm": 0.13373485207557678, + "learning_rate": 1.9995979317417197e-05, + "loss": 1.3675, + "step": 1294 + }, + { + "epoch": 0.3857108285708967, + "grad_norm": 0.14883075654506683, + "learning_rate": 1.9995951921167343e-05, + "loss": 1.3864, + "step": 1295 + }, + { + "epoch": 0.38600867477056533, + "grad_norm": 0.13818325102329254, + "learning_rate": 1.9995924431916332e-05, + "loss": 1.385, + "step": 1296 + }, + { + "epoch": 0.386306520970234, + "grad_norm": 0.13905934989452362, + "learning_rate": 1.999589684966441e-05, + "loss": 1.3637, + "step": 1297 + }, + { + "epoch": 0.38660436716990265, + "grad_norm": 0.15382356941699982, + "learning_rate": 1.9995869174411834e-05, + "loss": 1.3975, + "step": 1298 + }, + { + "epoch": 0.3869022133695713, + "grad_norm": 0.13522179424762726, + "learning_rate": 1.9995841406158866e-05, + "loss": 1.3712, + "step": 1299 + }, + { + "epoch": 0.3872000595692399, + "grad_norm": 0.1400614082813263, + "learning_rate": 1.9995813544905763e-05, + "loss": 1.357, + "step": 1300 + }, + { + "epoch": 0.3874979057689086, + "grad_norm": 0.14229221642017365, + "learning_rate": 1.9995785590652783e-05, + "loss": 1.3843, + "step": 1301 + }, + { + "epoch": 0.38779575196857724, + "grad_norm": 0.13116051256656647, + "learning_rate": 1.9995757543400182e-05, + "loss": 1.3606, + "step": 1302 + }, + { + "epoch": 0.38809359816824585, + "grad_norm": 0.13689297437667847, + "learning_rate": 1.9995729403148226e-05, + "loss": 1.373, + "step": 1303 + }, + { + "epoch": 0.3883914443679145, + "grad_norm": 0.16429245471954346, + "learning_rate": 1.999570116989718e-05, + "loss": 1.3704, + "step": 1304 + }, + { + "epoch": 0.38868929056758317, + "grad_norm": 0.14279286563396454, + "learning_rate": 1.9995672843647295e-05, + "loss": 1.3736, + "step": 1305 + }, + { + "epoch": 0.38898713676725183, + "grad_norm": 0.14285866916179657, + "learning_rate": 1.9995644424398847e-05, + "loss": 1.3741, + "step": 1306 + }, + { + "epoch": 0.38928498296692043, + "grad_norm": 0.14350569248199463, + "learning_rate": 1.9995615912152098e-05, + "loss": 1.3896, + "step": 1307 + }, + { + "epoch": 0.3895828291665891, + "grad_norm": 0.14331308007240295, + "learning_rate": 1.9995587306907308e-05, + "loss": 1.3507, + "step": 1308 + }, + { + "epoch": 0.38988067536625776, + "grad_norm": 0.14781762659549713, + "learning_rate": 1.9995558608664744e-05, + "loss": 1.3649, + "step": 1309 + }, + { + "epoch": 0.3901785215659264, + "grad_norm": 0.14205756783485413, + "learning_rate": 1.9995529817424675e-05, + "loss": 1.3748, + "step": 1310 + }, + { + "epoch": 0.390476367765595, + "grad_norm": 0.13681936264038086, + "learning_rate": 1.999550093318737e-05, + "loss": 1.3711, + "step": 1311 + }, + { + "epoch": 0.3907742139652637, + "grad_norm": 0.13977855443954468, + "learning_rate": 1.9995471955953096e-05, + "loss": 1.3681, + "step": 1312 + }, + { + "epoch": 0.39107206016493234, + "grad_norm": 0.14186030626296997, + "learning_rate": 1.9995442885722122e-05, + "loss": 1.3758, + "step": 1313 + }, + { + "epoch": 0.391369906364601, + "grad_norm": 0.14282235503196716, + "learning_rate": 1.999541372249472e-05, + "loss": 1.3824, + "step": 1314 + }, + { + "epoch": 0.3916677525642696, + "grad_norm": 0.13587988913059235, + "learning_rate": 1.9995384466271164e-05, + "loss": 1.3542, + "step": 1315 + }, + { + "epoch": 0.39196559876393827, + "grad_norm": 0.13733258843421936, + "learning_rate": 1.999535511705172e-05, + "loss": 1.3584, + "step": 1316 + }, + { + "epoch": 0.39226344496360693, + "grad_norm": 0.14015381038188934, + "learning_rate": 1.9995325674836665e-05, + "loss": 1.3716, + "step": 1317 + }, + { + "epoch": 0.39256129116327554, + "grad_norm": 0.13651269674301147, + "learning_rate": 1.9995296139626274e-05, + "loss": 1.365, + "step": 1318 + }, + { + "epoch": 0.3928591373629442, + "grad_norm": 0.14151425659656525, + "learning_rate": 1.9995266511420816e-05, + "loss": 1.3638, + "step": 1319 + }, + { + "epoch": 0.39315698356261286, + "grad_norm": 0.13102491199970245, + "learning_rate": 1.9995236790220574e-05, + "loss": 1.3545, + "step": 1320 + }, + { + "epoch": 0.3934548297622815, + "grad_norm": 0.14258791506290436, + "learning_rate": 1.9995206976025817e-05, + "loss": 1.3717, + "step": 1321 + }, + { + "epoch": 0.3937526759619501, + "grad_norm": 0.13457924127578735, + "learning_rate": 1.9995177068836828e-05, + "loss": 1.3766, + "step": 1322 + }, + { + "epoch": 0.3940505221616188, + "grad_norm": 0.14607053995132446, + "learning_rate": 1.9995147068653887e-05, + "loss": 1.3647, + "step": 1323 + }, + { + "epoch": 0.39434836836128745, + "grad_norm": 0.13513265550136566, + "learning_rate": 1.9995116975477265e-05, + "loss": 1.3589, + "step": 1324 + }, + { + "epoch": 0.3946462145609561, + "grad_norm": 0.1353466659784317, + "learning_rate": 1.9995086789307248e-05, + "loss": 1.3713, + "step": 1325 + }, + { + "epoch": 0.3949440607606247, + "grad_norm": 0.15018858015537262, + "learning_rate": 1.9995056510144116e-05, + "loss": 1.3583, + "step": 1326 + }, + { + "epoch": 0.3952419069602934, + "grad_norm": 0.14529645442962646, + "learning_rate": 1.999502613798815e-05, + "loss": 1.3737, + "step": 1327 + }, + { + "epoch": 0.39553975315996204, + "grad_norm": 0.14066046476364136, + "learning_rate": 1.9994995672839636e-05, + "loss": 1.3622, + "step": 1328 + }, + { + "epoch": 0.39583759935963064, + "grad_norm": 0.1439758837223053, + "learning_rate": 1.999496511469885e-05, + "loss": 1.3764, + "step": 1329 + }, + { + "epoch": 0.3961354455592993, + "grad_norm": 0.14791858196258545, + "learning_rate": 1.9994934463566086e-05, + "loss": 1.3917, + "step": 1330 + }, + { + "epoch": 0.39643329175896797, + "grad_norm": 0.1415499597787857, + "learning_rate": 1.999490371944162e-05, + "loss": 1.356, + "step": 1331 + }, + { + "epoch": 0.3967311379586366, + "grad_norm": 0.14524230360984802, + "learning_rate": 1.999487288232574e-05, + "loss": 1.3762, + "step": 1332 + }, + { + "epoch": 0.39702898415830523, + "grad_norm": 0.13859130442142487, + "learning_rate": 1.9994841952218738e-05, + "loss": 1.3627, + "step": 1333 + }, + { + "epoch": 0.3973268303579739, + "grad_norm": 0.13905595242977142, + "learning_rate": 1.9994810929120897e-05, + "loss": 1.372, + "step": 1334 + }, + { + "epoch": 0.39762467655764255, + "grad_norm": 0.14038988947868347, + "learning_rate": 1.999477981303251e-05, + "loss": 1.3674, + "step": 1335 + }, + { + "epoch": 0.3979225227573112, + "grad_norm": 0.1370796263217926, + "learning_rate": 1.999474860395386e-05, + "loss": 1.353, + "step": 1336 + }, + { + "epoch": 0.3982203689569798, + "grad_norm": 0.14049774408340454, + "learning_rate": 1.999471730188524e-05, + "loss": 1.3644, + "step": 1337 + }, + { + "epoch": 0.3985182151566485, + "grad_norm": 0.16187512874603271, + "learning_rate": 1.9994685906826944e-05, + "loss": 1.349, + "step": 1338 + }, + { + "epoch": 0.39881606135631714, + "grad_norm": 0.144514262676239, + "learning_rate": 1.9994654418779263e-05, + "loss": 1.3598, + "step": 1339 + }, + { + "epoch": 0.3991139075559858, + "grad_norm": 0.13683633506298065, + "learning_rate": 1.999462283774249e-05, + "loss": 1.3512, + "step": 1340 + }, + { + "epoch": 0.3994117537556544, + "grad_norm": 0.13460426032543182, + "learning_rate": 1.999459116371692e-05, + "loss": 1.3454, + "step": 1341 + }, + { + "epoch": 0.39970959995532307, + "grad_norm": 0.14671728014945984, + "learning_rate": 1.999455939670284e-05, + "loss": 1.3733, + "step": 1342 + }, + { + "epoch": 0.40000744615499173, + "grad_norm": 0.14138005673885345, + "learning_rate": 1.9994527536700557e-05, + "loss": 1.3702, + "step": 1343 + }, + { + "epoch": 0.40030529235466034, + "grad_norm": 0.1489688754081726, + "learning_rate": 1.999449558371036e-05, + "loss": 1.3739, + "step": 1344 + }, + { + "epoch": 0.400603138554329, + "grad_norm": 0.1427619457244873, + "learning_rate": 1.9994463537732546e-05, + "loss": 1.3839, + "step": 1345 + }, + { + "epoch": 0.40090098475399766, + "grad_norm": 0.1428166627883911, + "learning_rate": 1.999443139876742e-05, + "loss": 1.3648, + "step": 1346 + }, + { + "epoch": 0.4011988309536663, + "grad_norm": 0.15491773188114166, + "learning_rate": 1.9994399166815272e-05, + "loss": 1.368, + "step": 1347 + }, + { + "epoch": 0.4014966771533349, + "grad_norm": 0.14289771020412445, + "learning_rate": 1.9994366841876404e-05, + "loss": 1.3732, + "step": 1348 + }, + { + "epoch": 0.4017945233530036, + "grad_norm": 0.14798198640346527, + "learning_rate": 1.9994334423951122e-05, + "loss": 1.3309, + "step": 1349 + }, + { + "epoch": 0.40209236955267225, + "grad_norm": 0.14131715893745422, + "learning_rate": 1.9994301913039726e-05, + "loss": 1.3637, + "step": 1350 + }, + { + "epoch": 0.4023902157523409, + "grad_norm": 0.13964703679084778, + "learning_rate": 1.9994269309142517e-05, + "loss": 1.3767, + "step": 1351 + }, + { + "epoch": 0.4026880619520095, + "grad_norm": 0.15044206380844116, + "learning_rate": 1.9994236612259796e-05, + "loss": 1.3623, + "step": 1352 + }, + { + "epoch": 0.4029859081516782, + "grad_norm": 0.14952361583709717, + "learning_rate": 1.999420382239187e-05, + "loss": 1.3717, + "step": 1353 + }, + { + "epoch": 0.40328375435134683, + "grad_norm": 0.13733325898647308, + "learning_rate": 1.9994170939539042e-05, + "loss": 1.3471, + "step": 1354 + }, + { + "epoch": 0.4035816005510155, + "grad_norm": 0.13589802384376526, + "learning_rate": 1.9994137963701622e-05, + "loss": 1.3678, + "step": 1355 + }, + { + "epoch": 0.4038794467506841, + "grad_norm": 0.14110149443149567, + "learning_rate": 1.9994104894879914e-05, + "loss": 1.3529, + "step": 1356 + }, + { + "epoch": 0.40417729295035276, + "grad_norm": 0.14588642120361328, + "learning_rate": 1.999407173307423e-05, + "loss": 1.3475, + "step": 1357 + }, + { + "epoch": 0.4044751391500214, + "grad_norm": 0.17346397042274475, + "learning_rate": 1.9994038478284868e-05, + "loss": 1.3598, + "step": 1358 + }, + { + "epoch": 0.40477298534969003, + "grad_norm": 0.13631947338581085, + "learning_rate": 1.9994005130512147e-05, + "loss": 1.3675, + "step": 1359 + }, + { + "epoch": 0.4050708315493587, + "grad_norm": 0.13887470960617065, + "learning_rate": 1.9993971689756374e-05, + "loss": 1.3684, + "step": 1360 + }, + { + "epoch": 0.40536867774902735, + "grad_norm": 0.1530754268169403, + "learning_rate": 1.9993938156017857e-05, + "loss": 1.3761, + "step": 1361 + }, + { + "epoch": 0.405666523948696, + "grad_norm": 0.1420319378376007, + "learning_rate": 1.9993904529296915e-05, + "loss": 1.3866, + "step": 1362 + }, + { + "epoch": 0.4059643701483646, + "grad_norm": 0.14963901042938232, + "learning_rate": 1.9993870809593857e-05, + "loss": 1.3713, + "step": 1363 + }, + { + "epoch": 0.4062622163480333, + "grad_norm": 0.14358863234519958, + "learning_rate": 1.9993836996909e-05, + "loss": 1.3766, + "step": 1364 + }, + { + "epoch": 0.40656006254770194, + "grad_norm": 0.14113590121269226, + "learning_rate": 1.999380309124265e-05, + "loss": 1.3747, + "step": 1365 + }, + { + "epoch": 0.4068579087473706, + "grad_norm": 0.13802053034305573, + "learning_rate": 1.999376909259513e-05, + "loss": 1.3659, + "step": 1366 + }, + { + "epoch": 0.4071557549470392, + "grad_norm": 0.14546042680740356, + "learning_rate": 1.9993735000966756e-05, + "loss": 1.3717, + "step": 1367 + }, + { + "epoch": 0.40745360114670787, + "grad_norm": 0.15038831532001495, + "learning_rate": 1.999370081635784e-05, + "loss": 1.3597, + "step": 1368 + }, + { + "epoch": 0.4077514473463765, + "grad_norm": 0.1376657336950302, + "learning_rate": 1.9993666538768704e-05, + "loss": 1.3722, + "step": 1369 + }, + { + "epoch": 0.4080492935460452, + "grad_norm": 0.14589664340019226, + "learning_rate": 1.9993632168199668e-05, + "loss": 1.3547, + "step": 1370 + }, + { + "epoch": 0.4083471397457138, + "grad_norm": 0.1463681012392044, + "learning_rate": 1.9993597704651052e-05, + "loss": 1.3646, + "step": 1371 + }, + { + "epoch": 0.40864498594538246, + "grad_norm": 0.15309378504753113, + "learning_rate": 1.9993563148123176e-05, + "loss": 1.3631, + "step": 1372 + }, + { + "epoch": 0.4089428321450511, + "grad_norm": 0.14134198427200317, + "learning_rate": 1.9993528498616354e-05, + "loss": 1.3672, + "step": 1373 + }, + { + "epoch": 0.4092406783447197, + "grad_norm": 0.1537083387374878, + "learning_rate": 1.9993493756130918e-05, + "loss": 1.3858, + "step": 1374 + }, + { + "epoch": 0.4095385245443884, + "grad_norm": 0.1451197862625122, + "learning_rate": 1.9993458920667188e-05, + "loss": 1.369, + "step": 1375 + }, + { + "epoch": 0.40983637074405704, + "grad_norm": 0.13950785994529724, + "learning_rate": 1.9993423992225487e-05, + "loss": 1.3688, + "step": 1376 + }, + { + "epoch": 0.4101342169437257, + "grad_norm": 0.14010006189346313, + "learning_rate": 1.9993388970806143e-05, + "loss": 1.3552, + "step": 1377 + }, + { + "epoch": 0.4104320631433943, + "grad_norm": 0.1423448920249939, + "learning_rate": 1.9993353856409482e-05, + "loss": 1.355, + "step": 1378 + }, + { + "epoch": 0.41072990934306297, + "grad_norm": 0.1458849012851715, + "learning_rate": 1.9993318649035824e-05, + "loss": 1.3719, + "step": 1379 + }, + { + "epoch": 0.41102775554273163, + "grad_norm": 0.1420285850763321, + "learning_rate": 1.99932833486855e-05, + "loss": 1.3491, + "step": 1380 + }, + { + "epoch": 0.4113256017424003, + "grad_norm": 0.1413401961326599, + "learning_rate": 1.9993247955358845e-05, + "loss": 1.366, + "step": 1381 + }, + { + "epoch": 0.4116234479420689, + "grad_norm": 0.1456751674413681, + "learning_rate": 1.999321246905618e-05, + "loss": 1.3711, + "step": 1382 + }, + { + "epoch": 0.41192129414173756, + "grad_norm": 0.14318130910396576, + "learning_rate": 1.999317688977784e-05, + "loss": 1.3637, + "step": 1383 + }, + { + "epoch": 0.4122191403414062, + "grad_norm": 0.15225200355052948, + "learning_rate": 1.999314121752415e-05, + "loss": 1.3808, + "step": 1384 + }, + { + "epoch": 0.4125169865410748, + "grad_norm": 0.15400700271129608, + "learning_rate": 1.999310545229545e-05, + "loss": 1.3733, + "step": 1385 + }, + { + "epoch": 0.4128148327407435, + "grad_norm": 0.14465181529521942, + "learning_rate": 1.999306959409207e-05, + "loss": 1.3766, + "step": 1386 + }, + { + "epoch": 0.41311267894041215, + "grad_norm": 0.14450886845588684, + "learning_rate": 1.9993033642914334e-05, + "loss": 1.3659, + "step": 1387 + }, + { + "epoch": 0.4134105251400808, + "grad_norm": 0.14789682626724243, + "learning_rate": 1.999299759876259e-05, + "loss": 1.3402, + "step": 1388 + }, + { + "epoch": 0.4137083713397494, + "grad_norm": 0.13714885711669922, + "learning_rate": 1.999296146163717e-05, + "loss": 1.3617, + "step": 1389 + }, + { + "epoch": 0.4140062175394181, + "grad_norm": 0.14600655436515808, + "learning_rate": 1.9992925231538406e-05, + "loss": 1.3783, + "step": 1390 + }, + { + "epoch": 0.41430406373908674, + "grad_norm": 0.14446376264095306, + "learning_rate": 1.9992888908466638e-05, + "loss": 1.3502, + "step": 1391 + }, + { + "epoch": 0.4146019099387554, + "grad_norm": 0.15243427455425262, + "learning_rate": 1.9992852492422206e-05, + "loss": 1.3511, + "step": 1392 + }, + { + "epoch": 0.414899756138424, + "grad_norm": 0.13800148665905, + "learning_rate": 1.9992815983405442e-05, + "loss": 1.3725, + "step": 1393 + }, + { + "epoch": 0.41519760233809266, + "grad_norm": 0.14049609005451202, + "learning_rate": 1.9992779381416692e-05, + "loss": 1.3671, + "step": 1394 + }, + { + "epoch": 0.4154954485377613, + "grad_norm": 0.14720329642295837, + "learning_rate": 1.9992742686456298e-05, + "loss": 1.3734, + "step": 1395 + }, + { + "epoch": 0.41579329473743, + "grad_norm": 0.14476299285888672, + "learning_rate": 1.999270589852459e-05, + "loss": 1.3795, + "step": 1396 + }, + { + "epoch": 0.4160911409370986, + "grad_norm": 0.14092835783958435, + "learning_rate": 1.9992669017621925e-05, + "loss": 1.3593, + "step": 1397 + }, + { + "epoch": 0.41638898713676725, + "grad_norm": 0.14597156643867493, + "learning_rate": 1.9992632043748635e-05, + "loss": 1.3908, + "step": 1398 + }, + { + "epoch": 0.4166868333364359, + "grad_norm": 0.14053498208522797, + "learning_rate": 1.999259497690507e-05, + "loss": 1.3773, + "step": 1399 + }, + { + "epoch": 0.4169846795361045, + "grad_norm": 0.14102129638195038, + "learning_rate": 1.9992557817091574e-05, + "loss": 1.36, + "step": 1400 + }, + { + "epoch": 0.4172825257357732, + "grad_norm": 0.13804572820663452, + "learning_rate": 1.9992520564308488e-05, + "loss": 1.3773, + "step": 1401 + }, + { + "epoch": 0.41758037193544184, + "grad_norm": 0.14213070273399353, + "learning_rate": 1.9992483218556166e-05, + "loss": 1.3754, + "step": 1402 + }, + { + "epoch": 0.4178782181351105, + "grad_norm": 0.14632445573806763, + "learning_rate": 1.9992445779834952e-05, + "loss": 1.3648, + "step": 1403 + }, + { + "epoch": 0.4181760643347791, + "grad_norm": 0.14518555998802185, + "learning_rate": 1.9992408248145194e-05, + "loss": 1.3487, + "step": 1404 + }, + { + "epoch": 0.41847391053444777, + "grad_norm": 0.1442975252866745, + "learning_rate": 1.999237062348724e-05, + "loss": 1.3634, + "step": 1405 + }, + { + "epoch": 0.41877175673411643, + "grad_norm": 0.15438003838062286, + "learning_rate": 1.9992332905861445e-05, + "loss": 1.363, + "step": 1406 + }, + { + "epoch": 0.4190696029337851, + "grad_norm": 0.138583242893219, + "learning_rate": 1.999229509526815e-05, + "loss": 1.3379, + "step": 1407 + }, + { + "epoch": 0.4193674491334537, + "grad_norm": 0.14776061475276947, + "learning_rate": 1.999225719170772e-05, + "loss": 1.3701, + "step": 1408 + }, + { + "epoch": 0.41966529533312236, + "grad_norm": 0.14421144127845764, + "learning_rate": 1.9992219195180498e-05, + "loss": 1.3762, + "step": 1409 + }, + { + "epoch": 0.419963141532791, + "grad_norm": 0.14404408633708954, + "learning_rate": 1.9992181105686838e-05, + "loss": 1.3715, + "step": 1410 + }, + { + "epoch": 0.4202609877324597, + "grad_norm": 0.14231495559215546, + "learning_rate": 1.99921429232271e-05, + "loss": 1.355, + "step": 1411 + }, + { + "epoch": 0.4205588339321283, + "grad_norm": 0.1412181854248047, + "learning_rate": 1.9992104647801635e-05, + "loss": 1.3506, + "step": 1412 + }, + { + "epoch": 0.42085668013179695, + "grad_norm": 0.1508765071630478, + "learning_rate": 1.9992066279410797e-05, + "loss": 1.3588, + "step": 1413 + }, + { + "epoch": 0.4211545263314656, + "grad_norm": 0.1456536203622818, + "learning_rate": 1.9992027818054952e-05, + "loss": 1.3644, + "step": 1414 + }, + { + "epoch": 0.4214523725311342, + "grad_norm": 0.1506153792142868, + "learning_rate": 1.999198926373445e-05, + "loss": 1.3741, + "step": 1415 + }, + { + "epoch": 0.4217502187308029, + "grad_norm": 0.14759069681167603, + "learning_rate": 1.9991950616449648e-05, + "loss": 1.3582, + "step": 1416 + }, + { + "epoch": 0.42204806493047153, + "grad_norm": 0.13830821216106415, + "learning_rate": 1.9991911876200914e-05, + "loss": 1.3678, + "step": 1417 + }, + { + "epoch": 0.4223459111301402, + "grad_norm": 0.13808178901672363, + "learning_rate": 1.99918730429886e-05, + "loss": 1.3427, + "step": 1418 + }, + { + "epoch": 0.4226437573298088, + "grad_norm": 0.14943626523017883, + "learning_rate": 1.999183411681307e-05, + "loss": 1.3738, + "step": 1419 + }, + { + "epoch": 0.42294160352947746, + "grad_norm": 0.15086951851844788, + "learning_rate": 1.999179509767469e-05, + "loss": 1.3674, + "step": 1420 + }, + { + "epoch": 0.4232394497291461, + "grad_norm": 0.13922718167304993, + "learning_rate": 1.9991755985573823e-05, + "loss": 1.3561, + "step": 1421 + }, + { + "epoch": 0.4235372959288148, + "grad_norm": 0.1464361548423767, + "learning_rate": 1.9991716780510823e-05, + "loss": 1.3572, + "step": 1422 + }, + { + "epoch": 0.4238351421284834, + "grad_norm": 0.1514001041650772, + "learning_rate": 1.9991677482486068e-05, + "loss": 1.3714, + "step": 1423 + }, + { + "epoch": 0.42413298832815205, + "grad_norm": 0.15556089580059052, + "learning_rate": 1.999163809149991e-05, + "loss": 1.3637, + "step": 1424 + }, + { + "epoch": 0.4244308345278207, + "grad_norm": 0.1481410712003708, + "learning_rate": 1.9991598607552733e-05, + "loss": 1.3791, + "step": 1425 + }, + { + "epoch": 0.4247286807274893, + "grad_norm": 0.14641015231609344, + "learning_rate": 1.9991559030644888e-05, + "loss": 1.3479, + "step": 1426 + }, + { + "epoch": 0.425026526927158, + "grad_norm": 0.146641343832016, + "learning_rate": 1.999151936077675e-05, + "loss": 1.3615, + "step": 1427 + }, + { + "epoch": 0.42532437312682664, + "grad_norm": 0.1396789401769638, + "learning_rate": 1.9991479597948685e-05, + "loss": 1.3633, + "step": 1428 + }, + { + "epoch": 0.4256222193264953, + "grad_norm": 0.13987213373184204, + "learning_rate": 1.9991439742161066e-05, + "loss": 1.3631, + "step": 1429 + }, + { + "epoch": 0.4259200655261639, + "grad_norm": 0.14193253219127655, + "learning_rate": 1.9991399793414268e-05, + "loss": 1.3627, + "step": 1430 + }, + { + "epoch": 0.42621791172583257, + "grad_norm": 0.15473538637161255, + "learning_rate": 1.9991359751708653e-05, + "loss": 1.3726, + "step": 1431 + }, + { + "epoch": 0.4265157579255012, + "grad_norm": 0.14363671839237213, + "learning_rate": 1.9991319617044605e-05, + "loss": 1.3517, + "step": 1432 + }, + { + "epoch": 0.4268136041251699, + "grad_norm": 0.13485422730445862, + "learning_rate": 1.9991279389422485e-05, + "loss": 1.3599, + "step": 1433 + }, + { + "epoch": 0.4271114503248385, + "grad_norm": 0.14515508711338043, + "learning_rate": 1.9991239068842673e-05, + "loss": 1.3784, + "step": 1434 + }, + { + "epoch": 0.42740929652450715, + "grad_norm": 0.153084397315979, + "learning_rate": 1.999119865530555e-05, + "loss": 1.361, + "step": 1435 + }, + { + "epoch": 0.4277071427241758, + "grad_norm": 0.15337277948856354, + "learning_rate": 1.999115814881148e-05, + "loss": 1.3643, + "step": 1436 + }, + { + "epoch": 0.4280049889238445, + "grad_norm": 0.15344147384166718, + "learning_rate": 1.999111754936085e-05, + "loss": 1.3746, + "step": 1437 + }, + { + "epoch": 0.4283028351235131, + "grad_norm": 0.1456160843372345, + "learning_rate": 1.9991076856954034e-05, + "loss": 1.3631, + "step": 1438 + }, + { + "epoch": 0.42860068132318174, + "grad_norm": 0.1523975133895874, + "learning_rate": 1.999103607159141e-05, + "loss": 1.3638, + "step": 1439 + }, + { + "epoch": 0.4288985275228504, + "grad_norm": 0.15147021412849426, + "learning_rate": 1.9990995193273354e-05, + "loss": 1.3524, + "step": 1440 + }, + { + "epoch": 0.429196373722519, + "grad_norm": 0.14829808473587036, + "learning_rate": 1.9990954222000254e-05, + "loss": 1.371, + "step": 1441 + }, + { + "epoch": 0.42949421992218767, + "grad_norm": 0.14790543913841248, + "learning_rate": 1.9990913157772488e-05, + "loss": 1.343, + "step": 1442 + }, + { + "epoch": 0.42979206612185633, + "grad_norm": 0.1457212120294571, + "learning_rate": 1.9990872000590435e-05, + "loss": 1.3723, + "step": 1443 + }, + { + "epoch": 0.430089912321525, + "grad_norm": 0.14873450994491577, + "learning_rate": 1.9990830750454485e-05, + "loss": 1.3668, + "step": 1444 + }, + { + "epoch": 0.4303877585211936, + "grad_norm": 0.15360642969608307, + "learning_rate": 1.999078940736501e-05, + "loss": 1.3746, + "step": 1445 + }, + { + "epoch": 0.43068560472086226, + "grad_norm": 0.14420081675052643, + "learning_rate": 1.999074797132241e-05, + "loss": 1.358, + "step": 1446 + }, + { + "epoch": 0.4309834509205309, + "grad_norm": 0.1481042504310608, + "learning_rate": 1.999070644232706e-05, + "loss": 1.3346, + "step": 1447 + }, + { + "epoch": 0.4312812971201996, + "grad_norm": 0.15088361501693726, + "learning_rate": 1.9990664820379348e-05, + "loss": 1.3669, + "step": 1448 + }, + { + "epoch": 0.4315791433198682, + "grad_norm": 0.14640074968338013, + "learning_rate": 1.9990623105479662e-05, + "loss": 1.3495, + "step": 1449 + }, + { + "epoch": 0.43187698951953685, + "grad_norm": 0.1395910680294037, + "learning_rate": 1.999058129762839e-05, + "loss": 1.3567, + "step": 1450 + }, + { + "epoch": 0.4321748357192055, + "grad_norm": 0.14191977679729462, + "learning_rate": 1.9990539396825918e-05, + "loss": 1.3974, + "step": 1451 + }, + { + "epoch": 0.43247268191887417, + "grad_norm": 0.14816097915172577, + "learning_rate": 1.9990497403072645e-05, + "loss": 1.3662, + "step": 1452 + }, + { + "epoch": 0.4327705281185428, + "grad_norm": 0.15005965530872345, + "learning_rate": 1.9990455316368952e-05, + "loss": 1.3592, + "step": 1453 + }, + { + "epoch": 0.43306837431821144, + "grad_norm": 0.14851853251457214, + "learning_rate": 1.9990413136715234e-05, + "loss": 1.3552, + "step": 1454 + }, + { + "epoch": 0.4333662205178801, + "grad_norm": 0.14743919670581818, + "learning_rate": 1.9990370864111885e-05, + "loss": 1.3633, + "step": 1455 + }, + { + "epoch": 0.4336640667175487, + "grad_norm": 0.14935488998889923, + "learning_rate": 1.9990328498559297e-05, + "loss": 1.3852, + "step": 1456 + }, + { + "epoch": 0.43396191291721736, + "grad_norm": 0.14826509356498718, + "learning_rate": 1.9990286040057864e-05, + "loss": 1.3796, + "step": 1457 + }, + { + "epoch": 0.434259759116886, + "grad_norm": 0.14386005699634552, + "learning_rate": 1.9990243488607982e-05, + "loss": 1.3523, + "step": 1458 + }, + { + "epoch": 0.4345576053165547, + "grad_norm": 0.13946856558322906, + "learning_rate": 1.9990200844210044e-05, + "loss": 1.37, + "step": 1459 + }, + { + "epoch": 0.4348554515162233, + "grad_norm": 0.1450900286436081, + "learning_rate": 1.9990158106864454e-05, + "loss": 1.3526, + "step": 1460 + }, + { + "epoch": 0.43515329771589195, + "grad_norm": 0.1451714038848877, + "learning_rate": 1.9990115276571597e-05, + "loss": 1.3489, + "step": 1461 + }, + { + "epoch": 0.4354511439155606, + "grad_norm": 0.13972912728786469, + "learning_rate": 1.9990072353331883e-05, + "loss": 1.3458, + "step": 1462 + }, + { + "epoch": 0.4357489901152293, + "grad_norm": 0.13991695642471313, + "learning_rate": 1.9990029337145706e-05, + "loss": 1.3642, + "step": 1463 + }, + { + "epoch": 0.4360468363148979, + "grad_norm": 0.14446556568145752, + "learning_rate": 1.9989986228013468e-05, + "loss": 1.3601, + "step": 1464 + }, + { + "epoch": 0.43634468251456654, + "grad_norm": 0.14560119807720184, + "learning_rate": 1.9989943025935574e-05, + "loss": 1.3618, + "step": 1465 + }, + { + "epoch": 0.4366425287142352, + "grad_norm": 0.1507396101951599, + "learning_rate": 1.9989899730912415e-05, + "loss": 1.3559, + "step": 1466 + }, + { + "epoch": 0.4369403749139038, + "grad_norm": 0.1439831554889679, + "learning_rate": 1.9989856342944405e-05, + "loss": 1.3437, + "step": 1467 + }, + { + "epoch": 0.43723822111357247, + "grad_norm": 0.14343243837356567, + "learning_rate": 1.9989812862031938e-05, + "loss": 1.3613, + "step": 1468 + }, + { + "epoch": 0.43753606731324113, + "grad_norm": 0.1516478955745697, + "learning_rate": 1.998976928817543e-05, + "loss": 1.374, + "step": 1469 + }, + { + "epoch": 0.4378339135129098, + "grad_norm": 0.13883419334888458, + "learning_rate": 1.9989725621375277e-05, + "loss": 1.3497, + "step": 1470 + }, + { + "epoch": 0.4381317597125784, + "grad_norm": 0.15203416347503662, + "learning_rate": 1.9989681861631886e-05, + "loss": 1.3604, + "step": 1471 + }, + { + "epoch": 0.43842960591224706, + "grad_norm": 0.14967718720436096, + "learning_rate": 1.9989638008945667e-05, + "loss": 1.3672, + "step": 1472 + }, + { + "epoch": 0.4387274521119157, + "grad_norm": 0.1440047323703766, + "learning_rate": 1.998959406331703e-05, + "loss": 1.3554, + "step": 1473 + }, + { + "epoch": 0.4390252983115844, + "grad_norm": 0.15080441534519196, + "learning_rate": 1.9989550024746382e-05, + "loss": 1.3466, + "step": 1474 + }, + { + "epoch": 0.439323144511253, + "grad_norm": 0.15265288949012756, + "learning_rate": 1.9989505893234128e-05, + "loss": 1.3499, + "step": 1475 + }, + { + "epoch": 0.43962099071092164, + "grad_norm": 0.15390194952487946, + "learning_rate": 1.9989461668780687e-05, + "loss": 1.3669, + "step": 1476 + }, + { + "epoch": 0.4399188369105903, + "grad_norm": 0.15198729932308197, + "learning_rate": 1.9989417351386463e-05, + "loss": 1.376, + "step": 1477 + }, + { + "epoch": 0.44021668311025897, + "grad_norm": 0.14844822883605957, + "learning_rate": 1.9989372941051872e-05, + "loss": 1.3474, + "step": 1478 + }, + { + "epoch": 0.44051452930992757, + "grad_norm": 0.15690770745277405, + "learning_rate": 1.9989328437777325e-05, + "loss": 1.336, + "step": 1479 + }, + { + "epoch": 0.44081237550959623, + "grad_norm": 0.15483498573303223, + "learning_rate": 1.998928384156324e-05, + "loss": 1.3439, + "step": 1480 + }, + { + "epoch": 0.4411102217092649, + "grad_norm": 0.15072625875473022, + "learning_rate": 1.9989239152410028e-05, + "loss": 1.369, + "step": 1481 + }, + { + "epoch": 0.4414080679089335, + "grad_norm": 0.14917880296707153, + "learning_rate": 1.9989194370318107e-05, + "loss": 1.3597, + "step": 1482 + }, + { + "epoch": 0.44170591410860216, + "grad_norm": 0.15088708698749542, + "learning_rate": 1.9989149495287895e-05, + "loss": 1.3654, + "step": 1483 + }, + { + "epoch": 0.4420037603082708, + "grad_norm": 0.1527431458234787, + "learning_rate": 1.9989104527319805e-05, + "loss": 1.3437, + "step": 1484 + }, + { + "epoch": 0.4423016065079395, + "grad_norm": 0.1469215601682663, + "learning_rate": 1.9989059466414257e-05, + "loss": 1.3417, + "step": 1485 + }, + { + "epoch": 0.4425994527076081, + "grad_norm": 0.14421643316745758, + "learning_rate": 1.9989014312571674e-05, + "loss": 1.3647, + "step": 1486 + }, + { + "epoch": 0.44289729890727675, + "grad_norm": 0.1480475813150406, + "learning_rate": 1.998896906579247e-05, + "loss": 1.3574, + "step": 1487 + }, + { + "epoch": 0.4431951451069454, + "grad_norm": 0.16325557231903076, + "learning_rate": 1.9988923726077073e-05, + "loss": 1.3474, + "step": 1488 + }, + { + "epoch": 0.44349299130661407, + "grad_norm": 0.15464672446250916, + "learning_rate": 1.99888782934259e-05, + "loss": 1.3545, + "step": 1489 + }, + { + "epoch": 0.4437908375062827, + "grad_norm": 0.1462455838918686, + "learning_rate": 1.998883276783937e-05, + "loss": 1.3742, + "step": 1490 + }, + { + "epoch": 0.44408868370595134, + "grad_norm": 0.1596900075674057, + "learning_rate": 1.9988787149317918e-05, + "loss": 1.3426, + "step": 1491 + }, + { + "epoch": 0.44438652990562, + "grad_norm": 0.14827901124954224, + "learning_rate": 1.998874143786196e-05, + "loss": 1.3593, + "step": 1492 + }, + { + "epoch": 0.44468437610528866, + "grad_norm": 0.16149313747882843, + "learning_rate": 1.9988695633471916e-05, + "loss": 1.3695, + "step": 1493 + }, + { + "epoch": 0.44498222230495726, + "grad_norm": 0.14973697066307068, + "learning_rate": 1.9988649736148228e-05, + "loss": 1.374, + "step": 1494 + }, + { + "epoch": 0.4452800685046259, + "grad_norm": 0.15943573415279388, + "learning_rate": 1.998860374589131e-05, + "loss": 1.3639, + "step": 1495 + }, + { + "epoch": 0.4455779147042946, + "grad_norm": 0.16365402936935425, + "learning_rate": 1.9988557662701596e-05, + "loss": 1.3777, + "step": 1496 + }, + { + "epoch": 0.4458757609039632, + "grad_norm": 0.14562122523784637, + "learning_rate": 1.998851148657951e-05, + "loss": 1.3708, + "step": 1497 + }, + { + "epoch": 0.44617360710363185, + "grad_norm": 0.15102730691432953, + "learning_rate": 1.998846521752549e-05, + "loss": 1.3398, + "step": 1498 + }, + { + "epoch": 0.4464714533033005, + "grad_norm": 0.15463490784168243, + "learning_rate": 1.9988418855539956e-05, + "loss": 1.3496, + "step": 1499 + }, + { + "epoch": 0.4467692995029692, + "grad_norm": 0.1538606882095337, + "learning_rate": 1.9988372400623345e-05, + "loss": 1.3611, + "step": 1500 + }, + { + "epoch": 0.4467692995029692, + "eval_loss": 1.3834556341171265, + "eval_runtime": 19.0173, + "eval_samples_per_second": 91.18, + "eval_steps_per_second": 5.732, + "step": 1500 + }, + { + "epoch": 0.4470671457026378, + "grad_norm": 0.16407231986522675, + "learning_rate": 1.998832585277609e-05, + "loss": 1.3598, + "step": 1501 + }, + { + "epoch": 0.44736499190230644, + "grad_norm": 0.15346083045005798, + "learning_rate": 1.998827921199862e-05, + "loss": 1.3405, + "step": 1502 + }, + { + "epoch": 0.4476628381019751, + "grad_norm": 0.15303270518779755, + "learning_rate": 1.998823247829137e-05, + "loss": 1.3554, + "step": 1503 + }, + { + "epoch": 0.44796068430164376, + "grad_norm": 0.14479559659957886, + "learning_rate": 1.998818565165478e-05, + "loss": 1.3571, + "step": 1504 + }, + { + "epoch": 0.44825853050131237, + "grad_norm": 0.15231306850910187, + "learning_rate": 1.9988138732089285e-05, + "loss": 1.3483, + "step": 1505 + }, + { + "epoch": 0.44855637670098103, + "grad_norm": 0.14821556210517883, + "learning_rate": 1.9988091719595314e-05, + "loss": 1.3757, + "step": 1506 + }, + { + "epoch": 0.4488542229006497, + "grad_norm": 0.15764357149600983, + "learning_rate": 1.998804461417331e-05, + "loss": 1.3378, + "step": 1507 + }, + { + "epoch": 0.4491520691003183, + "grad_norm": 0.1531432718038559, + "learning_rate": 1.9987997415823708e-05, + "loss": 1.346, + "step": 1508 + }, + { + "epoch": 0.44944991529998696, + "grad_norm": 0.15966816246509552, + "learning_rate": 1.998795012454695e-05, + "loss": 1.3664, + "step": 1509 + }, + { + "epoch": 0.4497477614996556, + "grad_norm": 0.16142988204956055, + "learning_rate": 1.998790274034348e-05, + "loss": 1.3496, + "step": 1510 + }, + { + "epoch": 0.4500456076993243, + "grad_norm": 0.14158722758293152, + "learning_rate": 1.998785526321373e-05, + "loss": 1.3501, + "step": 1511 + }, + { + "epoch": 0.4503434538989929, + "grad_norm": 0.14258314669132233, + "learning_rate": 1.9987807693158145e-05, + "loss": 1.3612, + "step": 1512 + }, + { + "epoch": 0.45064130009866155, + "grad_norm": 0.15757381916046143, + "learning_rate": 1.998776003017717e-05, + "loss": 1.3424, + "step": 1513 + }, + { + "epoch": 0.4509391462983302, + "grad_norm": 0.14767710864543915, + "learning_rate": 1.9987712274271248e-05, + "loss": 1.3634, + "step": 1514 + }, + { + "epoch": 0.45123699249799887, + "grad_norm": 0.15256808698177338, + "learning_rate": 1.998766442544082e-05, + "loss": 1.3755, + "step": 1515 + }, + { + "epoch": 0.4515348386976675, + "grad_norm": 0.15053477883338928, + "learning_rate": 1.9987616483686335e-05, + "loss": 1.3552, + "step": 1516 + }, + { + "epoch": 0.45183268489733613, + "grad_norm": 0.1493692547082901, + "learning_rate": 1.9987568449008236e-05, + "loss": 1.3474, + "step": 1517 + }, + { + "epoch": 0.4521305310970048, + "grad_norm": 0.1480308622121811, + "learning_rate": 1.9987520321406973e-05, + "loss": 1.359, + "step": 1518 + }, + { + "epoch": 0.45242837729667346, + "grad_norm": 0.16877669095993042, + "learning_rate": 1.998747210088299e-05, + "loss": 1.349, + "step": 1519 + }, + { + "epoch": 0.45272622349634206, + "grad_norm": 0.1567610800266266, + "learning_rate": 1.998742378743674e-05, + "loss": 1.3587, + "step": 1520 + }, + { + "epoch": 0.4530240696960107, + "grad_norm": 0.15030761063098907, + "learning_rate": 1.998737538106867e-05, + "loss": 1.357, + "step": 1521 + }, + { + "epoch": 0.4533219158956794, + "grad_norm": 0.15131089091300964, + "learning_rate": 1.998732688177923e-05, + "loss": 1.3751, + "step": 1522 + }, + { + "epoch": 0.453619762095348, + "grad_norm": 0.15185332298278809, + "learning_rate": 1.9987278289568873e-05, + "loss": 1.3489, + "step": 1523 + }, + { + "epoch": 0.45391760829501665, + "grad_norm": 0.15380892157554626, + "learning_rate": 1.998722960443805e-05, + "loss": 1.3646, + "step": 1524 + }, + { + "epoch": 0.4542154544946853, + "grad_norm": 0.15530753135681152, + "learning_rate": 1.9987180826387213e-05, + "loss": 1.366, + "step": 1525 + }, + { + "epoch": 0.45451330069435397, + "grad_norm": 0.15113243460655212, + "learning_rate": 1.9987131955416815e-05, + "loss": 1.359, + "step": 1526 + }, + { + "epoch": 0.4548111468940226, + "grad_norm": 0.14856329560279846, + "learning_rate": 1.9987082991527313e-05, + "loss": 1.3648, + "step": 1527 + }, + { + "epoch": 0.45510899309369124, + "grad_norm": 0.15362197160720825, + "learning_rate": 1.9987033934719164e-05, + "loss": 1.3724, + "step": 1528 + }, + { + "epoch": 0.4554068392933599, + "grad_norm": 0.1537696123123169, + "learning_rate": 1.998698478499282e-05, + "loss": 1.3298, + "step": 1529 + }, + { + "epoch": 0.45570468549302856, + "grad_norm": 0.15299934148788452, + "learning_rate": 1.998693554234874e-05, + "loss": 1.3636, + "step": 1530 + }, + { + "epoch": 0.45600253169269717, + "grad_norm": 0.1576281040906906, + "learning_rate": 1.998688620678739e-05, + "loss": 1.3609, + "step": 1531 + }, + { + "epoch": 0.4563003778923658, + "grad_norm": 0.1615464836359024, + "learning_rate": 1.9986836778309215e-05, + "loss": 1.3523, + "step": 1532 + }, + { + "epoch": 0.4565982240920345, + "grad_norm": 0.1629544198513031, + "learning_rate": 1.9986787256914682e-05, + "loss": 1.3559, + "step": 1533 + }, + { + "epoch": 0.45689607029170315, + "grad_norm": 0.1634722799062729, + "learning_rate": 1.9986737642604253e-05, + "loss": 1.3517, + "step": 1534 + }, + { + "epoch": 0.45719391649137175, + "grad_norm": 0.15739208459854126, + "learning_rate": 1.998668793537839e-05, + "loss": 1.3381, + "step": 1535 + }, + { + "epoch": 0.4574917626910404, + "grad_norm": 0.16441817581653595, + "learning_rate": 1.9986638135237548e-05, + "loss": 1.3547, + "step": 1536 + }, + { + "epoch": 0.4577896088907091, + "grad_norm": 0.15827372670173645, + "learning_rate": 1.99865882421822e-05, + "loss": 1.3589, + "step": 1537 + }, + { + "epoch": 0.4580874550903777, + "grad_norm": 0.14872604608535767, + "learning_rate": 1.9986538256212806e-05, + "loss": 1.3533, + "step": 1538 + }, + { + "epoch": 0.45838530129004634, + "grad_norm": 0.16894614696502686, + "learning_rate": 1.998648817732983e-05, + "loss": 1.3485, + "step": 1539 + }, + { + "epoch": 0.458683147489715, + "grad_norm": 0.16135190427303314, + "learning_rate": 1.998643800553374e-05, + "loss": 1.3518, + "step": 1540 + }, + { + "epoch": 0.45898099368938367, + "grad_norm": 0.1732345074415207, + "learning_rate": 1.9986387740825e-05, + "loss": 1.3657, + "step": 1541 + }, + { + "epoch": 0.45927883988905227, + "grad_norm": 0.16166165471076965, + "learning_rate": 1.9986337383204085e-05, + "loss": 1.3607, + "step": 1542 + }, + { + "epoch": 0.45957668608872093, + "grad_norm": 0.1520800143480301, + "learning_rate": 1.998628693267145e-05, + "loss": 1.3508, + "step": 1543 + }, + { + "epoch": 0.4598745322883896, + "grad_norm": 0.15748022496700287, + "learning_rate": 1.9986236389227577e-05, + "loss": 1.3645, + "step": 1544 + }, + { + "epoch": 0.46017237848805825, + "grad_norm": 0.1648527979850769, + "learning_rate": 1.9986185752872934e-05, + "loss": 1.3708, + "step": 1545 + }, + { + "epoch": 0.46047022468772686, + "grad_norm": 0.16140109300613403, + "learning_rate": 1.9986135023607988e-05, + "loss": 1.3443, + "step": 1546 + }, + { + "epoch": 0.4607680708873955, + "grad_norm": 0.1539575457572937, + "learning_rate": 1.998608420143321e-05, + "loss": 1.3521, + "step": 1547 + }, + { + "epoch": 0.4610659170870642, + "grad_norm": 0.1520809382200241, + "learning_rate": 1.9986033286349078e-05, + "loss": 1.3566, + "step": 1548 + }, + { + "epoch": 0.4613637632867328, + "grad_norm": 0.16655495762825012, + "learning_rate": 1.9985982278356066e-05, + "loss": 1.3389, + "step": 1549 + }, + { + "epoch": 0.46166160948640145, + "grad_norm": 0.15567821264266968, + "learning_rate": 1.998593117745464e-05, + "loss": 1.3493, + "step": 1550 + }, + { + "epoch": 0.4619594556860701, + "grad_norm": 0.15215620398521423, + "learning_rate": 1.998587998364528e-05, + "loss": 1.3566, + "step": 1551 + }, + { + "epoch": 0.46225730188573877, + "grad_norm": 0.15496288239955902, + "learning_rate": 1.998582869692847e-05, + "loss": 1.356, + "step": 1552 + }, + { + "epoch": 0.4625551480854074, + "grad_norm": 0.15681160986423492, + "learning_rate": 1.998577731730468e-05, + "loss": 1.3512, + "step": 1553 + }, + { + "epoch": 0.46285299428507604, + "grad_norm": 0.16336871683597565, + "learning_rate": 1.9985725844774387e-05, + "loss": 1.3429, + "step": 1554 + }, + { + "epoch": 0.4631508404847447, + "grad_norm": 0.15553857386112213, + "learning_rate": 1.9985674279338072e-05, + "loss": 1.3623, + "step": 1555 + }, + { + "epoch": 0.46344868668441336, + "grad_norm": 0.15605011582374573, + "learning_rate": 1.9985622620996216e-05, + "loss": 1.3604, + "step": 1556 + }, + { + "epoch": 0.46374653288408196, + "grad_norm": 0.15912525355815887, + "learning_rate": 1.9985570869749295e-05, + "loss": 1.3574, + "step": 1557 + }, + { + "epoch": 0.4640443790837506, + "grad_norm": 0.1530112624168396, + "learning_rate": 1.9985519025597795e-05, + "loss": 1.3608, + "step": 1558 + }, + { + "epoch": 0.4643422252834193, + "grad_norm": 0.14379249513149261, + "learning_rate": 1.9985467088542197e-05, + "loss": 1.3519, + "step": 1559 + }, + { + "epoch": 0.46464007148308795, + "grad_norm": 0.17771276831626892, + "learning_rate": 1.9985415058582985e-05, + "loss": 1.3595, + "step": 1560 + }, + { + "epoch": 0.46493791768275655, + "grad_norm": 0.15996088087558746, + "learning_rate": 1.9985362935720644e-05, + "loss": 1.3653, + "step": 1561 + }, + { + "epoch": 0.4652357638824252, + "grad_norm": 0.1668889820575714, + "learning_rate": 1.998531071995565e-05, + "loss": 1.3688, + "step": 1562 + }, + { + "epoch": 0.4655336100820939, + "grad_norm": 0.1608288586139679, + "learning_rate": 1.99852584112885e-05, + "loss": 1.3708, + "step": 1563 + }, + { + "epoch": 0.4658314562817625, + "grad_norm": 0.15631313621997833, + "learning_rate": 1.998520600971968e-05, + "loss": 1.3625, + "step": 1564 + }, + { + "epoch": 0.46612930248143114, + "grad_norm": 0.15062110126018524, + "learning_rate": 1.9985153515249672e-05, + "loss": 1.3699, + "step": 1565 + }, + { + "epoch": 0.4664271486810998, + "grad_norm": 0.16072940826416016, + "learning_rate": 1.9985100927878965e-05, + "loss": 1.3485, + "step": 1566 + }, + { + "epoch": 0.46672499488076846, + "grad_norm": 0.15695033967494965, + "learning_rate": 1.9985048247608053e-05, + "loss": 1.3491, + "step": 1567 + }, + { + "epoch": 0.46702284108043707, + "grad_norm": 0.15807615220546722, + "learning_rate": 1.998499547443742e-05, + "loss": 1.3585, + "step": 1568 + }, + { + "epoch": 0.46732068728010573, + "grad_norm": 0.15113434195518494, + "learning_rate": 1.998494260836756e-05, + "loss": 1.3623, + "step": 1569 + }, + { + "epoch": 0.4676185334797744, + "grad_norm": 0.14953960478305817, + "learning_rate": 1.9984889649398967e-05, + "loss": 1.361, + "step": 1570 + }, + { + "epoch": 0.46791637967944305, + "grad_norm": 0.1541774868965149, + "learning_rate": 1.998483659753213e-05, + "loss": 1.3371, + "step": 1571 + }, + { + "epoch": 0.46821422587911166, + "grad_norm": 0.153075709939003, + "learning_rate": 1.9984783452767546e-05, + "loss": 1.3617, + "step": 1572 + }, + { + "epoch": 0.4685120720787803, + "grad_norm": 0.15750396251678467, + "learning_rate": 1.9984730215105705e-05, + "loss": 1.3467, + "step": 1573 + }, + { + "epoch": 0.468809918278449, + "grad_norm": 0.15796124935150146, + "learning_rate": 1.9984676884547104e-05, + "loss": 1.3348, + "step": 1574 + }, + { + "epoch": 0.46910776447811764, + "grad_norm": 0.15773655474185944, + "learning_rate": 1.9984623461092246e-05, + "loss": 1.3519, + "step": 1575 + }, + { + "epoch": 0.46940561067778624, + "grad_norm": 0.15742747485637665, + "learning_rate": 1.998456994474162e-05, + "loss": 1.3791, + "step": 1576 + }, + { + "epoch": 0.4697034568774549, + "grad_norm": 0.15541964769363403, + "learning_rate": 1.9984516335495722e-05, + "loss": 1.3568, + "step": 1577 + }, + { + "epoch": 0.47000130307712357, + "grad_norm": 0.15354672074317932, + "learning_rate": 1.998446263335506e-05, + "loss": 1.3552, + "step": 1578 + }, + { + "epoch": 0.47029914927679217, + "grad_norm": 0.15648815035820007, + "learning_rate": 1.9984408838320126e-05, + "loss": 1.3757, + "step": 1579 + }, + { + "epoch": 0.47059699547646083, + "grad_norm": 0.16400554776191711, + "learning_rate": 1.998435495039142e-05, + "loss": 1.3417, + "step": 1580 + }, + { + "epoch": 0.4708948416761295, + "grad_norm": 0.16394999623298645, + "learning_rate": 1.998430096956945e-05, + "loss": 1.3446, + "step": 1581 + }, + { + "epoch": 0.47119268787579816, + "grad_norm": 0.1599092334508896, + "learning_rate": 1.9984246895854717e-05, + "loss": 1.3448, + "step": 1582 + }, + { + "epoch": 0.47149053407546676, + "grad_norm": 0.16573455929756165, + "learning_rate": 1.9984192729247716e-05, + "loss": 1.3461, + "step": 1583 + }, + { + "epoch": 0.4717883802751354, + "grad_norm": 0.16031022369861603, + "learning_rate": 1.998413846974896e-05, + "loss": 1.3618, + "step": 1584 + }, + { + "epoch": 0.4720862264748041, + "grad_norm": 0.16845066845417023, + "learning_rate": 1.998408411735895e-05, + "loss": 1.3644, + "step": 1585 + }, + { + "epoch": 0.47238407267447274, + "grad_norm": 0.16577422618865967, + "learning_rate": 1.998402967207819e-05, + "loss": 1.3449, + "step": 1586 + }, + { + "epoch": 0.47268191887414135, + "grad_norm": 0.14588119089603424, + "learning_rate": 1.9983975133907193e-05, + "loss": 1.3308, + "step": 1587 + }, + { + "epoch": 0.47297976507381, + "grad_norm": 0.16787150502204895, + "learning_rate": 1.9983920502846457e-05, + "loss": 1.3691, + "step": 1588 + }, + { + "epoch": 0.47327761127347867, + "grad_norm": 0.1683722883462906, + "learning_rate": 1.99838657788965e-05, + "loss": 1.3627, + "step": 1589 + }, + { + "epoch": 0.4735754574731473, + "grad_norm": 0.16757291555404663, + "learning_rate": 1.9983810962057823e-05, + "loss": 1.3325, + "step": 1590 + }, + { + "epoch": 0.47387330367281594, + "grad_norm": 0.15515659749507904, + "learning_rate": 1.9983756052330942e-05, + "loss": 1.3419, + "step": 1591 + }, + { + "epoch": 0.4741711498724846, + "grad_norm": 0.14991584420204163, + "learning_rate": 1.9983701049716368e-05, + "loss": 1.3557, + "step": 1592 + }, + { + "epoch": 0.47446899607215326, + "grad_norm": 0.15937559306621552, + "learning_rate": 1.9983645954214602e-05, + "loss": 1.3488, + "step": 1593 + }, + { + "epoch": 0.47476684227182187, + "grad_norm": 0.15821953117847443, + "learning_rate": 1.9983590765826173e-05, + "loss": 1.358, + "step": 1594 + }, + { + "epoch": 0.4750646884714905, + "grad_norm": 0.15610694885253906, + "learning_rate": 1.9983535484551583e-05, + "loss": 1.35, + "step": 1595 + }, + { + "epoch": 0.4753625346711592, + "grad_norm": 0.15480051934719086, + "learning_rate": 1.998348011039135e-05, + "loss": 1.3296, + "step": 1596 + }, + { + "epoch": 0.47566038087082785, + "grad_norm": 0.1662609577178955, + "learning_rate": 1.9983424643345987e-05, + "loss": 1.3611, + "step": 1597 + }, + { + "epoch": 0.47595822707049645, + "grad_norm": 0.17669756710529327, + "learning_rate": 1.9983369083416012e-05, + "loss": 1.3781, + "step": 1598 + }, + { + "epoch": 0.4762560732701651, + "grad_norm": 0.15191276371479034, + "learning_rate": 1.9983313430601946e-05, + "loss": 1.3572, + "step": 1599 + }, + { + "epoch": 0.4765539194698338, + "grad_norm": 0.15627487003803253, + "learning_rate": 1.9983257684904297e-05, + "loss": 1.3702, + "step": 1600 + }, + { + "epoch": 0.47685176566950244, + "grad_norm": 0.16343528032302856, + "learning_rate": 1.9983201846323596e-05, + "loss": 1.3497, + "step": 1601 + }, + { + "epoch": 0.47714961186917104, + "grad_norm": 0.16408991813659668, + "learning_rate": 1.9983145914860345e-05, + "loss": 1.3576, + "step": 1602 + }, + { + "epoch": 0.4774474580688397, + "grad_norm": 0.15406526625156403, + "learning_rate": 1.9983089890515087e-05, + "loss": 1.3517, + "step": 1603 + }, + { + "epoch": 0.47774530426850836, + "grad_norm": 0.1557612121105194, + "learning_rate": 1.9983033773288323e-05, + "loss": 1.3496, + "step": 1604 + }, + { + "epoch": 0.47804315046817697, + "grad_norm": 0.16335482895374298, + "learning_rate": 1.9982977563180584e-05, + "loss": 1.3506, + "step": 1605 + }, + { + "epoch": 0.47834099666784563, + "grad_norm": 0.16537539660930634, + "learning_rate": 1.99829212601924e-05, + "loss": 1.3678, + "step": 1606 + }, + { + "epoch": 0.4786388428675143, + "grad_norm": 0.16301892697811127, + "learning_rate": 1.998286486432428e-05, + "loss": 1.348, + "step": 1607 + }, + { + "epoch": 0.47893668906718295, + "grad_norm": 0.16790518164634705, + "learning_rate": 1.9982808375576757e-05, + "loss": 1.3533, + "step": 1608 + }, + { + "epoch": 0.47923453526685156, + "grad_norm": 0.16025929152965546, + "learning_rate": 1.9982751793950355e-05, + "loss": 1.3579, + "step": 1609 + }, + { + "epoch": 0.4795323814665202, + "grad_norm": 0.1598828136920929, + "learning_rate": 1.9982695119445602e-05, + "loss": 1.3269, + "step": 1610 + }, + { + "epoch": 0.4798302276661889, + "grad_norm": 0.1701895296573639, + "learning_rate": 1.9982638352063025e-05, + "loss": 1.3498, + "step": 1611 + }, + { + "epoch": 0.48012807386585754, + "grad_norm": 0.15481679141521454, + "learning_rate": 1.9982581491803154e-05, + "loss": 1.345, + "step": 1612 + }, + { + "epoch": 0.48042592006552615, + "grad_norm": 0.1521386206150055, + "learning_rate": 1.998252453866651e-05, + "loss": 1.3385, + "step": 1613 + }, + { + "epoch": 0.4807237662651948, + "grad_norm": 0.15778110921382904, + "learning_rate": 1.998246749265363e-05, + "loss": 1.3452, + "step": 1614 + }, + { + "epoch": 0.48102161246486347, + "grad_norm": 0.16336236894130707, + "learning_rate": 1.9982410353765046e-05, + "loss": 1.3647, + "step": 1615 + }, + { + "epoch": 0.48131945866453213, + "grad_norm": 0.17148883640766144, + "learning_rate": 1.9982353122001284e-05, + "loss": 1.3661, + "step": 1616 + }, + { + "epoch": 0.48161730486420073, + "grad_norm": 0.1563539057970047, + "learning_rate": 1.9982295797362877e-05, + "loss": 1.338, + "step": 1617 + }, + { + "epoch": 0.4819151510638694, + "grad_norm": 0.1565932333469391, + "learning_rate": 1.9982238379850364e-05, + "loss": 1.3648, + "step": 1618 + }, + { + "epoch": 0.48221299726353806, + "grad_norm": 0.16150546073913574, + "learning_rate": 1.9982180869464276e-05, + "loss": 1.3603, + "step": 1619 + }, + { + "epoch": 0.48251084346320666, + "grad_norm": 0.16295665502548218, + "learning_rate": 1.9982123266205148e-05, + "loss": 1.3571, + "step": 1620 + }, + { + "epoch": 0.4828086896628753, + "grad_norm": 0.16042910516262054, + "learning_rate": 1.9982065570073513e-05, + "loss": 1.3641, + "step": 1621 + }, + { + "epoch": 0.483106535862544, + "grad_norm": 0.15912720561027527, + "learning_rate": 1.9982007781069913e-05, + "loss": 1.354, + "step": 1622 + }, + { + "epoch": 0.48340438206221265, + "grad_norm": 0.16279810667037964, + "learning_rate": 1.9981949899194883e-05, + "loss": 1.3634, + "step": 1623 + }, + { + "epoch": 0.48370222826188125, + "grad_norm": 0.15667667984962463, + "learning_rate": 1.9981891924448958e-05, + "loss": 1.3501, + "step": 1624 + }, + { + "epoch": 0.4840000744615499, + "grad_norm": 0.16223295032978058, + "learning_rate": 1.9981833856832684e-05, + "loss": 1.3638, + "step": 1625 + }, + { + "epoch": 0.4842979206612186, + "grad_norm": 0.16240368783473969, + "learning_rate": 1.9981775696346596e-05, + "loss": 1.3476, + "step": 1626 + }, + { + "epoch": 0.48459576686088723, + "grad_norm": 0.15941420197486877, + "learning_rate": 1.998171744299124e-05, + "loss": 1.3294, + "step": 1627 + }, + { + "epoch": 0.48489361306055584, + "grad_norm": 0.1619134098291397, + "learning_rate": 1.9981659096767158e-05, + "loss": 1.3429, + "step": 1628 + }, + { + "epoch": 0.4851914592602245, + "grad_norm": 0.14640536904335022, + "learning_rate": 1.9981600657674885e-05, + "loss": 1.3552, + "step": 1629 + }, + { + "epoch": 0.48548930545989316, + "grad_norm": 0.16520731151103973, + "learning_rate": 1.9981542125714973e-05, + "loss": 1.3441, + "step": 1630 + }, + { + "epoch": 0.48578715165956177, + "grad_norm": 0.1630891114473343, + "learning_rate": 1.998148350088796e-05, + "loss": 1.3558, + "step": 1631 + }, + { + "epoch": 0.4860849978592304, + "grad_norm": 0.15566229820251465, + "learning_rate": 1.99814247831944e-05, + "loss": 1.3575, + "step": 1632 + }, + { + "epoch": 0.4863828440588991, + "grad_norm": 0.15086857974529266, + "learning_rate": 1.998136597263483e-05, + "loss": 1.3428, + "step": 1633 + }, + { + "epoch": 0.48668069025856775, + "grad_norm": 0.1621299684047699, + "learning_rate": 1.9981307069209802e-05, + "loss": 1.359, + "step": 1634 + }, + { + "epoch": 0.48697853645823636, + "grad_norm": 0.15317213535308838, + "learning_rate": 1.9981248072919866e-05, + "loss": 1.3695, + "step": 1635 + }, + { + "epoch": 0.487276382657905, + "grad_norm": 0.1480894833803177, + "learning_rate": 1.9981188983765568e-05, + "loss": 1.3489, + "step": 1636 + }, + { + "epoch": 0.4875742288575737, + "grad_norm": 0.15267395973205566, + "learning_rate": 1.998112980174746e-05, + "loss": 1.3418, + "step": 1637 + }, + { + "epoch": 0.48787207505724234, + "grad_norm": 0.16101063787937164, + "learning_rate": 1.9981070526866086e-05, + "loss": 1.3553, + "step": 1638 + }, + { + "epoch": 0.48816992125691094, + "grad_norm": 0.16287663578987122, + "learning_rate": 1.9981011159122004e-05, + "loss": 1.3473, + "step": 1639 + }, + { + "epoch": 0.4884677674565796, + "grad_norm": 0.15994007885456085, + "learning_rate": 1.9980951698515766e-05, + "loss": 1.3457, + "step": 1640 + }, + { + "epoch": 0.48876561365624827, + "grad_norm": 0.15128056704998016, + "learning_rate": 1.998089214504792e-05, + "loss": 1.3421, + "step": 1641 + }, + { + "epoch": 0.4890634598559169, + "grad_norm": 0.15700973570346832, + "learning_rate": 1.9980832498719028e-05, + "loss": 1.3512, + "step": 1642 + }, + { + "epoch": 0.48936130605558553, + "grad_norm": 0.1566929966211319, + "learning_rate": 1.998077275952964e-05, + "loss": 1.3372, + "step": 1643 + }, + { + "epoch": 0.4896591522552542, + "grad_norm": 0.1569521278142929, + "learning_rate": 1.998071292748031e-05, + "loss": 1.3453, + "step": 1644 + }, + { + "epoch": 0.48995699845492285, + "grad_norm": 0.15177418291568756, + "learning_rate": 1.99806530025716e-05, + "loss": 1.3414, + "step": 1645 + }, + { + "epoch": 0.49025484465459146, + "grad_norm": 0.14923198521137238, + "learning_rate": 1.9980592984804067e-05, + "loss": 1.3467, + "step": 1646 + }, + { + "epoch": 0.4905526908542601, + "grad_norm": 0.1549576073884964, + "learning_rate": 1.9980532874178263e-05, + "loss": 1.3529, + "step": 1647 + }, + { + "epoch": 0.4908505370539288, + "grad_norm": 0.14973151683807373, + "learning_rate": 1.998047267069475e-05, + "loss": 1.3483, + "step": 1648 + }, + { + "epoch": 0.49114838325359744, + "grad_norm": 0.14889411628246307, + "learning_rate": 1.9980412374354097e-05, + "loss": 1.3513, + "step": 1649 + }, + { + "epoch": 0.49144622945326605, + "grad_norm": 0.15207715332508087, + "learning_rate": 1.9980351985156854e-05, + "loss": 1.3647, + "step": 1650 + }, + { + "epoch": 0.4917440756529347, + "grad_norm": 0.1659306436777115, + "learning_rate": 1.9980291503103582e-05, + "loss": 1.3519, + "step": 1651 + }, + { + "epoch": 0.49204192185260337, + "grad_norm": 0.15857510268688202, + "learning_rate": 1.9980230928194856e-05, + "loss": 1.352, + "step": 1652 + }, + { + "epoch": 0.49233976805227203, + "grad_norm": 0.15699228644371033, + "learning_rate": 1.998017026043123e-05, + "loss": 1.3537, + "step": 1653 + }, + { + "epoch": 0.49263761425194064, + "grad_norm": 0.16018475592136383, + "learning_rate": 1.9980109499813264e-05, + "loss": 1.3615, + "step": 1654 + }, + { + "epoch": 0.4929354604516093, + "grad_norm": 0.14704987406730652, + "learning_rate": 1.9980048646341538e-05, + "loss": 1.3527, + "step": 1655 + }, + { + "epoch": 0.49323330665127796, + "grad_norm": 0.15027841925621033, + "learning_rate": 1.9979987700016603e-05, + "loss": 1.354, + "step": 1656 + }, + { + "epoch": 0.4935311528509466, + "grad_norm": 0.16247746348381042, + "learning_rate": 1.997992666083903e-05, + "loss": 1.3538, + "step": 1657 + }, + { + "epoch": 0.4938289990506152, + "grad_norm": 0.15666894614696503, + "learning_rate": 1.99798655288094e-05, + "loss": 1.3289, + "step": 1658 + }, + { + "epoch": 0.4941268452502839, + "grad_norm": 0.16850797832012177, + "learning_rate": 1.9979804303928265e-05, + "loss": 1.368, + "step": 1659 + }, + { + "epoch": 0.49442469144995255, + "grad_norm": 0.16110515594482422, + "learning_rate": 1.9979742986196204e-05, + "loss": 1.3601, + "step": 1660 + }, + { + "epoch": 0.49472253764962115, + "grad_norm": 0.15504297614097595, + "learning_rate": 1.997968157561378e-05, + "loss": 1.369, + "step": 1661 + }, + { + "epoch": 0.4950203838492898, + "grad_norm": 0.16364294290542603, + "learning_rate": 1.9979620072181575e-05, + "loss": 1.3712, + "step": 1662 + }, + { + "epoch": 0.4953182300489585, + "grad_norm": 0.1554390788078308, + "learning_rate": 1.9979558475900148e-05, + "loss": 1.3637, + "step": 1663 + }, + { + "epoch": 0.49561607624862714, + "grad_norm": 0.16341114044189453, + "learning_rate": 1.9979496786770087e-05, + "loss": 1.3444, + "step": 1664 + }, + { + "epoch": 0.49591392244829574, + "grad_norm": 0.1561581939458847, + "learning_rate": 1.9979435004791953e-05, + "loss": 1.3548, + "step": 1665 + }, + { + "epoch": 0.4962117686479644, + "grad_norm": 0.15423549711704254, + "learning_rate": 1.9979373129966326e-05, + "loss": 1.3446, + "step": 1666 + }, + { + "epoch": 0.49650961484763306, + "grad_norm": 0.15586353838443756, + "learning_rate": 1.9979311162293783e-05, + "loss": 1.3403, + "step": 1667 + }, + { + "epoch": 0.4968074610473017, + "grad_norm": 0.16967377066612244, + "learning_rate": 1.99792491017749e-05, + "loss": 1.3516, + "step": 1668 + }, + { + "epoch": 0.49710530724697033, + "grad_norm": 0.16152755916118622, + "learning_rate": 1.9979186948410253e-05, + "loss": 1.3511, + "step": 1669 + }, + { + "epoch": 0.497403153446639, + "grad_norm": 0.17606371641159058, + "learning_rate": 1.997912470220042e-05, + "loss": 1.3558, + "step": 1670 + }, + { + "epoch": 0.49770099964630765, + "grad_norm": 0.15710416436195374, + "learning_rate": 1.9979062363145982e-05, + "loss": 1.3468, + "step": 1671 + }, + { + "epoch": 0.49799884584597626, + "grad_norm": 0.15560346841812134, + "learning_rate": 1.9978999931247514e-05, + "loss": 1.3348, + "step": 1672 + }, + { + "epoch": 0.4982966920456449, + "grad_norm": 0.16255123913288116, + "learning_rate": 1.9978937406505606e-05, + "loss": 1.3595, + "step": 1673 + }, + { + "epoch": 0.4985945382453136, + "grad_norm": 0.15652534365653992, + "learning_rate": 1.9978874788920834e-05, + "loss": 1.3554, + "step": 1674 + }, + { + "epoch": 0.49889238444498224, + "grad_norm": 0.16340097784996033, + "learning_rate": 1.997881207849378e-05, + "loss": 1.3469, + "step": 1675 + }, + { + "epoch": 0.49919023064465085, + "grad_norm": 0.1603468805551529, + "learning_rate": 1.9978749275225026e-05, + "loss": 1.3467, + "step": 1676 + }, + { + "epoch": 0.4994880768443195, + "grad_norm": 0.1565481275320053, + "learning_rate": 1.9978686379115163e-05, + "loss": 1.3547, + "step": 1677 + }, + { + "epoch": 0.49978592304398817, + "grad_norm": 0.15879780054092407, + "learning_rate": 1.997862339016477e-05, + "loss": 1.323, + "step": 1678 + }, + { + "epoch": 0.5000837692436568, + "grad_norm": 0.1717914640903473, + "learning_rate": 1.9978560308374436e-05, + "loss": 1.3532, + "step": 1679 + }, + { + "epoch": 0.5003816154433255, + "grad_norm": 0.15800116956233978, + "learning_rate": 1.9978497133744744e-05, + "loss": 1.3479, + "step": 1680 + }, + { + "epoch": 0.5006794616429942, + "grad_norm": 0.16885773837566376, + "learning_rate": 1.9978433866276288e-05, + "loss": 1.3539, + "step": 1681 + }, + { + "epoch": 0.5009773078426627, + "grad_norm": 0.16035017371177673, + "learning_rate": 1.9978370505969655e-05, + "loss": 1.3442, + "step": 1682 + }, + { + "epoch": 0.5012751540423314, + "grad_norm": 0.1676723062992096, + "learning_rate": 1.997830705282543e-05, + "loss": 1.3344, + "step": 1683 + }, + { + "epoch": 0.501573000242, + "grad_norm": 0.16242334246635437, + "learning_rate": 1.997824350684421e-05, + "loss": 1.3613, + "step": 1684 + }, + { + "epoch": 0.5018708464416687, + "grad_norm": 0.1588880866765976, + "learning_rate": 1.9978179868026575e-05, + "loss": 1.3472, + "step": 1685 + }, + { + "epoch": 0.5021686926413373, + "grad_norm": 0.1659652143716812, + "learning_rate": 1.9978116136373133e-05, + "loss": 1.3659, + "step": 1686 + }, + { + "epoch": 0.502466538841006, + "grad_norm": 0.17029404640197754, + "learning_rate": 1.9978052311884463e-05, + "loss": 1.3457, + "step": 1687 + }, + { + "epoch": 0.5027643850406747, + "grad_norm": 0.16028033196926117, + "learning_rate": 1.9977988394561165e-05, + "loss": 1.375, + "step": 1688 + }, + { + "epoch": 0.5030622312403432, + "grad_norm": 0.15996789932250977, + "learning_rate": 1.9977924384403836e-05, + "loss": 1.3547, + "step": 1689 + }, + { + "epoch": 0.5033600774400119, + "grad_norm": 0.15632674098014832, + "learning_rate": 1.9977860281413064e-05, + "loss": 1.3592, + "step": 1690 + }, + { + "epoch": 0.5036579236396805, + "grad_norm": 0.15774105489253998, + "learning_rate": 1.9977796085589453e-05, + "loss": 1.3592, + "step": 1691 + }, + { + "epoch": 0.5039557698393492, + "grad_norm": 0.16142427921295166, + "learning_rate": 1.99777317969336e-05, + "loss": 1.3548, + "step": 1692 + }, + { + "epoch": 0.5042536160390179, + "grad_norm": 0.15816092491149902, + "learning_rate": 1.9977667415446096e-05, + "loss": 1.3401, + "step": 1693 + }, + { + "epoch": 0.5045514622386865, + "grad_norm": 0.15842948853969574, + "learning_rate": 1.9977602941127546e-05, + "loss": 1.3418, + "step": 1694 + }, + { + "epoch": 0.5048493084383552, + "grad_norm": 0.16587843000888824, + "learning_rate": 1.9977538373978548e-05, + "loss": 1.3317, + "step": 1695 + }, + { + "epoch": 0.5051471546380238, + "grad_norm": 0.15935342013835907, + "learning_rate": 1.9977473713999703e-05, + "loss": 1.341, + "step": 1696 + }, + { + "epoch": 0.5054450008376924, + "grad_norm": 0.16304226219654083, + "learning_rate": 1.997740896119161e-05, + "loss": 1.3679, + "step": 1697 + }, + { + "epoch": 0.505742847037361, + "grad_norm": 0.1632426381111145, + "learning_rate": 1.997734411555488e-05, + "loss": 1.3499, + "step": 1698 + }, + { + "epoch": 0.5060406932370297, + "grad_norm": 0.1575668901205063, + "learning_rate": 1.9977279177090105e-05, + "loss": 1.3439, + "step": 1699 + }, + { + "epoch": 0.5063385394366984, + "grad_norm": 0.1599178910255432, + "learning_rate": 1.9977214145797898e-05, + "loss": 1.3397, + "step": 1700 + }, + { + "epoch": 0.506636385636367, + "grad_norm": 0.16731704771518707, + "learning_rate": 1.997714902167886e-05, + "loss": 1.3654, + "step": 1701 + }, + { + "epoch": 0.5069342318360357, + "grad_norm": 0.16202615201473236, + "learning_rate": 1.9977083804733595e-05, + "loss": 1.3339, + "step": 1702 + }, + { + "epoch": 0.5072320780357044, + "grad_norm": 0.16238966584205627, + "learning_rate": 1.9977018494962715e-05, + "loss": 1.3503, + "step": 1703 + }, + { + "epoch": 0.5075299242353729, + "grad_norm": 0.16439203917980194, + "learning_rate": 1.9976953092366825e-05, + "loss": 1.3343, + "step": 1704 + }, + { + "epoch": 0.5078277704350416, + "grad_norm": 0.15659132599830627, + "learning_rate": 1.9976887596946533e-05, + "loss": 1.3342, + "step": 1705 + }, + { + "epoch": 0.5081256166347102, + "grad_norm": 0.15232855081558228, + "learning_rate": 1.9976822008702445e-05, + "loss": 1.3394, + "step": 1706 + }, + { + "epoch": 0.5084234628343789, + "grad_norm": 0.16024251282215118, + "learning_rate": 1.9976756327635178e-05, + "loss": 1.3604, + "step": 1707 + }, + { + "epoch": 0.5087213090340476, + "grad_norm": 0.16114826500415802, + "learning_rate": 1.997669055374534e-05, + "loss": 1.3595, + "step": 1708 + }, + { + "epoch": 0.5090191552337162, + "grad_norm": 0.15960678458213806, + "learning_rate": 1.9976624687033543e-05, + "loss": 1.3651, + "step": 1709 + }, + { + "epoch": 0.5093170014333849, + "grad_norm": 0.16880513727664948, + "learning_rate": 1.99765587275004e-05, + "loss": 1.3491, + "step": 1710 + }, + { + "epoch": 0.5096148476330535, + "grad_norm": 0.16581284999847412, + "learning_rate": 1.9976492675146522e-05, + "loss": 1.3592, + "step": 1711 + }, + { + "epoch": 0.5099126938327221, + "grad_norm": 0.16696377098560333, + "learning_rate": 1.9976426529972532e-05, + "loss": 1.3508, + "step": 1712 + }, + { + "epoch": 0.5102105400323907, + "grad_norm": 0.16760198771953583, + "learning_rate": 1.9976360291979034e-05, + "loss": 1.3533, + "step": 1713 + }, + { + "epoch": 0.5105083862320594, + "grad_norm": 0.16133247315883636, + "learning_rate": 1.9976293961166654e-05, + "loss": 1.3476, + "step": 1714 + }, + { + "epoch": 0.5108062324317281, + "grad_norm": 0.1651097983121872, + "learning_rate": 1.9976227537536003e-05, + "loss": 1.3574, + "step": 1715 + }, + { + "epoch": 0.5111040786313967, + "grad_norm": 0.16226840019226074, + "learning_rate": 1.99761610210877e-05, + "loss": 1.3441, + "step": 1716 + }, + { + "epoch": 0.5114019248310654, + "grad_norm": 0.1548374593257904, + "learning_rate": 1.9976094411822364e-05, + "loss": 1.3421, + "step": 1717 + }, + { + "epoch": 0.511699771030734, + "grad_norm": 0.1616935282945633, + "learning_rate": 1.9976027709740616e-05, + "loss": 1.353, + "step": 1718 + }, + { + "epoch": 0.5119976172304026, + "grad_norm": 0.1775294840335846, + "learning_rate": 1.9975960914843075e-05, + "loss": 1.3583, + "step": 1719 + }, + { + "epoch": 0.5122954634300713, + "grad_norm": 0.17687584459781647, + "learning_rate": 1.9975894027130367e-05, + "loss": 1.3575, + "step": 1720 + }, + { + "epoch": 0.5125933096297399, + "grad_norm": 0.1619470715522766, + "learning_rate": 1.997582704660311e-05, + "loss": 1.3461, + "step": 1721 + }, + { + "epoch": 0.5128911558294086, + "grad_norm": 0.1800096482038498, + "learning_rate": 1.9975759973261924e-05, + "loss": 1.3674, + "step": 1722 + }, + { + "epoch": 0.5131890020290772, + "grad_norm": 0.178078293800354, + "learning_rate": 1.997569280710744e-05, + "loss": 1.3309, + "step": 1723 + }, + { + "epoch": 0.5134868482287459, + "grad_norm": 0.16881874203681946, + "learning_rate": 1.997562554814028e-05, + "loss": 1.3411, + "step": 1724 + }, + { + "epoch": 0.5137846944284146, + "grad_norm": 0.1666990965604782, + "learning_rate": 1.997555819636107e-05, + "loss": 1.3465, + "step": 1725 + }, + { + "epoch": 0.5140825406280832, + "grad_norm": 0.17623895406723022, + "learning_rate": 1.9975490751770436e-05, + "loss": 1.3122, + "step": 1726 + }, + { + "epoch": 0.5143803868277518, + "grad_norm": 0.17145301401615143, + "learning_rate": 1.9975423214369004e-05, + "loss": 1.3348, + "step": 1727 + }, + { + "epoch": 0.5146782330274204, + "grad_norm": 0.15384571254253387, + "learning_rate": 1.997535558415741e-05, + "loss": 1.3451, + "step": 1728 + }, + { + "epoch": 0.5149760792270891, + "grad_norm": 0.17020882666110992, + "learning_rate": 1.9975287861136272e-05, + "loss": 1.3414, + "step": 1729 + }, + { + "epoch": 0.5152739254267578, + "grad_norm": 0.17712000012397766, + "learning_rate": 1.9975220045306227e-05, + "loss": 1.3489, + "step": 1730 + }, + { + "epoch": 0.5155717716264264, + "grad_norm": 0.16047564148902893, + "learning_rate": 1.9975152136667902e-05, + "loss": 1.348, + "step": 1731 + }, + { + "epoch": 0.5158696178260951, + "grad_norm": 0.15981508791446686, + "learning_rate": 1.9975084135221933e-05, + "loss": 1.3523, + "step": 1732 + }, + { + "epoch": 0.5161674640257637, + "grad_norm": 0.1657506823539734, + "learning_rate": 1.9975016040968952e-05, + "loss": 1.3432, + "step": 1733 + }, + { + "epoch": 0.5164653102254323, + "grad_norm": 0.17202956974506378, + "learning_rate": 1.997494785390959e-05, + "loss": 1.3542, + "step": 1734 + }, + { + "epoch": 0.516763156425101, + "grad_norm": 0.16407586634159088, + "learning_rate": 1.9974879574044484e-05, + "loss": 1.3394, + "step": 1735 + }, + { + "epoch": 0.5170610026247696, + "grad_norm": 0.16045711934566498, + "learning_rate": 1.9974811201374267e-05, + "loss": 1.3498, + "step": 1736 + }, + { + "epoch": 0.5173588488244383, + "grad_norm": 0.17223873734474182, + "learning_rate": 1.997474273589958e-05, + "loss": 1.3409, + "step": 1737 + }, + { + "epoch": 0.5176566950241069, + "grad_norm": 0.15774434804916382, + "learning_rate": 1.9974674177621053e-05, + "loss": 1.3427, + "step": 1738 + }, + { + "epoch": 0.5179545412237756, + "grad_norm": 0.16195274889469147, + "learning_rate": 1.9974605526539326e-05, + "loss": 1.347, + "step": 1739 + }, + { + "epoch": 0.5182523874234443, + "grad_norm": 0.16589990258216858, + "learning_rate": 1.997453678265504e-05, + "loss": 1.3386, + "step": 1740 + }, + { + "epoch": 0.5185502336231128, + "grad_norm": 0.15620534121990204, + "learning_rate": 1.9974467945968835e-05, + "loss": 1.3648, + "step": 1741 + }, + { + "epoch": 0.5188480798227815, + "grad_norm": 0.16260677576065063, + "learning_rate": 1.997439901648135e-05, + "loss": 1.3383, + "step": 1742 + }, + { + "epoch": 0.5191459260224501, + "grad_norm": 0.16320344805717468, + "learning_rate": 1.9974329994193225e-05, + "loss": 1.3355, + "step": 1743 + }, + { + "epoch": 0.5194437722221188, + "grad_norm": 0.15730075538158417, + "learning_rate": 1.9974260879105104e-05, + "loss": 1.3454, + "step": 1744 + }, + { + "epoch": 0.5197416184217875, + "grad_norm": 0.17475290596485138, + "learning_rate": 1.9974191671217627e-05, + "loss": 1.3376, + "step": 1745 + }, + { + "epoch": 0.5200394646214561, + "grad_norm": 0.16563285887241364, + "learning_rate": 1.997412237053144e-05, + "loss": 1.3473, + "step": 1746 + }, + { + "epoch": 0.5203373108211248, + "grad_norm": 0.16842181980609894, + "learning_rate": 1.9974052977047193e-05, + "loss": 1.3772, + "step": 1747 + }, + { + "epoch": 0.5206351570207934, + "grad_norm": 0.17073404788970947, + "learning_rate": 1.9973983490765522e-05, + "loss": 1.3426, + "step": 1748 + }, + { + "epoch": 0.520933003220462, + "grad_norm": 0.1694171130657196, + "learning_rate": 1.997391391168708e-05, + "loss": 1.3568, + "step": 1749 + }, + { + "epoch": 0.5212308494201306, + "grad_norm": 0.16218890249729156, + "learning_rate": 1.9973844239812516e-05, + "loss": 1.3251, + "step": 1750 + }, + { + "epoch": 0.5215286956197993, + "grad_norm": 0.1597883403301239, + "learning_rate": 1.997377447514247e-05, + "loss": 1.3427, + "step": 1751 + }, + { + "epoch": 0.521826541819468, + "grad_norm": 0.16062845289707184, + "learning_rate": 1.9973704617677597e-05, + "loss": 1.3403, + "step": 1752 + }, + { + "epoch": 0.5221243880191366, + "grad_norm": 0.1634770780801773, + "learning_rate": 1.9973634667418548e-05, + "loss": 1.3456, + "step": 1753 + }, + { + "epoch": 0.5224222342188053, + "grad_norm": 0.16171061992645264, + "learning_rate": 1.997356462436597e-05, + "loss": 1.3569, + "step": 1754 + }, + { + "epoch": 0.522720080418474, + "grad_norm": 0.16133779287338257, + "learning_rate": 1.9973494488520514e-05, + "loss": 1.3392, + "step": 1755 + }, + { + "epoch": 0.5230179266181425, + "grad_norm": 0.15806788206100464, + "learning_rate": 1.9973424259882837e-05, + "loss": 1.3471, + "step": 1756 + }, + { + "epoch": 0.5233157728178112, + "grad_norm": 0.16078951954841614, + "learning_rate": 1.9973353938453592e-05, + "loss": 1.3389, + "step": 1757 + }, + { + "epoch": 0.5236136190174798, + "grad_norm": 0.16487789154052734, + "learning_rate": 1.9973283524233425e-05, + "loss": 1.3473, + "step": 1758 + }, + { + "epoch": 0.5239114652171485, + "grad_norm": 0.16977691650390625, + "learning_rate": 1.9973213017223005e-05, + "loss": 1.3508, + "step": 1759 + }, + { + "epoch": 0.5242093114168171, + "grad_norm": 0.15756413340568542, + "learning_rate": 1.9973142417422978e-05, + "loss": 1.3423, + "step": 1760 + }, + { + "epoch": 0.5245071576164858, + "grad_norm": 0.1605219691991806, + "learning_rate": 1.9973071724834002e-05, + "loss": 1.3485, + "step": 1761 + }, + { + "epoch": 0.5248050038161545, + "grad_norm": 0.15755310654640198, + "learning_rate": 1.997300093945674e-05, + "loss": 1.351, + "step": 1762 + }, + { + "epoch": 0.5251028500158231, + "grad_norm": 0.15491753816604614, + "learning_rate": 1.9972930061291845e-05, + "loss": 1.3274, + "step": 1763 + }, + { + "epoch": 0.5254006962154917, + "grad_norm": 0.16542217135429382, + "learning_rate": 1.9972859090339975e-05, + "loss": 1.3576, + "step": 1764 + }, + { + "epoch": 0.5256985424151603, + "grad_norm": 0.15959101915359497, + "learning_rate": 1.9972788026601798e-05, + "loss": 1.3443, + "step": 1765 + }, + { + "epoch": 0.525996388614829, + "grad_norm": 0.16338451206684113, + "learning_rate": 1.9972716870077966e-05, + "loss": 1.3387, + "step": 1766 + }, + { + "epoch": 0.5262942348144977, + "grad_norm": 0.15877531468868256, + "learning_rate": 1.9972645620769148e-05, + "loss": 1.3399, + "step": 1767 + }, + { + "epoch": 0.5265920810141663, + "grad_norm": 0.1649436354637146, + "learning_rate": 1.9972574278676006e-05, + "loss": 1.3414, + "step": 1768 + }, + { + "epoch": 0.526889927213835, + "grad_norm": 0.15953277051448822, + "learning_rate": 1.9972502843799204e-05, + "loss": 1.3607, + "step": 1769 + }, + { + "epoch": 0.5271877734135036, + "grad_norm": 0.16934038698673248, + "learning_rate": 1.99724313161394e-05, + "loss": 1.3285, + "step": 1770 + }, + { + "epoch": 0.5274856196131722, + "grad_norm": 0.16570153832435608, + "learning_rate": 1.9972359695697267e-05, + "loss": 1.3334, + "step": 1771 + }, + { + "epoch": 0.5277834658128409, + "grad_norm": 0.16746912896633148, + "learning_rate": 1.9972287982473468e-05, + "loss": 1.3472, + "step": 1772 + }, + { + "epoch": 0.5280813120125095, + "grad_norm": 0.15530602633953094, + "learning_rate": 1.997221617646867e-05, + "loss": 1.3438, + "step": 1773 + }, + { + "epoch": 0.5283791582121782, + "grad_norm": 0.15870961546897888, + "learning_rate": 1.9972144277683545e-05, + "loss": 1.3346, + "step": 1774 + }, + { + "epoch": 0.5286770044118468, + "grad_norm": 0.1621575653553009, + "learning_rate": 1.997207228611876e-05, + "loss": 1.3416, + "step": 1775 + }, + { + "epoch": 0.5289748506115155, + "grad_norm": 0.16383720934391022, + "learning_rate": 1.997200020177498e-05, + "loss": 1.3502, + "step": 1776 + }, + { + "epoch": 0.5292726968111842, + "grad_norm": 0.17416946589946747, + "learning_rate": 1.997192802465288e-05, + "loss": 1.3438, + "step": 1777 + }, + { + "epoch": 0.5295705430108528, + "grad_norm": 0.16202545166015625, + "learning_rate": 1.9971855754753134e-05, + "loss": 1.37, + "step": 1778 + }, + { + "epoch": 0.5298683892105214, + "grad_norm": 0.17150531709194183, + "learning_rate": 1.9971783392076407e-05, + "loss": 1.3343, + "step": 1779 + }, + { + "epoch": 0.53016623541019, + "grad_norm": 0.16482296586036682, + "learning_rate": 1.997171093662338e-05, + "loss": 1.3461, + "step": 1780 + }, + { + "epoch": 0.5304640816098587, + "grad_norm": 0.1628810316324234, + "learning_rate": 1.9971638388394724e-05, + "loss": 1.3454, + "step": 1781 + }, + { + "epoch": 0.5307619278095274, + "grad_norm": 0.17356710135936737, + "learning_rate": 1.997156574739111e-05, + "loss": 1.3525, + "step": 1782 + }, + { + "epoch": 0.531059774009196, + "grad_norm": 0.17725849151611328, + "learning_rate": 1.997149301361322e-05, + "loss": 1.3641, + "step": 1783 + }, + { + "epoch": 0.5313576202088647, + "grad_norm": 0.15778744220733643, + "learning_rate": 1.997142018706173e-05, + "loss": 1.3506, + "step": 1784 + }, + { + "epoch": 0.5316554664085333, + "grad_norm": 0.17386850714683533, + "learning_rate": 1.9971347267737314e-05, + "loss": 1.3454, + "step": 1785 + }, + { + "epoch": 0.5319533126082019, + "grad_norm": 0.1833541989326477, + "learning_rate": 1.9971274255640654e-05, + "loss": 1.3333, + "step": 1786 + }, + { + "epoch": 0.5322511588078706, + "grad_norm": 0.16736625134944916, + "learning_rate": 1.9971201150772426e-05, + "loss": 1.345, + "step": 1787 + }, + { + "epoch": 0.5325490050075392, + "grad_norm": 0.16242630779743195, + "learning_rate": 1.9971127953133314e-05, + "loss": 1.3425, + "step": 1788 + }, + { + "epoch": 0.5328468512072079, + "grad_norm": 0.16493670642375946, + "learning_rate": 1.9971054662723996e-05, + "loss": 1.3522, + "step": 1789 + }, + { + "epoch": 0.5331446974068765, + "grad_norm": 0.17540118098258972, + "learning_rate": 1.997098127954515e-05, + "loss": 1.3214, + "step": 1790 + }, + { + "epoch": 0.5334425436065452, + "grad_norm": 0.19233962893486023, + "learning_rate": 1.997090780359747e-05, + "loss": 1.3451, + "step": 1791 + }, + { + "epoch": 0.5337403898062139, + "grad_norm": 0.16943323612213135, + "learning_rate": 1.9970834234881628e-05, + "loss": 1.3303, + "step": 1792 + }, + { + "epoch": 0.5340382360058825, + "grad_norm": 0.15818580985069275, + "learning_rate": 1.9970760573398316e-05, + "loss": 1.3337, + "step": 1793 + }, + { + "epoch": 0.5343360822055511, + "grad_norm": 0.17102812230587006, + "learning_rate": 1.9970686819148216e-05, + "loss": 1.3577, + "step": 1794 + }, + { + "epoch": 0.5346339284052197, + "grad_norm": 0.17434453964233398, + "learning_rate": 1.9970612972132017e-05, + "loss": 1.3541, + "step": 1795 + }, + { + "epoch": 0.5349317746048884, + "grad_norm": 0.16895638406276703, + "learning_rate": 1.99705390323504e-05, + "loss": 1.3363, + "step": 1796 + }, + { + "epoch": 0.535229620804557, + "grad_norm": 0.1675790697336197, + "learning_rate": 1.9970464999804063e-05, + "loss": 1.3452, + "step": 1797 + }, + { + "epoch": 0.5355274670042257, + "grad_norm": 0.1745259016752243, + "learning_rate": 1.9970390874493685e-05, + "loss": 1.3344, + "step": 1798 + }, + { + "epoch": 0.5358253132038944, + "grad_norm": 0.16959160566329956, + "learning_rate": 1.9970316656419958e-05, + "loss": 1.3404, + "step": 1799 + }, + { + "epoch": 0.536123159403563, + "grad_norm": 0.1652490347623825, + "learning_rate": 1.9970242345583573e-05, + "loss": 1.3326, + "step": 1800 + }, + { + "epoch": 0.5364210056032316, + "grad_norm": 0.15833565592765808, + "learning_rate": 1.997016794198523e-05, + "loss": 1.3373, + "step": 1801 + }, + { + "epoch": 0.5367188518029002, + "grad_norm": 0.1646917462348938, + "learning_rate": 1.9970093445625607e-05, + "loss": 1.3328, + "step": 1802 + }, + { + "epoch": 0.5370166980025689, + "grad_norm": 0.17969022691249847, + "learning_rate": 1.9970018856505408e-05, + "loss": 1.3521, + "step": 1803 + }, + { + "epoch": 0.5373145442022376, + "grad_norm": 0.16079260408878326, + "learning_rate": 1.996994417462532e-05, + "loss": 1.3534, + "step": 1804 + }, + { + "epoch": 0.5376123904019062, + "grad_norm": 0.16265247762203217, + "learning_rate": 1.9969869399986043e-05, + "loss": 1.3398, + "step": 1805 + }, + { + "epoch": 0.5379102366015749, + "grad_norm": 0.1635185331106186, + "learning_rate": 1.9969794532588266e-05, + "loss": 1.3457, + "step": 1806 + }, + { + "epoch": 0.5382080828012435, + "grad_norm": 0.16827929019927979, + "learning_rate": 1.996971957243269e-05, + "loss": 1.3486, + "step": 1807 + }, + { + "epoch": 0.5385059290009122, + "grad_norm": 0.15943558514118195, + "learning_rate": 1.9969644519520014e-05, + "loss": 1.3529, + "step": 1808 + }, + { + "epoch": 0.5388037752005808, + "grad_norm": 0.15358304977416992, + "learning_rate": 1.9969569373850935e-05, + "loss": 1.3428, + "step": 1809 + }, + { + "epoch": 0.5391016214002494, + "grad_norm": 0.1591339260339737, + "learning_rate": 1.9969494135426155e-05, + "loss": 1.3291, + "step": 1810 + }, + { + "epoch": 0.5393994675999181, + "grad_norm": 0.16745908558368683, + "learning_rate": 1.9969418804246367e-05, + "loss": 1.3446, + "step": 1811 + }, + { + "epoch": 0.5396973137995867, + "grad_norm": 0.17378199100494385, + "learning_rate": 1.996934338031227e-05, + "loss": 1.3346, + "step": 1812 + }, + { + "epoch": 0.5399951599992554, + "grad_norm": 0.16124950349330902, + "learning_rate": 1.996926786362458e-05, + "loss": 1.3564, + "step": 1813 + }, + { + "epoch": 0.5402930061989241, + "grad_norm": 0.17586320638656616, + "learning_rate": 1.996919225418399e-05, + "loss": 1.3647, + "step": 1814 + }, + { + "epoch": 0.5405908523985927, + "grad_norm": 0.1691247671842575, + "learning_rate": 1.9969116551991197e-05, + "loss": 1.3437, + "step": 1815 + }, + { + "epoch": 0.5408886985982613, + "grad_norm": 0.1737504005432129, + "learning_rate": 1.996904075704692e-05, + "loss": 1.3238, + "step": 1816 + }, + { + "epoch": 0.5411865447979299, + "grad_norm": 0.15709450840950012, + "learning_rate": 1.9968964869351855e-05, + "loss": 1.3419, + "step": 1817 + }, + { + "epoch": 0.5414843909975986, + "grad_norm": 0.16622452437877655, + "learning_rate": 1.9968888888906707e-05, + "loss": 1.3457, + "step": 1818 + }, + { + "epoch": 0.5417822371972673, + "grad_norm": 0.16880926489830017, + "learning_rate": 1.996881281571219e-05, + "loss": 1.3461, + "step": 1819 + }, + { + "epoch": 0.5420800833969359, + "grad_norm": 0.17418015003204346, + "learning_rate": 1.9968736649769005e-05, + "loss": 1.3623, + "step": 1820 + }, + { + "epoch": 0.5423779295966046, + "grad_norm": 0.17431408166885376, + "learning_rate": 1.9968660391077864e-05, + "loss": 1.3654, + "step": 1821 + }, + { + "epoch": 0.5426757757962732, + "grad_norm": 0.16442149877548218, + "learning_rate": 1.9968584039639475e-05, + "loss": 1.3441, + "step": 1822 + }, + { + "epoch": 0.5429736219959418, + "grad_norm": 0.17375442385673523, + "learning_rate": 1.996850759545455e-05, + "loss": 1.3502, + "step": 1823 + }, + { + "epoch": 0.5432714681956105, + "grad_norm": 0.1641933023929596, + "learning_rate": 1.99684310585238e-05, + "loss": 1.3461, + "step": 1824 + }, + { + "epoch": 0.5435693143952791, + "grad_norm": 0.1642332673072815, + "learning_rate": 1.9968354428847934e-05, + "loss": 1.3395, + "step": 1825 + }, + { + "epoch": 0.5438671605949478, + "grad_norm": 0.179438978433609, + "learning_rate": 1.9968277706427667e-05, + "loss": 1.3522, + "step": 1826 + }, + { + "epoch": 0.5441650067946164, + "grad_norm": 0.16452936828136444, + "learning_rate": 1.9968200891263717e-05, + "loss": 1.3381, + "step": 1827 + }, + { + "epoch": 0.5444628529942851, + "grad_norm": 0.16768696904182434, + "learning_rate": 1.9968123983356794e-05, + "loss": 1.3439, + "step": 1828 + }, + { + "epoch": 0.5447606991939538, + "grad_norm": 0.17807155847549438, + "learning_rate": 1.996804698270761e-05, + "loss": 1.3341, + "step": 1829 + }, + { + "epoch": 0.5450585453936224, + "grad_norm": 0.17662352323532104, + "learning_rate": 1.996796988931689e-05, + "loss": 1.3335, + "step": 1830 + }, + { + "epoch": 0.545356391593291, + "grad_norm": 0.16839559376239777, + "learning_rate": 1.9967892703185344e-05, + "loss": 1.3476, + "step": 1831 + }, + { + "epoch": 0.5456542377929596, + "grad_norm": 0.1740187257528305, + "learning_rate": 1.9967815424313697e-05, + "loss": 1.3451, + "step": 1832 + }, + { + "epoch": 0.5459520839926283, + "grad_norm": 0.1738300770521164, + "learning_rate": 1.9967738052702664e-05, + "loss": 1.3345, + "step": 1833 + }, + { + "epoch": 0.546249930192297, + "grad_norm": 0.17655032873153687, + "learning_rate": 1.996766058835296e-05, + "loss": 1.3256, + "step": 1834 + }, + { + "epoch": 0.5465477763919656, + "grad_norm": 0.16478820145130157, + "learning_rate": 1.9967583031265313e-05, + "loss": 1.348, + "step": 1835 + }, + { + "epoch": 0.5468456225916343, + "grad_norm": 0.16392803192138672, + "learning_rate": 1.9967505381440446e-05, + "loss": 1.3548, + "step": 1836 + }, + { + "epoch": 0.5471434687913029, + "grad_norm": 0.17150482535362244, + "learning_rate": 1.9967427638879075e-05, + "loss": 1.3404, + "step": 1837 + }, + { + "epoch": 0.5474413149909715, + "grad_norm": 0.17070432007312775, + "learning_rate": 1.9967349803581924e-05, + "loss": 1.3376, + "step": 1838 + }, + { + "epoch": 0.5477391611906401, + "grad_norm": 0.16423381865024567, + "learning_rate": 1.9967271875549726e-05, + "loss": 1.3388, + "step": 1839 + }, + { + "epoch": 0.5480370073903088, + "grad_norm": 0.16895762085914612, + "learning_rate": 1.9967193854783192e-05, + "loss": 1.3328, + "step": 1840 + }, + { + "epoch": 0.5483348535899775, + "grad_norm": 0.16752618551254272, + "learning_rate": 1.9967115741283057e-05, + "loss": 1.347, + "step": 1841 + }, + { + "epoch": 0.5486326997896461, + "grad_norm": 0.16780757904052734, + "learning_rate": 1.996703753505005e-05, + "loss": 1.329, + "step": 1842 + }, + { + "epoch": 0.5489305459893148, + "grad_norm": 0.16428565979003906, + "learning_rate": 1.996695923608489e-05, + "loss": 1.346, + "step": 1843 + }, + { + "epoch": 0.5492283921889834, + "grad_norm": 0.172488272190094, + "learning_rate": 1.996688084438831e-05, + "loss": 1.3474, + "step": 1844 + }, + { + "epoch": 0.5495262383886521, + "grad_norm": 0.16258852183818817, + "learning_rate": 1.9966802359961042e-05, + "loss": 1.3384, + "step": 1845 + }, + { + "epoch": 0.5498240845883207, + "grad_norm": 0.16376498341560364, + "learning_rate": 1.996672378280381e-05, + "loss": 1.3475, + "step": 1846 + }, + { + "epoch": 0.5501219307879893, + "grad_norm": 0.1541081815958023, + "learning_rate": 1.996664511291735e-05, + "loss": 1.35, + "step": 1847 + }, + { + "epoch": 0.550419776987658, + "grad_norm": 0.16525718569755554, + "learning_rate": 1.9966566350302398e-05, + "loss": 1.3377, + "step": 1848 + }, + { + "epoch": 0.5507176231873266, + "grad_norm": 0.16358032822608948, + "learning_rate": 1.9966487494959678e-05, + "loss": 1.3246, + "step": 1849 + }, + { + "epoch": 0.5510154693869953, + "grad_norm": 0.15843936800956726, + "learning_rate": 1.9966408546889924e-05, + "loss": 1.3415, + "step": 1850 + }, + { + "epoch": 0.551313315586664, + "grad_norm": 0.1719946563243866, + "learning_rate": 1.9966329506093876e-05, + "loss": 1.3264, + "step": 1851 + }, + { + "epoch": 0.5516111617863326, + "grad_norm": 0.1743927001953125, + "learning_rate": 1.9966250372572265e-05, + "loss": 1.3387, + "step": 1852 + }, + { + "epoch": 0.5519090079860012, + "grad_norm": 0.17628605663776398, + "learning_rate": 1.9966171146325832e-05, + "loss": 1.3414, + "step": 1853 + }, + { + "epoch": 0.5522068541856698, + "grad_norm": 0.16940726339817047, + "learning_rate": 1.9966091827355312e-05, + "loss": 1.3334, + "step": 1854 + }, + { + "epoch": 0.5525047003853385, + "grad_norm": 0.16338200867176056, + "learning_rate": 1.9966012415661444e-05, + "loss": 1.3202, + "step": 1855 + }, + { + "epoch": 0.5528025465850072, + "grad_norm": 0.15881620347499847, + "learning_rate": 1.996593291124496e-05, + "loss": 1.342, + "step": 1856 + }, + { + "epoch": 0.5531003927846758, + "grad_norm": 0.16228406131267548, + "learning_rate": 1.9965853314106606e-05, + "loss": 1.3443, + "step": 1857 + }, + { + "epoch": 0.5533982389843445, + "grad_norm": 0.16766801476478577, + "learning_rate": 1.996577362424712e-05, + "loss": 1.3513, + "step": 1858 + }, + { + "epoch": 0.5536960851840131, + "grad_norm": 0.16810661554336548, + "learning_rate": 1.996569384166725e-05, + "loss": 1.3309, + "step": 1859 + }, + { + "epoch": 0.5539939313836818, + "grad_norm": 0.1693679839372635, + "learning_rate": 1.9965613966367726e-05, + "loss": 1.3379, + "step": 1860 + }, + { + "epoch": 0.5542917775833504, + "grad_norm": 0.15733188390731812, + "learning_rate": 1.9965533998349306e-05, + "loss": 1.3275, + "step": 1861 + }, + { + "epoch": 0.554589623783019, + "grad_norm": 0.16854235529899597, + "learning_rate": 1.996545393761272e-05, + "loss": 1.351, + "step": 1862 + }, + { + "epoch": 0.5548874699826877, + "grad_norm": 0.16890855133533478, + "learning_rate": 1.9965373784158725e-05, + "loss": 1.3444, + "step": 1863 + }, + { + "epoch": 0.5551853161823563, + "grad_norm": 0.17377708852291107, + "learning_rate": 1.9965293537988058e-05, + "loss": 1.3455, + "step": 1864 + }, + { + "epoch": 0.555483162382025, + "grad_norm": 0.17160733044147491, + "learning_rate": 1.9965213199101466e-05, + "loss": 1.3326, + "step": 1865 + }, + { + "epoch": 0.5557810085816937, + "grad_norm": 0.17008385062217712, + "learning_rate": 1.99651327674997e-05, + "loss": 1.3565, + "step": 1866 + }, + { + "epoch": 0.5560788547813623, + "grad_norm": 0.16323962807655334, + "learning_rate": 1.996505224318351e-05, + "loss": 1.3564, + "step": 1867 + }, + { + "epoch": 0.5563767009810309, + "grad_norm": 0.16453659534454346, + "learning_rate": 1.9964971626153643e-05, + "loss": 1.3408, + "step": 1868 + }, + { + "epoch": 0.5566745471806995, + "grad_norm": 0.17312097549438477, + "learning_rate": 1.9964890916410846e-05, + "loss": 1.3462, + "step": 1869 + }, + { + "epoch": 0.5569723933803682, + "grad_norm": 0.17335082590579987, + "learning_rate": 1.9964810113955872e-05, + "loss": 1.3358, + "step": 1870 + }, + { + "epoch": 0.5572702395800369, + "grad_norm": 0.17735369503498077, + "learning_rate": 1.9964729218789472e-05, + "loss": 1.3436, + "step": 1871 + }, + { + "epoch": 0.5575680857797055, + "grad_norm": 0.17080743610858917, + "learning_rate": 1.9964648230912406e-05, + "loss": 1.3452, + "step": 1872 + }, + { + "epoch": 0.5578659319793742, + "grad_norm": 0.17747481167316437, + "learning_rate": 1.9964567150325416e-05, + "loss": 1.3336, + "step": 1873 + }, + { + "epoch": 0.5581637781790428, + "grad_norm": 0.1720564216375351, + "learning_rate": 1.9964485977029263e-05, + "loss": 1.33, + "step": 1874 + }, + { + "epoch": 0.5584616243787115, + "grad_norm": 0.16504305601119995, + "learning_rate": 1.9964404711024703e-05, + "loss": 1.348, + "step": 1875 + }, + { + "epoch": 0.55875947057838, + "grad_norm": 0.1772458851337433, + "learning_rate": 1.9964323352312486e-05, + "loss": 1.3474, + "step": 1876 + }, + { + "epoch": 0.5590573167780487, + "grad_norm": 0.1816224902868271, + "learning_rate": 1.9964241900893377e-05, + "loss": 1.3217, + "step": 1877 + }, + { + "epoch": 0.5593551629777174, + "grad_norm": 0.1705823540687561, + "learning_rate": 1.9964160356768128e-05, + "loss": 1.3346, + "step": 1878 + }, + { + "epoch": 0.559653009177386, + "grad_norm": 0.1707306206226349, + "learning_rate": 1.9964078719937497e-05, + "loss": 1.3369, + "step": 1879 + }, + { + "epoch": 0.5599508553770547, + "grad_norm": 0.17467527091503143, + "learning_rate": 1.9963996990402252e-05, + "loss": 1.3468, + "step": 1880 + }, + { + "epoch": 0.5602487015767234, + "grad_norm": 0.17778559029102325, + "learning_rate": 1.9963915168163143e-05, + "loss": 1.3525, + "step": 1881 + }, + { + "epoch": 0.560546547776392, + "grad_norm": 0.17405925691127777, + "learning_rate": 1.9963833253220937e-05, + "loss": 1.3235, + "step": 1882 + }, + { + "epoch": 0.5608443939760606, + "grad_norm": 0.1665666401386261, + "learning_rate": 1.9963751245576396e-05, + "loss": 1.3342, + "step": 1883 + }, + { + "epoch": 0.5611422401757292, + "grad_norm": 0.16782276332378387, + "learning_rate": 1.996366914523028e-05, + "loss": 1.3436, + "step": 1884 + }, + { + "epoch": 0.5614400863753979, + "grad_norm": 0.16133250296115875, + "learning_rate": 1.9963586952183355e-05, + "loss": 1.3572, + "step": 1885 + }, + { + "epoch": 0.5617379325750665, + "grad_norm": 0.17694924771785736, + "learning_rate": 1.9963504666436386e-05, + "loss": 1.3491, + "step": 1886 + }, + { + "epoch": 0.5620357787747352, + "grad_norm": 0.17527422308921814, + "learning_rate": 1.9963422287990134e-05, + "loss": 1.3376, + "step": 1887 + }, + { + "epoch": 0.5623336249744039, + "grad_norm": 0.17014721035957336, + "learning_rate": 1.9963339816845377e-05, + "loss": 1.3289, + "step": 1888 + }, + { + "epoch": 0.5626314711740725, + "grad_norm": 0.164009228348732, + "learning_rate": 1.9963257253002868e-05, + "loss": 1.3403, + "step": 1889 + }, + { + "epoch": 0.5629293173737412, + "grad_norm": 0.2767203450202942, + "learning_rate": 1.9963174596463387e-05, + "loss": 1.3266, + "step": 1890 + }, + { + "epoch": 0.5632271635734097, + "grad_norm": 0.17292539775371552, + "learning_rate": 1.9963091847227694e-05, + "loss": 1.3648, + "step": 1891 + }, + { + "epoch": 0.5635250097730784, + "grad_norm": 0.17129047214984894, + "learning_rate": 1.9963009005296565e-05, + "loss": 1.335, + "step": 1892 + }, + { + "epoch": 0.5638228559727471, + "grad_norm": 0.16882653534412384, + "learning_rate": 1.9962926070670767e-05, + "loss": 1.3381, + "step": 1893 + }, + { + "epoch": 0.5641207021724157, + "grad_norm": 0.17744049429893494, + "learning_rate": 1.996284304335107e-05, + "loss": 1.3374, + "step": 1894 + }, + { + "epoch": 0.5644185483720844, + "grad_norm": 0.17141692340373993, + "learning_rate": 1.996275992333826e-05, + "loss": 1.3348, + "step": 1895 + }, + { + "epoch": 0.564716394571753, + "grad_norm": 0.16697026789188385, + "learning_rate": 1.9962676710633093e-05, + "loss": 1.332, + "step": 1896 + }, + { + "epoch": 0.5650142407714217, + "grad_norm": 0.1682598888874054, + "learning_rate": 1.996259340523635e-05, + "loss": 1.3302, + "step": 1897 + }, + { + "epoch": 0.5653120869710903, + "grad_norm": 0.1695362627506256, + "learning_rate": 1.9962510007148807e-05, + "loss": 1.3294, + "step": 1898 + }, + { + "epoch": 0.5656099331707589, + "grad_norm": 0.16629473865032196, + "learning_rate": 1.9962426516371236e-05, + "loss": 1.3422, + "step": 1899 + }, + { + "epoch": 0.5659077793704276, + "grad_norm": 0.16506071388721466, + "learning_rate": 1.996234293290442e-05, + "loss": 1.3388, + "step": 1900 + }, + { + "epoch": 0.5662056255700962, + "grad_norm": 0.17205072939395905, + "learning_rate": 1.9962259256749135e-05, + "loss": 1.3316, + "step": 1901 + }, + { + "epoch": 0.5665034717697649, + "grad_norm": 0.16647745668888092, + "learning_rate": 1.9962175487906155e-05, + "loss": 1.3259, + "step": 1902 + }, + { + "epoch": 0.5668013179694336, + "grad_norm": 0.16938139498233795, + "learning_rate": 1.9962091626376265e-05, + "loss": 1.3318, + "step": 1903 + }, + { + "epoch": 0.5670991641691022, + "grad_norm": 0.17617148160934448, + "learning_rate": 1.9962007672160243e-05, + "loss": 1.3444, + "step": 1904 + }, + { + "epoch": 0.5673970103687708, + "grad_norm": 0.1727476865053177, + "learning_rate": 1.9961923625258867e-05, + "loss": 1.35, + "step": 1905 + }, + { + "epoch": 0.5676948565684394, + "grad_norm": 0.16883544623851776, + "learning_rate": 1.9961839485672923e-05, + "loss": 1.342, + "step": 1906 + }, + { + "epoch": 0.5679927027681081, + "grad_norm": 0.17220336198806763, + "learning_rate": 1.9961755253403194e-05, + "loss": 1.334, + "step": 1907 + }, + { + "epoch": 0.5682905489677768, + "grad_norm": 0.1599874645471573, + "learning_rate": 1.996167092845046e-05, + "loss": 1.339, + "step": 1908 + }, + { + "epoch": 0.5685883951674454, + "grad_norm": 0.16740910708904266, + "learning_rate": 1.9961586510815508e-05, + "loss": 1.3516, + "step": 1909 + }, + { + "epoch": 0.5688862413671141, + "grad_norm": 0.181361585855484, + "learning_rate": 1.9961502000499127e-05, + "loss": 1.3496, + "step": 1910 + }, + { + "epoch": 0.5691840875667827, + "grad_norm": 0.17191778123378754, + "learning_rate": 1.9961417397502098e-05, + "loss": 1.3273, + "step": 1911 + }, + { + "epoch": 0.5694819337664514, + "grad_norm": 0.1787191778421402, + "learning_rate": 1.9961332701825207e-05, + "loss": 1.3398, + "step": 1912 + }, + { + "epoch": 0.56977977996612, + "grad_norm": 0.1790206879377365, + "learning_rate": 1.9961247913469244e-05, + "loss": 1.343, + "step": 1913 + }, + { + "epoch": 0.5700776261657886, + "grad_norm": 0.17674620449543, + "learning_rate": 1.9961163032435006e-05, + "loss": 1.331, + "step": 1914 + }, + { + "epoch": 0.5703754723654573, + "grad_norm": 0.17626053094863892, + "learning_rate": 1.9961078058723267e-05, + "loss": 1.336, + "step": 1915 + }, + { + "epoch": 0.5706733185651259, + "grad_norm": 0.16902464628219604, + "learning_rate": 1.9960992992334828e-05, + "loss": 1.3333, + "step": 1916 + }, + { + "epoch": 0.5709711647647946, + "grad_norm": 0.16353672742843628, + "learning_rate": 1.996090783327048e-05, + "loss": 1.3385, + "step": 1917 + }, + { + "epoch": 0.5712690109644633, + "grad_norm": 0.16979794204235077, + "learning_rate": 1.9960822581531013e-05, + "loss": 1.3418, + "step": 1918 + }, + { + "epoch": 0.5715668571641319, + "grad_norm": 0.17273689806461334, + "learning_rate": 1.996073723711722e-05, + "loss": 1.34, + "step": 1919 + }, + { + "epoch": 0.5718647033638005, + "grad_norm": 0.17751668393611908, + "learning_rate": 1.99606518000299e-05, + "loss": 1.3294, + "step": 1920 + }, + { + "epoch": 0.5721625495634691, + "grad_norm": 0.1674806773662567, + "learning_rate": 1.9960566270269837e-05, + "loss": 1.3532, + "step": 1921 + }, + { + "epoch": 0.5724603957631378, + "grad_norm": 0.17393875122070312, + "learning_rate": 1.9960480647837837e-05, + "loss": 1.3477, + "step": 1922 + }, + { + "epoch": 0.5727582419628064, + "grad_norm": 0.17875970900058746, + "learning_rate": 1.9960394932734694e-05, + "loss": 1.3456, + "step": 1923 + }, + { + "epoch": 0.5730560881624751, + "grad_norm": 0.17763720452785492, + "learning_rate": 1.9960309124961203e-05, + "loss": 1.3415, + "step": 1924 + }, + { + "epoch": 0.5733539343621438, + "grad_norm": 0.17421069741249084, + "learning_rate": 1.9960223224518163e-05, + "loss": 1.3462, + "step": 1925 + }, + { + "epoch": 0.5736517805618124, + "grad_norm": 0.17774200439453125, + "learning_rate": 1.9960137231406372e-05, + "loss": 1.3272, + "step": 1926 + }, + { + "epoch": 0.5739496267614811, + "grad_norm": 0.1704302579164505, + "learning_rate": 1.996005114562664e-05, + "loss": 1.3304, + "step": 1927 + }, + { + "epoch": 0.5742474729611496, + "grad_norm": 0.17656168341636658, + "learning_rate": 1.9959964967179753e-05, + "loss": 1.3452, + "step": 1928 + }, + { + "epoch": 0.5745453191608183, + "grad_norm": 0.1732204705476761, + "learning_rate": 1.995987869606652e-05, + "loss": 1.3266, + "step": 1929 + }, + { + "epoch": 0.574843165360487, + "grad_norm": 0.17743700742721558, + "learning_rate": 1.995979233228775e-05, + "loss": 1.3331, + "step": 1930 + }, + { + "epoch": 0.5751410115601556, + "grad_norm": 0.18953180313110352, + "learning_rate": 1.9959705875844233e-05, + "loss": 1.3649, + "step": 1931 + }, + { + "epoch": 0.5754388577598243, + "grad_norm": 0.161458820104599, + "learning_rate": 1.9959619326736786e-05, + "loss": 1.339, + "step": 1932 + }, + { + "epoch": 0.575736703959493, + "grad_norm": 0.16367121040821075, + "learning_rate": 1.9959532684966205e-05, + "loss": 1.3535, + "step": 1933 + }, + { + "epoch": 0.5760345501591616, + "grad_norm": 0.1716615855693817, + "learning_rate": 1.99594459505333e-05, + "loss": 1.3387, + "step": 1934 + }, + { + "epoch": 0.5763323963588302, + "grad_norm": 0.16995373368263245, + "learning_rate": 1.995935912343888e-05, + "loss": 1.3326, + "step": 1935 + }, + { + "epoch": 0.5766302425584988, + "grad_norm": 0.1661783903837204, + "learning_rate": 1.9959272203683747e-05, + "loss": 1.3244, + "step": 1936 + }, + { + "epoch": 0.5769280887581675, + "grad_norm": 0.17309847474098206, + "learning_rate": 1.995918519126872e-05, + "loss": 1.3429, + "step": 1937 + }, + { + "epoch": 0.5772259349578361, + "grad_norm": 0.18650607764720917, + "learning_rate": 1.9959098086194596e-05, + "loss": 1.349, + "step": 1938 + }, + { + "epoch": 0.5775237811575048, + "grad_norm": 0.1782013326883316, + "learning_rate": 1.9959010888462193e-05, + "loss": 1.3412, + "step": 1939 + }, + { + "epoch": 0.5778216273571735, + "grad_norm": 0.17299634218215942, + "learning_rate": 1.9958923598072318e-05, + "loss": 1.3142, + "step": 1940 + }, + { + "epoch": 0.5781194735568421, + "grad_norm": 0.17703306674957275, + "learning_rate": 1.995883621502579e-05, + "loss": 1.3403, + "step": 1941 + }, + { + "epoch": 0.5784173197565108, + "grad_norm": 0.16884054243564606, + "learning_rate": 1.9958748739323415e-05, + "loss": 1.3194, + "step": 1942 + }, + { + "epoch": 0.5787151659561793, + "grad_norm": 0.18963994085788727, + "learning_rate": 1.995866117096601e-05, + "loss": 1.346, + "step": 1943 + }, + { + "epoch": 0.579013012155848, + "grad_norm": 0.18116572499275208, + "learning_rate": 1.9958573509954392e-05, + "loss": 1.3631, + "step": 1944 + }, + { + "epoch": 0.5793108583555167, + "grad_norm": 0.17018160223960876, + "learning_rate": 1.995848575628937e-05, + "loss": 1.342, + "step": 1945 + }, + { + "epoch": 0.5796087045551853, + "grad_norm": 0.1706002652645111, + "learning_rate": 1.9958397909971765e-05, + "loss": 1.3306, + "step": 1946 + }, + { + "epoch": 0.579906550754854, + "grad_norm": 0.16390784084796906, + "learning_rate": 1.9958309971002395e-05, + "loss": 1.3362, + "step": 1947 + }, + { + "epoch": 0.5802043969545226, + "grad_norm": 0.16863805055618286, + "learning_rate": 1.9958221939382075e-05, + "loss": 1.3433, + "step": 1948 + }, + { + "epoch": 0.5805022431541913, + "grad_norm": 0.16780760884284973, + "learning_rate": 1.9958133815111628e-05, + "loss": 1.3519, + "step": 1949 + }, + { + "epoch": 0.5808000893538598, + "grad_norm": 0.180782750248909, + "learning_rate": 1.995804559819187e-05, + "loss": 1.3356, + "step": 1950 + }, + { + "epoch": 0.5810979355535285, + "grad_norm": 0.1618662178516388, + "learning_rate": 1.9957957288623624e-05, + "loss": 1.3366, + "step": 1951 + }, + { + "epoch": 0.5813957817531972, + "grad_norm": 0.16837403178215027, + "learning_rate": 1.995786888640771e-05, + "loss": 1.3397, + "step": 1952 + }, + { + "epoch": 0.5816936279528658, + "grad_norm": 0.19062912464141846, + "learning_rate": 1.9957780391544953e-05, + "loss": 1.3337, + "step": 1953 + }, + { + "epoch": 0.5819914741525345, + "grad_norm": 0.174716517329216, + "learning_rate": 1.995769180403618e-05, + "loss": 1.3388, + "step": 1954 + }, + { + "epoch": 0.5822893203522032, + "grad_norm": 0.1641959249973297, + "learning_rate": 1.9957603123882202e-05, + "loss": 1.3308, + "step": 1955 + }, + { + "epoch": 0.5825871665518718, + "grad_norm": 0.16572526097297668, + "learning_rate": 1.9957514351083855e-05, + "loss": 1.3372, + "step": 1956 + }, + { + "epoch": 0.5828850127515405, + "grad_norm": 0.16768459975719452, + "learning_rate": 1.9957425485641964e-05, + "loss": 1.3277, + "step": 1957 + }, + { + "epoch": 0.583182858951209, + "grad_norm": 0.17460903525352478, + "learning_rate": 1.995733652755735e-05, + "loss": 1.3393, + "step": 1958 + }, + { + "epoch": 0.5834807051508777, + "grad_norm": 0.1763729304075241, + "learning_rate": 1.9957247476830846e-05, + "loss": 1.3447, + "step": 1959 + }, + { + "epoch": 0.5837785513505463, + "grad_norm": 0.1680125743150711, + "learning_rate": 1.9957158333463283e-05, + "loss": 1.3445, + "step": 1960 + }, + { + "epoch": 0.584076397550215, + "grad_norm": 0.1667768508195877, + "learning_rate": 1.9957069097455482e-05, + "loss": 1.3216, + "step": 1961 + }, + { + "epoch": 0.5843742437498837, + "grad_norm": 0.17902809381484985, + "learning_rate": 1.995697976880828e-05, + "loss": 1.3481, + "step": 1962 + }, + { + "epoch": 0.5846720899495523, + "grad_norm": 0.17711710929870605, + "learning_rate": 1.9956890347522505e-05, + "loss": 1.3173, + "step": 1963 + }, + { + "epoch": 0.584969936149221, + "grad_norm": 0.17662787437438965, + "learning_rate": 1.995680083359899e-05, + "loss": 1.3389, + "step": 1964 + }, + { + "epoch": 0.5852677823488895, + "grad_norm": 0.16423983871936798, + "learning_rate": 1.9956711227038567e-05, + "loss": 1.3275, + "step": 1965 + }, + { + "epoch": 0.5855656285485582, + "grad_norm": 0.17329730093479156, + "learning_rate": 1.995662152784207e-05, + "loss": 1.3334, + "step": 1966 + }, + { + "epoch": 0.5858634747482269, + "grad_norm": 0.18660931289196014, + "learning_rate": 1.9956531736010336e-05, + "loss": 1.3476, + "step": 1967 + }, + { + "epoch": 0.5861613209478955, + "grad_norm": 0.18202678859233856, + "learning_rate": 1.9956441851544197e-05, + "loss": 1.3495, + "step": 1968 + }, + { + "epoch": 0.5864591671475642, + "grad_norm": 0.17551197111606598, + "learning_rate": 1.9956351874444492e-05, + "loss": 1.3353, + "step": 1969 + }, + { + "epoch": 0.5867570133472328, + "grad_norm": 0.1738758683204651, + "learning_rate": 1.9956261804712055e-05, + "loss": 1.3453, + "step": 1970 + }, + { + "epoch": 0.5870548595469015, + "grad_norm": 0.1726599484682083, + "learning_rate": 1.9956171642347725e-05, + "loss": 1.325, + "step": 1971 + }, + { + "epoch": 0.5873527057465702, + "grad_norm": 0.1872359812259674, + "learning_rate": 1.9956081387352343e-05, + "loss": 1.3345, + "step": 1972 + }, + { + "epoch": 0.5876505519462387, + "grad_norm": 0.1731291115283966, + "learning_rate": 1.9955991039726745e-05, + "loss": 1.3277, + "step": 1973 + }, + { + "epoch": 0.5879483981459074, + "grad_norm": 0.16684368252754211, + "learning_rate": 1.9955900599471776e-05, + "loss": 1.3357, + "step": 1974 + }, + { + "epoch": 0.588246244345576, + "grad_norm": 0.17079827189445496, + "learning_rate": 1.9955810066588276e-05, + "loss": 1.3239, + "step": 1975 + }, + { + "epoch": 0.5885440905452447, + "grad_norm": 0.16931898891925812, + "learning_rate": 1.9955719441077088e-05, + "loss": 1.316, + "step": 1976 + }, + { + "epoch": 0.5888419367449134, + "grad_norm": 0.16645090281963348, + "learning_rate": 1.995562872293905e-05, + "loss": 1.3357, + "step": 1977 + }, + { + "epoch": 0.589139782944582, + "grad_norm": 0.18683874607086182, + "learning_rate": 1.9955537912175012e-05, + "loss": 1.3299, + "step": 1978 + }, + { + "epoch": 0.5894376291442507, + "grad_norm": 0.1716485619544983, + "learning_rate": 1.9955447008785813e-05, + "loss": 1.3384, + "step": 1979 + }, + { + "epoch": 0.5897354753439192, + "grad_norm": 0.17715637385845184, + "learning_rate": 1.9955356012772307e-05, + "loss": 1.3377, + "step": 1980 + }, + { + "epoch": 0.5900333215435879, + "grad_norm": 0.1782515048980713, + "learning_rate": 1.9955264924135334e-05, + "loss": 1.3405, + "step": 1981 + }, + { + "epoch": 0.5903311677432566, + "grad_norm": 0.1820431649684906, + "learning_rate": 1.9955173742875743e-05, + "loss": 1.3211, + "step": 1982 + }, + { + "epoch": 0.5906290139429252, + "grad_norm": 0.18172216415405273, + "learning_rate": 1.9955082468994383e-05, + "loss": 1.3298, + "step": 1983 + }, + { + "epoch": 0.5909268601425939, + "grad_norm": 0.17631405591964722, + "learning_rate": 1.9954991102492108e-05, + "loss": 1.322, + "step": 1984 + }, + { + "epoch": 0.5912247063422625, + "grad_norm": 0.17888231575489044, + "learning_rate": 1.995489964336976e-05, + "loss": 1.322, + "step": 1985 + }, + { + "epoch": 0.5915225525419312, + "grad_norm": 0.16786961257457733, + "learning_rate": 1.995480809162819e-05, + "loss": 1.3271, + "step": 1986 + }, + { + "epoch": 0.5918203987415998, + "grad_norm": 0.18634092807769775, + "learning_rate": 1.9954716447268258e-05, + "loss": 1.3532, + "step": 1987 + }, + { + "epoch": 0.5921182449412684, + "grad_norm": 0.1904766708612442, + "learning_rate": 1.9954624710290807e-05, + "loss": 1.339, + "step": 1988 + }, + { + "epoch": 0.5924160911409371, + "grad_norm": 0.17535746097564697, + "learning_rate": 1.9954532880696694e-05, + "loss": 1.3245, + "step": 1989 + }, + { + "epoch": 0.5927139373406057, + "grad_norm": 0.17284177243709564, + "learning_rate": 1.9954440958486782e-05, + "loss": 1.3425, + "step": 1990 + }, + { + "epoch": 0.5930117835402744, + "grad_norm": 0.1806136667728424, + "learning_rate": 1.995434894366191e-05, + "loss": 1.3235, + "step": 1991 + }, + { + "epoch": 0.5933096297399431, + "grad_norm": 0.18574415147304535, + "learning_rate": 1.9954256836222946e-05, + "loss": 1.3452, + "step": 1992 + }, + { + "epoch": 0.5936074759396117, + "grad_norm": 0.17696474492549896, + "learning_rate": 1.9954164636170747e-05, + "loss": 1.336, + "step": 1993 + }, + { + "epoch": 0.5939053221392804, + "grad_norm": 0.18362903594970703, + "learning_rate": 1.9954072343506164e-05, + "loss": 1.349, + "step": 1994 + }, + { + "epoch": 0.5942031683389489, + "grad_norm": 0.166317418217659, + "learning_rate": 1.9953979958230062e-05, + "loss": 1.3317, + "step": 1995 + }, + { + "epoch": 0.5945010145386176, + "grad_norm": 0.17155015468597412, + "learning_rate": 1.9953887480343294e-05, + "loss": 1.3411, + "step": 1996 + }, + { + "epoch": 0.5947988607382863, + "grad_norm": 0.1783314049243927, + "learning_rate": 1.9953794909846724e-05, + "loss": 1.3349, + "step": 1997 + }, + { + "epoch": 0.5950967069379549, + "grad_norm": 0.17705361545085907, + "learning_rate": 1.9953702246741216e-05, + "loss": 1.3563, + "step": 1998 + }, + { + "epoch": 0.5953945531376236, + "grad_norm": 0.16366109251976013, + "learning_rate": 1.995360949102763e-05, + "loss": 1.3435, + "step": 1999 + }, + { + "epoch": 0.5956923993372922, + "grad_norm": 0.1686832308769226, + "learning_rate": 1.9953516642706827e-05, + "loss": 1.3653, + "step": 2000 + }, + { + "epoch": 0.5956923993372922, + "eval_loss": 1.3709481954574585, + "eval_runtime": 20.4305, + "eval_samples_per_second": 84.873, + "eval_steps_per_second": 5.335, + "step": 2000 + }, + { + "epoch": 0.5959902455369609, + "grad_norm": 0.17259104549884796, + "learning_rate": 1.995342370177967e-05, + "loss": 1.3281, + "step": 2001 + }, + { + "epoch": 0.5962880917366294, + "grad_norm": 0.17897668480873108, + "learning_rate": 1.995333066824703e-05, + "loss": 1.3326, + "step": 2002 + }, + { + "epoch": 0.5965859379362981, + "grad_norm": 0.1779920607805252, + "learning_rate": 1.995323754210977e-05, + "loss": 1.3484, + "step": 2003 + }, + { + "epoch": 0.5968837841359668, + "grad_norm": 0.16980735957622528, + "learning_rate": 1.995314432336875e-05, + "loss": 1.3208, + "step": 2004 + }, + { + "epoch": 0.5971816303356354, + "grad_norm": 0.16974031925201416, + "learning_rate": 1.9953051012024845e-05, + "loss": 1.3133, + "step": 2005 + }, + { + "epoch": 0.5974794765353041, + "grad_norm": 0.17339594662189484, + "learning_rate": 1.995295760807892e-05, + "loss": 1.3286, + "step": 2006 + }, + { + "epoch": 0.5977773227349727, + "grad_norm": 0.17947392165660858, + "learning_rate": 1.9952864111531845e-05, + "loss": 1.3467, + "step": 2007 + }, + { + "epoch": 0.5980751689346414, + "grad_norm": 0.1726466864347458, + "learning_rate": 1.995277052238449e-05, + "loss": 1.3257, + "step": 2008 + }, + { + "epoch": 0.5983730151343101, + "grad_norm": 0.17698267102241516, + "learning_rate": 1.9952676840637724e-05, + "loss": 1.3587, + "step": 2009 + }, + { + "epoch": 0.5986708613339786, + "grad_norm": 0.17208711802959442, + "learning_rate": 1.995258306629242e-05, + "loss": 1.3321, + "step": 2010 + }, + { + "epoch": 0.5989687075336473, + "grad_norm": 0.16823822259902954, + "learning_rate": 1.995248919934945e-05, + "loss": 1.3332, + "step": 2011 + }, + { + "epoch": 0.5992665537333159, + "grad_norm": 0.1744266003370285, + "learning_rate": 1.9952395239809686e-05, + "loss": 1.3384, + "step": 2012 + }, + { + "epoch": 0.5995643999329846, + "grad_norm": 0.1792793571949005, + "learning_rate": 1.995230118767401e-05, + "loss": 1.34, + "step": 2013 + }, + { + "epoch": 0.5998622461326533, + "grad_norm": 0.17216219007968903, + "learning_rate": 1.9952207042943287e-05, + "loss": 1.318, + "step": 2014 + }, + { + "epoch": 0.6001600923323219, + "grad_norm": 0.1686355471611023, + "learning_rate": 1.9952112805618394e-05, + "loss": 1.3177, + "step": 2015 + }, + { + "epoch": 0.6004579385319906, + "grad_norm": 0.17162348330020905, + "learning_rate": 1.9952018475700212e-05, + "loss": 1.3229, + "step": 2016 + }, + { + "epoch": 0.6007557847316591, + "grad_norm": 0.17360536754131317, + "learning_rate": 1.995192405318962e-05, + "loss": 1.3429, + "step": 2017 + }, + { + "epoch": 0.6010536309313278, + "grad_norm": 0.18095779418945312, + "learning_rate": 1.9951829538087492e-05, + "loss": 1.3338, + "step": 2018 + }, + { + "epoch": 0.6013514771309965, + "grad_norm": 0.17286114394664764, + "learning_rate": 1.9951734930394705e-05, + "loss": 1.3404, + "step": 2019 + }, + { + "epoch": 0.6016493233306651, + "grad_norm": 0.16724084317684174, + "learning_rate": 1.9951640230112146e-05, + "loss": 1.3318, + "step": 2020 + }, + { + "epoch": 0.6019471695303338, + "grad_norm": 0.1806597113609314, + "learning_rate": 1.9951545437240698e-05, + "loss": 1.3367, + "step": 2021 + }, + { + "epoch": 0.6022450157300024, + "grad_norm": 0.18451228737831116, + "learning_rate": 1.9951450551781236e-05, + "loss": 1.3536, + "step": 2022 + }, + { + "epoch": 0.6025428619296711, + "grad_norm": 0.1788858026266098, + "learning_rate": 1.9951355573734643e-05, + "loss": 1.3365, + "step": 2023 + }, + { + "epoch": 0.6028407081293398, + "grad_norm": 0.1803123652935028, + "learning_rate": 1.9951260503101803e-05, + "loss": 1.3414, + "step": 2024 + }, + { + "epoch": 0.6031385543290083, + "grad_norm": 0.18506979942321777, + "learning_rate": 1.9951165339883606e-05, + "loss": 1.3226, + "step": 2025 + }, + { + "epoch": 0.603436400528677, + "grad_norm": 0.17891237139701843, + "learning_rate": 1.995107008408093e-05, + "loss": 1.3415, + "step": 2026 + }, + { + "epoch": 0.6037342467283456, + "grad_norm": 0.18189607560634613, + "learning_rate": 1.995097473569467e-05, + "loss": 1.3446, + "step": 2027 + }, + { + "epoch": 0.6040320929280143, + "grad_norm": 0.1696876734495163, + "learning_rate": 1.9950879294725702e-05, + "loss": 1.3488, + "step": 2028 + }, + { + "epoch": 0.604329939127683, + "grad_norm": 0.18890951573848724, + "learning_rate": 1.9950783761174922e-05, + "loss": 1.3356, + "step": 2029 + }, + { + "epoch": 0.6046277853273516, + "grad_norm": 0.18418404459953308, + "learning_rate": 1.9950688135043217e-05, + "loss": 1.3331, + "step": 2030 + }, + { + "epoch": 0.6049256315270203, + "grad_norm": 0.17907729744911194, + "learning_rate": 1.995059241633148e-05, + "loss": 1.3351, + "step": 2031 + }, + { + "epoch": 0.6052234777266888, + "grad_norm": 0.18905779719352722, + "learning_rate": 1.9950496605040595e-05, + "loss": 1.3254, + "step": 2032 + }, + { + "epoch": 0.6055213239263575, + "grad_norm": 0.17531952261924744, + "learning_rate": 1.9950400701171456e-05, + "loss": 1.3402, + "step": 2033 + }, + { + "epoch": 0.6058191701260262, + "grad_norm": 0.1793741136789322, + "learning_rate": 1.9950304704724956e-05, + "loss": 1.3354, + "step": 2034 + }, + { + "epoch": 0.6061170163256948, + "grad_norm": 0.18081824481487274, + "learning_rate": 1.9950208615701987e-05, + "loss": 1.335, + "step": 2035 + }, + { + "epoch": 0.6064148625253635, + "grad_norm": 0.17112943530082703, + "learning_rate": 1.9950112434103444e-05, + "loss": 1.3324, + "step": 2036 + }, + { + "epoch": 0.6067127087250321, + "grad_norm": 0.17233715951442719, + "learning_rate": 1.9950016159930223e-05, + "loss": 1.3292, + "step": 2037 + }, + { + "epoch": 0.6070105549247008, + "grad_norm": 0.16468045115470886, + "learning_rate": 1.994991979318322e-05, + "loss": 1.3278, + "step": 2038 + }, + { + "epoch": 0.6073084011243695, + "grad_norm": 0.1741722971200943, + "learning_rate": 1.9949823333863328e-05, + "loss": 1.3488, + "step": 2039 + }, + { + "epoch": 0.607606247324038, + "grad_norm": 0.18325020372867584, + "learning_rate": 1.9949726781971446e-05, + "loss": 1.3446, + "step": 2040 + }, + { + "epoch": 0.6079040935237067, + "grad_norm": 0.18267396092414856, + "learning_rate": 1.9949630137508472e-05, + "loss": 1.3421, + "step": 2041 + }, + { + "epoch": 0.6082019397233753, + "grad_norm": 0.17630638182163239, + "learning_rate": 1.9949533400475307e-05, + "loss": 1.3342, + "step": 2042 + }, + { + "epoch": 0.608499785923044, + "grad_norm": 0.167982280254364, + "learning_rate": 1.994943657087285e-05, + "loss": 1.3507, + "step": 2043 + }, + { + "epoch": 0.6087976321227127, + "grad_norm": 0.1815871000289917, + "learning_rate": 1.9949339648702004e-05, + "loss": 1.3226, + "step": 2044 + }, + { + "epoch": 0.6090954783223813, + "grad_norm": 0.17143020033836365, + "learning_rate": 1.9949242633963666e-05, + "loss": 1.335, + "step": 2045 + }, + { + "epoch": 0.60939332452205, + "grad_norm": 0.17347553372383118, + "learning_rate": 1.9949145526658742e-05, + "loss": 1.3327, + "step": 2046 + }, + { + "epoch": 0.6096911707217185, + "grad_norm": 0.17974741756916046, + "learning_rate": 1.9949048326788133e-05, + "loss": 1.3419, + "step": 2047 + }, + { + "epoch": 0.6099890169213872, + "grad_norm": 0.18071913719177246, + "learning_rate": 1.9948951034352747e-05, + "loss": 1.345, + "step": 2048 + }, + { + "epoch": 0.6102868631210558, + "grad_norm": 0.18039347231388092, + "learning_rate": 1.994885364935349e-05, + "loss": 1.3373, + "step": 2049 + }, + { + "epoch": 0.6105847093207245, + "grad_norm": 0.18226660788059235, + "learning_rate": 1.9948756171791262e-05, + "loss": 1.3433, + "step": 2050 + }, + { + "epoch": 0.6108825555203932, + "grad_norm": 0.18637697398662567, + "learning_rate": 1.994865860166697e-05, + "loss": 1.3352, + "step": 2051 + }, + { + "epoch": 0.6111804017200618, + "grad_norm": 0.18535496294498444, + "learning_rate": 1.994856093898153e-05, + "loss": 1.316, + "step": 2052 + }, + { + "epoch": 0.6114782479197305, + "grad_norm": 0.1768663376569748, + "learning_rate": 1.9948463183735845e-05, + "loss": 1.3341, + "step": 2053 + }, + { + "epoch": 0.6117760941193991, + "grad_norm": 0.17285248637199402, + "learning_rate": 1.9948365335930825e-05, + "loss": 1.3266, + "step": 2054 + }, + { + "epoch": 0.6120739403190677, + "grad_norm": 0.1809185892343521, + "learning_rate": 1.9948267395567378e-05, + "loss": 1.3343, + "step": 2055 + }, + { + "epoch": 0.6123717865187364, + "grad_norm": 0.18210750818252563, + "learning_rate": 1.994816936264642e-05, + "loss": 1.3495, + "step": 2056 + }, + { + "epoch": 0.612669632718405, + "grad_norm": 0.17945021390914917, + "learning_rate": 1.994807123716886e-05, + "loss": 1.3395, + "step": 2057 + }, + { + "epoch": 0.6129674789180737, + "grad_norm": 0.16708803176879883, + "learning_rate": 1.994797301913561e-05, + "loss": 1.3273, + "step": 2058 + }, + { + "epoch": 0.6132653251177423, + "grad_norm": 0.17374387383460999, + "learning_rate": 1.994787470854759e-05, + "loss": 1.3347, + "step": 2059 + }, + { + "epoch": 0.613563171317411, + "grad_norm": 0.18136759102344513, + "learning_rate": 1.9947776305405708e-05, + "loss": 1.3313, + "step": 2060 + }, + { + "epoch": 0.6138610175170797, + "grad_norm": 0.18151119351387024, + "learning_rate": 1.9947677809710882e-05, + "loss": 1.3469, + "step": 2061 + }, + { + "epoch": 0.6141588637167482, + "grad_norm": 0.17824167013168335, + "learning_rate": 1.994757922146403e-05, + "loss": 1.3299, + "step": 2062 + }, + { + "epoch": 0.6144567099164169, + "grad_norm": 0.17063666880130768, + "learning_rate": 1.9947480540666064e-05, + "loss": 1.3083, + "step": 2063 + }, + { + "epoch": 0.6147545561160855, + "grad_norm": 0.17322927713394165, + "learning_rate": 1.9947381767317907e-05, + "loss": 1.3258, + "step": 2064 + }, + { + "epoch": 0.6150524023157542, + "grad_norm": 0.18017977476119995, + "learning_rate": 1.9947282901420477e-05, + "loss": 1.3336, + "step": 2065 + }, + { + "epoch": 0.6153502485154229, + "grad_norm": 0.17630837857723236, + "learning_rate": 1.9947183942974693e-05, + "loss": 1.3229, + "step": 2066 + }, + { + "epoch": 0.6156480947150915, + "grad_norm": 0.1698862910270691, + "learning_rate": 1.9947084891981476e-05, + "loss": 1.3391, + "step": 2067 + }, + { + "epoch": 0.6159459409147602, + "grad_norm": 0.16928812861442566, + "learning_rate": 1.9946985748441747e-05, + "loss": 1.3545, + "step": 2068 + }, + { + "epoch": 0.6162437871144287, + "grad_norm": 0.17287898063659668, + "learning_rate": 1.994688651235643e-05, + "loss": 1.3445, + "step": 2069 + }, + { + "epoch": 0.6165416333140974, + "grad_norm": 0.1712704598903656, + "learning_rate": 1.9946787183726445e-05, + "loss": 1.3156, + "step": 2070 + }, + { + "epoch": 0.616839479513766, + "grad_norm": 0.18089932203292847, + "learning_rate": 1.994668776255272e-05, + "loss": 1.3223, + "step": 2071 + }, + { + "epoch": 0.6171373257134347, + "grad_norm": 0.16370098292827606, + "learning_rate": 1.994658824883618e-05, + "loss": 1.3522, + "step": 2072 + }, + { + "epoch": 0.6174351719131034, + "grad_norm": 0.18175539374351501, + "learning_rate": 1.9946488642577747e-05, + "loss": 1.3253, + "step": 2073 + }, + { + "epoch": 0.617733018112772, + "grad_norm": 0.16670680046081543, + "learning_rate": 1.9946388943778353e-05, + "loss": 1.3271, + "step": 2074 + }, + { + "epoch": 0.6180308643124407, + "grad_norm": 0.16849905252456665, + "learning_rate": 1.994628915243892e-05, + "loss": 1.3338, + "step": 2075 + }, + { + "epoch": 0.6183287105121094, + "grad_norm": 0.1773916482925415, + "learning_rate": 1.994618926856038e-05, + "loss": 1.3301, + "step": 2076 + }, + { + "epoch": 0.6186265567117779, + "grad_norm": 0.167177215218544, + "learning_rate": 1.994608929214366e-05, + "loss": 1.3293, + "step": 2077 + }, + { + "epoch": 0.6189244029114466, + "grad_norm": 0.17059874534606934, + "learning_rate": 1.9945989223189694e-05, + "loss": 1.3206, + "step": 2078 + }, + { + "epoch": 0.6192222491111152, + "grad_norm": 0.17021021246910095, + "learning_rate": 1.994588906169941e-05, + "loss": 1.3296, + "step": 2079 + }, + { + "epoch": 0.6195200953107839, + "grad_norm": 0.17559467256069183, + "learning_rate": 1.994578880767374e-05, + "loss": 1.3289, + "step": 2080 + }, + { + "epoch": 0.6198179415104526, + "grad_norm": 0.17063923180103302, + "learning_rate": 1.994568846111362e-05, + "loss": 1.3201, + "step": 2081 + }, + { + "epoch": 0.6201157877101212, + "grad_norm": 0.17661602795124054, + "learning_rate": 1.9945588022019975e-05, + "loss": 1.326, + "step": 2082 + }, + { + "epoch": 0.6204136339097899, + "grad_norm": 0.17022280395030975, + "learning_rate": 1.994548749039375e-05, + "loss": 1.3284, + "step": 2083 + }, + { + "epoch": 0.6207114801094584, + "grad_norm": 0.18009376525878906, + "learning_rate": 1.9945386866235874e-05, + "loss": 1.3187, + "step": 2084 + }, + { + "epoch": 0.6210093263091271, + "grad_norm": 0.167107954621315, + "learning_rate": 1.9945286149547284e-05, + "loss": 1.3351, + "step": 2085 + }, + { + "epoch": 0.6213071725087957, + "grad_norm": 0.1685510128736496, + "learning_rate": 1.994518534032892e-05, + "loss": 1.3186, + "step": 2086 + }, + { + "epoch": 0.6216050187084644, + "grad_norm": 0.179209366440773, + "learning_rate": 1.9945084438581713e-05, + "loss": 1.3394, + "step": 2087 + }, + { + "epoch": 0.6219028649081331, + "grad_norm": 0.18145005404949188, + "learning_rate": 1.9944983444306613e-05, + "loss": 1.3336, + "step": 2088 + }, + { + "epoch": 0.6222007111078017, + "grad_norm": 0.17497749626636505, + "learning_rate": 1.994488235750455e-05, + "loss": 1.3488, + "step": 2089 + }, + { + "epoch": 0.6224985573074704, + "grad_norm": 0.1645064353942871, + "learning_rate": 1.9944781178176468e-05, + "loss": 1.3264, + "step": 2090 + }, + { + "epoch": 0.622796403507139, + "grad_norm": 0.17959046363830566, + "learning_rate": 1.9944679906323307e-05, + "loss": 1.3192, + "step": 2091 + }, + { + "epoch": 0.6230942497068076, + "grad_norm": 0.17209501564502716, + "learning_rate": 1.994457854194601e-05, + "loss": 1.3409, + "step": 2092 + }, + { + "epoch": 0.6233920959064763, + "grad_norm": 0.16878920793533325, + "learning_rate": 1.9944477085045525e-05, + "loss": 1.3499, + "step": 2093 + }, + { + "epoch": 0.6236899421061449, + "grad_norm": 0.17653900384902954, + "learning_rate": 1.994437553562279e-05, + "loss": 1.3289, + "step": 2094 + }, + { + "epoch": 0.6239877883058136, + "grad_norm": 0.1805211454629898, + "learning_rate": 1.9944273893678748e-05, + "loss": 1.3096, + "step": 2095 + }, + { + "epoch": 0.6242856345054822, + "grad_norm": 0.18547116219997406, + "learning_rate": 1.994417215921435e-05, + "loss": 1.3339, + "step": 2096 + }, + { + "epoch": 0.6245834807051509, + "grad_norm": 0.17186765372753143, + "learning_rate": 1.994407033223054e-05, + "loss": 1.3263, + "step": 2097 + }, + { + "epoch": 0.6248813269048196, + "grad_norm": 0.18666228652000427, + "learning_rate": 1.9943968412728262e-05, + "loss": 1.3403, + "step": 2098 + }, + { + "epoch": 0.6251791731044881, + "grad_norm": 0.1880766749382019, + "learning_rate": 1.9943866400708473e-05, + "loss": 1.3348, + "step": 2099 + }, + { + "epoch": 0.6254770193041568, + "grad_norm": 0.17504388093948364, + "learning_rate": 1.9943764296172116e-05, + "loss": 1.3265, + "step": 2100 + }, + { + "epoch": 0.6257748655038254, + "grad_norm": 0.18056386709213257, + "learning_rate": 1.994366209912014e-05, + "loss": 1.3333, + "step": 2101 + }, + { + "epoch": 0.6260727117034941, + "grad_norm": 0.18163588643074036, + "learning_rate": 1.9943559809553502e-05, + "loss": 1.327, + "step": 2102 + }, + { + "epoch": 0.6263705579031628, + "grad_norm": 0.1701747626066208, + "learning_rate": 1.9943457427473146e-05, + "loss": 1.3363, + "step": 2103 + }, + { + "epoch": 0.6266684041028314, + "grad_norm": 0.1721438467502594, + "learning_rate": 1.9943354952880028e-05, + "loss": 1.3382, + "step": 2104 + }, + { + "epoch": 0.6269662503025001, + "grad_norm": 0.16902436316013336, + "learning_rate": 1.9943252385775103e-05, + "loss": 1.3363, + "step": 2105 + }, + { + "epoch": 0.6272640965021687, + "grad_norm": 0.17550300061702728, + "learning_rate": 1.9943149726159326e-05, + "loss": 1.3274, + "step": 2106 + }, + { + "epoch": 0.6275619427018373, + "grad_norm": 0.17495177686214447, + "learning_rate": 1.9943046974033647e-05, + "loss": 1.3417, + "step": 2107 + }, + { + "epoch": 0.627859788901506, + "grad_norm": 0.17364050447940826, + "learning_rate": 1.9942944129399023e-05, + "loss": 1.3178, + "step": 2108 + }, + { + "epoch": 0.6281576351011746, + "grad_norm": 0.16875362396240234, + "learning_rate": 1.9942841192256415e-05, + "loss": 1.3265, + "step": 2109 + }, + { + "epoch": 0.6284554813008433, + "grad_norm": 0.17429284751415253, + "learning_rate": 1.994273816260678e-05, + "loss": 1.32, + "step": 2110 + }, + { + "epoch": 0.6287533275005119, + "grad_norm": 0.17145192623138428, + "learning_rate": 1.994263504045107e-05, + "loss": 1.3319, + "step": 2111 + }, + { + "epoch": 0.6290511737001806, + "grad_norm": 0.16923145949840546, + "learning_rate": 1.9942531825790254e-05, + "loss": 1.3371, + "step": 2112 + }, + { + "epoch": 0.6293490198998493, + "grad_norm": 0.17293810844421387, + "learning_rate": 1.994242851862529e-05, + "loss": 1.3186, + "step": 2113 + }, + { + "epoch": 0.6296468660995178, + "grad_norm": 0.1745597869157791, + "learning_rate": 1.9942325118957133e-05, + "loss": 1.3202, + "step": 2114 + }, + { + "epoch": 0.6299447122991865, + "grad_norm": 0.17113369703292847, + "learning_rate": 1.994222162678675e-05, + "loss": 1.3371, + "step": 2115 + }, + { + "epoch": 0.6302425584988551, + "grad_norm": 0.17446771264076233, + "learning_rate": 1.9942118042115104e-05, + "loss": 1.3452, + "step": 2116 + }, + { + "epoch": 0.6305404046985238, + "grad_norm": 0.178619846701622, + "learning_rate": 1.9942014364943154e-05, + "loss": 1.3223, + "step": 2117 + }, + { + "epoch": 0.6308382508981925, + "grad_norm": 0.18409490585327148, + "learning_rate": 1.9941910595271872e-05, + "loss": 1.3224, + "step": 2118 + }, + { + "epoch": 0.6311360970978611, + "grad_norm": 0.1795678585767746, + "learning_rate": 1.9941806733102217e-05, + "loss": 1.3296, + "step": 2119 + }, + { + "epoch": 0.6314339432975298, + "grad_norm": 0.1839311569929123, + "learning_rate": 1.994170277843516e-05, + "loss": 1.3274, + "step": 2120 + }, + { + "epoch": 0.6317317894971984, + "grad_norm": 0.17433764040470123, + "learning_rate": 1.9941598731271665e-05, + "loss": 1.3118, + "step": 2121 + }, + { + "epoch": 0.632029635696867, + "grad_norm": 0.18274016678333282, + "learning_rate": 1.9941494591612702e-05, + "loss": 1.3092, + "step": 2122 + }, + { + "epoch": 0.6323274818965356, + "grad_norm": 0.16890452802181244, + "learning_rate": 1.994139035945924e-05, + "loss": 1.3396, + "step": 2123 + }, + { + "epoch": 0.6326253280962043, + "grad_norm": 0.18805302679538727, + "learning_rate": 1.9941286034812248e-05, + "loss": 1.3313, + "step": 2124 + }, + { + "epoch": 0.632923174295873, + "grad_norm": 0.17017240822315216, + "learning_rate": 1.9941181617672694e-05, + "loss": 1.328, + "step": 2125 + }, + { + "epoch": 0.6332210204955416, + "grad_norm": 0.18405862152576447, + "learning_rate": 1.9941077108041555e-05, + "loss": 1.3443, + "step": 2126 + }, + { + "epoch": 0.6335188666952103, + "grad_norm": 0.16606247425079346, + "learning_rate": 1.9940972505919797e-05, + "loss": 1.3268, + "step": 2127 + }, + { + "epoch": 0.633816712894879, + "grad_norm": 0.17940346896648407, + "learning_rate": 1.9940867811308398e-05, + "loss": 1.334, + "step": 2128 + }, + { + "epoch": 0.6341145590945475, + "grad_norm": 0.1704801768064499, + "learning_rate": 1.994076302420833e-05, + "loss": 1.329, + "step": 2129 + }, + { + "epoch": 0.6344124052942162, + "grad_norm": 0.1802673041820526, + "learning_rate": 1.994065814462057e-05, + "loss": 1.3335, + "step": 2130 + }, + { + "epoch": 0.6347102514938848, + "grad_norm": 0.17566320300102234, + "learning_rate": 1.9940553172546088e-05, + "loss": 1.3195, + "step": 2131 + }, + { + "epoch": 0.6350080976935535, + "grad_norm": 0.177176833152771, + "learning_rate": 1.9940448107985873e-05, + "loss": 1.3275, + "step": 2132 + }, + { + "epoch": 0.6353059438932221, + "grad_norm": 0.17318475246429443, + "learning_rate": 1.994034295094089e-05, + "loss": 1.3281, + "step": 2133 + }, + { + "epoch": 0.6356037900928908, + "grad_norm": 0.17585478723049164, + "learning_rate": 1.994023770141212e-05, + "loss": 1.3368, + "step": 2134 + }, + { + "epoch": 0.6359016362925595, + "grad_norm": 0.1724289357662201, + "learning_rate": 1.9940132359400545e-05, + "loss": 1.3273, + "step": 2135 + }, + { + "epoch": 0.6361994824922281, + "grad_norm": 0.17516650259494781, + "learning_rate": 1.9940026924907144e-05, + "loss": 1.3307, + "step": 2136 + }, + { + "epoch": 0.6364973286918967, + "grad_norm": 0.17116110026836395, + "learning_rate": 1.99399213979329e-05, + "loss": 1.338, + "step": 2137 + }, + { + "epoch": 0.6367951748915653, + "grad_norm": 0.1668197512626648, + "learning_rate": 1.9939815778478792e-05, + "loss": 1.3455, + "step": 2138 + }, + { + "epoch": 0.637093021091234, + "grad_norm": 0.17479157447814941, + "learning_rate": 1.9939710066545804e-05, + "loss": 1.3205, + "step": 2139 + }, + { + "epoch": 0.6373908672909027, + "grad_norm": 0.17087820172309875, + "learning_rate": 1.993960426213492e-05, + "loss": 1.3041, + "step": 2140 + }, + { + "epoch": 0.6376887134905713, + "grad_norm": 0.18054068088531494, + "learning_rate": 1.993949836524712e-05, + "loss": 1.3279, + "step": 2141 + }, + { + "epoch": 0.63798655969024, + "grad_norm": 0.17079319059848785, + "learning_rate": 1.99393923758834e-05, + "loss": 1.3237, + "step": 2142 + }, + { + "epoch": 0.6382844058899086, + "grad_norm": 0.171514630317688, + "learning_rate": 1.9939286294044732e-05, + "loss": 1.3254, + "step": 2143 + }, + { + "epoch": 0.6385822520895772, + "grad_norm": 0.16941453516483307, + "learning_rate": 1.993918011973211e-05, + "loss": 1.3338, + "step": 2144 + }, + { + "epoch": 0.6388800982892459, + "grad_norm": 0.16755352914333344, + "learning_rate": 1.9939073852946524e-05, + "loss": 1.3251, + "step": 2145 + }, + { + "epoch": 0.6391779444889145, + "grad_norm": 0.17181557416915894, + "learning_rate": 1.993896749368896e-05, + "loss": 1.3394, + "step": 2146 + }, + { + "epoch": 0.6394757906885832, + "grad_norm": 0.18224985897541046, + "learning_rate": 1.9938861041960406e-05, + "loss": 1.3363, + "step": 2147 + }, + { + "epoch": 0.6397736368882518, + "grad_norm": 0.1785338670015335, + "learning_rate": 1.993875449776186e-05, + "loss": 1.3164, + "step": 2148 + }, + { + "epoch": 0.6400714830879205, + "grad_norm": 0.17881037294864655, + "learning_rate": 1.9938647861094302e-05, + "loss": 1.3178, + "step": 2149 + }, + { + "epoch": 0.6403693292875892, + "grad_norm": 0.17853541672229767, + "learning_rate": 1.9938541131958732e-05, + "loss": 1.3294, + "step": 2150 + }, + { + "epoch": 0.6406671754872577, + "grad_norm": 0.16955041885375977, + "learning_rate": 1.993843431035614e-05, + "loss": 1.337, + "step": 2151 + }, + { + "epoch": 0.6409650216869264, + "grad_norm": 0.5713686943054199, + "learning_rate": 1.993832739628752e-05, + "loss": 1.3335, + "step": 2152 + }, + { + "epoch": 0.641262867886595, + "grad_norm": 0.18174128234386444, + "learning_rate": 1.993822038975387e-05, + "loss": 1.3162, + "step": 2153 + }, + { + "epoch": 0.6415607140862637, + "grad_norm": 0.1834786981344223, + "learning_rate": 1.993811329075618e-05, + "loss": 1.3347, + "step": 2154 + }, + { + "epoch": 0.6418585602859324, + "grad_norm": 0.18102896213531494, + "learning_rate": 1.993800609929545e-05, + "loss": 1.3424, + "step": 2155 + }, + { + "epoch": 0.642156406485601, + "grad_norm": 0.17838533222675323, + "learning_rate": 1.9937898815372677e-05, + "loss": 1.3353, + "step": 2156 + }, + { + "epoch": 0.6424542526852697, + "grad_norm": 0.18635553121566772, + "learning_rate": 1.993779143898886e-05, + "loss": 1.338, + "step": 2157 + }, + { + "epoch": 0.6427520988849383, + "grad_norm": 0.19375599920749664, + "learning_rate": 1.9937683970144996e-05, + "loss": 1.357, + "step": 2158 + }, + { + "epoch": 0.6430499450846069, + "grad_norm": 0.16988781094551086, + "learning_rate": 1.9937576408842087e-05, + "loss": 1.3301, + "step": 2159 + }, + { + "epoch": 0.6433477912842755, + "grad_norm": 0.184706449508667, + "learning_rate": 1.993746875508113e-05, + "loss": 1.3288, + "step": 2160 + }, + { + "epoch": 0.6436456374839442, + "grad_norm": 0.1785503625869751, + "learning_rate": 1.993736100886313e-05, + "loss": 1.3381, + "step": 2161 + }, + { + "epoch": 0.6439434836836129, + "grad_norm": 0.1770181804895401, + "learning_rate": 1.993725317018909e-05, + "loss": 1.3259, + "step": 2162 + }, + { + "epoch": 0.6442413298832815, + "grad_norm": 0.1877453774213791, + "learning_rate": 1.993714523906001e-05, + "loss": 1.337, + "step": 2163 + }, + { + "epoch": 0.6445391760829502, + "grad_norm": 0.18047162890434265, + "learning_rate": 1.9937037215476895e-05, + "loss": 1.334, + "step": 2164 + }, + { + "epoch": 0.6448370222826189, + "grad_norm": 0.21972842514514923, + "learning_rate": 1.993692909944075e-05, + "loss": 1.3383, + "step": 2165 + }, + { + "epoch": 0.6451348684822874, + "grad_norm": 0.17394456267356873, + "learning_rate": 1.9936820890952585e-05, + "loss": 1.3304, + "step": 2166 + }, + { + "epoch": 0.6454327146819561, + "grad_norm": 0.18222405016422272, + "learning_rate": 1.9936712590013404e-05, + "loss": 1.3265, + "step": 2167 + }, + { + "epoch": 0.6457305608816247, + "grad_norm": 0.18246474862098694, + "learning_rate": 1.9936604196624214e-05, + "loss": 1.3251, + "step": 2168 + }, + { + "epoch": 0.6460284070812934, + "grad_norm": 0.17890970408916473, + "learning_rate": 1.9936495710786025e-05, + "loss": 1.3271, + "step": 2169 + }, + { + "epoch": 0.646326253280962, + "grad_norm": 0.17982813715934753, + "learning_rate": 1.9936387132499838e-05, + "loss": 1.3299, + "step": 2170 + }, + { + "epoch": 0.6466240994806307, + "grad_norm": 0.18518179655075073, + "learning_rate": 1.993627846176668e-05, + "loss": 1.3258, + "step": 2171 + }, + { + "epoch": 0.6469219456802994, + "grad_norm": 0.17826440930366516, + "learning_rate": 1.9936169698587546e-05, + "loss": 1.3174, + "step": 2172 + }, + { + "epoch": 0.647219791879968, + "grad_norm": 0.17979653179645538, + "learning_rate": 1.9936060842963456e-05, + "loss": 1.3278, + "step": 2173 + }, + { + "epoch": 0.6475176380796366, + "grad_norm": 0.17953205108642578, + "learning_rate": 1.993595189489542e-05, + "loss": 1.3514, + "step": 2174 + }, + { + "epoch": 0.6478154842793052, + "grad_norm": 0.18435202538967133, + "learning_rate": 1.9935842854384456e-05, + "loss": 1.3157, + "step": 2175 + }, + { + "epoch": 0.6481133304789739, + "grad_norm": 0.17402073740959167, + "learning_rate": 1.993573372143157e-05, + "loss": 1.3356, + "step": 2176 + }, + { + "epoch": 0.6484111766786426, + "grad_norm": 0.17155030369758606, + "learning_rate": 1.9935624496037786e-05, + "loss": 1.317, + "step": 2177 + }, + { + "epoch": 0.6487090228783112, + "grad_norm": 0.17052295804023743, + "learning_rate": 1.9935515178204118e-05, + "loss": 1.3164, + "step": 2178 + }, + { + "epoch": 0.6490068690779799, + "grad_norm": 0.1768856942653656, + "learning_rate": 1.9935405767931582e-05, + "loss": 1.3451, + "step": 2179 + }, + { + "epoch": 0.6493047152776485, + "grad_norm": 0.18772312998771667, + "learning_rate": 1.9935296265221192e-05, + "loss": 1.3215, + "step": 2180 + }, + { + "epoch": 0.6496025614773171, + "grad_norm": 0.1783144325017929, + "learning_rate": 1.9935186670073975e-05, + "loss": 1.3476, + "step": 2181 + }, + { + "epoch": 0.6499004076769858, + "grad_norm": 0.17431962490081787, + "learning_rate": 1.9935076982490943e-05, + "loss": 1.3408, + "step": 2182 + }, + { + "epoch": 0.6501982538766544, + "grad_norm": 0.18071846663951874, + "learning_rate": 1.993496720247312e-05, + "loss": 1.337, + "step": 2183 + }, + { + "epoch": 0.6504961000763231, + "grad_norm": 0.17916467785835266, + "learning_rate": 1.993485733002153e-05, + "loss": 1.344, + "step": 2184 + }, + { + "epoch": 0.6507939462759917, + "grad_norm": 0.181587815284729, + "learning_rate": 1.993474736513719e-05, + "loss": 1.3243, + "step": 2185 + }, + { + "epoch": 0.6510917924756604, + "grad_norm": 0.16496814787387848, + "learning_rate": 1.9934637307821126e-05, + "loss": 1.3287, + "step": 2186 + }, + { + "epoch": 0.6513896386753291, + "grad_norm": 0.18435817956924438, + "learning_rate": 1.993452715807436e-05, + "loss": 1.3095, + "step": 2187 + }, + { + "epoch": 0.6516874848749977, + "grad_norm": 0.17921589314937592, + "learning_rate": 1.993441691589792e-05, + "loss": 1.3267, + "step": 2188 + }, + { + "epoch": 0.6519853310746663, + "grad_norm": 0.17948828637599945, + "learning_rate": 1.993430658129283e-05, + "loss": 1.3222, + "step": 2189 + }, + { + "epoch": 0.6522831772743349, + "grad_norm": 0.18039970099925995, + "learning_rate": 1.9934196154260114e-05, + "loss": 1.3252, + "step": 2190 + }, + { + "epoch": 0.6525810234740036, + "grad_norm": 0.16772134602069855, + "learning_rate": 1.9934085634800807e-05, + "loss": 1.3329, + "step": 2191 + }, + { + "epoch": 0.6528788696736723, + "grad_norm": 0.17711152136325836, + "learning_rate": 1.993397502291593e-05, + "loss": 1.3336, + "step": 2192 + }, + { + "epoch": 0.6531767158733409, + "grad_norm": 0.1912129521369934, + "learning_rate": 1.9933864318606514e-05, + "loss": 1.3189, + "step": 2193 + }, + { + "epoch": 0.6534745620730096, + "grad_norm": 0.18480518460273743, + "learning_rate": 1.9933753521873587e-05, + "loss": 1.3287, + "step": 2194 + }, + { + "epoch": 0.6537724082726782, + "grad_norm": 0.17623695731163025, + "learning_rate": 1.9933642632718185e-05, + "loss": 1.3193, + "step": 2195 + }, + { + "epoch": 0.6540702544723468, + "grad_norm": 0.1776251345872879, + "learning_rate": 1.9933531651141335e-05, + "loss": 1.3404, + "step": 2196 + }, + { + "epoch": 0.6543681006720155, + "grad_norm": 0.18505097925662994, + "learning_rate": 1.9933420577144075e-05, + "loss": 1.3214, + "step": 2197 + }, + { + "epoch": 0.6546659468716841, + "grad_norm": 0.19477857649326324, + "learning_rate": 1.9933309410727427e-05, + "loss": 1.3391, + "step": 2198 + }, + { + "epoch": 0.6549637930713528, + "grad_norm": 0.1819305121898651, + "learning_rate": 1.993319815189244e-05, + "loss": 1.3302, + "step": 2199 + }, + { + "epoch": 0.6552616392710214, + "grad_norm": 0.18122519552707672, + "learning_rate": 1.993308680064014e-05, + "loss": 1.3501, + "step": 2200 + }, + { + "epoch": 0.6555594854706901, + "grad_norm": 0.1732597053050995, + "learning_rate": 1.993297535697157e-05, + "loss": 1.3256, + "step": 2201 + }, + { + "epoch": 0.6558573316703588, + "grad_norm": 0.18337799608707428, + "learning_rate": 1.9932863820887753e-05, + "loss": 1.3349, + "step": 2202 + }, + { + "epoch": 0.6561551778700274, + "grad_norm": 0.18442195653915405, + "learning_rate": 1.9932752192389743e-05, + "loss": 1.3359, + "step": 2203 + }, + { + "epoch": 0.656453024069696, + "grad_norm": 0.17236138880252838, + "learning_rate": 1.9932640471478568e-05, + "loss": 1.3318, + "step": 2204 + }, + { + "epoch": 0.6567508702693646, + "grad_norm": 0.18082675337791443, + "learning_rate": 1.993252865815527e-05, + "loss": 1.3377, + "step": 2205 + }, + { + "epoch": 0.6570487164690333, + "grad_norm": 0.18028658628463745, + "learning_rate": 1.9932416752420895e-05, + "loss": 1.3268, + "step": 2206 + }, + { + "epoch": 0.657346562668702, + "grad_norm": 0.18233519792556763, + "learning_rate": 1.9932304754276473e-05, + "loss": 1.3501, + "step": 2207 + }, + { + "epoch": 0.6576444088683706, + "grad_norm": 0.1773044466972351, + "learning_rate": 1.9932192663723054e-05, + "loss": 1.3387, + "step": 2208 + }, + { + "epoch": 0.6579422550680393, + "grad_norm": 0.17857183516025543, + "learning_rate": 1.993208048076168e-05, + "loss": 1.327, + "step": 2209 + }, + { + "epoch": 0.6582401012677079, + "grad_norm": 0.18105275928974152, + "learning_rate": 1.9931968205393398e-05, + "loss": 1.3268, + "step": 2210 + }, + { + "epoch": 0.6585379474673765, + "grad_norm": 0.17181296646595, + "learning_rate": 1.993185583761924e-05, + "loss": 1.3236, + "step": 2211 + }, + { + "epoch": 0.6588357936670451, + "grad_norm": 0.1748933494091034, + "learning_rate": 1.993174337744027e-05, + "loss": 1.3407, + "step": 2212 + }, + { + "epoch": 0.6591336398667138, + "grad_norm": 0.18497586250305176, + "learning_rate": 1.993163082485752e-05, + "loss": 1.3093, + "step": 2213 + }, + { + "epoch": 0.6594314860663825, + "grad_norm": 0.17400768399238586, + "learning_rate": 1.993151817987204e-05, + "loss": 1.3278, + "step": 2214 + }, + { + "epoch": 0.6597293322660511, + "grad_norm": 0.18220269680023193, + "learning_rate": 1.993140544248488e-05, + "loss": 1.3406, + "step": 2215 + }, + { + "epoch": 0.6600271784657198, + "grad_norm": 0.17420974373817444, + "learning_rate": 1.993129261269709e-05, + "loss": 1.336, + "step": 2216 + }, + { + "epoch": 0.6603250246653884, + "grad_norm": 0.17776700854301453, + "learning_rate": 1.9931179690509714e-05, + "loss": 1.3396, + "step": 2217 + }, + { + "epoch": 0.6606228708650571, + "grad_norm": 0.16965252161026, + "learning_rate": 1.9931066675923808e-05, + "loss": 1.3432, + "step": 2218 + }, + { + "epoch": 0.6609207170647257, + "grad_norm": 0.19087284803390503, + "learning_rate": 1.9930953568940424e-05, + "loss": 1.3202, + "step": 2219 + }, + { + "epoch": 0.6612185632643943, + "grad_norm": 0.18144509196281433, + "learning_rate": 1.9930840369560612e-05, + "loss": 1.3085, + "step": 2220 + }, + { + "epoch": 0.661516409464063, + "grad_norm": 0.22314777970314026, + "learning_rate": 1.9930727077785427e-05, + "loss": 1.3226, + "step": 2221 + }, + { + "epoch": 0.6618142556637316, + "grad_norm": 0.18070575594902039, + "learning_rate": 1.993061369361592e-05, + "loss": 1.3152, + "step": 2222 + }, + { + "epoch": 0.6621121018634003, + "grad_norm": 0.18086224794387817, + "learning_rate": 1.9930500217053147e-05, + "loss": 1.3316, + "step": 2223 + }, + { + "epoch": 0.662409948063069, + "grad_norm": 0.18728743493556976, + "learning_rate": 1.993038664809817e-05, + "loss": 1.3261, + "step": 2224 + }, + { + "epoch": 0.6627077942627376, + "grad_norm": 0.17994599044322968, + "learning_rate": 1.9930272986752036e-05, + "loss": 1.3346, + "step": 2225 + }, + { + "epoch": 0.6630056404624062, + "grad_norm": 0.1903691291809082, + "learning_rate": 1.9930159233015805e-05, + "loss": 1.3491, + "step": 2226 + }, + { + "epoch": 0.6633034866620748, + "grad_norm": 0.18117789924144745, + "learning_rate": 1.993004538689054e-05, + "loss": 1.3057, + "step": 2227 + }, + { + "epoch": 0.6636013328617435, + "grad_norm": 0.17975282669067383, + "learning_rate": 1.9929931448377292e-05, + "loss": 1.3228, + "step": 2228 + }, + { + "epoch": 0.6638991790614122, + "grad_norm": 0.18567296862602234, + "learning_rate": 1.9929817417477132e-05, + "loss": 1.3328, + "step": 2229 + }, + { + "epoch": 0.6641970252610808, + "grad_norm": 0.18861307203769684, + "learning_rate": 1.9929703294191115e-05, + "loss": 1.3223, + "step": 2230 + }, + { + "epoch": 0.6644948714607495, + "grad_norm": 0.18120045959949493, + "learning_rate": 1.99295890785203e-05, + "loss": 1.346, + "step": 2231 + }, + { + "epoch": 0.6647927176604181, + "grad_norm": 0.17772220075130463, + "learning_rate": 1.9929474770465755e-05, + "loss": 1.3255, + "step": 2232 + }, + { + "epoch": 0.6650905638600868, + "grad_norm": 0.189261332154274, + "learning_rate": 1.9929360370028535e-05, + "loss": 1.3306, + "step": 2233 + }, + { + "epoch": 0.6653884100597554, + "grad_norm": 0.19222469627857208, + "learning_rate": 1.9929245877209718e-05, + "loss": 1.3327, + "step": 2234 + }, + { + "epoch": 0.665686256259424, + "grad_norm": 0.17099244892597198, + "learning_rate": 1.992913129201036e-05, + "loss": 1.3198, + "step": 2235 + }, + { + "epoch": 0.6659841024590927, + "grad_norm": 0.1890147626399994, + "learning_rate": 1.9929016614431526e-05, + "loss": 1.3253, + "step": 2236 + }, + { + "epoch": 0.6662819486587613, + "grad_norm": 0.18433354794979095, + "learning_rate": 1.9928901844474285e-05, + "loss": 1.315, + "step": 2237 + }, + { + "epoch": 0.66657979485843, + "grad_norm": 0.18210284411907196, + "learning_rate": 1.992878698213971e-05, + "loss": 1.3342, + "step": 2238 + }, + { + "epoch": 0.6668776410580987, + "grad_norm": 0.17864404618740082, + "learning_rate": 1.9928672027428866e-05, + "loss": 1.3399, + "step": 2239 + }, + { + "epoch": 0.6671754872577673, + "grad_norm": 0.18134282529354095, + "learning_rate": 1.9928556980342818e-05, + "loss": 1.3254, + "step": 2240 + }, + { + "epoch": 0.6674733334574359, + "grad_norm": 0.19404278695583344, + "learning_rate": 1.992844184088264e-05, + "loss": 1.3467, + "step": 2241 + }, + { + "epoch": 0.6677711796571045, + "grad_norm": 0.18934600055217743, + "learning_rate": 1.9928326609049406e-05, + "loss": 1.3338, + "step": 2242 + }, + { + "epoch": 0.6680690258567732, + "grad_norm": 0.17520056664943695, + "learning_rate": 1.9928211284844183e-05, + "loss": 1.3353, + "step": 2243 + }, + { + "epoch": 0.6683668720564419, + "grad_norm": 0.18218885362148285, + "learning_rate": 1.992809586826805e-05, + "loss": 1.3143, + "step": 2244 + }, + { + "epoch": 0.6686647182561105, + "grad_norm": 0.1865578293800354, + "learning_rate": 1.992798035932207e-05, + "loss": 1.3276, + "step": 2245 + }, + { + "epoch": 0.6689625644557792, + "grad_norm": 0.18215002119541168, + "learning_rate": 1.9927864758007332e-05, + "loss": 1.3176, + "step": 2246 + }, + { + "epoch": 0.6692604106554478, + "grad_norm": 0.18074733018875122, + "learning_rate": 1.9927749064324905e-05, + "loss": 1.3295, + "step": 2247 + }, + { + "epoch": 0.6695582568551164, + "grad_norm": 0.18050310015678406, + "learning_rate": 1.9927633278275862e-05, + "loss": 1.312, + "step": 2248 + }, + { + "epoch": 0.669856103054785, + "grad_norm": 0.17129181325435638, + "learning_rate": 1.992751739986128e-05, + "loss": 1.3518, + "step": 2249 + }, + { + "epoch": 0.6701539492544537, + "grad_norm": 0.19418247044086456, + "learning_rate": 1.9927401429082244e-05, + "loss": 1.3273, + "step": 2250 + }, + { + "epoch": 0.6704517954541224, + "grad_norm": 0.18332301080226898, + "learning_rate": 1.992728536593983e-05, + "loss": 1.3223, + "step": 2251 + }, + { + "epoch": 0.670749641653791, + "grad_norm": 0.17829930782318115, + "learning_rate": 1.9927169210435117e-05, + "loss": 1.3381, + "step": 2252 + }, + { + "epoch": 0.6710474878534597, + "grad_norm": 0.1867094486951828, + "learning_rate": 1.9927052962569183e-05, + "loss": 1.341, + "step": 2253 + }, + { + "epoch": 0.6713453340531284, + "grad_norm": 0.18619227409362793, + "learning_rate": 1.9926936622343115e-05, + "loss": 1.3242, + "step": 2254 + }, + { + "epoch": 0.671643180252797, + "grad_norm": 0.1722981333732605, + "learning_rate": 1.992682018975799e-05, + "loss": 1.3277, + "step": 2255 + }, + { + "epoch": 0.6719410264524656, + "grad_norm": 0.17711478471755981, + "learning_rate": 1.9926703664814898e-05, + "loss": 1.3135, + "step": 2256 + }, + { + "epoch": 0.6722388726521342, + "grad_norm": 0.18896667659282684, + "learning_rate": 1.9926587047514917e-05, + "loss": 1.3303, + "step": 2257 + }, + { + "epoch": 0.6725367188518029, + "grad_norm": 0.17343689501285553, + "learning_rate": 1.9926470337859133e-05, + "loss": 1.3349, + "step": 2258 + }, + { + "epoch": 0.6728345650514715, + "grad_norm": 0.1764567643404007, + "learning_rate": 1.9926353535848636e-05, + "loss": 1.333, + "step": 2259 + }, + { + "epoch": 0.6731324112511402, + "grad_norm": 0.17717604339122772, + "learning_rate": 1.9926236641484506e-05, + "loss": 1.3227, + "step": 2260 + }, + { + "epoch": 0.6734302574508089, + "grad_norm": 0.1780940145254135, + "learning_rate": 1.9926119654767836e-05, + "loss": 1.3304, + "step": 2261 + }, + { + "epoch": 0.6737281036504775, + "grad_norm": 0.182095006108284, + "learning_rate": 1.9926002575699713e-05, + "loss": 1.3158, + "step": 2262 + }, + { + "epoch": 0.6740259498501461, + "grad_norm": 0.17723587155342102, + "learning_rate": 1.992588540428123e-05, + "loss": 1.3241, + "step": 2263 + }, + { + "epoch": 0.6743237960498147, + "grad_norm": 0.1783917248249054, + "learning_rate": 1.9925768140513466e-05, + "loss": 1.3216, + "step": 2264 + }, + { + "epoch": 0.6746216422494834, + "grad_norm": 0.1807514876127243, + "learning_rate": 1.9925650784397522e-05, + "loss": 1.3329, + "step": 2265 + }, + { + "epoch": 0.6749194884491521, + "grad_norm": 0.18023428320884705, + "learning_rate": 1.9925533335934488e-05, + "loss": 1.3192, + "step": 2266 + }, + { + "epoch": 0.6752173346488207, + "grad_norm": 0.18253910541534424, + "learning_rate": 1.9925415795125455e-05, + "loss": 1.3182, + "step": 2267 + }, + { + "epoch": 0.6755151808484894, + "grad_norm": 0.19268609583377838, + "learning_rate": 1.992529816197152e-05, + "loss": 1.3322, + "step": 2268 + }, + { + "epoch": 0.675813027048158, + "grad_norm": 0.18364225327968597, + "learning_rate": 1.992518043647377e-05, + "loss": 1.3308, + "step": 2269 + }, + { + "epoch": 0.6761108732478267, + "grad_norm": 0.17027676105499268, + "learning_rate": 1.9925062618633308e-05, + "loss": 1.3514, + "step": 2270 + }, + { + "epoch": 0.6764087194474953, + "grad_norm": 0.18000012636184692, + "learning_rate": 1.9924944708451225e-05, + "loss": 1.3038, + "step": 2271 + }, + { + "epoch": 0.6767065656471639, + "grad_norm": 0.19108423590660095, + "learning_rate": 1.9924826705928623e-05, + "loss": 1.3204, + "step": 2272 + }, + { + "epoch": 0.6770044118468326, + "grad_norm": 0.1780695766210556, + "learning_rate": 1.9924708611066595e-05, + "loss": 1.3524, + "step": 2273 + }, + { + "epoch": 0.6773022580465012, + "grad_norm": 0.1744093894958496, + "learning_rate": 1.9924590423866242e-05, + "loss": 1.3183, + "step": 2274 + }, + { + "epoch": 0.6776001042461699, + "grad_norm": 0.1840371936559677, + "learning_rate": 1.9924472144328667e-05, + "loss": 1.3395, + "step": 2275 + }, + { + "epoch": 0.6778979504458386, + "grad_norm": 0.18936192989349365, + "learning_rate": 1.9924353772454962e-05, + "loss": 1.3308, + "step": 2276 + }, + { + "epoch": 0.6781957966455072, + "grad_norm": 0.1820136308670044, + "learning_rate": 1.9924235308246237e-05, + "loss": 1.3396, + "step": 2277 + }, + { + "epoch": 0.6784936428451758, + "grad_norm": 0.18547116219997406, + "learning_rate": 1.9924116751703592e-05, + "loss": 1.3116, + "step": 2278 + }, + { + "epoch": 0.6787914890448444, + "grad_norm": 0.18258768320083618, + "learning_rate": 1.9923998102828125e-05, + "loss": 1.3307, + "step": 2279 + }, + { + "epoch": 0.6790893352445131, + "grad_norm": 0.1860014945268631, + "learning_rate": 1.992387936162094e-05, + "loss": 1.3164, + "step": 2280 + }, + { + "epoch": 0.6793871814441818, + "grad_norm": 0.1784329116344452, + "learning_rate": 1.9923760528083153e-05, + "loss": 1.3217, + "step": 2281 + }, + { + "epoch": 0.6796850276438504, + "grad_norm": 0.18610075116157532, + "learning_rate": 1.9923641602215857e-05, + "loss": 1.3416, + "step": 2282 + }, + { + "epoch": 0.6799828738435191, + "grad_norm": 0.1862253099679947, + "learning_rate": 1.9923522584020164e-05, + "loss": 1.3276, + "step": 2283 + }, + { + "epoch": 0.6802807200431877, + "grad_norm": 0.18593938648700714, + "learning_rate": 1.9923403473497182e-05, + "loss": 1.3112, + "step": 2284 + }, + { + "epoch": 0.6805785662428564, + "grad_norm": 0.1905030757188797, + "learning_rate": 1.9923284270648015e-05, + "loss": 1.3392, + "step": 2285 + }, + { + "epoch": 0.680876412442525, + "grad_norm": 0.18495959043502808, + "learning_rate": 1.992316497547378e-05, + "loss": 1.3148, + "step": 2286 + }, + { + "epoch": 0.6811742586421936, + "grad_norm": 0.18764932453632355, + "learning_rate": 1.9923045587975576e-05, + "loss": 1.3226, + "step": 2287 + }, + { + "epoch": 0.6814721048418623, + "grad_norm": 0.1735365390777588, + "learning_rate": 1.992292610815452e-05, + "loss": 1.3311, + "step": 2288 + }, + { + "epoch": 0.6817699510415309, + "grad_norm": 0.18366490304470062, + "learning_rate": 1.9922806536011724e-05, + "loss": 1.3356, + "step": 2289 + }, + { + "epoch": 0.6820677972411996, + "grad_norm": 0.17725488543510437, + "learning_rate": 1.9922686871548303e-05, + "loss": 1.3312, + "step": 2290 + }, + { + "epoch": 0.6823656434408683, + "grad_norm": 0.17958669364452362, + "learning_rate": 1.9922567114765362e-05, + "loss": 1.3238, + "step": 2291 + }, + { + "epoch": 0.6826634896405369, + "grad_norm": 0.18054373562335968, + "learning_rate": 1.9922447265664023e-05, + "loss": 1.3357, + "step": 2292 + }, + { + "epoch": 0.6829613358402055, + "grad_norm": 0.1845768392086029, + "learning_rate": 1.99223273242454e-05, + "loss": 1.336, + "step": 2293 + }, + { + "epoch": 0.6832591820398741, + "grad_norm": 0.16891595721244812, + "learning_rate": 1.9922207290510603e-05, + "loss": 1.3212, + "step": 2294 + }, + { + "epoch": 0.6835570282395428, + "grad_norm": 0.1755804568529129, + "learning_rate": 1.9922087164460755e-05, + "loss": 1.3164, + "step": 2295 + }, + { + "epoch": 0.6838548744392114, + "grad_norm": 0.18519924581050873, + "learning_rate": 1.992196694609697e-05, + "loss": 1.3306, + "step": 2296 + }, + { + "epoch": 0.6841527206388801, + "grad_norm": 0.18089953064918518, + "learning_rate": 1.992184663542037e-05, + "loss": 1.3381, + "step": 2297 + }, + { + "epoch": 0.6844505668385488, + "grad_norm": 0.18031135201454163, + "learning_rate": 1.9921726232432072e-05, + "loss": 1.3244, + "step": 2298 + }, + { + "epoch": 0.6847484130382174, + "grad_norm": 0.1838495433330536, + "learning_rate": 1.9921605737133197e-05, + "loss": 1.3282, + "step": 2299 + }, + { + "epoch": 0.6850462592378861, + "grad_norm": 0.17975088953971863, + "learning_rate": 1.9921485149524864e-05, + "loss": 1.3302, + "step": 2300 + }, + { + "epoch": 0.6853441054375546, + "grad_norm": 0.17442457377910614, + "learning_rate": 1.99213644696082e-05, + "loss": 1.3328, + "step": 2301 + }, + { + "epoch": 0.6856419516372233, + "grad_norm": 0.17792700231075287, + "learning_rate": 1.992124369738432e-05, + "loss": 1.3198, + "step": 2302 + }, + { + "epoch": 0.685939797836892, + "grad_norm": 0.17103061079978943, + "learning_rate": 1.9921122832854353e-05, + "loss": 1.3293, + "step": 2303 + }, + { + "epoch": 0.6862376440365606, + "grad_norm": 0.18080931901931763, + "learning_rate": 1.9921001876019425e-05, + "loss": 1.3322, + "step": 2304 + }, + { + "epoch": 0.6865354902362293, + "grad_norm": 0.17673060297966003, + "learning_rate": 1.9920880826880657e-05, + "loss": 1.3155, + "step": 2305 + }, + { + "epoch": 0.686833336435898, + "grad_norm": 0.18672189116477966, + "learning_rate": 1.9920759685439178e-05, + "loss": 1.3056, + "step": 2306 + }, + { + "epoch": 0.6871311826355666, + "grad_norm": 0.17934873700141907, + "learning_rate": 1.9920638451696112e-05, + "loss": 1.3379, + "step": 2307 + }, + { + "epoch": 0.6874290288352352, + "grad_norm": 0.1873992532491684, + "learning_rate": 1.9920517125652594e-05, + "loss": 1.337, + "step": 2308 + }, + { + "epoch": 0.6877268750349038, + "grad_norm": 0.18944714963436127, + "learning_rate": 1.9920395707309743e-05, + "loss": 1.3169, + "step": 2309 + }, + { + "epoch": 0.6880247212345725, + "grad_norm": 0.18874067068099976, + "learning_rate": 1.9920274196668696e-05, + "loss": 1.327, + "step": 2310 + }, + { + "epoch": 0.6883225674342411, + "grad_norm": 0.17803321778774261, + "learning_rate": 1.9920152593730582e-05, + "loss": 1.3236, + "step": 2311 + }, + { + "epoch": 0.6886204136339098, + "grad_norm": 0.17799226939678192, + "learning_rate": 1.9920030898496532e-05, + "loss": 1.323, + "step": 2312 + }, + { + "epoch": 0.6889182598335785, + "grad_norm": 0.18605384230613708, + "learning_rate": 1.9919909110967676e-05, + "loss": 1.3214, + "step": 2313 + }, + { + "epoch": 0.6892161060332471, + "grad_norm": 0.1801423281431198, + "learning_rate": 1.9919787231145147e-05, + "loss": 1.3154, + "step": 2314 + }, + { + "epoch": 0.6895139522329158, + "grad_norm": 0.1903238594532013, + "learning_rate": 1.9919665259030084e-05, + "loss": 1.3088, + "step": 2315 + }, + { + "epoch": 0.6898117984325843, + "grad_norm": 0.18387939035892487, + "learning_rate": 1.991954319462362e-05, + "loss": 1.3234, + "step": 2316 + }, + { + "epoch": 0.690109644632253, + "grad_norm": 0.18068253993988037, + "learning_rate": 1.9919421037926885e-05, + "loss": 1.3444, + "step": 2317 + }, + { + "epoch": 0.6904074908319217, + "grad_norm": 0.18537044525146484, + "learning_rate": 1.991929878894102e-05, + "loss": 1.3246, + "step": 2318 + }, + { + "epoch": 0.6907053370315903, + "grad_norm": 0.19145667552947998, + "learning_rate": 1.9919176447667167e-05, + "loss": 1.3397, + "step": 2319 + }, + { + "epoch": 0.691003183231259, + "grad_norm": 0.18210239708423615, + "learning_rate": 1.9919054014106457e-05, + "loss": 1.3469, + "step": 2320 + }, + { + "epoch": 0.6913010294309276, + "grad_norm": 0.1801193505525589, + "learning_rate": 1.9918931488260028e-05, + "loss": 1.3235, + "step": 2321 + }, + { + "epoch": 0.6915988756305963, + "grad_norm": 0.18036174774169922, + "learning_rate": 1.991880887012903e-05, + "loss": 1.3377, + "step": 2322 + }, + { + "epoch": 0.6918967218302648, + "grad_norm": 0.1823994517326355, + "learning_rate": 1.9918686159714596e-05, + "loss": 1.3342, + "step": 2323 + }, + { + "epoch": 0.6921945680299335, + "grad_norm": 0.1893562525510788, + "learning_rate": 1.9918563357017863e-05, + "loss": 1.3241, + "step": 2324 + }, + { + "epoch": 0.6924924142296022, + "grad_norm": 0.17952671647071838, + "learning_rate": 1.9918440462039984e-05, + "loss": 1.3393, + "step": 2325 + }, + { + "epoch": 0.6927902604292708, + "grad_norm": 0.1820048838853836, + "learning_rate": 1.99183174747821e-05, + "loss": 1.3284, + "step": 2326 + }, + { + "epoch": 0.6930881066289395, + "grad_norm": 0.18531034886837006, + "learning_rate": 1.9918194395245352e-05, + "loss": 1.3036, + "step": 2327 + }, + { + "epoch": 0.6933859528286082, + "grad_norm": 0.1763806790113449, + "learning_rate": 1.9918071223430885e-05, + "loss": 1.3334, + "step": 2328 + }, + { + "epoch": 0.6936837990282768, + "grad_norm": 0.17299970984458923, + "learning_rate": 1.9917947959339846e-05, + "loss": 1.3233, + "step": 2329 + }, + { + "epoch": 0.6939816452279454, + "grad_norm": 0.18826264142990112, + "learning_rate": 1.9917824602973387e-05, + "loss": 1.3555, + "step": 2330 + }, + { + "epoch": 0.694279491427614, + "grad_norm": 0.17757001519203186, + "learning_rate": 1.9917701154332646e-05, + "loss": 1.3157, + "step": 2331 + }, + { + "epoch": 0.6945773376272827, + "grad_norm": 0.18596325814723969, + "learning_rate": 1.9917577613418778e-05, + "loss": 1.3406, + "step": 2332 + }, + { + "epoch": 0.6948751838269513, + "grad_norm": 0.18018612265586853, + "learning_rate": 1.991745398023293e-05, + "loss": 1.3295, + "step": 2333 + }, + { + "epoch": 0.69517303002662, + "grad_norm": 0.1862306296825409, + "learning_rate": 1.9917330254776254e-05, + "loss": 1.3308, + "step": 2334 + }, + { + "epoch": 0.6954708762262887, + "grad_norm": 0.18988408148288727, + "learning_rate": 1.99172064370499e-05, + "loss": 1.3511, + "step": 2335 + }, + { + "epoch": 0.6957687224259573, + "grad_norm": 0.18056082725524902, + "learning_rate": 1.991708252705502e-05, + "loss": 1.3219, + "step": 2336 + }, + { + "epoch": 0.696066568625626, + "grad_norm": 0.19533643126487732, + "learning_rate": 1.991695852479277e-05, + "loss": 1.325, + "step": 2337 + }, + { + "epoch": 0.6963644148252945, + "grad_norm": 0.1862207055091858, + "learning_rate": 1.9916834430264296e-05, + "loss": 1.3157, + "step": 2338 + }, + { + "epoch": 0.6966622610249632, + "grad_norm": 0.18456688523292542, + "learning_rate": 1.991671024347076e-05, + "loss": 1.3158, + "step": 2339 + }, + { + "epoch": 0.6969601072246319, + "grad_norm": 0.18564829230308533, + "learning_rate": 1.9916585964413317e-05, + "loss": 1.3379, + "step": 2340 + }, + { + "epoch": 0.6972579534243005, + "grad_norm": 0.18239466845989227, + "learning_rate": 1.9916461593093115e-05, + "loss": 1.3352, + "step": 2341 + }, + { + "epoch": 0.6975557996239692, + "grad_norm": 0.18152117729187012, + "learning_rate": 1.9916337129511323e-05, + "loss": 1.3309, + "step": 2342 + }, + { + "epoch": 0.6978536458236378, + "grad_norm": 0.17583444714546204, + "learning_rate": 1.9916212573669093e-05, + "loss": 1.321, + "step": 2343 + }, + { + "epoch": 0.6981514920233065, + "grad_norm": 0.19174323976039886, + "learning_rate": 1.991608792556758e-05, + "loss": 1.3311, + "step": 2344 + }, + { + "epoch": 0.6984493382229751, + "grad_norm": 0.20540690422058105, + "learning_rate": 1.9915963185207948e-05, + "loss": 1.3446, + "step": 2345 + }, + { + "epoch": 0.6987471844226437, + "grad_norm": 0.18162190914154053, + "learning_rate": 1.991583835259136e-05, + "loss": 1.3302, + "step": 2346 + }, + { + "epoch": 0.6990450306223124, + "grad_norm": 0.18038925528526306, + "learning_rate": 1.9915713427718972e-05, + "loss": 1.319, + "step": 2347 + }, + { + "epoch": 0.699342876821981, + "grad_norm": 0.18839231133460999, + "learning_rate": 1.9915588410591954e-05, + "loss": 1.3175, + "step": 2348 + }, + { + "epoch": 0.6996407230216497, + "grad_norm": 0.19137445092201233, + "learning_rate": 1.991546330121146e-05, + "loss": 1.3184, + "step": 2349 + }, + { + "epoch": 0.6999385692213184, + "grad_norm": 0.19656787812709808, + "learning_rate": 1.991533809957866e-05, + "loss": 1.3182, + "step": 2350 + }, + { + "epoch": 0.700236415420987, + "grad_norm": 0.1719386726617813, + "learning_rate": 1.9915212805694715e-05, + "loss": 1.3273, + "step": 2351 + }, + { + "epoch": 0.7005342616206557, + "grad_norm": 0.18107326328754425, + "learning_rate": 1.9915087419560795e-05, + "loss": 1.326, + "step": 2352 + }, + { + "epoch": 0.7008321078203242, + "grad_norm": 0.18605652451515198, + "learning_rate": 1.9914961941178062e-05, + "loss": 1.3163, + "step": 2353 + }, + { + "epoch": 0.7011299540199929, + "grad_norm": 0.18013206124305725, + "learning_rate": 1.9914836370547688e-05, + "loss": 1.3263, + "step": 2354 + }, + { + "epoch": 0.7014278002196616, + "grad_norm": 0.18930692970752716, + "learning_rate": 1.991471070767084e-05, + "loss": 1.3471, + "step": 2355 + }, + { + "epoch": 0.7017256464193302, + "grad_norm": 0.17976365983486176, + "learning_rate": 1.9914584952548682e-05, + "loss": 1.3229, + "step": 2356 + }, + { + "epoch": 0.7020234926189989, + "grad_norm": 0.18191921710968018, + "learning_rate": 1.991445910518239e-05, + "loss": 1.3168, + "step": 2357 + }, + { + "epoch": 0.7023213388186675, + "grad_norm": 0.19520500302314758, + "learning_rate": 1.9914333165573136e-05, + "loss": 1.3367, + "step": 2358 + }, + { + "epoch": 0.7026191850183362, + "grad_norm": 0.1778630167245865, + "learning_rate": 1.9914207133722086e-05, + "loss": 1.3095, + "step": 2359 + }, + { + "epoch": 0.7029170312180048, + "grad_norm": 0.1767389327287674, + "learning_rate": 1.9914081009630413e-05, + "loss": 1.307, + "step": 2360 + }, + { + "epoch": 0.7032148774176734, + "grad_norm": 0.17485411465168, + "learning_rate": 1.99139547932993e-05, + "loss": 1.3238, + "step": 2361 + }, + { + "epoch": 0.7035127236173421, + "grad_norm": 0.17325006425380707, + "learning_rate": 1.991382848472991e-05, + "loss": 1.3139, + "step": 2362 + }, + { + "epoch": 0.7038105698170107, + "grad_norm": 0.18308599293231964, + "learning_rate": 1.9913702083923422e-05, + "loss": 1.3335, + "step": 2363 + }, + { + "epoch": 0.7041084160166794, + "grad_norm": 0.1821424812078476, + "learning_rate": 1.9913575590881013e-05, + "loss": 1.3053, + "step": 2364 + }, + { + "epoch": 0.7044062622163481, + "grad_norm": 0.18681998550891876, + "learning_rate": 1.9913449005603857e-05, + "loss": 1.3132, + "step": 2365 + }, + { + "epoch": 0.7047041084160167, + "grad_norm": 0.18637306988239288, + "learning_rate": 1.9913322328093138e-05, + "loss": 1.2985, + "step": 2366 + }, + { + "epoch": 0.7050019546156854, + "grad_norm": 0.19341403245925903, + "learning_rate": 1.9913195558350028e-05, + "loss": 1.3215, + "step": 2367 + }, + { + "epoch": 0.7052998008153539, + "grad_norm": 0.1809716671705246, + "learning_rate": 1.9913068696375706e-05, + "loss": 1.3319, + "step": 2368 + }, + { + "epoch": 0.7055976470150226, + "grad_norm": 0.18493227660655975, + "learning_rate": 1.9912941742171362e-05, + "loss": 1.3293, + "step": 2369 + }, + { + "epoch": 0.7058954932146912, + "grad_norm": 0.195916086435318, + "learning_rate": 1.9912814695738167e-05, + "loss": 1.3296, + "step": 2370 + }, + { + "epoch": 0.7061933394143599, + "grad_norm": 0.1875908225774765, + "learning_rate": 1.9912687557077307e-05, + "loss": 1.321, + "step": 2371 + }, + { + "epoch": 0.7064911856140286, + "grad_norm": 0.20055951178073883, + "learning_rate": 1.9912560326189966e-05, + "loss": 1.318, + "step": 2372 + }, + { + "epoch": 0.7067890318136972, + "grad_norm": 0.1841721534729004, + "learning_rate": 1.9912433003077324e-05, + "loss": 1.3118, + "step": 2373 + }, + { + "epoch": 0.7070868780133659, + "grad_norm": 0.18234992027282715, + "learning_rate": 1.991230558774057e-05, + "loss": 1.3244, + "step": 2374 + }, + { + "epoch": 0.7073847242130344, + "grad_norm": 0.1869121789932251, + "learning_rate": 1.9912178080180883e-05, + "loss": 1.313, + "step": 2375 + }, + { + "epoch": 0.7076825704127031, + "grad_norm": 0.19375371932983398, + "learning_rate": 1.9912050480399458e-05, + "loss": 1.3361, + "step": 2376 + }, + { + "epoch": 0.7079804166123718, + "grad_norm": 0.18816982209682465, + "learning_rate": 1.9911922788397473e-05, + "loss": 1.343, + "step": 2377 + }, + { + "epoch": 0.7082782628120404, + "grad_norm": 0.18643729388713837, + "learning_rate": 1.991179500417612e-05, + "loss": 1.3338, + "step": 2378 + }, + { + "epoch": 0.7085761090117091, + "grad_norm": 0.18068727850914001, + "learning_rate": 1.9911667127736594e-05, + "loss": 1.3202, + "step": 2379 + }, + { + "epoch": 0.7088739552113777, + "grad_norm": 0.1867077499628067, + "learning_rate": 1.991153915908008e-05, + "loss": 1.3307, + "step": 2380 + }, + { + "epoch": 0.7091718014110464, + "grad_norm": 0.1902526617050171, + "learning_rate": 1.9911411098207765e-05, + "loss": 1.3224, + "step": 2381 + }, + { + "epoch": 0.7094696476107151, + "grad_norm": 0.18281622231006622, + "learning_rate": 1.991128294512084e-05, + "loss": 1.3156, + "step": 2382 + }, + { + "epoch": 0.7097674938103836, + "grad_norm": 0.17584648728370667, + "learning_rate": 1.9911154699820506e-05, + "loss": 1.3338, + "step": 2383 + }, + { + "epoch": 0.7100653400100523, + "grad_norm": 0.17479106783866882, + "learning_rate": 1.991102636230795e-05, + "loss": 1.3211, + "step": 2384 + }, + { + "epoch": 0.7103631862097209, + "grad_norm": 0.18454335629940033, + "learning_rate": 1.9910897932584367e-05, + "loss": 1.3318, + "step": 2385 + }, + { + "epoch": 0.7106610324093896, + "grad_norm": 0.18156906962394714, + "learning_rate": 1.991076941065095e-05, + "loss": 1.3362, + "step": 2386 + }, + { + "epoch": 0.7109588786090583, + "grad_norm": 0.18315784633159637, + "learning_rate": 1.9910640796508897e-05, + "loss": 1.3266, + "step": 2387 + }, + { + "epoch": 0.7112567248087269, + "grad_norm": 0.17966997623443604, + "learning_rate": 1.9910512090159405e-05, + "loss": 1.3111, + "step": 2388 + }, + { + "epoch": 0.7115545710083956, + "grad_norm": 0.19016844034194946, + "learning_rate": 1.991038329160367e-05, + "loss": 1.3338, + "step": 2389 + }, + { + "epoch": 0.7118524172080641, + "grad_norm": 0.17686402797698975, + "learning_rate": 1.991025440084289e-05, + "loss": 1.3293, + "step": 2390 + }, + { + "epoch": 0.7121502634077328, + "grad_norm": 0.17823560535907745, + "learning_rate": 1.991012541787827e-05, + "loss": 1.319, + "step": 2391 + }, + { + "epoch": 0.7124481096074015, + "grad_norm": 0.18323731422424316, + "learning_rate": 1.9909996342711e-05, + "loss": 1.3212, + "step": 2392 + }, + { + "epoch": 0.7127459558070701, + "grad_norm": 0.19072403013706207, + "learning_rate": 1.990986717534229e-05, + "loss": 1.3262, + "step": 2393 + }, + { + "epoch": 0.7130438020067388, + "grad_norm": 0.18610836565494537, + "learning_rate": 1.9909737915773335e-05, + "loss": 1.3136, + "step": 2394 + }, + { + "epoch": 0.7133416482064074, + "grad_norm": 0.18933738768100739, + "learning_rate": 1.990960856400534e-05, + "loss": 1.3163, + "step": 2395 + }, + { + "epoch": 0.7136394944060761, + "grad_norm": 0.1870865523815155, + "learning_rate": 1.990947912003951e-05, + "loss": 1.2943, + "step": 2396 + }, + { + "epoch": 0.7139373406057448, + "grad_norm": 0.18337379395961761, + "learning_rate": 1.9909349583877053e-05, + "loss": 1.3244, + "step": 2397 + }, + { + "epoch": 0.7142351868054133, + "grad_norm": 0.19369252026081085, + "learning_rate": 1.990921995551916e-05, + "loss": 1.3224, + "step": 2398 + }, + { + "epoch": 0.714533033005082, + "grad_norm": 0.19757018983364105, + "learning_rate": 1.9909090234967053e-05, + "loss": 1.3196, + "step": 2399 + }, + { + "epoch": 0.7148308792047506, + "grad_norm": 0.17689107358455658, + "learning_rate": 1.9908960422221932e-05, + "loss": 1.3209, + "step": 2400 + }, + { + "epoch": 0.7151287254044193, + "grad_norm": 0.18062490224838257, + "learning_rate": 1.9908830517285007e-05, + "loss": 1.317, + "step": 2401 + }, + { + "epoch": 0.715426571604088, + "grad_norm": 0.1801805943250656, + "learning_rate": 1.990870052015748e-05, + "loss": 1.3143, + "step": 2402 + }, + { + "epoch": 0.7157244178037566, + "grad_norm": 0.18121488392353058, + "learning_rate": 1.9908570430840567e-05, + "loss": 1.3135, + "step": 2403 + }, + { + "epoch": 0.7160222640034253, + "grad_norm": 0.19454193115234375, + "learning_rate": 1.9908440249335478e-05, + "loss": 1.3282, + "step": 2404 + }, + { + "epoch": 0.7163201102030938, + "grad_norm": 0.1717558354139328, + "learning_rate": 1.990830997564342e-05, + "loss": 1.318, + "step": 2405 + }, + { + "epoch": 0.7166179564027625, + "grad_norm": 0.18550734221935272, + "learning_rate": 1.990817960976561e-05, + "loss": 1.3045, + "step": 2406 + }, + { + "epoch": 0.7169158026024312, + "grad_norm": 0.17859478294849396, + "learning_rate": 1.990804915170326e-05, + "loss": 1.324, + "step": 2407 + }, + { + "epoch": 0.7172136488020998, + "grad_norm": 0.1804829239845276, + "learning_rate": 1.990791860145758e-05, + "loss": 1.3131, + "step": 2408 + }, + { + "epoch": 0.7175114950017685, + "grad_norm": 0.19894477725028992, + "learning_rate": 1.990778795902979e-05, + "loss": 1.3088, + "step": 2409 + }, + { + "epoch": 0.7178093412014371, + "grad_norm": 0.1904131919145584, + "learning_rate": 1.99076572244211e-05, + "loss": 1.3081, + "step": 2410 + }, + { + "epoch": 0.7181071874011058, + "grad_norm": 0.18772949278354645, + "learning_rate": 1.990752639763273e-05, + "loss": 1.3228, + "step": 2411 + }, + { + "epoch": 0.7184050336007743, + "grad_norm": 0.1856239140033722, + "learning_rate": 1.9907395478665896e-05, + "loss": 1.3304, + "step": 2412 + }, + { + "epoch": 0.718702879800443, + "grad_norm": 0.1815764456987381, + "learning_rate": 1.9907264467521817e-05, + "loss": 1.3251, + "step": 2413 + }, + { + "epoch": 0.7190007260001117, + "grad_norm": 0.18634894490242004, + "learning_rate": 1.9907133364201712e-05, + "loss": 1.302, + "step": 2414 + }, + { + "epoch": 0.7192985721997803, + "grad_norm": 0.18666182458400726, + "learning_rate": 1.9907002168706798e-05, + "loss": 1.3212, + "step": 2415 + }, + { + "epoch": 0.719596418399449, + "grad_norm": 0.18581783771514893, + "learning_rate": 1.9906870881038297e-05, + "loss": 1.3433, + "step": 2416 + }, + { + "epoch": 0.7198942645991177, + "grad_norm": 0.19481654465198517, + "learning_rate": 1.9906739501197435e-05, + "loss": 1.311, + "step": 2417 + }, + { + "epoch": 0.7201921107987863, + "grad_norm": 0.1938018798828125, + "learning_rate": 1.9906608029185427e-05, + "loss": 1.32, + "step": 2418 + }, + { + "epoch": 0.720489956998455, + "grad_norm": 0.18101400136947632, + "learning_rate": 1.9906476465003498e-05, + "loss": 1.3321, + "step": 2419 + }, + { + "epoch": 0.7207878031981235, + "grad_norm": 0.17936542630195618, + "learning_rate": 1.9906344808652878e-05, + "loss": 1.3215, + "step": 2420 + }, + { + "epoch": 0.7210856493977922, + "grad_norm": 0.20500177145004272, + "learning_rate": 1.9906213060134787e-05, + "loss": 1.3109, + "step": 2421 + }, + { + "epoch": 0.7213834955974608, + "grad_norm": 0.21439610421657562, + "learning_rate": 1.990608121945045e-05, + "loss": 1.3226, + "step": 2422 + }, + { + "epoch": 0.7216813417971295, + "grad_norm": 0.18862423300743103, + "learning_rate": 1.9905949286601095e-05, + "loss": 1.3144, + "step": 2423 + }, + { + "epoch": 0.7219791879967982, + "grad_norm": 0.19281642138957977, + "learning_rate": 1.9905817261587947e-05, + "loss": 1.3357, + "step": 2424 + }, + { + "epoch": 0.7222770341964668, + "grad_norm": 0.19567199051380157, + "learning_rate": 1.990568514441224e-05, + "loss": 1.3178, + "step": 2425 + }, + { + "epoch": 0.7225748803961355, + "grad_norm": 0.19655363261699677, + "learning_rate": 1.99055529350752e-05, + "loss": 1.3181, + "step": 2426 + }, + { + "epoch": 0.722872726595804, + "grad_norm": 0.19821792840957642, + "learning_rate": 1.9905420633578055e-05, + "loss": 1.3031, + "step": 2427 + }, + { + "epoch": 0.7231705727954727, + "grad_norm": 0.1845242828130722, + "learning_rate": 1.990528823992204e-05, + "loss": 1.3362, + "step": 2428 + }, + { + "epoch": 0.7234684189951414, + "grad_norm": 0.20102176070213318, + "learning_rate": 1.9905155754108385e-05, + "loss": 1.3204, + "step": 2429 + }, + { + "epoch": 0.72376626519481, + "grad_norm": 0.19908630847930908, + "learning_rate": 1.990502317613832e-05, + "loss": 1.3232, + "step": 2430 + }, + { + "epoch": 0.7240641113944787, + "grad_norm": 0.19704653322696686, + "learning_rate": 1.9904890506013084e-05, + "loss": 1.3116, + "step": 2431 + }, + { + "epoch": 0.7243619575941473, + "grad_norm": 0.1876095086336136, + "learning_rate": 1.9904757743733903e-05, + "loss": 1.3309, + "step": 2432 + }, + { + "epoch": 0.724659803793816, + "grad_norm": 0.20279152691364288, + "learning_rate": 1.9904624889302024e-05, + "loss": 1.329, + "step": 2433 + }, + { + "epoch": 0.7249576499934847, + "grad_norm": 0.18088439106941223, + "learning_rate": 1.9904491942718672e-05, + "loss": 1.3312, + "step": 2434 + }, + { + "epoch": 0.7252554961931532, + "grad_norm": 0.18198047578334808, + "learning_rate": 1.9904358903985092e-05, + "loss": 1.3251, + "step": 2435 + }, + { + "epoch": 0.7255533423928219, + "grad_norm": 0.18865373730659485, + "learning_rate": 1.9904225773102516e-05, + "loss": 1.3314, + "step": 2436 + }, + { + "epoch": 0.7258511885924905, + "grad_norm": 0.19915615022182465, + "learning_rate": 1.9904092550072184e-05, + "loss": 1.3215, + "step": 2437 + }, + { + "epoch": 0.7261490347921592, + "grad_norm": 0.18113912642002106, + "learning_rate": 1.9903959234895337e-05, + "loss": 1.3152, + "step": 2438 + }, + { + "epoch": 0.7264468809918279, + "grad_norm": 0.19505468010902405, + "learning_rate": 1.9903825827573215e-05, + "loss": 1.3097, + "step": 2439 + }, + { + "epoch": 0.7267447271914965, + "grad_norm": 0.18470172584056854, + "learning_rate": 1.990369232810706e-05, + "loss": 1.3206, + "step": 2440 + }, + { + "epoch": 0.7270425733911652, + "grad_norm": 0.19797997176647186, + "learning_rate": 1.990355873649811e-05, + "loss": 1.3269, + "step": 2441 + }, + { + "epoch": 0.7273404195908337, + "grad_norm": 0.18900243937969208, + "learning_rate": 1.9903425052747613e-05, + "loss": 1.3422, + "step": 2442 + }, + { + "epoch": 0.7276382657905024, + "grad_norm": 0.1780720353126526, + "learning_rate": 1.9903291276856813e-05, + "loss": 1.3131, + "step": 2443 + }, + { + "epoch": 0.727936111990171, + "grad_norm": 0.18734505772590637, + "learning_rate": 1.990315740882695e-05, + "loss": 1.3211, + "step": 2444 + }, + { + "epoch": 0.7282339581898397, + "grad_norm": 0.19354982674121857, + "learning_rate": 1.990302344865927e-05, + "loss": 1.3187, + "step": 2445 + }, + { + "epoch": 0.7285318043895084, + "grad_norm": 0.18596869707107544, + "learning_rate": 1.9902889396355023e-05, + "loss": 1.3262, + "step": 2446 + }, + { + "epoch": 0.728829650589177, + "grad_norm": 0.18094055354595184, + "learning_rate": 1.9902755251915454e-05, + "loss": 1.3255, + "step": 2447 + }, + { + "epoch": 0.7291274967888457, + "grad_norm": 0.1835775524377823, + "learning_rate": 1.9902621015341812e-05, + "loss": 1.3135, + "step": 2448 + }, + { + "epoch": 0.7294253429885144, + "grad_norm": 0.18673677742481232, + "learning_rate": 1.9902486686635342e-05, + "loss": 1.3205, + "step": 2449 + }, + { + "epoch": 0.7297231891881829, + "grad_norm": 0.18784306943416595, + "learning_rate": 1.99023522657973e-05, + "loss": 1.3132, + "step": 2450 + }, + { + "epoch": 0.7300210353878516, + "grad_norm": 0.19159583747386932, + "learning_rate": 1.9902217752828935e-05, + "loss": 1.3469, + "step": 2451 + }, + { + "epoch": 0.7303188815875202, + "grad_norm": 0.1930502951145172, + "learning_rate": 1.9902083147731495e-05, + "loss": 1.3247, + "step": 2452 + }, + { + "epoch": 0.7306167277871889, + "grad_norm": 0.18336597084999084, + "learning_rate": 1.9901948450506238e-05, + "loss": 1.3292, + "step": 2453 + }, + { + "epoch": 0.7309145739868576, + "grad_norm": 0.20361313223838806, + "learning_rate": 1.9901813661154408e-05, + "loss": 1.3202, + "step": 2454 + }, + { + "epoch": 0.7312124201865262, + "grad_norm": 0.20657707750797272, + "learning_rate": 1.9901678779677266e-05, + "loss": 1.3271, + "step": 2455 + }, + { + "epoch": 0.7315102663861949, + "grad_norm": 0.19663913547992706, + "learning_rate": 1.990154380607607e-05, + "loss": 1.3254, + "step": 2456 + }, + { + "epoch": 0.7318081125858634, + "grad_norm": 0.19350913166999817, + "learning_rate": 1.990140874035207e-05, + "loss": 1.317, + "step": 2457 + }, + { + "epoch": 0.7321059587855321, + "grad_norm": 0.1959448605775833, + "learning_rate": 1.990127358250652e-05, + "loss": 1.3388, + "step": 2458 + }, + { + "epoch": 0.7324038049852007, + "grad_norm": 0.1914556324481964, + "learning_rate": 1.9901138332540685e-05, + "loss": 1.3193, + "step": 2459 + }, + { + "epoch": 0.7327016511848694, + "grad_norm": 0.19220644235610962, + "learning_rate": 1.9901002990455822e-05, + "loss": 1.3096, + "step": 2460 + }, + { + "epoch": 0.7329994973845381, + "grad_norm": 0.17991133034229279, + "learning_rate": 1.9900867556253188e-05, + "loss": 1.3168, + "step": 2461 + }, + { + "epoch": 0.7332973435842067, + "grad_norm": 0.18685564398765564, + "learning_rate": 1.9900732029934038e-05, + "loss": 1.3407, + "step": 2462 + }, + { + "epoch": 0.7335951897838754, + "grad_norm": 0.1979990303516388, + "learning_rate": 1.9900596411499644e-05, + "loss": 1.3124, + "step": 2463 + }, + { + "epoch": 0.733893035983544, + "grad_norm": 0.22801139950752258, + "learning_rate": 1.9900460700951257e-05, + "loss": 1.3229, + "step": 2464 + }, + { + "epoch": 0.7341908821832126, + "grad_norm": 0.19433832168579102, + "learning_rate": 1.990032489829015e-05, + "loss": 1.331, + "step": 2465 + }, + { + "epoch": 0.7344887283828813, + "grad_norm": 0.18098841607570648, + "learning_rate": 1.9900189003517578e-05, + "loss": 1.3282, + "step": 2466 + }, + { + "epoch": 0.7347865745825499, + "grad_norm": 0.18603920936584473, + "learning_rate": 1.990005301663481e-05, + "loss": 1.3258, + "step": 2467 + }, + { + "epoch": 0.7350844207822186, + "grad_norm": 0.19751660525798798, + "learning_rate": 1.9899916937643107e-05, + "loss": 1.3203, + "step": 2468 + }, + { + "epoch": 0.7353822669818872, + "grad_norm": 0.1913176327943802, + "learning_rate": 1.989978076654374e-05, + "loss": 1.3297, + "step": 2469 + }, + { + "epoch": 0.7356801131815559, + "grad_norm": 0.19215697050094604, + "learning_rate": 1.989964450333797e-05, + "loss": 1.3196, + "step": 2470 + }, + { + "epoch": 0.7359779593812246, + "grad_norm": 0.19319573044776917, + "learning_rate": 1.9899508148027075e-05, + "loss": 1.3402, + "step": 2471 + }, + { + "epoch": 0.7362758055808931, + "grad_norm": 0.19183918833732605, + "learning_rate": 1.9899371700612313e-05, + "loss": 1.3425, + "step": 2472 + }, + { + "epoch": 0.7365736517805618, + "grad_norm": 0.17503371834754944, + "learning_rate": 1.989923516109496e-05, + "loss": 1.3207, + "step": 2473 + }, + { + "epoch": 0.7368714979802304, + "grad_norm": 0.1884661465883255, + "learning_rate": 1.989909852947628e-05, + "loss": 1.329, + "step": 2474 + }, + { + "epoch": 0.7371693441798991, + "grad_norm": 0.19076471030712128, + "learning_rate": 1.9898961805757547e-05, + "loss": 1.3061, + "step": 2475 + }, + { + "epoch": 0.7374671903795678, + "grad_norm": 0.1796417236328125, + "learning_rate": 1.989882498994004e-05, + "loss": 1.3201, + "step": 2476 + }, + { + "epoch": 0.7377650365792364, + "grad_norm": 0.18640348315238953, + "learning_rate": 1.9898688082025024e-05, + "loss": 1.3346, + "step": 2477 + }, + { + "epoch": 0.7380628827789051, + "grad_norm": 0.20385372638702393, + "learning_rate": 1.9898551082013774e-05, + "loss": 1.3182, + "step": 2478 + }, + { + "epoch": 0.7383607289785737, + "grad_norm": 0.17771016061306, + "learning_rate": 1.9898413989907563e-05, + "loss": 1.3293, + "step": 2479 + }, + { + "epoch": 0.7386585751782423, + "grad_norm": 0.17758183181285858, + "learning_rate": 1.9898276805707673e-05, + "loss": 1.3371, + "step": 2480 + }, + { + "epoch": 0.738956421377911, + "grad_norm": 0.18944403529167175, + "learning_rate": 1.9898139529415374e-05, + "loss": 1.3006, + "step": 2481 + }, + { + "epoch": 0.7392542675775796, + "grad_norm": 0.18202091753482819, + "learning_rate": 1.9898002161031948e-05, + "loss": 1.3255, + "step": 2482 + }, + { + "epoch": 0.7395521137772483, + "grad_norm": 0.1849079728126526, + "learning_rate": 1.989786470055867e-05, + "loss": 1.3168, + "step": 2483 + }, + { + "epoch": 0.7398499599769169, + "grad_norm": 0.19304360449314117, + "learning_rate": 1.9897727147996817e-05, + "loss": 1.3296, + "step": 2484 + }, + { + "epoch": 0.7401478061765856, + "grad_norm": 0.18227414786815643, + "learning_rate": 1.9897589503347673e-05, + "loss": 1.3093, + "step": 2485 + }, + { + "epoch": 0.7404456523762543, + "grad_norm": 0.2001018077135086, + "learning_rate": 1.9897451766612515e-05, + "loss": 1.3346, + "step": 2486 + }, + { + "epoch": 0.7407434985759228, + "grad_norm": 0.18903778493404388, + "learning_rate": 1.989731393779263e-05, + "loss": 1.3431, + "step": 2487 + }, + { + "epoch": 0.7410413447755915, + "grad_norm": 0.19867846369743347, + "learning_rate": 1.9897176016889296e-05, + "loss": 1.3341, + "step": 2488 + }, + { + "epoch": 0.7413391909752601, + "grad_norm": 0.19320452213287354, + "learning_rate": 1.9897038003903795e-05, + "loss": 1.3235, + "step": 2489 + }, + { + "epoch": 0.7416370371749288, + "grad_norm": 0.18547265231609344, + "learning_rate": 1.989689989883741e-05, + "loss": 1.3359, + "step": 2490 + }, + { + "epoch": 0.7419348833745975, + "grad_norm": 0.18041522800922394, + "learning_rate": 1.9896761701691437e-05, + "loss": 1.303, + "step": 2491 + }, + { + "epoch": 0.7422327295742661, + "grad_norm": 0.1961670070886612, + "learning_rate": 1.9896623412467147e-05, + "loss": 1.3224, + "step": 2492 + }, + { + "epoch": 0.7425305757739348, + "grad_norm": 0.19458021223545074, + "learning_rate": 1.9896485031165836e-05, + "loss": 1.3336, + "step": 2493 + }, + { + "epoch": 0.7428284219736033, + "grad_norm": 0.1804085075855255, + "learning_rate": 1.989634655778879e-05, + "loss": 1.3176, + "step": 2494 + }, + { + "epoch": 0.743126268173272, + "grad_norm": 0.20877377688884735, + "learning_rate": 1.9896207992337296e-05, + "loss": 1.3213, + "step": 2495 + }, + { + "epoch": 0.7434241143729406, + "grad_norm": 0.20411789417266846, + "learning_rate": 1.9896069334812638e-05, + "loss": 1.3265, + "step": 2496 + }, + { + "epoch": 0.7437219605726093, + "grad_norm": 0.18133004009723663, + "learning_rate": 1.9895930585216116e-05, + "loss": 1.3338, + "step": 2497 + }, + { + "epoch": 0.744019806772278, + "grad_norm": 0.18506641685962677, + "learning_rate": 1.9895791743549017e-05, + "loss": 1.3357, + "step": 2498 + }, + { + "epoch": 0.7443176529719466, + "grad_norm": 0.18748866021633148, + "learning_rate": 1.989565280981263e-05, + "loss": 1.3157, + "step": 2499 + }, + { + "epoch": 0.7446154991716153, + "grad_norm": 0.19544926285743713, + "learning_rate": 1.989551378400825e-05, + "loss": 1.3171, + "step": 2500 + }, + { + "epoch": 0.7446154991716153, + "eval_loss": 1.3665268421173096, + "eval_runtime": 19.5702, + "eval_samples_per_second": 88.604, + "eval_steps_per_second": 5.57, + "step": 2500 + }, + { + "epoch": 0.744913345371284, + "grad_norm": 0.18737247586250305, + "learning_rate": 1.989537466613717e-05, + "loss": 1.3311, + "step": 2501 + }, + { + "epoch": 0.7452111915709525, + "grad_norm": 0.183830127120018, + "learning_rate": 1.9895235456200685e-05, + "loss": 1.3178, + "step": 2502 + }, + { + "epoch": 0.7455090377706212, + "grad_norm": 0.18429724872112274, + "learning_rate": 1.989509615420009e-05, + "loss": 1.3013, + "step": 2503 + }, + { + "epoch": 0.7458068839702898, + "grad_norm": 0.1926550567150116, + "learning_rate": 1.9894956760136682e-05, + "loss": 1.3067, + "step": 2504 + }, + { + "epoch": 0.7461047301699585, + "grad_norm": 0.18947188556194305, + "learning_rate": 1.9894817274011755e-05, + "loss": 1.3108, + "step": 2505 + }, + { + "epoch": 0.7464025763696271, + "grad_norm": 0.1820349395275116, + "learning_rate": 1.9894677695826607e-05, + "loss": 1.3147, + "step": 2506 + }, + { + "epoch": 0.7467004225692958, + "grad_norm": 0.19136416912078857, + "learning_rate": 1.989453802558254e-05, + "loss": 1.3381, + "step": 2507 + }, + { + "epoch": 0.7469982687689645, + "grad_norm": 0.1774381846189499, + "learning_rate": 1.989439826328085e-05, + "loss": 1.313, + "step": 2508 + }, + { + "epoch": 0.747296114968633, + "grad_norm": 0.1879688948392868, + "learning_rate": 1.9894258408922843e-05, + "loss": 1.3352, + "step": 2509 + }, + { + "epoch": 0.7475939611683017, + "grad_norm": 0.18918554484844208, + "learning_rate": 1.989411846250981e-05, + "loss": 1.317, + "step": 2510 + }, + { + "epoch": 0.7478918073679703, + "grad_norm": 0.1828010529279709, + "learning_rate": 1.9893978424043063e-05, + "loss": 1.3183, + "step": 2511 + }, + { + "epoch": 0.748189653567639, + "grad_norm": 0.18194228410720825, + "learning_rate": 1.9893838293523902e-05, + "loss": 1.3129, + "step": 2512 + }, + { + "epoch": 0.7484874997673077, + "grad_norm": 0.18059831857681274, + "learning_rate": 1.9893698070953626e-05, + "loss": 1.3143, + "step": 2513 + }, + { + "epoch": 0.7487853459669763, + "grad_norm": 0.19225727021694183, + "learning_rate": 1.9893557756333545e-05, + "loss": 1.3316, + "step": 2514 + }, + { + "epoch": 0.749083192166645, + "grad_norm": 0.18320530652999878, + "learning_rate": 1.9893417349664963e-05, + "loss": 1.3095, + "step": 2515 + }, + { + "epoch": 0.7493810383663136, + "grad_norm": 0.1820412576198578, + "learning_rate": 1.9893276850949186e-05, + "loss": 1.3282, + "step": 2516 + }, + { + "epoch": 0.7496788845659822, + "grad_norm": 0.19816304743289948, + "learning_rate": 1.989313626018752e-05, + "loss": 1.3158, + "step": 2517 + }, + { + "epoch": 0.7499767307656509, + "grad_norm": 0.18489590287208557, + "learning_rate": 1.9892995577381276e-05, + "loss": 1.3158, + "step": 2518 + }, + { + "epoch": 0.7502745769653195, + "grad_norm": 0.18483327329158783, + "learning_rate": 1.9892854802531762e-05, + "loss": 1.3257, + "step": 2519 + }, + { + "epoch": 0.7505724231649882, + "grad_norm": 0.18682824075222015, + "learning_rate": 1.9892713935640287e-05, + "loss": 1.3091, + "step": 2520 + }, + { + "epoch": 0.7508702693646568, + "grad_norm": 0.19897626340389252, + "learning_rate": 1.989257297670816e-05, + "loss": 1.3275, + "step": 2521 + }, + { + "epoch": 0.7511681155643255, + "grad_norm": 0.1884569376707077, + "learning_rate": 1.989243192573669e-05, + "loss": 1.3043, + "step": 2522 + }, + { + "epoch": 0.7514659617639942, + "grad_norm": 0.18143467605113983, + "learning_rate": 1.98922907827272e-05, + "loss": 1.3071, + "step": 2523 + }, + { + "epoch": 0.7517638079636627, + "grad_norm": 0.18571607768535614, + "learning_rate": 1.9892149547680993e-05, + "loss": 1.3112, + "step": 2524 + }, + { + "epoch": 0.7520616541633314, + "grad_norm": 0.19137336313724518, + "learning_rate": 1.989200822059939e-05, + "loss": 1.3139, + "step": 2525 + }, + { + "epoch": 0.752359500363, + "grad_norm": 0.19988438487052917, + "learning_rate": 1.9891866801483702e-05, + "loss": 1.3171, + "step": 2526 + }, + { + "epoch": 0.7526573465626687, + "grad_norm": 0.18807357549667358, + "learning_rate": 1.9891725290335243e-05, + "loss": 1.338, + "step": 2527 + }, + { + "epoch": 0.7529551927623374, + "grad_norm": 0.1797240525484085, + "learning_rate": 1.9891583687155334e-05, + "loss": 1.3159, + "step": 2528 + }, + { + "epoch": 0.753253038962006, + "grad_norm": 0.18882443010807037, + "learning_rate": 1.9891441991945288e-05, + "loss": 1.3189, + "step": 2529 + }, + { + "epoch": 0.7535508851616747, + "grad_norm": 0.18117474019527435, + "learning_rate": 1.9891300204706427e-05, + "loss": 1.3311, + "step": 2530 + }, + { + "epoch": 0.7538487313613433, + "grad_norm": 0.18642142415046692, + "learning_rate": 1.989115832544007e-05, + "loss": 1.3146, + "step": 2531 + }, + { + "epoch": 0.7541465775610119, + "grad_norm": 0.19051580131053925, + "learning_rate": 1.989101635414753e-05, + "loss": 1.3134, + "step": 2532 + }, + { + "epoch": 0.7544444237606805, + "grad_norm": 0.19228899478912354, + "learning_rate": 1.9890874290830143e-05, + "loss": 1.3095, + "step": 2533 + }, + { + "epoch": 0.7547422699603492, + "grad_norm": 0.17889706790447235, + "learning_rate": 1.9890732135489215e-05, + "loss": 1.2917, + "step": 2534 + }, + { + "epoch": 0.7550401161600179, + "grad_norm": 0.18818412721157074, + "learning_rate": 1.9890589888126075e-05, + "loss": 1.3177, + "step": 2535 + }, + { + "epoch": 0.7553379623596865, + "grad_norm": 0.18684431910514832, + "learning_rate": 1.989044754874205e-05, + "loss": 1.3361, + "step": 2536 + }, + { + "epoch": 0.7556358085593552, + "grad_norm": 0.18594640493392944, + "learning_rate": 1.9890305117338456e-05, + "loss": 1.3065, + "step": 2537 + }, + { + "epoch": 0.7559336547590239, + "grad_norm": 0.18711869418621063, + "learning_rate": 1.989016259391663e-05, + "loss": 1.3345, + "step": 2538 + }, + { + "epoch": 0.7562315009586924, + "grad_norm": 0.19249029457569122, + "learning_rate": 1.9890019978477885e-05, + "loss": 1.3336, + "step": 2539 + }, + { + "epoch": 0.7565293471583611, + "grad_norm": 0.1920575052499771, + "learning_rate": 1.9889877271023558e-05, + "loss": 1.3376, + "step": 2540 + }, + { + "epoch": 0.7568271933580297, + "grad_norm": 0.187772735953331, + "learning_rate": 1.9889734471554968e-05, + "loss": 1.3191, + "step": 2541 + }, + { + "epoch": 0.7571250395576984, + "grad_norm": 0.1912943422794342, + "learning_rate": 1.988959158007345e-05, + "loss": 1.3289, + "step": 2542 + }, + { + "epoch": 0.757422885757367, + "grad_norm": 0.20400895178318024, + "learning_rate": 1.9889448596580336e-05, + "loss": 1.3235, + "step": 2543 + }, + { + "epoch": 0.7577207319570357, + "grad_norm": 0.18285918235778809, + "learning_rate": 1.9889305521076946e-05, + "loss": 1.325, + "step": 2544 + }, + { + "epoch": 0.7580185781567044, + "grad_norm": 0.18557345867156982, + "learning_rate": 1.988916235356462e-05, + "loss": 1.3148, + "step": 2545 + }, + { + "epoch": 0.758316424356373, + "grad_norm": 0.18313251435756683, + "learning_rate": 1.9889019094044687e-05, + "loss": 1.334, + "step": 2546 + }, + { + "epoch": 0.7586142705560416, + "grad_norm": 0.18324284255504608, + "learning_rate": 1.9888875742518477e-05, + "loss": 1.3135, + "step": 2547 + }, + { + "epoch": 0.7589121167557102, + "grad_norm": 0.20431822538375854, + "learning_rate": 1.988873229898733e-05, + "loss": 1.3366, + "step": 2548 + }, + { + "epoch": 0.7592099629553789, + "grad_norm": 0.20846553146839142, + "learning_rate": 1.988858876345258e-05, + "loss": 1.3369, + "step": 2549 + }, + { + "epoch": 0.7595078091550476, + "grad_norm": 0.19721843302249908, + "learning_rate": 1.9888445135915554e-05, + "loss": 1.3243, + "step": 2550 + }, + { + "epoch": 0.7598056553547162, + "grad_norm": 0.18610846996307373, + "learning_rate": 1.9888301416377595e-05, + "loss": 1.3156, + "step": 2551 + }, + { + "epoch": 0.7601035015543849, + "grad_norm": 0.18554642796516418, + "learning_rate": 1.988815760484004e-05, + "loss": 1.3479, + "step": 2552 + }, + { + "epoch": 0.7604013477540535, + "grad_norm": 0.19714638590812683, + "learning_rate": 1.988801370130423e-05, + "loss": 1.3221, + "step": 2553 + }, + { + "epoch": 0.7606991939537221, + "grad_norm": 0.20119118690490723, + "learning_rate": 1.9887869705771496e-05, + "loss": 1.3248, + "step": 2554 + }, + { + "epoch": 0.7609970401533908, + "grad_norm": 0.18889166414737701, + "learning_rate": 1.988772561824318e-05, + "loss": 1.3217, + "step": 2555 + }, + { + "epoch": 0.7612948863530594, + "grad_norm": 0.18209540843963623, + "learning_rate": 1.9887581438720627e-05, + "loss": 1.3183, + "step": 2556 + }, + { + "epoch": 0.7615927325527281, + "grad_norm": 0.2016037553548813, + "learning_rate": 1.9887437167205178e-05, + "loss": 1.3337, + "step": 2557 + }, + { + "epoch": 0.7618905787523967, + "grad_norm": 0.1964045912027359, + "learning_rate": 1.988729280369817e-05, + "loss": 1.3191, + "step": 2558 + }, + { + "epoch": 0.7621884249520654, + "grad_norm": 0.20772108435630798, + "learning_rate": 1.988714834820095e-05, + "loss": 1.3273, + "step": 2559 + }, + { + "epoch": 0.7624862711517341, + "grad_norm": 0.18894939124584198, + "learning_rate": 1.988700380071486e-05, + "loss": 1.3096, + "step": 2560 + }, + { + "epoch": 0.7627841173514027, + "grad_norm": 0.18265798687934875, + "learning_rate": 1.9886859161241248e-05, + "loss": 1.3145, + "step": 2561 + }, + { + "epoch": 0.7630819635510713, + "grad_norm": 0.19556808471679688, + "learning_rate": 1.9886714429781457e-05, + "loss": 1.3332, + "step": 2562 + }, + { + "epoch": 0.7633798097507399, + "grad_norm": 0.20578935742378235, + "learning_rate": 1.988656960633683e-05, + "loss": 1.3277, + "step": 2563 + }, + { + "epoch": 0.7636776559504086, + "grad_norm": 0.19405904412269592, + "learning_rate": 1.9886424690908724e-05, + "loss": 1.3194, + "step": 2564 + }, + { + "epoch": 0.7639755021500773, + "grad_norm": 0.19134913384914398, + "learning_rate": 1.988627968349848e-05, + "loss": 1.3268, + "step": 2565 + }, + { + "epoch": 0.7642733483497459, + "grad_norm": 0.1856839954853058, + "learning_rate": 1.988613458410745e-05, + "loss": 1.3263, + "step": 2566 + }, + { + "epoch": 0.7645711945494146, + "grad_norm": 0.20191329717636108, + "learning_rate": 1.9885989392736986e-05, + "loss": 1.3044, + "step": 2567 + }, + { + "epoch": 0.7648690407490832, + "grad_norm": 0.19773228466510773, + "learning_rate": 1.9885844109388436e-05, + "loss": 1.3145, + "step": 2568 + }, + { + "epoch": 0.7651668869487518, + "grad_norm": 0.18773703277111053, + "learning_rate": 1.9885698734063146e-05, + "loss": 1.3246, + "step": 2569 + }, + { + "epoch": 0.7654647331484205, + "grad_norm": 0.1798093169927597, + "learning_rate": 1.988555326676248e-05, + "loss": 1.3189, + "step": 2570 + }, + { + "epoch": 0.7657625793480891, + "grad_norm": 0.20348991453647614, + "learning_rate": 1.988540770748778e-05, + "loss": 1.323, + "step": 2571 + }, + { + "epoch": 0.7660604255477578, + "grad_norm": 0.19539345800876617, + "learning_rate": 1.988526205624041e-05, + "loss": 1.3106, + "step": 2572 + }, + { + "epoch": 0.7663582717474264, + "grad_norm": 0.18025873601436615, + "learning_rate": 1.988511631302172e-05, + "loss": 1.3169, + "step": 2573 + }, + { + "epoch": 0.7666561179470951, + "grad_norm": 0.18473367393016815, + "learning_rate": 1.9884970477833066e-05, + "loss": 1.3124, + "step": 2574 + }, + { + "epoch": 0.7669539641467638, + "grad_norm": 0.19651813805103302, + "learning_rate": 1.9884824550675806e-05, + "loss": 1.3288, + "step": 2575 + }, + { + "epoch": 0.7672518103464323, + "grad_norm": 0.18388725817203522, + "learning_rate": 1.9884678531551297e-05, + "loss": 1.3271, + "step": 2576 + }, + { + "epoch": 0.767549656546101, + "grad_norm": 0.17565305531024933, + "learning_rate": 1.98845324204609e-05, + "loss": 1.3173, + "step": 2577 + }, + { + "epoch": 0.7678475027457696, + "grad_norm": 0.1815018653869629, + "learning_rate": 1.9884386217405972e-05, + "loss": 1.3181, + "step": 2578 + }, + { + "epoch": 0.7681453489454383, + "grad_norm": 0.19392378628253937, + "learning_rate": 1.9884239922387873e-05, + "loss": 1.3379, + "step": 2579 + }, + { + "epoch": 0.768443195145107, + "grad_norm": 0.18461914360523224, + "learning_rate": 1.9884093535407963e-05, + "loss": 1.3227, + "step": 2580 + }, + { + "epoch": 0.7687410413447756, + "grad_norm": 0.3589487671852112, + "learning_rate": 1.988394705646761e-05, + "loss": 1.3266, + "step": 2581 + }, + { + "epoch": 0.7690388875444443, + "grad_norm": 0.1992337703704834, + "learning_rate": 1.9883800485568174e-05, + "loss": 1.3151, + "step": 2582 + }, + { + "epoch": 0.7693367337441129, + "grad_norm": 0.18829941749572754, + "learning_rate": 1.988365382271101e-05, + "loss": 1.3257, + "step": 2583 + }, + { + "epoch": 0.7696345799437815, + "grad_norm": 0.1826237291097641, + "learning_rate": 1.9883507067897493e-05, + "loss": 1.3114, + "step": 2584 + }, + { + "epoch": 0.7699324261434501, + "grad_norm": 0.1839599758386612, + "learning_rate": 1.9883360221128987e-05, + "loss": 1.3176, + "step": 2585 + }, + { + "epoch": 0.7702302723431188, + "grad_norm": 0.17493994534015656, + "learning_rate": 1.9883213282406855e-05, + "loss": 1.3207, + "step": 2586 + }, + { + "epoch": 0.7705281185427875, + "grad_norm": 0.17947088181972504, + "learning_rate": 1.988306625173247e-05, + "loss": 1.3061, + "step": 2587 + }, + { + "epoch": 0.7708259647424561, + "grad_norm": 0.1852508932352066, + "learning_rate": 1.988291912910719e-05, + "loss": 1.3236, + "step": 2588 + }, + { + "epoch": 0.7711238109421248, + "grad_norm": 0.18166646361351013, + "learning_rate": 1.988277191453239e-05, + "loss": 1.3069, + "step": 2589 + }, + { + "epoch": 0.7714216571417934, + "grad_norm": 0.1864398717880249, + "learning_rate": 1.988262460800944e-05, + "loss": 1.3407, + "step": 2590 + }, + { + "epoch": 0.771719503341462, + "grad_norm": 0.1851319670677185, + "learning_rate": 1.9882477209539707e-05, + "loss": 1.3223, + "step": 2591 + }, + { + "epoch": 0.7720173495411307, + "grad_norm": 0.19353334605693817, + "learning_rate": 1.988232971912457e-05, + "loss": 1.3085, + "step": 2592 + }, + { + "epoch": 0.7723151957407993, + "grad_norm": 0.19049708545207977, + "learning_rate": 1.9882182136765394e-05, + "loss": 1.3389, + "step": 2593 + }, + { + "epoch": 0.772613041940468, + "grad_norm": 0.1989065259695053, + "learning_rate": 1.9882034462463553e-05, + "loss": 1.3245, + "step": 2594 + }, + { + "epoch": 0.7729108881401366, + "grad_norm": 0.1943540871143341, + "learning_rate": 1.9881886696220424e-05, + "loss": 1.31, + "step": 2595 + }, + { + "epoch": 0.7732087343398053, + "grad_norm": 0.18225571513175964, + "learning_rate": 1.9881738838037376e-05, + "loss": 1.3101, + "step": 2596 + }, + { + "epoch": 0.773506580539474, + "grad_norm": 0.18928158283233643, + "learning_rate": 1.9881590887915794e-05, + "loss": 1.3065, + "step": 2597 + }, + { + "epoch": 0.7738044267391426, + "grad_norm": 0.19873449206352234, + "learning_rate": 1.9881442845857046e-05, + "loss": 1.3214, + "step": 2598 + }, + { + "epoch": 0.7741022729388112, + "grad_norm": 0.18662762641906738, + "learning_rate": 1.9881294711862514e-05, + "loss": 1.3086, + "step": 2599 + }, + { + "epoch": 0.7744001191384798, + "grad_norm": 0.18702490627765656, + "learning_rate": 1.9881146485933574e-05, + "loss": 1.3127, + "step": 2600 + }, + { + "epoch": 0.7746979653381485, + "grad_norm": 0.19949471950531006, + "learning_rate": 1.9880998168071607e-05, + "loss": 1.3108, + "step": 2601 + }, + { + "epoch": 0.7749958115378172, + "grad_norm": 0.19867639243602753, + "learning_rate": 1.9880849758277987e-05, + "loss": 1.315, + "step": 2602 + }, + { + "epoch": 0.7752936577374858, + "grad_norm": 0.18987229466438293, + "learning_rate": 1.9880701256554106e-05, + "loss": 1.3311, + "step": 2603 + }, + { + "epoch": 0.7755915039371545, + "grad_norm": 0.19967243075370789, + "learning_rate": 1.9880552662901337e-05, + "loss": 1.3342, + "step": 2604 + }, + { + "epoch": 0.7758893501368231, + "grad_norm": 0.1925693154335022, + "learning_rate": 1.9880403977321063e-05, + "loss": 1.3222, + "step": 2605 + }, + { + "epoch": 0.7761871963364917, + "grad_norm": 0.19117453694343567, + "learning_rate": 1.988025519981467e-05, + "loss": 1.3056, + "step": 2606 + }, + { + "epoch": 0.7764850425361604, + "grad_norm": 0.19443592429161072, + "learning_rate": 1.988010633038354e-05, + "loss": 1.3319, + "step": 2607 + }, + { + "epoch": 0.776782888735829, + "grad_norm": 0.19276925921440125, + "learning_rate": 1.9879957369029062e-05, + "loss": 1.2891, + "step": 2608 + }, + { + "epoch": 0.7770807349354977, + "grad_norm": 0.1844119131565094, + "learning_rate": 1.9879808315752616e-05, + "loss": 1.3221, + "step": 2609 + }, + { + "epoch": 0.7773785811351663, + "grad_norm": 0.1958608776330948, + "learning_rate": 1.9879659170555596e-05, + "loss": 1.3192, + "step": 2610 + }, + { + "epoch": 0.777676427334835, + "grad_norm": 0.18317051231861115, + "learning_rate": 1.9879509933439384e-05, + "loss": 1.3238, + "step": 2611 + }, + { + "epoch": 0.7779742735345037, + "grad_norm": 0.20573605597019196, + "learning_rate": 1.987936060440537e-05, + "loss": 1.2924, + "step": 2612 + }, + { + "epoch": 0.7782721197341723, + "grad_norm": 0.19641467928886414, + "learning_rate": 1.987921118345494e-05, + "loss": 1.319, + "step": 2613 + }, + { + "epoch": 0.7785699659338409, + "grad_norm": 0.18709130585193634, + "learning_rate": 1.9879061670589493e-05, + "loss": 1.3208, + "step": 2614 + }, + { + "epoch": 0.7788678121335095, + "grad_norm": 0.2033838927745819, + "learning_rate": 1.987891206581041e-05, + "loss": 1.3148, + "step": 2615 + }, + { + "epoch": 0.7791656583331782, + "grad_norm": 0.1878487765789032, + "learning_rate": 1.9878762369119092e-05, + "loss": 1.3306, + "step": 2616 + }, + { + "epoch": 0.7794635045328469, + "grad_norm": 0.18077583611011505, + "learning_rate": 1.9878612580516926e-05, + "loss": 1.3171, + "step": 2617 + }, + { + "epoch": 0.7797613507325155, + "grad_norm": 0.18070384860038757, + "learning_rate": 1.9878462700005306e-05, + "loss": 1.3016, + "step": 2618 + }, + { + "epoch": 0.7800591969321842, + "grad_norm": 0.1827581375837326, + "learning_rate": 1.9878312727585627e-05, + "loss": 1.327, + "step": 2619 + }, + { + "epoch": 0.7803570431318528, + "grad_norm": 0.19255481660366058, + "learning_rate": 1.9878162663259285e-05, + "loss": 1.3174, + "step": 2620 + }, + { + "epoch": 0.7806548893315214, + "grad_norm": 0.18368345499038696, + "learning_rate": 1.987801250702768e-05, + "loss": 1.3263, + "step": 2621 + }, + { + "epoch": 0.78095273553119, + "grad_norm": 0.18737348914146423, + "learning_rate": 1.98778622588922e-05, + "loss": 1.3132, + "step": 2622 + }, + { + "epoch": 0.7812505817308587, + "grad_norm": 0.1827528327703476, + "learning_rate": 1.9877711918854248e-05, + "loss": 1.3205, + "step": 2623 + }, + { + "epoch": 0.7815484279305274, + "grad_norm": 0.18387190997600555, + "learning_rate": 1.9877561486915224e-05, + "loss": 1.3216, + "step": 2624 + }, + { + "epoch": 0.781846274130196, + "grad_norm": 0.18497104942798615, + "learning_rate": 1.9877410963076523e-05, + "loss": 1.3116, + "step": 2625 + }, + { + "epoch": 0.7821441203298647, + "grad_norm": 0.18254296481609344, + "learning_rate": 1.9877260347339552e-05, + "loss": 1.317, + "step": 2626 + }, + { + "epoch": 0.7824419665295334, + "grad_norm": 0.1875661015510559, + "learning_rate": 1.987710963970571e-05, + "loss": 1.3146, + "step": 2627 + }, + { + "epoch": 0.782739812729202, + "grad_norm": 0.19719748198986053, + "learning_rate": 1.9876958840176397e-05, + "loss": 1.3283, + "step": 2628 + }, + { + "epoch": 0.7830376589288706, + "grad_norm": 0.18464429676532745, + "learning_rate": 1.9876807948753017e-05, + "loss": 1.3177, + "step": 2629 + }, + { + "epoch": 0.7833355051285392, + "grad_norm": 0.18928779661655426, + "learning_rate": 1.9876656965436974e-05, + "loss": 1.3387, + "step": 2630 + }, + { + "epoch": 0.7836333513282079, + "grad_norm": 0.18596485257148743, + "learning_rate": 1.9876505890229675e-05, + "loss": 1.3074, + "step": 2631 + }, + { + "epoch": 0.7839311975278765, + "grad_norm": 0.18939976394176483, + "learning_rate": 1.987635472313252e-05, + "loss": 1.3088, + "step": 2632 + }, + { + "epoch": 0.7842290437275452, + "grad_norm": 0.18326114118099213, + "learning_rate": 1.9876203464146922e-05, + "loss": 1.3119, + "step": 2633 + }, + { + "epoch": 0.7845268899272139, + "grad_norm": 0.19060494005680084, + "learning_rate": 1.987605211327428e-05, + "loss": 1.3008, + "step": 2634 + }, + { + "epoch": 0.7848247361268825, + "grad_norm": 0.19197452068328857, + "learning_rate": 1.987590067051601e-05, + "loss": 1.325, + "step": 2635 + }, + { + "epoch": 0.7851225823265511, + "grad_norm": 0.18107475340366364, + "learning_rate": 1.987574913587352e-05, + "loss": 1.3189, + "step": 2636 + }, + { + "epoch": 0.7854204285262197, + "grad_norm": 0.18983641266822815, + "learning_rate": 1.9875597509348218e-05, + "loss": 1.3172, + "step": 2637 + }, + { + "epoch": 0.7857182747258884, + "grad_norm": 0.18232080340385437, + "learning_rate": 1.9875445790941513e-05, + "loss": 1.3015, + "step": 2638 + }, + { + "epoch": 0.7860161209255571, + "grad_norm": 0.18595845997333527, + "learning_rate": 1.987529398065482e-05, + "loss": 1.319, + "step": 2639 + }, + { + "epoch": 0.7863139671252257, + "grad_norm": 0.1903022974729538, + "learning_rate": 1.987514207848955e-05, + "loss": 1.3259, + "step": 2640 + }, + { + "epoch": 0.7866118133248944, + "grad_norm": 0.19145415723323822, + "learning_rate": 1.9874990084447116e-05, + "loss": 1.3201, + "step": 2641 + }, + { + "epoch": 0.786909659524563, + "grad_norm": 0.19813406467437744, + "learning_rate": 1.987483799852893e-05, + "loss": 1.321, + "step": 2642 + }, + { + "epoch": 0.7872075057242317, + "grad_norm": 0.18630708754062653, + "learning_rate": 1.9874685820736413e-05, + "loss": 1.3232, + "step": 2643 + }, + { + "epoch": 0.7875053519239003, + "grad_norm": 0.20730362832546234, + "learning_rate": 1.987453355107097e-05, + "loss": 1.3476, + "step": 2644 + }, + { + "epoch": 0.7878031981235689, + "grad_norm": 0.19203825294971466, + "learning_rate": 1.9874381189534032e-05, + "loss": 1.3195, + "step": 2645 + }, + { + "epoch": 0.7881010443232376, + "grad_norm": 0.18337178230285645, + "learning_rate": 1.9874228736127007e-05, + "loss": 1.3294, + "step": 2646 + }, + { + "epoch": 0.7883988905229062, + "grad_norm": 0.19438254833221436, + "learning_rate": 1.9874076190851313e-05, + "loss": 1.3328, + "step": 2647 + }, + { + "epoch": 0.7886967367225749, + "grad_norm": 0.1868901252746582, + "learning_rate": 1.9873923553708378e-05, + "loss": 1.3236, + "step": 2648 + }, + { + "epoch": 0.7889945829222436, + "grad_norm": 0.19647769629955292, + "learning_rate": 1.9873770824699613e-05, + "loss": 1.3171, + "step": 2649 + }, + { + "epoch": 0.7892924291219122, + "grad_norm": 0.19201479852199554, + "learning_rate": 1.9873618003826442e-05, + "loss": 1.3166, + "step": 2650 + }, + { + "epoch": 0.7895902753215808, + "grad_norm": 0.1863786280155182, + "learning_rate": 1.9873465091090285e-05, + "loss": 1.3071, + "step": 2651 + }, + { + "epoch": 0.7898881215212494, + "grad_norm": 0.21051806211471558, + "learning_rate": 1.9873312086492568e-05, + "loss": 1.3264, + "step": 2652 + }, + { + "epoch": 0.7901859677209181, + "grad_norm": 0.22071000933647156, + "learning_rate": 1.9873158990034714e-05, + "loss": 1.3259, + "step": 2653 + }, + { + "epoch": 0.7904838139205868, + "grad_norm": 0.18861712515354156, + "learning_rate": 1.9873005801718146e-05, + "loss": 1.3192, + "step": 2654 + }, + { + "epoch": 0.7907816601202554, + "grad_norm": 0.19382211565971375, + "learning_rate": 1.987285252154429e-05, + "loss": 1.3157, + "step": 2655 + }, + { + "epoch": 0.7910795063199241, + "grad_norm": 0.20809771120548248, + "learning_rate": 1.9872699149514574e-05, + "loss": 1.3077, + "step": 2656 + }, + { + "epoch": 0.7913773525195927, + "grad_norm": 0.19337840378284454, + "learning_rate": 1.987254568563042e-05, + "loss": 1.331, + "step": 2657 + }, + { + "epoch": 0.7916751987192613, + "grad_norm": 0.19075921177864075, + "learning_rate": 1.987239212989326e-05, + "loss": 1.3267, + "step": 2658 + }, + { + "epoch": 0.79197304491893, + "grad_norm": 0.1955624371767044, + "learning_rate": 1.987223848230452e-05, + "loss": 1.3068, + "step": 2659 + }, + { + "epoch": 0.7922708911185986, + "grad_norm": 0.19948722422122955, + "learning_rate": 1.987208474286563e-05, + "loss": 1.3018, + "step": 2660 + }, + { + "epoch": 0.7925687373182673, + "grad_norm": 0.19672907888889313, + "learning_rate": 1.9871930911578024e-05, + "loss": 1.3015, + "step": 2661 + }, + { + "epoch": 0.7928665835179359, + "grad_norm": 0.19176773726940155, + "learning_rate": 1.987177698844313e-05, + "loss": 1.3108, + "step": 2662 + }, + { + "epoch": 0.7931644297176046, + "grad_norm": 0.1909971386194229, + "learning_rate": 1.9871622973462377e-05, + "loss": 1.3083, + "step": 2663 + }, + { + "epoch": 0.7934622759172733, + "grad_norm": 0.20255598425865173, + "learning_rate": 1.9871468866637203e-05, + "loss": 1.3245, + "step": 2664 + }, + { + "epoch": 0.7937601221169419, + "grad_norm": 0.1945829540491104, + "learning_rate": 1.9871314667969043e-05, + "loss": 1.3201, + "step": 2665 + }, + { + "epoch": 0.7940579683166105, + "grad_norm": 0.1856970489025116, + "learning_rate": 1.9871160377459326e-05, + "loss": 1.3255, + "step": 2666 + }, + { + "epoch": 0.7943558145162791, + "grad_norm": 0.19815580546855927, + "learning_rate": 1.9871005995109492e-05, + "loss": 1.3197, + "step": 2667 + }, + { + "epoch": 0.7946536607159478, + "grad_norm": 0.19439074397087097, + "learning_rate": 1.987085152092098e-05, + "loss": 1.3085, + "step": 2668 + }, + { + "epoch": 0.7949515069156164, + "grad_norm": 0.19563210010528564, + "learning_rate": 1.9870696954895216e-05, + "loss": 1.3176, + "step": 2669 + }, + { + "epoch": 0.7952493531152851, + "grad_norm": 0.1940533071756363, + "learning_rate": 1.9870542297033645e-05, + "loss": 1.3157, + "step": 2670 + }, + { + "epoch": 0.7955471993149538, + "grad_norm": 0.20374523103237152, + "learning_rate": 1.9870387547337708e-05, + "loss": 1.3346, + "step": 2671 + }, + { + "epoch": 0.7958450455146224, + "grad_norm": 0.19792142510414124, + "learning_rate": 1.9870232705808844e-05, + "loss": 1.3141, + "step": 2672 + }, + { + "epoch": 0.796142891714291, + "grad_norm": 0.1836538463830948, + "learning_rate": 1.9870077772448493e-05, + "loss": 1.3147, + "step": 2673 + }, + { + "epoch": 0.7964407379139596, + "grad_norm": 0.19210876524448395, + "learning_rate": 1.986992274725809e-05, + "loss": 1.3252, + "step": 2674 + }, + { + "epoch": 0.7967385841136283, + "grad_norm": 0.2051551789045334, + "learning_rate": 1.986976763023909e-05, + "loss": 1.3092, + "step": 2675 + }, + { + "epoch": 0.797036430313297, + "grad_norm": 0.19050179421901703, + "learning_rate": 1.9869612421392928e-05, + "loss": 1.3185, + "step": 2676 + }, + { + "epoch": 0.7973342765129656, + "grad_norm": 0.18543951213359833, + "learning_rate": 1.986945712072105e-05, + "loss": 1.3068, + "step": 2677 + }, + { + "epoch": 0.7976321227126343, + "grad_norm": 0.18150204420089722, + "learning_rate": 1.9869301728224893e-05, + "loss": 1.3045, + "step": 2678 + }, + { + "epoch": 0.797929968912303, + "grad_norm": 0.18430055677890778, + "learning_rate": 1.986914624390592e-05, + "loss": 1.3197, + "step": 2679 + }, + { + "epoch": 0.7982278151119716, + "grad_norm": 0.19682268798351288, + "learning_rate": 1.986899066776556e-05, + "loss": 1.3075, + "step": 2680 + }, + { + "epoch": 0.7985256613116402, + "grad_norm": 0.1920955628156662, + "learning_rate": 1.986883499980527e-05, + "loss": 1.3059, + "step": 2681 + }, + { + "epoch": 0.7988235075113088, + "grad_norm": 0.18305283784866333, + "learning_rate": 1.9868679240026502e-05, + "loss": 1.297, + "step": 2682 + }, + { + "epoch": 0.7991213537109775, + "grad_norm": 0.19209906458854675, + "learning_rate": 1.9868523388430693e-05, + "loss": 1.3048, + "step": 2683 + }, + { + "epoch": 0.7994191999106461, + "grad_norm": 0.19421148300170898, + "learning_rate": 1.9868367445019304e-05, + "loss": 1.3234, + "step": 2684 + }, + { + "epoch": 0.7997170461103148, + "grad_norm": 0.19669722020626068, + "learning_rate": 1.9868211409793778e-05, + "loss": 1.3239, + "step": 2685 + }, + { + "epoch": 0.8000148923099835, + "grad_norm": 0.19060830771923065, + "learning_rate": 1.986805528275557e-05, + "loss": 1.3148, + "step": 2686 + }, + { + "epoch": 0.8003127385096521, + "grad_norm": 0.19015301764011383, + "learning_rate": 1.9867899063906136e-05, + "loss": 1.3139, + "step": 2687 + }, + { + "epoch": 0.8006105847093207, + "grad_norm": 0.18294614553451538, + "learning_rate": 1.9867742753246926e-05, + "loss": 1.325, + "step": 2688 + }, + { + "epoch": 0.8009084309089893, + "grad_norm": 0.18956173956394196, + "learning_rate": 1.9867586350779394e-05, + "loss": 1.3257, + "step": 2689 + }, + { + "epoch": 0.801206277108658, + "grad_norm": 0.19346562027931213, + "learning_rate": 1.9867429856504993e-05, + "loss": 1.3058, + "step": 2690 + }, + { + "epoch": 0.8015041233083267, + "grad_norm": 0.18454086780548096, + "learning_rate": 1.9867273270425184e-05, + "loss": 1.3087, + "step": 2691 + }, + { + "epoch": 0.8018019695079953, + "grad_norm": 0.19533635675907135, + "learning_rate": 1.9867116592541423e-05, + "loss": 1.3062, + "step": 2692 + }, + { + "epoch": 0.802099815707664, + "grad_norm": 0.2025240808725357, + "learning_rate": 1.9866959822855163e-05, + "loss": 1.3281, + "step": 2693 + }, + { + "epoch": 0.8023976619073326, + "grad_norm": 0.19497162103652954, + "learning_rate": 1.9866802961367867e-05, + "loss": 1.3307, + "step": 2694 + }, + { + "epoch": 0.8026955081070013, + "grad_norm": 0.18541958928108215, + "learning_rate": 1.9866646008080996e-05, + "loss": 1.3159, + "step": 2695 + }, + { + "epoch": 0.8029933543066698, + "grad_norm": 0.18936875462532043, + "learning_rate": 1.9866488962996004e-05, + "loss": 1.3178, + "step": 2696 + }, + { + "epoch": 0.8032912005063385, + "grad_norm": 0.21259909868240356, + "learning_rate": 1.986633182611436e-05, + "loss": 1.3445, + "step": 2697 + }, + { + "epoch": 0.8035890467060072, + "grad_norm": 0.18928909301757812, + "learning_rate": 1.9866174597437517e-05, + "loss": 1.3102, + "step": 2698 + }, + { + "epoch": 0.8038868929056758, + "grad_norm": 0.18981429934501648, + "learning_rate": 1.9866017276966945e-05, + "loss": 1.3098, + "step": 2699 + }, + { + "epoch": 0.8041847391053445, + "grad_norm": 0.19316232204437256, + "learning_rate": 1.98658598647041e-05, + "loss": 1.3146, + "step": 2700 + }, + { + "epoch": 0.8044825853050132, + "grad_norm": 0.2001708298921585, + "learning_rate": 1.9865702360650458e-05, + "loss": 1.3193, + "step": 2701 + }, + { + "epoch": 0.8047804315046818, + "grad_norm": 0.19563239812850952, + "learning_rate": 1.986554476480748e-05, + "loss": 1.3055, + "step": 2702 + }, + { + "epoch": 0.8050782777043504, + "grad_norm": 0.20106148719787598, + "learning_rate": 1.9865387077176623e-05, + "loss": 1.3182, + "step": 2703 + }, + { + "epoch": 0.805376123904019, + "grad_norm": 0.18671630322933197, + "learning_rate": 1.986522929775937e-05, + "loss": 1.2984, + "step": 2704 + }, + { + "epoch": 0.8056739701036877, + "grad_norm": 0.18762362003326416, + "learning_rate": 1.986507142655717e-05, + "loss": 1.319, + "step": 2705 + }, + { + "epoch": 0.8059718163033563, + "grad_norm": 0.20593728125095367, + "learning_rate": 1.9864913463571512e-05, + "loss": 1.2955, + "step": 2706 + }, + { + "epoch": 0.806269662503025, + "grad_norm": 0.18869160115718842, + "learning_rate": 1.986475540880385e-05, + "loss": 1.3055, + "step": 2707 + }, + { + "epoch": 0.8065675087026937, + "grad_norm": 0.20767731964588165, + "learning_rate": 1.9864597262255666e-05, + "loss": 1.3087, + "step": 2708 + }, + { + "epoch": 0.8068653549023623, + "grad_norm": 0.18076202273368835, + "learning_rate": 1.986443902392842e-05, + "loss": 1.305, + "step": 2709 + }, + { + "epoch": 0.807163201102031, + "grad_norm": 0.19493411481380463, + "learning_rate": 1.9864280693823594e-05, + "loss": 1.2989, + "step": 2710 + }, + { + "epoch": 0.8074610473016995, + "grad_norm": 0.21097835898399353, + "learning_rate": 1.9864122271942654e-05, + "loss": 1.3312, + "step": 2711 + }, + { + "epoch": 0.8077588935013682, + "grad_norm": 0.20255888998508453, + "learning_rate": 1.986396375828708e-05, + "loss": 1.3153, + "step": 2712 + }, + { + "epoch": 0.8080567397010369, + "grad_norm": 0.18912790715694427, + "learning_rate": 1.9863805152858342e-05, + "loss": 1.3159, + "step": 2713 + }, + { + "epoch": 0.8083545859007055, + "grad_norm": 0.18855468928813934, + "learning_rate": 1.9863646455657918e-05, + "loss": 1.3227, + "step": 2714 + }, + { + "epoch": 0.8086524321003742, + "grad_norm": 0.20826329290866852, + "learning_rate": 1.986348766668728e-05, + "loss": 1.3123, + "step": 2715 + }, + { + "epoch": 0.8089502783000428, + "grad_norm": 0.19729100167751312, + "learning_rate": 1.9863328785947916e-05, + "loss": 1.3168, + "step": 2716 + }, + { + "epoch": 0.8092481244997115, + "grad_norm": 0.19067421555519104, + "learning_rate": 1.9863169813441296e-05, + "loss": 1.3018, + "step": 2717 + }, + { + "epoch": 0.8095459706993801, + "grad_norm": 0.19457784295082092, + "learning_rate": 1.98630107491689e-05, + "loss": 1.3201, + "step": 2718 + }, + { + "epoch": 0.8098438168990487, + "grad_norm": 0.1942594051361084, + "learning_rate": 1.9862851593132208e-05, + "loss": 1.3234, + "step": 2719 + }, + { + "epoch": 0.8101416630987174, + "grad_norm": 0.20054592192173004, + "learning_rate": 1.9862692345332704e-05, + "loss": 1.3224, + "step": 2720 + }, + { + "epoch": 0.810439509298386, + "grad_norm": 0.20668689906597137, + "learning_rate": 1.9862533005771864e-05, + "loss": 1.32, + "step": 2721 + }, + { + "epoch": 0.8107373554980547, + "grad_norm": 0.1965094655752182, + "learning_rate": 1.9862373574451173e-05, + "loss": 1.3066, + "step": 2722 + }, + { + "epoch": 0.8110352016977234, + "grad_norm": 0.1899128556251526, + "learning_rate": 1.9862214051372114e-05, + "loss": 1.3015, + "step": 2723 + }, + { + "epoch": 0.811333047897392, + "grad_norm": 0.20184451341629028, + "learning_rate": 1.9862054436536175e-05, + "loss": 1.309, + "step": 2724 + }, + { + "epoch": 0.8116308940970607, + "grad_norm": 0.19189730286598206, + "learning_rate": 1.9861894729944836e-05, + "loss": 1.3255, + "step": 2725 + }, + { + "epoch": 0.8119287402967292, + "grad_norm": 0.19494622945785522, + "learning_rate": 1.9861734931599588e-05, + "loss": 1.2981, + "step": 2726 + }, + { + "epoch": 0.8122265864963979, + "grad_norm": 0.21060998737812042, + "learning_rate": 1.9861575041501912e-05, + "loss": 1.3195, + "step": 2727 + }, + { + "epoch": 0.8125244326960666, + "grad_norm": 0.19490574300289154, + "learning_rate": 1.98614150596533e-05, + "loss": 1.3309, + "step": 2728 + }, + { + "epoch": 0.8128222788957352, + "grad_norm": 0.1967184990644455, + "learning_rate": 1.9861254986055235e-05, + "loss": 1.3219, + "step": 2729 + }, + { + "epoch": 0.8131201250954039, + "grad_norm": 0.2028256356716156, + "learning_rate": 1.9861094820709215e-05, + "loss": 1.3335, + "step": 2730 + }, + { + "epoch": 0.8134179712950725, + "grad_norm": 0.21276399493217468, + "learning_rate": 1.986093456361672e-05, + "loss": 1.3126, + "step": 2731 + }, + { + "epoch": 0.8137158174947412, + "grad_norm": 0.20708300173282623, + "learning_rate": 1.986077421477925e-05, + "loss": 1.3266, + "step": 2732 + }, + { + "epoch": 0.8140136636944098, + "grad_norm": 0.20489421486854553, + "learning_rate": 1.986061377419829e-05, + "loss": 1.3226, + "step": 2733 + }, + { + "epoch": 0.8143115098940784, + "grad_norm": 0.19015038013458252, + "learning_rate": 1.9860453241875342e-05, + "loss": 1.3064, + "step": 2734 + }, + { + "epoch": 0.8146093560937471, + "grad_norm": 0.18883351981639862, + "learning_rate": 1.9860292617811888e-05, + "loss": 1.2963, + "step": 2735 + }, + { + "epoch": 0.8149072022934157, + "grad_norm": 0.18830153346061707, + "learning_rate": 1.986013190200943e-05, + "loss": 1.3319, + "step": 2736 + }, + { + "epoch": 0.8152050484930844, + "grad_norm": 0.1849372833967209, + "learning_rate": 1.9859971094469456e-05, + "loss": 1.3048, + "step": 2737 + }, + { + "epoch": 0.815502894692753, + "grad_norm": 0.20077911019325256, + "learning_rate": 1.985981019519347e-05, + "loss": 1.3198, + "step": 2738 + }, + { + "epoch": 0.8158007408924217, + "grad_norm": 0.19239689409732819, + "learning_rate": 1.985964920418297e-05, + "loss": 1.3265, + "step": 2739 + }, + { + "epoch": 0.8160985870920904, + "grad_norm": 0.19686836004257202, + "learning_rate": 1.9859488121439448e-05, + "loss": 1.305, + "step": 2740 + }, + { + "epoch": 0.8163964332917589, + "grad_norm": 0.20230987668037415, + "learning_rate": 1.9859326946964403e-05, + "loss": 1.3075, + "step": 2741 + }, + { + "epoch": 0.8166942794914276, + "grad_norm": 0.1813904494047165, + "learning_rate": 1.9859165680759335e-05, + "loss": 1.3003, + "step": 2742 + }, + { + "epoch": 0.8169921256910962, + "grad_norm": 0.18585096299648285, + "learning_rate": 1.985900432282575e-05, + "loss": 1.3285, + "step": 2743 + }, + { + "epoch": 0.8172899718907649, + "grad_norm": 0.19540949165821075, + "learning_rate": 1.9858842873165142e-05, + "loss": 1.3283, + "step": 2744 + }, + { + "epoch": 0.8175878180904336, + "grad_norm": 0.1910727322101593, + "learning_rate": 1.9858681331779016e-05, + "loss": 1.3188, + "step": 2745 + }, + { + "epoch": 0.8178856642901022, + "grad_norm": 0.17849615216255188, + "learning_rate": 1.9858519698668877e-05, + "loss": 1.312, + "step": 2746 + }, + { + "epoch": 0.8181835104897709, + "grad_norm": 0.1925978809595108, + "learning_rate": 1.985835797383622e-05, + "loss": 1.2988, + "step": 2747 + }, + { + "epoch": 0.8184813566894394, + "grad_norm": 0.18562570214271545, + "learning_rate": 1.9858196157282564e-05, + "loss": 1.3114, + "step": 2748 + }, + { + "epoch": 0.8187792028891081, + "grad_norm": 0.1920994520187378, + "learning_rate": 1.9858034249009406e-05, + "loss": 1.3191, + "step": 2749 + }, + { + "epoch": 0.8190770490887768, + "grad_norm": 0.18559814989566803, + "learning_rate": 1.9857872249018252e-05, + "loss": 1.3246, + "step": 2750 + }, + { + "epoch": 0.8193748952884454, + "grad_norm": 0.19468210637569427, + "learning_rate": 1.9857710157310612e-05, + "loss": 1.3098, + "step": 2751 + }, + { + "epoch": 0.8196727414881141, + "grad_norm": 0.19151291251182556, + "learning_rate": 1.985754797388799e-05, + "loss": 1.2972, + "step": 2752 + }, + { + "epoch": 0.8199705876877827, + "grad_norm": 0.19110466539859772, + "learning_rate": 1.9857385698751898e-05, + "loss": 1.3182, + "step": 2753 + }, + { + "epoch": 0.8202684338874514, + "grad_norm": 0.194067120552063, + "learning_rate": 1.9857223331903846e-05, + "loss": 1.3141, + "step": 2754 + }, + { + "epoch": 0.82056628008712, + "grad_norm": 0.19519226253032684, + "learning_rate": 1.9857060873345345e-05, + "loss": 1.2999, + "step": 2755 + }, + { + "epoch": 0.8208641262867886, + "grad_norm": 0.1942768692970276, + "learning_rate": 1.9856898323077906e-05, + "loss": 1.3266, + "step": 2756 + }, + { + "epoch": 0.8211619724864573, + "grad_norm": 0.1909717470407486, + "learning_rate": 1.985673568110304e-05, + "loss": 1.2988, + "step": 2757 + }, + { + "epoch": 0.8214598186861259, + "grad_norm": 0.19233834743499756, + "learning_rate": 1.985657294742226e-05, + "loss": 1.3027, + "step": 2758 + }, + { + "epoch": 0.8217576648857946, + "grad_norm": 0.18716426193714142, + "learning_rate": 1.985641012203708e-05, + "loss": 1.3098, + "step": 2759 + }, + { + "epoch": 0.8220555110854633, + "grad_norm": 0.20037902891635895, + "learning_rate": 1.985624720494902e-05, + "loss": 1.3118, + "step": 2760 + }, + { + "epoch": 0.8223533572851319, + "grad_norm": 0.20361924171447754, + "learning_rate": 1.985608419615959e-05, + "loss": 1.3013, + "step": 2761 + }, + { + "epoch": 0.8226512034848006, + "grad_norm": 0.19471381604671478, + "learning_rate": 1.9855921095670306e-05, + "loss": 1.3139, + "step": 2762 + }, + { + "epoch": 0.8229490496844691, + "grad_norm": 0.19743210077285767, + "learning_rate": 1.9855757903482692e-05, + "loss": 1.309, + "step": 2763 + }, + { + "epoch": 0.8232468958841378, + "grad_norm": 0.19530196487903595, + "learning_rate": 1.9855594619598262e-05, + "loss": 1.3305, + "step": 2764 + }, + { + "epoch": 0.8235447420838065, + "grad_norm": 0.19224487245082855, + "learning_rate": 1.985543124401853e-05, + "loss": 1.309, + "step": 2765 + }, + { + "epoch": 0.8238425882834751, + "grad_norm": 0.3356058895587921, + "learning_rate": 1.9855267776745028e-05, + "loss": 1.3086, + "step": 2766 + }, + { + "epoch": 0.8241404344831438, + "grad_norm": 0.19616787135601044, + "learning_rate": 1.9855104217779265e-05, + "loss": 1.3214, + "step": 2767 + }, + { + "epoch": 0.8244382806828124, + "grad_norm": 0.19699224829673767, + "learning_rate": 1.9854940567122773e-05, + "loss": 1.3096, + "step": 2768 + }, + { + "epoch": 0.8247361268824811, + "grad_norm": 0.1945800483226776, + "learning_rate": 1.9854776824777068e-05, + "loss": 1.3194, + "step": 2769 + }, + { + "epoch": 0.8250339730821497, + "grad_norm": 0.18592716753482819, + "learning_rate": 1.9854612990743675e-05, + "loss": 1.3042, + "step": 2770 + }, + { + "epoch": 0.8253318192818183, + "grad_norm": 0.1943686157464981, + "learning_rate": 1.985444906502412e-05, + "loss": 1.3264, + "step": 2771 + }, + { + "epoch": 0.825629665481487, + "grad_norm": 0.2716086506843567, + "learning_rate": 1.985428504761992e-05, + "loss": 1.3085, + "step": 2772 + }, + { + "epoch": 0.8259275116811556, + "grad_norm": 0.20023038983345032, + "learning_rate": 1.985412093853261e-05, + "loss": 1.311, + "step": 2773 + }, + { + "epoch": 0.8262253578808243, + "grad_norm": 0.1959606558084488, + "learning_rate": 1.9853956737763718e-05, + "loss": 1.3389, + "step": 2774 + }, + { + "epoch": 0.826523204080493, + "grad_norm": 0.19522209465503693, + "learning_rate": 1.9853792445314766e-05, + "loss": 1.3306, + "step": 2775 + }, + { + "epoch": 0.8268210502801616, + "grad_norm": 0.19641757011413574, + "learning_rate": 1.985362806118728e-05, + "loss": 1.3073, + "step": 2776 + }, + { + "epoch": 0.8271188964798303, + "grad_norm": 0.19511444866657257, + "learning_rate": 1.9853463585382804e-05, + "loss": 1.3078, + "step": 2777 + }, + { + "epoch": 0.8274167426794988, + "grad_norm": 0.186085045337677, + "learning_rate": 1.985329901790285e-05, + "loss": 1.3221, + "step": 2778 + }, + { + "epoch": 0.8277145888791675, + "grad_norm": 0.1986667364835739, + "learning_rate": 1.985313435874896e-05, + "loss": 1.3094, + "step": 2779 + }, + { + "epoch": 0.8280124350788362, + "grad_norm": 0.1856551170349121, + "learning_rate": 1.9852969607922664e-05, + "loss": 1.2961, + "step": 2780 + }, + { + "epoch": 0.8283102812785048, + "grad_norm": 0.1947954148054123, + "learning_rate": 1.9852804765425495e-05, + "loss": 1.3067, + "step": 2781 + }, + { + "epoch": 0.8286081274781735, + "grad_norm": 0.19135957956314087, + "learning_rate": 1.9852639831258985e-05, + "loss": 1.3059, + "step": 2782 + }, + { + "epoch": 0.8289059736778421, + "grad_norm": 0.19226491451263428, + "learning_rate": 1.985247480542467e-05, + "loss": 1.2999, + "step": 2783 + }, + { + "epoch": 0.8292038198775108, + "grad_norm": 0.19346089661121368, + "learning_rate": 1.9852309687924084e-05, + "loss": 1.3104, + "step": 2784 + }, + { + "epoch": 0.8295016660771793, + "grad_norm": 0.19168633222579956, + "learning_rate": 1.9852144478758763e-05, + "loss": 1.3205, + "step": 2785 + }, + { + "epoch": 0.829799512276848, + "grad_norm": 0.19923147559165955, + "learning_rate": 1.9851979177930243e-05, + "loss": 1.3308, + "step": 2786 + }, + { + "epoch": 0.8300973584765167, + "grad_norm": 0.1852472871541977, + "learning_rate": 1.985181378544007e-05, + "loss": 1.2994, + "step": 2787 + }, + { + "epoch": 0.8303952046761853, + "grad_norm": 0.20116518437862396, + "learning_rate": 1.985164830128977e-05, + "loss": 1.3182, + "step": 2788 + }, + { + "epoch": 0.830693050875854, + "grad_norm": 0.1894659548997879, + "learning_rate": 1.9851482725480896e-05, + "loss": 1.3044, + "step": 2789 + }, + { + "epoch": 0.8309908970755226, + "grad_norm": 0.19924505054950714, + "learning_rate": 1.9851317058014978e-05, + "loss": 1.3123, + "step": 2790 + }, + { + "epoch": 0.8312887432751913, + "grad_norm": 0.19592399895191193, + "learning_rate": 1.9851151298893563e-05, + "loss": 1.3243, + "step": 2791 + }, + { + "epoch": 0.83158658947486, + "grad_norm": 0.19199055433273315, + "learning_rate": 1.9850985448118192e-05, + "loss": 1.3114, + "step": 2792 + }, + { + "epoch": 0.8318844356745285, + "grad_norm": 0.1868075728416443, + "learning_rate": 1.9850819505690408e-05, + "loss": 1.3117, + "step": 2793 + }, + { + "epoch": 0.8321822818741972, + "grad_norm": 0.19179585576057434, + "learning_rate": 1.985065347161175e-05, + "loss": 1.3254, + "step": 2794 + }, + { + "epoch": 0.8324801280738658, + "grad_norm": 0.2259640246629715, + "learning_rate": 1.985048734588377e-05, + "loss": 1.3308, + "step": 2795 + }, + { + "epoch": 0.8327779742735345, + "grad_norm": 0.19590279459953308, + "learning_rate": 1.9850321128508013e-05, + "loss": 1.3217, + "step": 2796 + }, + { + "epoch": 0.8330758204732032, + "grad_norm": 0.19112949073314667, + "learning_rate": 1.985015481948602e-05, + "loss": 1.3131, + "step": 2797 + }, + { + "epoch": 0.8333736666728718, + "grad_norm": 0.20110125839710236, + "learning_rate": 1.9849988418819342e-05, + "loss": 1.3052, + "step": 2798 + }, + { + "epoch": 0.8336715128725405, + "grad_norm": 0.1952708512544632, + "learning_rate": 1.984982192650953e-05, + "loss": 1.2947, + "step": 2799 + }, + { + "epoch": 0.833969359072209, + "grad_norm": 0.18995912373065948, + "learning_rate": 1.9849655342558126e-05, + "loss": 1.3191, + "step": 2800 + }, + { + "epoch": 0.8342672052718777, + "grad_norm": 0.1994224488735199, + "learning_rate": 1.9849488666966686e-05, + "loss": 1.3229, + "step": 2801 + }, + { + "epoch": 0.8345650514715464, + "grad_norm": 0.19360418617725372, + "learning_rate": 1.9849321899736757e-05, + "loss": 1.3131, + "step": 2802 + }, + { + "epoch": 0.834862897671215, + "grad_norm": 0.20493118464946747, + "learning_rate": 1.9849155040869895e-05, + "loss": 1.3249, + "step": 2803 + }, + { + "epoch": 0.8351607438708837, + "grad_norm": 0.1936962753534317, + "learning_rate": 1.9848988090367648e-05, + "loss": 1.3174, + "step": 2804 + }, + { + "epoch": 0.8354585900705523, + "grad_norm": 0.18214362859725952, + "learning_rate": 1.9848821048231567e-05, + "loss": 1.3082, + "step": 2805 + }, + { + "epoch": 0.835756436270221, + "grad_norm": 0.21772930026054382, + "learning_rate": 1.9848653914463214e-05, + "loss": 1.3053, + "step": 2806 + }, + { + "epoch": 0.8360542824698897, + "grad_norm": 0.19386501610279083, + "learning_rate": 1.9848486689064138e-05, + "loss": 1.3072, + "step": 2807 + }, + { + "epoch": 0.8363521286695582, + "grad_norm": 0.183438241481781, + "learning_rate": 1.9848319372035898e-05, + "loss": 1.3009, + "step": 2808 + }, + { + "epoch": 0.8366499748692269, + "grad_norm": 0.21069328486919403, + "learning_rate": 1.9848151963380048e-05, + "loss": 1.3227, + "step": 2809 + }, + { + "epoch": 0.8369478210688955, + "grad_norm": 0.2096351981163025, + "learning_rate": 1.984798446309815e-05, + "loss": 1.313, + "step": 2810 + }, + { + "epoch": 0.8372456672685642, + "grad_norm": 0.20137789845466614, + "learning_rate": 1.9847816871191757e-05, + "loss": 1.3208, + "step": 2811 + }, + { + "epoch": 0.8375435134682329, + "grad_norm": 0.19148589670658112, + "learning_rate": 1.9847649187662433e-05, + "loss": 1.3121, + "step": 2812 + }, + { + "epoch": 0.8378413596679015, + "grad_norm": 0.19229096174240112, + "learning_rate": 1.9847481412511734e-05, + "loss": 1.2878, + "step": 2813 + }, + { + "epoch": 0.8381392058675702, + "grad_norm": 0.1903059333562851, + "learning_rate": 1.984731354574122e-05, + "loss": 1.3072, + "step": 2814 + }, + { + "epoch": 0.8384370520672387, + "grad_norm": 0.19770605862140656, + "learning_rate": 1.9847145587352458e-05, + "loss": 1.3059, + "step": 2815 + }, + { + "epoch": 0.8387348982669074, + "grad_norm": 0.20117489993572235, + "learning_rate": 1.984697753734701e-05, + "loss": 1.3155, + "step": 2816 + }, + { + "epoch": 0.839032744466576, + "grad_norm": 0.20878930389881134, + "learning_rate": 1.9846809395726433e-05, + "loss": 1.3177, + "step": 2817 + }, + { + "epoch": 0.8393305906662447, + "grad_norm": 0.19036711752414703, + "learning_rate": 1.9846641162492303e-05, + "loss": 1.3054, + "step": 2818 + }, + { + "epoch": 0.8396284368659134, + "grad_norm": 0.19927635788917542, + "learning_rate": 1.9846472837646173e-05, + "loss": 1.3266, + "step": 2819 + }, + { + "epoch": 0.839926283065582, + "grad_norm": 0.19310376048088074, + "learning_rate": 1.9846304421189618e-05, + "loss": 1.3124, + "step": 2820 + }, + { + "epoch": 0.8402241292652507, + "grad_norm": 0.19466964900493622, + "learning_rate": 1.9846135913124196e-05, + "loss": 1.3077, + "step": 2821 + }, + { + "epoch": 0.8405219754649194, + "grad_norm": 0.19182047247886658, + "learning_rate": 1.9845967313451484e-05, + "loss": 1.3104, + "step": 2822 + }, + { + "epoch": 0.8408198216645879, + "grad_norm": 0.18639199435710907, + "learning_rate": 1.9845798622173046e-05, + "loss": 1.3178, + "step": 2823 + }, + { + "epoch": 0.8411176678642566, + "grad_norm": 0.19101166725158691, + "learning_rate": 1.984562983929045e-05, + "loss": 1.3171, + "step": 2824 + }, + { + "epoch": 0.8414155140639252, + "grad_norm": 0.18609404563903809, + "learning_rate": 1.984546096480527e-05, + "loss": 1.3144, + "step": 2825 + }, + { + "epoch": 0.8417133602635939, + "grad_norm": 0.1885635405778885, + "learning_rate": 1.984529199871908e-05, + "loss": 1.3198, + "step": 2826 + }, + { + "epoch": 0.8420112064632626, + "grad_norm": 0.1921912133693695, + "learning_rate": 1.9845122941033443e-05, + "loss": 1.3247, + "step": 2827 + }, + { + "epoch": 0.8423090526629312, + "grad_norm": 0.21644912660121918, + "learning_rate": 1.9844953791749938e-05, + "loss": 1.3158, + "step": 2828 + }, + { + "epoch": 0.8426068988625999, + "grad_norm": 0.1851980835199356, + "learning_rate": 1.9844784550870137e-05, + "loss": 1.2949, + "step": 2829 + }, + { + "epoch": 0.8429047450622684, + "grad_norm": 0.19381019473075867, + "learning_rate": 1.9844615218395615e-05, + "loss": 1.3182, + "step": 2830 + }, + { + "epoch": 0.8432025912619371, + "grad_norm": 0.1965644806623459, + "learning_rate": 1.9844445794327947e-05, + "loss": 1.3122, + "step": 2831 + }, + { + "epoch": 0.8435004374616057, + "grad_norm": 0.20047470927238464, + "learning_rate": 1.984427627866871e-05, + "loss": 1.3124, + "step": 2832 + }, + { + "epoch": 0.8437982836612744, + "grad_norm": 0.1878950148820877, + "learning_rate": 1.984410667141948e-05, + "loss": 1.3033, + "step": 2833 + }, + { + "epoch": 0.8440961298609431, + "grad_norm": 0.18842440843582153, + "learning_rate": 1.9843936972581838e-05, + "loss": 1.3162, + "step": 2834 + }, + { + "epoch": 0.8443939760606117, + "grad_norm": 0.19343741238117218, + "learning_rate": 1.984376718215736e-05, + "loss": 1.3271, + "step": 2835 + }, + { + "epoch": 0.8446918222602804, + "grad_norm": 0.19063957035541534, + "learning_rate": 1.9843597300147625e-05, + "loss": 1.3083, + "step": 2836 + }, + { + "epoch": 0.8449896684599489, + "grad_norm": 0.20658956468105316, + "learning_rate": 1.9843427326554218e-05, + "loss": 1.3055, + "step": 2837 + }, + { + "epoch": 0.8452875146596176, + "grad_norm": 0.19133248925209045, + "learning_rate": 1.9843257261378717e-05, + "loss": 1.3177, + "step": 2838 + }, + { + "epoch": 0.8455853608592863, + "grad_norm": 0.1955590844154358, + "learning_rate": 1.9843087104622702e-05, + "loss": 1.2879, + "step": 2839 + }, + { + "epoch": 0.8458832070589549, + "grad_norm": 0.1960470825433731, + "learning_rate": 1.984291685628776e-05, + "loss": 1.3118, + "step": 2840 + }, + { + "epoch": 0.8461810532586236, + "grad_norm": 0.2028670758008957, + "learning_rate": 1.9842746516375474e-05, + "loss": 1.318, + "step": 2841 + }, + { + "epoch": 0.8464788994582922, + "grad_norm": 0.18661056458950043, + "learning_rate": 1.984257608488743e-05, + "loss": 1.2969, + "step": 2842 + }, + { + "epoch": 0.8467767456579609, + "grad_norm": 0.18581277132034302, + "learning_rate": 1.984240556182521e-05, + "loss": 1.3079, + "step": 2843 + }, + { + "epoch": 0.8470745918576296, + "grad_norm": 0.19332443177700043, + "learning_rate": 1.9842234947190406e-05, + "loss": 1.3074, + "step": 2844 + }, + { + "epoch": 0.8473724380572981, + "grad_norm": 0.19430364668369293, + "learning_rate": 1.98420642409846e-05, + "loss": 1.3131, + "step": 2845 + }, + { + "epoch": 0.8476702842569668, + "grad_norm": 0.1899929642677307, + "learning_rate": 1.9841893443209385e-05, + "loss": 1.3215, + "step": 2846 + }, + { + "epoch": 0.8479681304566354, + "grad_norm": 0.18724815547466278, + "learning_rate": 1.9841722553866344e-05, + "loss": 1.3109, + "step": 2847 + }, + { + "epoch": 0.8482659766563041, + "grad_norm": 0.19492894411087036, + "learning_rate": 1.984155157295707e-05, + "loss": 1.3271, + "step": 2848 + }, + { + "epoch": 0.8485638228559728, + "grad_norm": 0.18969008326530457, + "learning_rate": 1.9841380500483157e-05, + "loss": 1.3087, + "step": 2849 + }, + { + "epoch": 0.8488616690556414, + "grad_norm": 0.19285663962364197, + "learning_rate": 1.984120933644619e-05, + "loss": 1.2828, + "step": 2850 + }, + { + "epoch": 0.8491595152553101, + "grad_norm": 0.19459478557109833, + "learning_rate": 1.984103808084777e-05, + "loss": 1.3242, + "step": 2851 + }, + { + "epoch": 0.8494573614549786, + "grad_norm": 0.1840110868215561, + "learning_rate": 1.9840866733689482e-05, + "loss": 1.3112, + "step": 2852 + }, + { + "epoch": 0.8497552076546473, + "grad_norm": 0.19513243436813354, + "learning_rate": 1.9840695294972926e-05, + "loss": 1.3048, + "step": 2853 + }, + { + "epoch": 0.850053053854316, + "grad_norm": 0.1993199735879898, + "learning_rate": 1.98405237646997e-05, + "loss": 1.3422, + "step": 2854 + }, + { + "epoch": 0.8503509000539846, + "grad_norm": 0.1872604489326477, + "learning_rate": 1.9840352142871384e-05, + "loss": 1.3077, + "step": 2855 + }, + { + "epoch": 0.8506487462536533, + "grad_norm": 0.18576233088970184, + "learning_rate": 1.9840180429489593e-05, + "loss": 1.3156, + "step": 2856 + }, + { + "epoch": 0.8509465924533219, + "grad_norm": 0.18810655176639557, + "learning_rate": 1.9840008624555917e-05, + "loss": 1.3133, + "step": 2857 + }, + { + "epoch": 0.8512444386529906, + "grad_norm": 0.18760626018047333, + "learning_rate": 1.983983672807195e-05, + "loss": 1.2985, + "step": 2858 + }, + { + "epoch": 0.8515422848526593, + "grad_norm": 0.18436740338802338, + "learning_rate": 1.98396647400393e-05, + "loss": 1.3197, + "step": 2859 + }, + { + "epoch": 0.8518401310523278, + "grad_norm": 0.18469035625457764, + "learning_rate": 1.983949266045956e-05, + "loss": 1.3238, + "step": 2860 + }, + { + "epoch": 0.8521379772519965, + "grad_norm": 0.18595953285694122, + "learning_rate": 1.983932048933434e-05, + "loss": 1.3141, + "step": 2861 + }, + { + "epoch": 0.8524358234516651, + "grad_norm": 0.19043129682540894, + "learning_rate": 1.983914822666523e-05, + "loss": 1.3231, + "step": 2862 + }, + { + "epoch": 0.8527336696513338, + "grad_norm": 0.19436709582805634, + "learning_rate": 1.9838975872453842e-05, + "loss": 1.331, + "step": 2863 + }, + { + "epoch": 0.8530315158510025, + "grad_norm": 0.19808824360370636, + "learning_rate": 1.9838803426701774e-05, + "loss": 1.3049, + "step": 2864 + }, + { + "epoch": 0.8533293620506711, + "grad_norm": 0.18367069959640503, + "learning_rate": 1.9838630889410635e-05, + "loss": 1.3097, + "step": 2865 + }, + { + "epoch": 0.8536272082503398, + "grad_norm": 0.20176853239536285, + "learning_rate": 1.9838458260582024e-05, + "loss": 1.3348, + "step": 2866 + }, + { + "epoch": 0.8539250544500083, + "grad_norm": 0.1957586705684662, + "learning_rate": 1.9838285540217554e-05, + "loss": 1.3194, + "step": 2867 + }, + { + "epoch": 0.854222900649677, + "grad_norm": 0.19776873290538788, + "learning_rate": 1.983811272831883e-05, + "loss": 1.3224, + "step": 2868 + }, + { + "epoch": 0.8545207468493456, + "grad_norm": 0.1966826468706131, + "learning_rate": 1.983793982488746e-05, + "loss": 1.3134, + "step": 2869 + }, + { + "epoch": 0.8548185930490143, + "grad_norm": 0.19042354822158813, + "learning_rate": 1.9837766829925045e-05, + "loss": 1.2999, + "step": 2870 + }, + { + "epoch": 0.855116439248683, + "grad_norm": 0.20047244429588318, + "learning_rate": 1.9837593743433207e-05, + "loss": 1.3303, + "step": 2871 + }, + { + "epoch": 0.8554142854483516, + "grad_norm": 0.20083603262901306, + "learning_rate": 1.9837420565413546e-05, + "loss": 1.3063, + "step": 2872 + }, + { + "epoch": 0.8557121316480203, + "grad_norm": 0.2025173008441925, + "learning_rate": 1.9837247295867682e-05, + "loss": 1.3232, + "step": 2873 + }, + { + "epoch": 0.856009977847689, + "grad_norm": 0.20584246516227722, + "learning_rate": 1.9837073934797222e-05, + "loss": 1.322, + "step": 2874 + }, + { + "epoch": 0.8563078240473575, + "grad_norm": 0.1971740424633026, + "learning_rate": 1.9836900482203778e-05, + "loss": 1.3105, + "step": 2875 + }, + { + "epoch": 0.8566056702470262, + "grad_norm": 0.18648609519004822, + "learning_rate": 1.9836726938088966e-05, + "loss": 1.3218, + "step": 2876 + }, + { + "epoch": 0.8569035164466948, + "grad_norm": 0.2044125497341156, + "learning_rate": 1.9836553302454402e-05, + "loss": 1.3245, + "step": 2877 + }, + { + "epoch": 0.8572013626463635, + "grad_norm": 0.20012997090816498, + "learning_rate": 1.9836379575301696e-05, + "loss": 1.3213, + "step": 2878 + }, + { + "epoch": 0.8574992088460321, + "grad_norm": 0.1945681869983673, + "learning_rate": 1.983620575663247e-05, + "loss": 1.3169, + "step": 2879 + }, + { + "epoch": 0.8577970550457008, + "grad_norm": 0.19905398786067963, + "learning_rate": 1.9836031846448338e-05, + "loss": 1.3251, + "step": 2880 + }, + { + "epoch": 0.8580949012453695, + "grad_norm": 0.2026236206293106, + "learning_rate": 1.9835857844750922e-05, + "loss": 1.3394, + "step": 2881 + }, + { + "epoch": 0.858392747445038, + "grad_norm": 0.19498775899410248, + "learning_rate": 1.9835683751541835e-05, + "loss": 1.2935, + "step": 2882 + }, + { + "epoch": 0.8586905936447067, + "grad_norm": 0.20352014899253845, + "learning_rate": 1.9835509566822703e-05, + "loss": 1.3141, + "step": 2883 + }, + { + "epoch": 0.8589884398443753, + "grad_norm": 0.19510705769062042, + "learning_rate": 1.983533529059514e-05, + "loss": 1.3114, + "step": 2884 + }, + { + "epoch": 0.859286286044044, + "grad_norm": 0.19283436238765717, + "learning_rate": 1.983516092286077e-05, + "loss": 1.3209, + "step": 2885 + }, + { + "epoch": 0.8595841322437127, + "grad_norm": 0.1914401352405548, + "learning_rate": 1.9834986463621218e-05, + "loss": 1.3057, + "step": 2886 + }, + { + "epoch": 0.8598819784433813, + "grad_norm": 0.19460298120975494, + "learning_rate": 1.9834811912878106e-05, + "loss": 1.2959, + "step": 2887 + }, + { + "epoch": 0.86017982464305, + "grad_norm": 0.19352369010448456, + "learning_rate": 1.9834637270633057e-05, + "loss": 1.295, + "step": 2888 + }, + { + "epoch": 0.8604776708427186, + "grad_norm": 0.19900284707546234, + "learning_rate": 1.9834462536887694e-05, + "loss": 1.3332, + "step": 2889 + }, + { + "epoch": 0.8607755170423872, + "grad_norm": 0.19056642055511475, + "learning_rate": 1.9834287711643647e-05, + "loss": 1.297, + "step": 2890 + }, + { + "epoch": 0.8610733632420559, + "grad_norm": 0.19439257681369781, + "learning_rate": 1.983411279490254e-05, + "loss": 1.3127, + "step": 2891 + }, + { + "epoch": 0.8613712094417245, + "grad_norm": 0.18397583067417145, + "learning_rate": 1.9833937786666e-05, + "loss": 1.3181, + "step": 2892 + }, + { + "epoch": 0.8616690556413932, + "grad_norm": 0.19947071373462677, + "learning_rate": 1.9833762686935656e-05, + "loss": 1.3075, + "step": 2893 + }, + { + "epoch": 0.8619669018410618, + "grad_norm": 0.18260259926319122, + "learning_rate": 1.9833587495713138e-05, + "loss": 1.3129, + "step": 2894 + }, + { + "epoch": 0.8622647480407305, + "grad_norm": 0.1963987797498703, + "learning_rate": 1.9833412213000077e-05, + "loss": 1.3141, + "step": 2895 + }, + { + "epoch": 0.8625625942403992, + "grad_norm": 0.2164333015680313, + "learning_rate": 1.9833236838798097e-05, + "loss": 1.3123, + "step": 2896 + }, + { + "epoch": 0.8628604404400677, + "grad_norm": 0.19688157737255096, + "learning_rate": 1.9833061373108836e-05, + "loss": 1.3178, + "step": 2897 + }, + { + "epoch": 0.8631582866397364, + "grad_norm": 0.1961808204650879, + "learning_rate": 1.9832885815933926e-05, + "loss": 1.313, + "step": 2898 + }, + { + "epoch": 0.863456132839405, + "grad_norm": 0.1982816457748413, + "learning_rate": 1.9832710167275002e-05, + "loss": 1.3211, + "step": 2899 + }, + { + "epoch": 0.8637539790390737, + "grad_norm": 0.20556198060512543, + "learning_rate": 1.983253442713369e-05, + "loss": 1.3202, + "step": 2900 + }, + { + "epoch": 0.8640518252387424, + "grad_norm": 0.2019490897655487, + "learning_rate": 1.9832358595511638e-05, + "loss": 1.3018, + "step": 2901 + }, + { + "epoch": 0.864349671438411, + "grad_norm": 0.19240225851535797, + "learning_rate": 1.9832182672410468e-05, + "loss": 1.3138, + "step": 2902 + }, + { + "epoch": 0.8646475176380797, + "grad_norm": 0.19900253415107727, + "learning_rate": 1.9832006657831827e-05, + "loss": 1.3014, + "step": 2903 + }, + { + "epoch": 0.8649453638377483, + "grad_norm": 0.19329264760017395, + "learning_rate": 1.9831830551777348e-05, + "loss": 1.3122, + "step": 2904 + }, + { + "epoch": 0.8652432100374169, + "grad_norm": 0.20352724194526672, + "learning_rate": 1.983165435424867e-05, + "loss": 1.3035, + "step": 2905 + }, + { + "epoch": 0.8655410562370855, + "grad_norm": 0.22703319787979126, + "learning_rate": 1.9831478065247434e-05, + "loss": 1.2987, + "step": 2906 + }, + { + "epoch": 0.8658389024367542, + "grad_norm": 0.19756467640399933, + "learning_rate": 1.9831301684775276e-05, + "loss": 1.306, + "step": 2907 + }, + { + "epoch": 0.8661367486364229, + "grad_norm": 0.21882663667201996, + "learning_rate": 1.9831125212833842e-05, + "loss": 1.3342, + "step": 2908 + }, + { + "epoch": 0.8664345948360915, + "grad_norm": 0.20317678153514862, + "learning_rate": 1.9830948649424773e-05, + "loss": 1.3162, + "step": 2909 + }, + { + "epoch": 0.8667324410357602, + "grad_norm": 0.20027881860733032, + "learning_rate": 1.9830771994549712e-05, + "loss": 1.3061, + "step": 2910 + }, + { + "epoch": 0.8670302872354289, + "grad_norm": 0.19904261827468872, + "learning_rate": 1.98305952482103e-05, + "loss": 1.3042, + "step": 2911 + }, + { + "epoch": 0.8673281334350974, + "grad_norm": 0.2004762887954712, + "learning_rate": 1.983041841040818e-05, + "loss": 1.2973, + "step": 2912 + }, + { + "epoch": 0.8676259796347661, + "grad_norm": 0.20205415785312653, + "learning_rate": 1.9830241481145e-05, + "loss": 1.3091, + "step": 2913 + }, + { + "epoch": 0.8679238258344347, + "grad_norm": 0.2154819518327713, + "learning_rate": 1.983006446042241e-05, + "loss": 1.3296, + "step": 2914 + }, + { + "epoch": 0.8682216720341034, + "grad_norm": 0.19992339611053467, + "learning_rate": 1.982988734824205e-05, + "loss": 1.3025, + "step": 2915 + }, + { + "epoch": 0.868519518233772, + "grad_norm": 0.19117744266986847, + "learning_rate": 1.9829710144605568e-05, + "loss": 1.313, + "step": 2916 + }, + { + "epoch": 0.8688173644334407, + "grad_norm": 0.20051947236061096, + "learning_rate": 1.9829532849514623e-05, + "loss": 1.3172, + "step": 2917 + }, + { + "epoch": 0.8691152106331094, + "grad_norm": 0.18870866298675537, + "learning_rate": 1.9829355462970852e-05, + "loss": 1.308, + "step": 2918 + }, + { + "epoch": 0.8694130568327779, + "grad_norm": 0.18585895001888275, + "learning_rate": 1.9829177984975912e-05, + "loss": 1.3361, + "step": 2919 + }, + { + "epoch": 0.8697109030324466, + "grad_norm": 0.19265083968639374, + "learning_rate": 1.982900041553145e-05, + "loss": 1.2989, + "step": 2920 + }, + { + "epoch": 0.8700087492321152, + "grad_norm": 0.20708735287189484, + "learning_rate": 1.9828822754639124e-05, + "loss": 1.3295, + "step": 2921 + }, + { + "epoch": 0.8703065954317839, + "grad_norm": 0.19332322478294373, + "learning_rate": 1.982864500230058e-05, + "loss": 1.3164, + "step": 2922 + }, + { + "epoch": 0.8706044416314526, + "grad_norm": 0.19034935534000397, + "learning_rate": 1.982846715851748e-05, + "loss": 1.2992, + "step": 2923 + }, + { + "epoch": 0.8709022878311212, + "grad_norm": 0.21060289442539215, + "learning_rate": 1.982828922329147e-05, + "loss": 1.3204, + "step": 2924 + }, + { + "epoch": 0.8712001340307899, + "grad_norm": 0.20401045680046082, + "learning_rate": 1.9828111196624212e-05, + "loss": 1.2966, + "step": 2925 + }, + { + "epoch": 0.8714979802304585, + "grad_norm": 0.1838827133178711, + "learning_rate": 1.982793307851736e-05, + "loss": 1.3039, + "step": 2926 + }, + { + "epoch": 0.8717958264301271, + "grad_norm": 0.20430974662303925, + "learning_rate": 1.9827754868972572e-05, + "loss": 1.3115, + "step": 2927 + }, + { + "epoch": 0.8720936726297958, + "grad_norm": 0.19398505985736847, + "learning_rate": 1.9827576567991503e-05, + "loss": 1.3243, + "step": 2928 + }, + { + "epoch": 0.8723915188294644, + "grad_norm": 0.1987292468547821, + "learning_rate": 1.9827398175575815e-05, + "loss": 1.3095, + "step": 2929 + }, + { + "epoch": 0.8726893650291331, + "grad_norm": 0.19124366343021393, + "learning_rate": 1.9827219691727167e-05, + "loss": 1.3105, + "step": 2930 + }, + { + "epoch": 0.8729872112288017, + "grad_norm": 0.19534295797348022, + "learning_rate": 1.9827041116447217e-05, + "loss": 1.3082, + "step": 2931 + }, + { + "epoch": 0.8732850574284704, + "grad_norm": 0.20367717742919922, + "learning_rate": 1.9826862449737633e-05, + "loss": 1.3146, + "step": 2932 + }, + { + "epoch": 0.8735829036281391, + "grad_norm": 0.19920623302459717, + "learning_rate": 1.982668369160007e-05, + "loss": 1.3078, + "step": 2933 + }, + { + "epoch": 0.8738807498278076, + "grad_norm": 0.19633528590202332, + "learning_rate": 1.9826504842036193e-05, + "loss": 1.3069, + "step": 2934 + }, + { + "epoch": 0.8741785960274763, + "grad_norm": 0.20765548944473267, + "learning_rate": 1.982632590104767e-05, + "loss": 1.3167, + "step": 2935 + }, + { + "epoch": 0.8744764422271449, + "grad_norm": 0.20531393587589264, + "learning_rate": 1.9826146868636162e-05, + "loss": 1.3116, + "step": 2936 + }, + { + "epoch": 0.8747742884268136, + "grad_norm": 0.2838412821292877, + "learning_rate": 1.9825967744803337e-05, + "loss": 1.3075, + "step": 2937 + }, + { + "epoch": 0.8750721346264823, + "grad_norm": 0.2089710533618927, + "learning_rate": 1.982578852955086e-05, + "loss": 1.3105, + "step": 2938 + }, + { + "epoch": 0.8753699808261509, + "grad_norm": 0.2029941827058792, + "learning_rate": 1.9825609222880396e-05, + "loss": 1.2992, + "step": 2939 + }, + { + "epoch": 0.8756678270258196, + "grad_norm": 0.19510619342327118, + "learning_rate": 1.982542982479362e-05, + "loss": 1.3293, + "step": 2940 + }, + { + "epoch": 0.8759656732254882, + "grad_norm": 0.19722811877727509, + "learning_rate": 1.9825250335292196e-05, + "loss": 1.3185, + "step": 2941 + }, + { + "epoch": 0.8762635194251568, + "grad_norm": 0.1850764900445938, + "learning_rate": 1.9825070754377794e-05, + "loss": 1.3113, + "step": 2942 + }, + { + "epoch": 0.8765613656248255, + "grad_norm": 0.18542790412902832, + "learning_rate": 1.9824891082052088e-05, + "loss": 1.2884, + "step": 2943 + }, + { + "epoch": 0.8768592118244941, + "grad_norm": 0.2818898558616638, + "learning_rate": 1.9824711318316744e-05, + "loss": 1.3054, + "step": 2944 + }, + { + "epoch": 0.8771570580241628, + "grad_norm": 0.19794979691505432, + "learning_rate": 1.9824531463173443e-05, + "loss": 1.3149, + "step": 2945 + }, + { + "epoch": 0.8774549042238314, + "grad_norm": 0.19850584864616394, + "learning_rate": 1.982435151662385e-05, + "loss": 1.3059, + "step": 2946 + }, + { + "epoch": 0.8777527504235001, + "grad_norm": 0.1981966346502304, + "learning_rate": 1.982417147866964e-05, + "loss": 1.3225, + "step": 2947 + }, + { + "epoch": 0.8780505966231688, + "grad_norm": 0.19317342340946198, + "learning_rate": 1.9823991349312495e-05, + "loss": 1.2975, + "step": 2948 + }, + { + "epoch": 0.8783484428228373, + "grad_norm": 0.19680213928222656, + "learning_rate": 1.9823811128554084e-05, + "loss": 1.3083, + "step": 2949 + }, + { + "epoch": 0.878646289022506, + "grad_norm": 0.1831223964691162, + "learning_rate": 1.9823630816396087e-05, + "loss": 1.3169, + "step": 2950 + }, + { + "epoch": 0.8789441352221746, + "grad_norm": 0.19879232347011566, + "learning_rate": 1.9823450412840183e-05, + "loss": 1.296, + "step": 2951 + }, + { + "epoch": 0.8792419814218433, + "grad_norm": 0.20635774731636047, + "learning_rate": 1.9823269917888045e-05, + "loss": 1.3359, + "step": 2952 + }, + { + "epoch": 0.879539827621512, + "grad_norm": 0.31576263904571533, + "learning_rate": 1.9823089331541357e-05, + "loss": 1.308, + "step": 2953 + }, + { + "epoch": 0.8798376738211806, + "grad_norm": 0.19298005104064941, + "learning_rate": 1.9822908653801796e-05, + "loss": 1.3243, + "step": 2954 + }, + { + "epoch": 0.8801355200208493, + "grad_norm": 0.19962824881076813, + "learning_rate": 1.9822727884671046e-05, + "loss": 1.3214, + "step": 2955 + }, + { + "epoch": 0.8804333662205179, + "grad_norm": 0.19718356430530548, + "learning_rate": 1.982254702415079e-05, + "loss": 1.3124, + "step": 2956 + }, + { + "epoch": 0.8807312124201865, + "grad_norm": 0.19522196054458618, + "learning_rate": 1.9822366072242704e-05, + "loss": 1.3185, + "step": 2957 + }, + { + "epoch": 0.8810290586198551, + "grad_norm": 0.19573675096035004, + "learning_rate": 1.9822185028948483e-05, + "loss": 1.2923, + "step": 2958 + }, + { + "epoch": 0.8813269048195238, + "grad_norm": 0.1976947784423828, + "learning_rate": 1.9822003894269795e-05, + "loss": 1.3237, + "step": 2959 + }, + { + "epoch": 0.8816247510191925, + "grad_norm": 0.20155999064445496, + "learning_rate": 1.982182266820834e-05, + "loss": 1.2854, + "step": 2960 + }, + { + "epoch": 0.8819225972188611, + "grad_norm": 0.1935465931892395, + "learning_rate": 1.98216413507658e-05, + "loss": 1.3018, + "step": 2961 + }, + { + "epoch": 0.8822204434185298, + "grad_norm": 0.1943049430847168, + "learning_rate": 1.9821459941943857e-05, + "loss": 1.3169, + "step": 2962 + }, + { + "epoch": 0.8825182896181984, + "grad_norm": 0.19589929282665253, + "learning_rate": 1.9821278441744205e-05, + "loss": 1.3065, + "step": 2963 + }, + { + "epoch": 0.882816135817867, + "grad_norm": 0.18929025530815125, + "learning_rate": 1.982109685016853e-05, + "loss": 1.307, + "step": 2964 + }, + { + "epoch": 0.8831139820175357, + "grad_norm": 0.19319988787174225, + "learning_rate": 1.9820915167218517e-05, + "loss": 1.3358, + "step": 2965 + }, + { + "epoch": 0.8834118282172043, + "grad_norm": 0.19552074372768402, + "learning_rate": 1.9820733392895865e-05, + "loss": 1.3163, + "step": 2966 + }, + { + "epoch": 0.883709674416873, + "grad_norm": 0.1948016732931137, + "learning_rate": 1.9820551527202262e-05, + "loss": 1.3193, + "step": 2967 + }, + { + "epoch": 0.8840075206165416, + "grad_norm": 0.19571538269519806, + "learning_rate": 1.9820369570139397e-05, + "loss": 1.316, + "step": 2968 + }, + { + "epoch": 0.8843053668162103, + "grad_norm": 0.1959875226020813, + "learning_rate": 1.9820187521708966e-05, + "loss": 1.3061, + "step": 2969 + }, + { + "epoch": 0.884603213015879, + "grad_norm": 0.2011423259973526, + "learning_rate": 1.9820005381912662e-05, + "loss": 1.3186, + "step": 2970 + }, + { + "epoch": 0.8849010592155476, + "grad_norm": 0.19470910727977753, + "learning_rate": 1.981982315075218e-05, + "loss": 1.3121, + "step": 2971 + }, + { + "epoch": 0.8851989054152162, + "grad_norm": 0.20103910565376282, + "learning_rate": 1.9819640828229214e-05, + "loss": 1.3182, + "step": 2972 + }, + { + "epoch": 0.8854967516148848, + "grad_norm": 0.19423173367977142, + "learning_rate": 1.9819458414345462e-05, + "loss": 1.3048, + "step": 2973 + }, + { + "epoch": 0.8857945978145535, + "grad_norm": 0.19678495824337006, + "learning_rate": 1.981927590910262e-05, + "loss": 1.3117, + "step": 2974 + }, + { + "epoch": 0.8860924440142222, + "grad_norm": 0.19346599280834198, + "learning_rate": 1.9819093312502383e-05, + "loss": 1.3251, + "step": 2975 + }, + { + "epoch": 0.8863902902138908, + "grad_norm": 0.1933828890323639, + "learning_rate": 1.981891062454646e-05, + "loss": 1.312, + "step": 2976 + }, + { + "epoch": 0.8866881364135595, + "grad_norm": 0.21723021566867828, + "learning_rate": 1.9818727845236537e-05, + "loss": 1.3115, + "step": 2977 + }, + { + "epoch": 0.8869859826132281, + "grad_norm": 0.20165273547172546, + "learning_rate": 1.9818544974574327e-05, + "loss": 1.3037, + "step": 2978 + }, + { + "epoch": 0.8872838288128967, + "grad_norm": 0.19502496719360352, + "learning_rate": 1.9818362012561525e-05, + "loss": 1.3188, + "step": 2979 + }, + { + "epoch": 0.8875816750125654, + "grad_norm": 0.19082093238830566, + "learning_rate": 1.981817895919983e-05, + "loss": 1.3179, + "step": 2980 + }, + { + "epoch": 0.887879521212234, + "grad_norm": 0.19495829939842224, + "learning_rate": 1.9817995814490956e-05, + "loss": 1.315, + "step": 2981 + }, + { + "epoch": 0.8881773674119027, + "grad_norm": 0.19247324764728546, + "learning_rate": 1.98178125784366e-05, + "loss": 1.3182, + "step": 2982 + }, + { + "epoch": 0.8884752136115713, + "grad_norm": 0.19553199410438538, + "learning_rate": 1.981762925103846e-05, + "loss": 1.2941, + "step": 2983 + }, + { + "epoch": 0.88877305981124, + "grad_norm": 0.1885903775691986, + "learning_rate": 1.981744583229825e-05, + "loss": 1.2995, + "step": 2984 + }, + { + "epoch": 0.8890709060109087, + "grad_norm": 0.19529998302459717, + "learning_rate": 1.981726232221768e-05, + "loss": 1.3087, + "step": 2985 + }, + { + "epoch": 0.8893687522105773, + "grad_norm": 0.1879928857088089, + "learning_rate": 1.981707872079845e-05, + "loss": 1.3053, + "step": 2986 + }, + { + "epoch": 0.8896665984102459, + "grad_norm": 0.21500670909881592, + "learning_rate": 1.9816895028042272e-05, + "loss": 1.3084, + "step": 2987 + }, + { + "epoch": 0.8899644446099145, + "grad_norm": 0.19979077577590942, + "learning_rate": 1.9816711243950852e-05, + "loss": 1.2911, + "step": 2988 + }, + { + "epoch": 0.8902622908095832, + "grad_norm": 0.18259525299072266, + "learning_rate": 1.98165273685259e-05, + "loss": 1.3061, + "step": 2989 + }, + { + "epoch": 0.8905601370092519, + "grad_norm": 0.19319696724414825, + "learning_rate": 1.9816343401769136e-05, + "loss": 1.3174, + "step": 2990 + }, + { + "epoch": 0.8908579832089205, + "grad_norm": 0.19616177678108215, + "learning_rate": 1.9816159343682256e-05, + "loss": 1.2965, + "step": 2991 + }, + { + "epoch": 0.8911558294085892, + "grad_norm": 0.19472496211528778, + "learning_rate": 1.9815975194266986e-05, + "loss": 1.3183, + "step": 2992 + }, + { + "epoch": 0.8914536756082578, + "grad_norm": 0.19293633103370667, + "learning_rate": 1.9815790953525033e-05, + "loss": 1.305, + "step": 2993 + }, + { + "epoch": 0.8917515218079264, + "grad_norm": 0.19016510248184204, + "learning_rate": 1.981560662145811e-05, + "loss": 1.2958, + "step": 2994 + }, + { + "epoch": 0.892049368007595, + "grad_norm": 0.19922995567321777, + "learning_rate": 1.9815422198067935e-05, + "loss": 1.3044, + "step": 2995 + }, + { + "epoch": 0.8923472142072637, + "grad_norm": 0.19845548272132874, + "learning_rate": 1.981523768335622e-05, + "loss": 1.3211, + "step": 2996 + }, + { + "epoch": 0.8926450604069324, + "grad_norm": 0.20516902208328247, + "learning_rate": 1.981505307732469e-05, + "loss": 1.3338, + "step": 2997 + }, + { + "epoch": 0.892942906606601, + "grad_norm": 0.21636910736560822, + "learning_rate": 1.981486837997505e-05, + "loss": 1.3116, + "step": 2998 + }, + { + "epoch": 0.8932407528062697, + "grad_norm": 0.20371657609939575, + "learning_rate": 1.9814683591309034e-05, + "loss": 1.3135, + "step": 2999 + }, + { + "epoch": 0.8935385990059383, + "grad_norm": 0.19144035875797272, + "learning_rate": 1.9814498711328348e-05, + "loss": 1.3089, + "step": 3000 + }, + { + "epoch": 0.8935385990059383, + "eval_loss": 1.3625597953796387, + "eval_runtime": 21.0759, + "eval_samples_per_second": 82.274, + "eval_steps_per_second": 5.172, + "step": 3000 + }, + { + "epoch": 0.8938364452056069, + "grad_norm": 0.19272580742835999, + "learning_rate": 1.9814313740034715e-05, + "loss": 1.301, + "step": 3001 + }, + { + "epoch": 0.8941342914052756, + "grad_norm": 0.19311420619487762, + "learning_rate": 1.9814128677429864e-05, + "loss": 1.3022, + "step": 3002 + }, + { + "epoch": 0.8944321376049442, + "grad_norm": 0.1974402368068695, + "learning_rate": 1.9813943523515507e-05, + "loss": 1.3036, + "step": 3003 + }, + { + "epoch": 0.8947299838046129, + "grad_norm": 0.187892347574234, + "learning_rate": 1.981375827829337e-05, + "loss": 1.2897, + "step": 3004 + }, + { + "epoch": 0.8950278300042815, + "grad_norm": 0.19705121219158173, + "learning_rate": 1.9813572941765176e-05, + "loss": 1.3024, + "step": 3005 + }, + { + "epoch": 0.8953256762039502, + "grad_norm": 0.2087492048740387, + "learning_rate": 1.981338751393265e-05, + "loss": 1.3222, + "step": 3006 + }, + { + "epoch": 0.8956235224036189, + "grad_norm": 0.19134962558746338, + "learning_rate": 1.981320199479752e-05, + "loss": 1.2942, + "step": 3007 + }, + { + "epoch": 0.8959213686032875, + "grad_norm": 0.1930139660835266, + "learning_rate": 1.981301638436151e-05, + "loss": 1.329, + "step": 3008 + }, + { + "epoch": 0.8962192148029561, + "grad_norm": 0.1902092695236206, + "learning_rate": 1.9812830682626342e-05, + "loss": 1.3007, + "step": 3009 + }, + { + "epoch": 0.8965170610026247, + "grad_norm": 0.1995134800672531, + "learning_rate": 1.9812644889593752e-05, + "loss": 1.3157, + "step": 3010 + }, + { + "epoch": 0.8968149072022934, + "grad_norm": 0.19240212440490723, + "learning_rate": 1.9812459005265463e-05, + "loss": 1.3122, + "step": 3011 + }, + { + "epoch": 0.8971127534019621, + "grad_norm": 0.20270533859729767, + "learning_rate": 1.9812273029643205e-05, + "loss": 1.3035, + "step": 3012 + }, + { + "epoch": 0.8974105996016307, + "grad_norm": 0.1889352947473526, + "learning_rate": 1.981208696272871e-05, + "loss": 1.3073, + "step": 3013 + }, + { + "epoch": 0.8977084458012994, + "grad_norm": 0.1877407431602478, + "learning_rate": 1.981190080452371e-05, + "loss": 1.2942, + "step": 3014 + }, + { + "epoch": 0.898006292000968, + "grad_norm": 0.192073792219162, + "learning_rate": 1.9811714555029937e-05, + "loss": 1.2944, + "step": 3015 + }, + { + "epoch": 0.8983041382006366, + "grad_norm": 0.20263244211673737, + "learning_rate": 1.9811528214249122e-05, + "loss": 1.3089, + "step": 3016 + }, + { + "epoch": 0.8986019844003053, + "grad_norm": 0.19734568893909454, + "learning_rate": 1.9811341782182994e-05, + "loss": 1.3108, + "step": 3017 + }, + { + "epoch": 0.8988998305999739, + "grad_norm": 0.19960430264472961, + "learning_rate": 1.9811155258833294e-05, + "loss": 1.3119, + "step": 3018 + }, + { + "epoch": 0.8991976767996426, + "grad_norm": 0.1904885470867157, + "learning_rate": 1.981096864420176e-05, + "loss": 1.291, + "step": 3019 + }, + { + "epoch": 0.8994955229993112, + "grad_norm": 0.19666573405265808, + "learning_rate": 1.9810781938290124e-05, + "loss": 1.2999, + "step": 3020 + }, + { + "epoch": 0.8997933691989799, + "grad_norm": 0.1923913210630417, + "learning_rate": 1.9810595141100116e-05, + "loss": 1.3184, + "step": 3021 + }, + { + "epoch": 0.9000912153986486, + "grad_norm": 0.19742454588413239, + "learning_rate": 1.981040825263349e-05, + "loss": 1.3083, + "step": 3022 + }, + { + "epoch": 0.9003890615983172, + "grad_norm": 0.19502176344394684, + "learning_rate": 1.981022127289197e-05, + "loss": 1.3085, + "step": 3023 + }, + { + "epoch": 0.9006869077979858, + "grad_norm": 0.19793961942195892, + "learning_rate": 1.9810034201877304e-05, + "loss": 1.3189, + "step": 3024 + }, + { + "epoch": 0.9009847539976544, + "grad_norm": 0.19931679964065552, + "learning_rate": 1.980984703959123e-05, + "loss": 1.3224, + "step": 3025 + }, + { + "epoch": 0.9012826001973231, + "grad_norm": 0.198928564786911, + "learning_rate": 1.980965978603549e-05, + "loss": 1.3149, + "step": 3026 + }, + { + "epoch": 0.9015804463969918, + "grad_norm": 0.19738368690013885, + "learning_rate": 1.9809472441211826e-05, + "loss": 1.3177, + "step": 3027 + }, + { + "epoch": 0.9018782925966604, + "grad_norm": 0.1833115518093109, + "learning_rate": 1.980928500512198e-05, + "loss": 1.313, + "step": 3028 + }, + { + "epoch": 0.9021761387963291, + "grad_norm": 0.19273918867111206, + "learning_rate": 1.9809097477767695e-05, + "loss": 1.3233, + "step": 3029 + }, + { + "epoch": 0.9024739849959977, + "grad_norm": 0.19289207458496094, + "learning_rate": 1.980890985915072e-05, + "loss": 1.2983, + "step": 3030 + }, + { + "epoch": 0.9027718311956663, + "grad_norm": 0.2008187621831894, + "learning_rate": 1.98087221492728e-05, + "loss": 1.3066, + "step": 3031 + }, + { + "epoch": 0.903069677395335, + "grad_norm": 0.20150333642959595, + "learning_rate": 1.9808534348135676e-05, + "loss": 1.2904, + "step": 3032 + }, + { + "epoch": 0.9033675235950036, + "grad_norm": 0.19519232213497162, + "learning_rate": 1.9808346455741097e-05, + "loss": 1.3192, + "step": 3033 + }, + { + "epoch": 0.9036653697946723, + "grad_norm": 0.19195297360420227, + "learning_rate": 1.9808158472090814e-05, + "loss": 1.3096, + "step": 3034 + }, + { + "epoch": 0.9039632159943409, + "grad_norm": 0.2011096477508545, + "learning_rate": 1.980797039718658e-05, + "loss": 1.3089, + "step": 3035 + }, + { + "epoch": 0.9042610621940096, + "grad_norm": 0.18987266719341278, + "learning_rate": 1.980778223103013e-05, + "loss": 1.2981, + "step": 3036 + }, + { + "epoch": 0.9045589083936783, + "grad_norm": 0.20272229611873627, + "learning_rate": 1.980759397362323e-05, + "loss": 1.3111, + "step": 3037 + }, + { + "epoch": 0.9048567545933469, + "grad_norm": 0.20297445356845856, + "learning_rate": 1.9807405624967627e-05, + "loss": 1.3028, + "step": 3038 + }, + { + "epoch": 0.9051546007930155, + "grad_norm": 0.1968143731355667, + "learning_rate": 1.980721718506507e-05, + "loss": 1.2874, + "step": 3039 + }, + { + "epoch": 0.9054524469926841, + "grad_norm": 0.19670012593269348, + "learning_rate": 1.9807028653917315e-05, + "loss": 1.2864, + "step": 3040 + }, + { + "epoch": 0.9057502931923528, + "grad_norm": 0.1936255693435669, + "learning_rate": 1.9806840031526113e-05, + "loss": 1.3033, + "step": 3041 + }, + { + "epoch": 0.9060481393920214, + "grad_norm": 0.2041553556919098, + "learning_rate": 1.9806651317893224e-05, + "loss": 1.314, + "step": 3042 + }, + { + "epoch": 0.9063459855916901, + "grad_norm": 0.20097097754478455, + "learning_rate": 1.9806462513020402e-05, + "loss": 1.3107, + "step": 3043 + }, + { + "epoch": 0.9066438317913588, + "grad_norm": 0.20446543395519257, + "learning_rate": 1.9806273616909403e-05, + "loss": 1.3226, + "step": 3044 + }, + { + "epoch": 0.9069416779910274, + "grad_norm": 0.19455336034297943, + "learning_rate": 1.980608462956198e-05, + "loss": 1.3077, + "step": 3045 + }, + { + "epoch": 0.907239524190696, + "grad_norm": 0.20479156076908112, + "learning_rate": 1.9805895550979898e-05, + "loss": 1.3041, + "step": 3046 + }, + { + "epoch": 0.9075373703903646, + "grad_norm": 0.1962958574295044, + "learning_rate": 1.9805706381164917e-05, + "loss": 1.3184, + "step": 3047 + }, + { + "epoch": 0.9078352165900333, + "grad_norm": 0.2033453732728958, + "learning_rate": 1.9805517120118788e-05, + "loss": 1.3003, + "step": 3048 + }, + { + "epoch": 0.908133062789702, + "grad_norm": 0.21646438539028168, + "learning_rate": 1.980532776784328e-05, + "loss": 1.3184, + "step": 3049 + }, + { + "epoch": 0.9084309089893706, + "grad_norm": 0.20765748620033264, + "learning_rate": 1.9805138324340152e-05, + "loss": 1.3167, + "step": 3050 + }, + { + "epoch": 0.9087287551890393, + "grad_norm": 0.2132749706506729, + "learning_rate": 1.9804948789611166e-05, + "loss": 1.3433, + "step": 3051 + }, + { + "epoch": 0.9090266013887079, + "grad_norm": 0.20811358094215393, + "learning_rate": 1.980475916365809e-05, + "loss": 1.3159, + "step": 3052 + }, + { + "epoch": 0.9093244475883766, + "grad_norm": 0.2119991034269333, + "learning_rate": 1.980456944648268e-05, + "loss": 1.3237, + "step": 3053 + }, + { + "epoch": 0.9096222937880452, + "grad_norm": 0.20286352932453156, + "learning_rate": 1.9804379638086706e-05, + "loss": 1.3194, + "step": 3054 + }, + { + "epoch": 0.9099201399877138, + "grad_norm": 0.20021235942840576, + "learning_rate": 1.9804189738471935e-05, + "loss": 1.2853, + "step": 3055 + }, + { + "epoch": 0.9102179861873825, + "grad_norm": 0.19465458393096924, + "learning_rate": 1.9803999747640135e-05, + "loss": 1.3183, + "step": 3056 + }, + { + "epoch": 0.9105158323870511, + "grad_norm": 0.21487313508987427, + "learning_rate": 1.980380966559307e-05, + "loss": 1.3205, + "step": 3057 + }, + { + "epoch": 0.9108136785867198, + "grad_norm": 0.19248226284980774, + "learning_rate": 1.9803619492332507e-05, + "loss": 1.317, + "step": 3058 + }, + { + "epoch": 0.9111115247863885, + "grad_norm": 0.20386140048503876, + "learning_rate": 1.9803429227860218e-05, + "loss": 1.3019, + "step": 3059 + }, + { + "epoch": 0.9114093709860571, + "grad_norm": 0.18888108432292938, + "learning_rate": 1.9803238872177972e-05, + "loss": 1.3095, + "step": 3060 + }, + { + "epoch": 0.9117072171857257, + "grad_norm": 0.20344072580337524, + "learning_rate": 1.9803048425287543e-05, + "loss": 1.3146, + "step": 3061 + }, + { + "epoch": 0.9120050633853943, + "grad_norm": 0.20831163227558136, + "learning_rate": 1.98028578871907e-05, + "loss": 1.337, + "step": 3062 + }, + { + "epoch": 0.912302909585063, + "grad_norm": 0.19399350881576538, + "learning_rate": 1.980266725788922e-05, + "loss": 1.3102, + "step": 3063 + }, + { + "epoch": 0.9126007557847317, + "grad_norm": 0.2027069479227066, + "learning_rate": 1.9802476537384868e-05, + "loss": 1.3089, + "step": 3064 + }, + { + "epoch": 0.9128986019844003, + "grad_norm": 0.20166629552841187, + "learning_rate": 1.980228572567943e-05, + "loss": 1.2962, + "step": 3065 + }, + { + "epoch": 0.913196448184069, + "grad_norm": 0.2001592516899109, + "learning_rate": 1.980209482277467e-05, + "loss": 1.3127, + "step": 3066 + }, + { + "epoch": 0.9134942943837376, + "grad_norm": 0.2128322273492813, + "learning_rate": 1.9801903828672372e-05, + "loss": 1.3051, + "step": 3067 + }, + { + "epoch": 0.9137921405834063, + "grad_norm": 0.1976434886455536, + "learning_rate": 1.9801712743374312e-05, + "loss": 1.3123, + "step": 3068 + }, + { + "epoch": 0.9140899867830748, + "grad_norm": 0.20201466977596283, + "learning_rate": 1.980152156688226e-05, + "loss": 1.2979, + "step": 3069 + }, + { + "epoch": 0.9143878329827435, + "grad_norm": 0.20096318423748016, + "learning_rate": 1.9801330299198005e-05, + "loss": 1.2987, + "step": 3070 + }, + { + "epoch": 0.9146856791824122, + "grad_norm": 0.19993111491203308, + "learning_rate": 1.980113894032332e-05, + "loss": 1.299, + "step": 3071 + }, + { + "epoch": 0.9149835253820808, + "grad_norm": 0.20647186040878296, + "learning_rate": 1.980094749025999e-05, + "loss": 1.2875, + "step": 3072 + }, + { + "epoch": 0.9152813715817495, + "grad_norm": 0.224049374461174, + "learning_rate": 1.9800755949009794e-05, + "loss": 1.3375, + "step": 3073 + }, + { + "epoch": 0.9155792177814182, + "grad_norm": 0.2097865641117096, + "learning_rate": 1.980056431657451e-05, + "loss": 1.2911, + "step": 3074 + }, + { + "epoch": 0.9158770639810868, + "grad_norm": 0.20605894923210144, + "learning_rate": 1.980037259295593e-05, + "loss": 1.3106, + "step": 3075 + }, + { + "epoch": 0.9161749101807554, + "grad_norm": 0.19733017683029175, + "learning_rate": 1.9800180778155832e-05, + "loss": 1.3071, + "step": 3076 + }, + { + "epoch": 0.916472756380424, + "grad_norm": 0.2055257260799408, + "learning_rate": 1.9799988872175996e-05, + "loss": 1.2896, + "step": 3077 + }, + { + "epoch": 0.9167706025800927, + "grad_norm": 0.20163756608963013, + "learning_rate": 1.979979687501822e-05, + "loss": 1.32, + "step": 3078 + }, + { + "epoch": 0.9170684487797613, + "grad_norm": 0.20674440264701843, + "learning_rate": 1.979960478668428e-05, + "loss": 1.302, + "step": 3079 + }, + { + "epoch": 0.91736629497943, + "grad_norm": 0.19469445943832397, + "learning_rate": 1.9799412607175963e-05, + "loss": 1.3096, + "step": 3080 + }, + { + "epoch": 0.9176641411790987, + "grad_norm": 0.1986764520406723, + "learning_rate": 1.9799220336495063e-05, + "loss": 1.2967, + "step": 3081 + }, + { + "epoch": 0.9179619873787673, + "grad_norm": 0.20165130496025085, + "learning_rate": 1.9799027974643365e-05, + "loss": 1.3115, + "step": 3082 + }, + { + "epoch": 0.9182598335784359, + "grad_norm": 0.1868019998073578, + "learning_rate": 1.9798835521622662e-05, + "loss": 1.3056, + "step": 3083 + }, + { + "epoch": 0.9185576797781045, + "grad_norm": 0.19767910242080688, + "learning_rate": 1.9798642977434742e-05, + "loss": 1.313, + "step": 3084 + }, + { + "epoch": 0.9188555259777732, + "grad_norm": 0.20172198116779327, + "learning_rate": 1.9798450342081398e-05, + "loss": 1.305, + "step": 3085 + }, + { + "epoch": 0.9191533721774419, + "grad_norm": 0.2029125988483429, + "learning_rate": 1.9798257615564415e-05, + "loss": 1.3079, + "step": 3086 + }, + { + "epoch": 0.9194512183771105, + "grad_norm": 0.19753821194171906, + "learning_rate": 1.9798064797885596e-05, + "loss": 1.3058, + "step": 3087 + }, + { + "epoch": 0.9197490645767792, + "grad_norm": 0.1987670212984085, + "learning_rate": 1.9797871889046733e-05, + "loss": 1.3208, + "step": 3088 + }, + { + "epoch": 0.9200469107764478, + "grad_norm": 0.1995089054107666, + "learning_rate": 1.9797678889049615e-05, + "loss": 1.323, + "step": 3089 + }, + { + "epoch": 0.9203447569761165, + "grad_norm": 0.19714906811714172, + "learning_rate": 1.9797485797896045e-05, + "loss": 1.3018, + "step": 3090 + }, + { + "epoch": 0.9206426031757851, + "grad_norm": 0.19496291875839233, + "learning_rate": 1.9797292615587814e-05, + "loss": 1.3045, + "step": 3091 + }, + { + "epoch": 0.9209404493754537, + "grad_norm": 0.20265202224254608, + "learning_rate": 1.9797099342126726e-05, + "loss": 1.3135, + "step": 3092 + }, + { + "epoch": 0.9212382955751224, + "grad_norm": 0.19452430307865143, + "learning_rate": 1.979690597751457e-05, + "loss": 1.3063, + "step": 3093 + }, + { + "epoch": 0.921536141774791, + "grad_norm": 0.19185176491737366, + "learning_rate": 1.979671252175315e-05, + "loss": 1.3013, + "step": 3094 + }, + { + "epoch": 0.9218339879744597, + "grad_norm": 0.21012099087238312, + "learning_rate": 1.9796518974844265e-05, + "loss": 1.3092, + "step": 3095 + }, + { + "epoch": 0.9221318341741284, + "grad_norm": 0.22558675706386566, + "learning_rate": 1.9796325336789716e-05, + "loss": 1.3408, + "step": 3096 + }, + { + "epoch": 0.922429680373797, + "grad_norm": 0.1874483972787857, + "learning_rate": 1.9796131607591305e-05, + "loss": 1.2966, + "step": 3097 + }, + { + "epoch": 0.9227275265734656, + "grad_norm": 0.2448127418756485, + "learning_rate": 1.9795937787250835e-05, + "loss": 1.3082, + "step": 3098 + }, + { + "epoch": 0.9230253727731342, + "grad_norm": 0.18637888133525848, + "learning_rate": 1.979574387577011e-05, + "loss": 1.3185, + "step": 3099 + }, + { + "epoch": 0.9233232189728029, + "grad_norm": 0.2024553120136261, + "learning_rate": 1.979554987315093e-05, + "loss": 1.3131, + "step": 3100 + }, + { + "epoch": 0.9236210651724716, + "grad_norm": 0.18883754312992096, + "learning_rate": 1.9795355779395107e-05, + "loss": 1.3215, + "step": 3101 + }, + { + "epoch": 0.9239189113721402, + "grad_norm": 0.19369982182979584, + "learning_rate": 1.979516159450444e-05, + "loss": 1.3142, + "step": 3102 + }, + { + "epoch": 0.9242167575718089, + "grad_norm": 0.20784643292427063, + "learning_rate": 1.9794967318480735e-05, + "loss": 1.2958, + "step": 3103 + }, + { + "epoch": 0.9245146037714775, + "grad_norm": 0.1899401843547821, + "learning_rate": 1.979477295132581e-05, + "loss": 1.3032, + "step": 3104 + }, + { + "epoch": 0.9248124499711462, + "grad_norm": 0.19492416083812714, + "learning_rate": 1.9794578493041458e-05, + "loss": 1.3201, + "step": 3105 + }, + { + "epoch": 0.9251102961708147, + "grad_norm": 0.19163598120212555, + "learning_rate": 1.97943839436295e-05, + "loss": 1.3131, + "step": 3106 + }, + { + "epoch": 0.9254081423704834, + "grad_norm": 0.1966073364019394, + "learning_rate": 1.979418930309174e-05, + "loss": 1.3071, + "step": 3107 + }, + { + "epoch": 0.9257059885701521, + "grad_norm": 0.2052278071641922, + "learning_rate": 1.9793994571429996e-05, + "loss": 1.3031, + "step": 3108 + }, + { + "epoch": 0.9260038347698207, + "grad_norm": 0.1949099451303482, + "learning_rate": 1.9793799748646073e-05, + "loss": 1.3307, + "step": 3109 + }, + { + "epoch": 0.9263016809694894, + "grad_norm": 0.20263433456420898, + "learning_rate": 1.9793604834741785e-05, + "loss": 1.2873, + "step": 3110 + }, + { + "epoch": 0.926599527169158, + "grad_norm": 0.19447313249111176, + "learning_rate": 1.9793409829718947e-05, + "loss": 1.322, + "step": 3111 + }, + { + "epoch": 0.9268973733688267, + "grad_norm": 0.18885290622711182, + "learning_rate": 1.9793214733579373e-05, + "loss": 1.3018, + "step": 3112 + }, + { + "epoch": 0.9271952195684953, + "grad_norm": 0.19127033650875092, + "learning_rate": 1.9793019546324874e-05, + "loss": 1.2934, + "step": 3113 + }, + { + "epoch": 0.9274930657681639, + "grad_norm": 0.19292551279067993, + "learning_rate": 1.979282426795727e-05, + "loss": 1.302, + "step": 3114 + }, + { + "epoch": 0.9277909119678326, + "grad_norm": 0.2079010307788849, + "learning_rate": 1.9792628898478382e-05, + "loss": 1.3152, + "step": 3115 + }, + { + "epoch": 0.9280887581675012, + "grad_norm": 0.19239436089992523, + "learning_rate": 1.9792433437890017e-05, + "loss": 1.3109, + "step": 3116 + }, + { + "epoch": 0.9283866043671699, + "grad_norm": 0.19920912384986877, + "learning_rate": 1.9792237886194002e-05, + "loss": 1.2922, + "step": 3117 + }, + { + "epoch": 0.9286844505668386, + "grad_norm": 0.19173072278499603, + "learning_rate": 1.9792042243392157e-05, + "loss": 1.3057, + "step": 3118 + }, + { + "epoch": 0.9289822967665072, + "grad_norm": 0.18758858740329742, + "learning_rate": 1.9791846509486294e-05, + "loss": 1.3072, + "step": 3119 + }, + { + "epoch": 0.9292801429661759, + "grad_norm": 0.1958310753107071, + "learning_rate": 1.9791650684478247e-05, + "loss": 1.3054, + "step": 3120 + }, + { + "epoch": 0.9295779891658444, + "grad_norm": 0.20528602600097656, + "learning_rate": 1.9791454768369823e-05, + "loss": 1.318, + "step": 3121 + }, + { + "epoch": 0.9298758353655131, + "grad_norm": 0.1926400512456894, + "learning_rate": 1.979125876116285e-05, + "loss": 1.3155, + "step": 3122 + }, + { + "epoch": 0.9301736815651818, + "grad_norm": 0.20159479975700378, + "learning_rate": 1.9791062662859162e-05, + "loss": 1.2954, + "step": 3123 + }, + { + "epoch": 0.9304715277648504, + "grad_norm": 0.20442180335521698, + "learning_rate": 1.979086647346057e-05, + "loss": 1.3036, + "step": 3124 + }, + { + "epoch": 0.9307693739645191, + "grad_norm": 0.21335050463676453, + "learning_rate": 1.9790670192968906e-05, + "loss": 1.3026, + "step": 3125 + }, + { + "epoch": 0.9310672201641877, + "grad_norm": 0.19624581933021545, + "learning_rate": 1.9790473821385995e-05, + "loss": 1.3245, + "step": 3126 + }, + { + "epoch": 0.9313650663638564, + "grad_norm": 0.20077237486839294, + "learning_rate": 1.9790277358713662e-05, + "loss": 1.3139, + "step": 3127 + }, + { + "epoch": 0.931662912563525, + "grad_norm": 0.2491120547056198, + "learning_rate": 1.979008080495374e-05, + "loss": 1.3303, + "step": 3128 + }, + { + "epoch": 0.9319607587631936, + "grad_norm": 0.2023068517446518, + "learning_rate": 1.978988416010805e-05, + "loss": 1.3117, + "step": 3129 + }, + { + "epoch": 0.9322586049628623, + "grad_norm": 0.19905631244182587, + "learning_rate": 1.978968742417843e-05, + "loss": 1.3005, + "step": 3130 + }, + { + "epoch": 0.9325564511625309, + "grad_norm": 0.20414213836193085, + "learning_rate": 1.97894905971667e-05, + "loss": 1.3076, + "step": 3131 + }, + { + "epoch": 0.9328542973621996, + "grad_norm": 0.2147301733493805, + "learning_rate": 1.9789293679074704e-05, + "loss": 1.3346, + "step": 3132 + }, + { + "epoch": 0.9331521435618683, + "grad_norm": 0.19705048203468323, + "learning_rate": 1.9789096669904266e-05, + "loss": 1.3014, + "step": 3133 + }, + { + "epoch": 0.9334499897615369, + "grad_norm": 0.1927923560142517, + "learning_rate": 1.9788899569657216e-05, + "loss": 1.334, + "step": 3134 + }, + { + "epoch": 0.9337478359612056, + "grad_norm": 0.20143796503543854, + "learning_rate": 1.97887023783354e-05, + "loss": 1.3158, + "step": 3135 + }, + { + "epoch": 0.9340456821608741, + "grad_norm": 0.20049843192100525, + "learning_rate": 1.9788505095940636e-05, + "loss": 1.3028, + "step": 3136 + }, + { + "epoch": 0.9343435283605428, + "grad_norm": 0.1882764995098114, + "learning_rate": 1.9788307722474774e-05, + "loss": 1.3072, + "step": 3137 + }, + { + "epoch": 0.9346413745602115, + "grad_norm": 0.19787998497486115, + "learning_rate": 1.9788110257939644e-05, + "loss": 1.3258, + "step": 3138 + }, + { + "epoch": 0.9349392207598801, + "grad_norm": 0.1986810564994812, + "learning_rate": 1.978791270233708e-05, + "loss": 1.3077, + "step": 3139 + }, + { + "epoch": 0.9352370669595488, + "grad_norm": 0.2063145935535431, + "learning_rate": 1.978771505566893e-05, + "loss": 1.3167, + "step": 3140 + }, + { + "epoch": 0.9355349131592174, + "grad_norm": 0.19711002707481384, + "learning_rate": 1.9787517317937025e-05, + "loss": 1.3141, + "step": 3141 + }, + { + "epoch": 0.9358327593588861, + "grad_norm": 0.19143569469451904, + "learning_rate": 1.978731948914321e-05, + "loss": 1.309, + "step": 3142 + }, + { + "epoch": 0.9361306055585547, + "grad_norm": 0.19706933200359344, + "learning_rate": 1.9787121569289316e-05, + "loss": 1.3104, + "step": 3143 + }, + { + "epoch": 0.9364284517582233, + "grad_norm": 0.19449691474437714, + "learning_rate": 1.9786923558377192e-05, + "loss": 1.3102, + "step": 3144 + }, + { + "epoch": 0.936726297957892, + "grad_norm": 0.2025444209575653, + "learning_rate": 1.978672545640868e-05, + "loss": 1.2949, + "step": 3145 + }, + { + "epoch": 0.9370241441575606, + "grad_norm": 0.19922438263893127, + "learning_rate": 1.978652726338562e-05, + "loss": 1.3044, + "step": 3146 + }, + { + "epoch": 0.9373219903572293, + "grad_norm": 0.19880349934101105, + "learning_rate": 1.9786328979309865e-05, + "loss": 1.3072, + "step": 3147 + }, + { + "epoch": 0.937619836556898, + "grad_norm": 0.19347430765628815, + "learning_rate": 1.9786130604183244e-05, + "loss": 1.304, + "step": 3148 + }, + { + "epoch": 0.9379176827565666, + "grad_norm": 0.1958763152360916, + "learning_rate": 1.9785932138007617e-05, + "loss": 1.3203, + "step": 3149 + }, + { + "epoch": 0.9382155289562353, + "grad_norm": 0.1980467438697815, + "learning_rate": 1.9785733580784823e-05, + "loss": 1.3051, + "step": 3150 + }, + { + "epoch": 0.9385133751559038, + "grad_norm": 0.19710774719715118, + "learning_rate": 1.978553493251671e-05, + "loss": 1.3101, + "step": 3151 + }, + { + "epoch": 0.9388112213555725, + "grad_norm": 0.18674476444721222, + "learning_rate": 1.978533619320513e-05, + "loss": 1.303, + "step": 3152 + }, + { + "epoch": 0.9391090675552412, + "grad_norm": 0.20070084929466248, + "learning_rate": 1.978513736285193e-05, + "loss": 1.3192, + "step": 3153 + }, + { + "epoch": 0.9394069137549098, + "grad_norm": 0.1897594928741455, + "learning_rate": 1.9784938441458957e-05, + "loss": 1.2943, + "step": 3154 + }, + { + "epoch": 0.9397047599545785, + "grad_norm": 0.206159770488739, + "learning_rate": 1.9784739429028063e-05, + "loss": 1.3178, + "step": 3155 + }, + { + "epoch": 0.9400026061542471, + "grad_norm": 0.19705045223236084, + "learning_rate": 1.9784540325561104e-05, + "loss": 1.2959, + "step": 3156 + }, + { + "epoch": 0.9403004523539158, + "grad_norm": 0.1905844360589981, + "learning_rate": 1.978434113105993e-05, + "loss": 1.2994, + "step": 3157 + }, + { + "epoch": 0.9405982985535843, + "grad_norm": 0.19088006019592285, + "learning_rate": 1.978414184552639e-05, + "loss": 1.3053, + "step": 3158 + }, + { + "epoch": 0.940896144753253, + "grad_norm": 0.19885419309139252, + "learning_rate": 1.9783942468962343e-05, + "loss": 1.3257, + "step": 3159 + }, + { + "epoch": 0.9411939909529217, + "grad_norm": 0.20647475123405457, + "learning_rate": 1.978374300136964e-05, + "loss": 1.301, + "step": 3160 + }, + { + "epoch": 0.9414918371525903, + "grad_norm": 0.20827622711658478, + "learning_rate": 1.9783543442750144e-05, + "loss": 1.2988, + "step": 3161 + }, + { + "epoch": 0.941789683352259, + "grad_norm": 0.1971600353717804, + "learning_rate": 1.9783343793105705e-05, + "loss": 1.2978, + "step": 3162 + }, + { + "epoch": 0.9420875295519276, + "grad_norm": 0.20790520310401917, + "learning_rate": 1.9783144052438184e-05, + "loss": 1.3042, + "step": 3163 + }, + { + "epoch": 0.9423853757515963, + "grad_norm": 0.19935233891010284, + "learning_rate": 1.9782944220749438e-05, + "loss": 1.2978, + "step": 3164 + }, + { + "epoch": 0.9426832219512649, + "grad_norm": 0.19943548738956451, + "learning_rate": 1.9782744298041322e-05, + "loss": 1.3118, + "step": 3165 + }, + { + "epoch": 0.9429810681509335, + "grad_norm": 0.19326171278953552, + "learning_rate": 1.9782544284315702e-05, + "loss": 1.3204, + "step": 3166 + }, + { + "epoch": 0.9432789143506022, + "grad_norm": 0.21011726558208466, + "learning_rate": 1.978234417957444e-05, + "loss": 1.3276, + "step": 3167 + }, + { + "epoch": 0.9435767605502708, + "grad_norm": 0.20445969700813293, + "learning_rate": 1.9782143983819392e-05, + "loss": 1.3084, + "step": 3168 + }, + { + "epoch": 0.9438746067499395, + "grad_norm": 0.19919198751449585, + "learning_rate": 1.9781943697052427e-05, + "loss": 1.2952, + "step": 3169 + }, + { + "epoch": 0.9441724529496082, + "grad_norm": 0.2089526355266571, + "learning_rate": 1.97817433192754e-05, + "loss": 1.304, + "step": 3170 + }, + { + "epoch": 0.9444702991492768, + "grad_norm": 0.19269444048404694, + "learning_rate": 1.9781542850490182e-05, + "loss": 1.2865, + "step": 3171 + }, + { + "epoch": 0.9447681453489455, + "grad_norm": 0.20415502786636353, + "learning_rate": 1.978134229069864e-05, + "loss": 1.3236, + "step": 3172 + }, + { + "epoch": 0.945065991548614, + "grad_norm": 0.19797283411026, + "learning_rate": 1.9781141639902636e-05, + "loss": 1.3045, + "step": 3173 + }, + { + "epoch": 0.9453638377482827, + "grad_norm": 0.18118560314178467, + "learning_rate": 1.978094089810403e-05, + "loss": 1.2933, + "step": 3174 + }, + { + "epoch": 0.9456616839479514, + "grad_norm": 0.19790440797805786, + "learning_rate": 1.9780740065304703e-05, + "loss": 1.2991, + "step": 3175 + }, + { + "epoch": 0.94595953014762, + "grad_norm": 0.19850385189056396, + "learning_rate": 1.9780539141506515e-05, + "loss": 1.3124, + "step": 3176 + }, + { + "epoch": 0.9462573763472887, + "grad_norm": 0.19748616218566895, + "learning_rate": 1.978033812671134e-05, + "loss": 1.3223, + "step": 3177 + }, + { + "epoch": 0.9465552225469573, + "grad_norm": 0.19483156502246857, + "learning_rate": 1.9780137020921045e-05, + "loss": 1.3071, + "step": 3178 + }, + { + "epoch": 0.946853068746626, + "grad_norm": 0.194817915558815, + "learning_rate": 1.97799358241375e-05, + "loss": 1.3002, + "step": 3179 + }, + { + "epoch": 0.9471509149462946, + "grad_norm": 0.19546271860599518, + "learning_rate": 1.977973453636258e-05, + "loss": 1.3112, + "step": 3180 + }, + { + "epoch": 0.9474487611459632, + "grad_norm": 0.1972402185201645, + "learning_rate": 1.977953315759816e-05, + "loss": 1.2967, + "step": 3181 + }, + { + "epoch": 0.9477466073456319, + "grad_norm": 0.20926529169082642, + "learning_rate": 1.9779331687846105e-05, + "loss": 1.3099, + "step": 3182 + }, + { + "epoch": 0.9480444535453005, + "grad_norm": 0.20544731616973877, + "learning_rate": 1.97791301271083e-05, + "loss": 1.3136, + "step": 3183 + }, + { + "epoch": 0.9483422997449692, + "grad_norm": 0.20195014774799347, + "learning_rate": 1.977892847538661e-05, + "loss": 1.3066, + "step": 3184 + }, + { + "epoch": 0.9486401459446379, + "grad_norm": 0.20106039941310883, + "learning_rate": 1.977872673268292e-05, + "loss": 1.3008, + "step": 3185 + }, + { + "epoch": 0.9489379921443065, + "grad_norm": 0.20220544934272766, + "learning_rate": 1.9778524898999102e-05, + "loss": 1.3044, + "step": 3186 + }, + { + "epoch": 0.9492358383439752, + "grad_norm": 0.19450978934764862, + "learning_rate": 1.9778322974337036e-05, + "loss": 1.3047, + "step": 3187 + }, + { + "epoch": 0.9495336845436437, + "grad_norm": 0.20149749517440796, + "learning_rate": 1.97781209586986e-05, + "loss": 1.3135, + "step": 3188 + }, + { + "epoch": 0.9498315307433124, + "grad_norm": 0.20093722641468048, + "learning_rate": 1.977791885208567e-05, + "loss": 1.3067, + "step": 3189 + }, + { + "epoch": 0.950129376942981, + "grad_norm": 0.1976437121629715, + "learning_rate": 1.977771665450013e-05, + "loss": 1.3073, + "step": 3190 + }, + { + "epoch": 0.9504272231426497, + "grad_norm": 0.19089743494987488, + "learning_rate": 1.977751436594386e-05, + "loss": 1.3196, + "step": 3191 + }, + { + "epoch": 0.9507250693423184, + "grad_norm": 0.21099534630775452, + "learning_rate": 1.977731198641875e-05, + "loss": 1.3068, + "step": 3192 + }, + { + "epoch": 0.951022915541987, + "grad_norm": 0.1997574418783188, + "learning_rate": 1.977710951592667e-05, + "loss": 1.3014, + "step": 3193 + }, + { + "epoch": 0.9513207617416557, + "grad_norm": 0.20798201858997345, + "learning_rate": 1.977690695446951e-05, + "loss": 1.2956, + "step": 3194 + }, + { + "epoch": 0.9516186079413242, + "grad_norm": 0.2052495777606964, + "learning_rate": 1.9776704302049155e-05, + "loss": 1.3071, + "step": 3195 + }, + { + "epoch": 0.9519164541409929, + "grad_norm": 0.21590670943260193, + "learning_rate": 1.9776501558667488e-05, + "loss": 1.299, + "step": 3196 + }, + { + "epoch": 0.9522143003406616, + "grad_norm": 0.19587722420692444, + "learning_rate": 1.9776298724326398e-05, + "loss": 1.3029, + "step": 3197 + }, + { + "epoch": 0.9525121465403302, + "grad_norm": 0.21008187532424927, + "learning_rate": 1.9776095799027773e-05, + "loss": 1.3249, + "step": 3198 + }, + { + "epoch": 0.9528099927399989, + "grad_norm": 0.19470874965190887, + "learning_rate": 1.9775892782773497e-05, + "loss": 1.2969, + "step": 3199 + }, + { + "epoch": 0.9531078389396676, + "grad_norm": 0.19459150731563568, + "learning_rate": 1.977568967556546e-05, + "loss": 1.3153, + "step": 3200 + }, + { + "epoch": 0.9534056851393362, + "grad_norm": 0.1974397748708725, + "learning_rate": 1.9775486477405556e-05, + "loss": 1.3192, + "step": 3201 + }, + { + "epoch": 0.9537035313390049, + "grad_norm": 0.19983869791030884, + "learning_rate": 1.977528318829567e-05, + "loss": 1.3096, + "step": 3202 + }, + { + "epoch": 0.9540013775386734, + "grad_norm": 0.20790590345859528, + "learning_rate": 1.9775079808237695e-05, + "loss": 1.3079, + "step": 3203 + }, + { + "epoch": 0.9542992237383421, + "grad_norm": 0.20371964573860168, + "learning_rate": 1.9774876337233527e-05, + "loss": 1.3018, + "step": 3204 + }, + { + "epoch": 0.9545970699380107, + "grad_norm": 0.19802193343639374, + "learning_rate": 1.977467277528505e-05, + "loss": 1.3104, + "step": 3205 + }, + { + "epoch": 0.9548949161376794, + "grad_norm": 0.195772185921669, + "learning_rate": 1.9774469122394167e-05, + "loss": 1.3017, + "step": 3206 + }, + { + "epoch": 0.9551927623373481, + "grad_norm": 0.19083097577095032, + "learning_rate": 1.977426537856277e-05, + "loss": 1.3069, + "step": 3207 + }, + { + "epoch": 0.9554906085370167, + "grad_norm": 0.20081329345703125, + "learning_rate": 1.9774061543792754e-05, + "loss": 1.3071, + "step": 3208 + }, + { + "epoch": 0.9557884547366854, + "grad_norm": 0.2067907452583313, + "learning_rate": 1.9773857618086016e-05, + "loss": 1.3019, + "step": 3209 + }, + { + "epoch": 0.9560863009363539, + "grad_norm": 0.21616604924201965, + "learning_rate": 1.9773653601444453e-05, + "loss": 1.3047, + "step": 3210 + }, + { + "epoch": 0.9563841471360226, + "grad_norm": 0.2052096724510193, + "learning_rate": 1.9773449493869963e-05, + "loss": 1.2714, + "step": 3211 + }, + { + "epoch": 0.9566819933356913, + "grad_norm": 0.20484837889671326, + "learning_rate": 1.9773245295364443e-05, + "loss": 1.2986, + "step": 3212 + }, + { + "epoch": 0.9569798395353599, + "grad_norm": 0.19925053417682648, + "learning_rate": 1.9773041005929797e-05, + "loss": 1.303, + "step": 3213 + }, + { + "epoch": 0.9572776857350286, + "grad_norm": 0.19449517130851746, + "learning_rate": 1.9772836625567923e-05, + "loss": 1.3057, + "step": 3214 + }, + { + "epoch": 0.9575755319346972, + "grad_norm": 0.20237061381340027, + "learning_rate": 1.977263215428072e-05, + "loss": 1.3025, + "step": 3215 + }, + { + "epoch": 0.9578733781343659, + "grad_norm": 0.1875840276479721, + "learning_rate": 1.9772427592070095e-05, + "loss": 1.2857, + "step": 3216 + }, + { + "epoch": 0.9581712243340346, + "grad_norm": 0.1839030534029007, + "learning_rate": 1.977222293893795e-05, + "loss": 1.3042, + "step": 3217 + }, + { + "epoch": 0.9584690705337031, + "grad_norm": 0.19607189297676086, + "learning_rate": 1.977201819488619e-05, + "loss": 1.307, + "step": 3218 + }, + { + "epoch": 0.9587669167333718, + "grad_norm": 0.2031727433204651, + "learning_rate": 1.977181335991672e-05, + "loss": 1.3069, + "step": 3219 + }, + { + "epoch": 0.9590647629330404, + "grad_norm": 0.20891068875789642, + "learning_rate": 1.9771608434031443e-05, + "loss": 1.29, + "step": 3220 + }, + { + "epoch": 0.9593626091327091, + "grad_norm": 0.2035285234451294, + "learning_rate": 1.9771403417232265e-05, + "loss": 1.3103, + "step": 3221 + }, + { + "epoch": 0.9596604553323778, + "grad_norm": 0.1997441202402115, + "learning_rate": 1.9771198309521095e-05, + "loss": 1.3185, + "step": 3222 + }, + { + "epoch": 0.9599583015320464, + "grad_norm": 0.21472379565238953, + "learning_rate": 1.9770993110899847e-05, + "loss": 1.305, + "step": 3223 + }, + { + "epoch": 0.9602561477317151, + "grad_norm": 0.20216310024261475, + "learning_rate": 1.9770787821370422e-05, + "loss": 1.3031, + "step": 3224 + }, + { + "epoch": 0.9605539939313836, + "grad_norm": 0.2030552327632904, + "learning_rate": 1.977058244093473e-05, + "loss": 1.3062, + "step": 3225 + }, + { + "epoch": 0.9608518401310523, + "grad_norm": 0.19075995683670044, + "learning_rate": 1.9770376969594685e-05, + "loss": 1.3046, + "step": 3226 + }, + { + "epoch": 0.961149686330721, + "grad_norm": 0.19615478813648224, + "learning_rate": 1.97701714073522e-05, + "loss": 1.3173, + "step": 3227 + }, + { + "epoch": 0.9614475325303896, + "grad_norm": 0.21271677315235138, + "learning_rate": 1.9769965754209188e-05, + "loss": 1.3039, + "step": 3228 + }, + { + "epoch": 0.9617453787300583, + "grad_norm": 0.2042091339826584, + "learning_rate": 1.976976001016756e-05, + "loss": 1.3038, + "step": 3229 + }, + { + "epoch": 0.9620432249297269, + "grad_norm": 0.19975119829177856, + "learning_rate": 1.9769554175229228e-05, + "loss": 1.3012, + "step": 3230 + }, + { + "epoch": 0.9623410711293956, + "grad_norm": 0.20455537736415863, + "learning_rate": 1.976934824939611e-05, + "loss": 1.2987, + "step": 3231 + }, + { + "epoch": 0.9626389173290643, + "grad_norm": 0.19876553118228912, + "learning_rate": 1.9769142232670123e-05, + "loss": 1.3123, + "step": 3232 + }, + { + "epoch": 0.9629367635287328, + "grad_norm": 0.19535748660564423, + "learning_rate": 1.976893612505318e-05, + "loss": 1.3003, + "step": 3233 + }, + { + "epoch": 0.9632346097284015, + "grad_norm": 0.19817012548446655, + "learning_rate": 1.9768729926547205e-05, + "loss": 1.2986, + "step": 3234 + }, + { + "epoch": 0.9635324559280701, + "grad_norm": 0.20017985999584198, + "learning_rate": 1.976852363715411e-05, + "loss": 1.3041, + "step": 3235 + }, + { + "epoch": 0.9638303021277388, + "grad_norm": 0.19800353050231934, + "learning_rate": 1.9768317256875814e-05, + "loss": 1.3256, + "step": 3236 + }, + { + "epoch": 0.9641281483274075, + "grad_norm": 0.208049014210701, + "learning_rate": 1.9768110785714242e-05, + "loss": 1.2828, + "step": 3237 + }, + { + "epoch": 0.9644259945270761, + "grad_norm": 0.21552208065986633, + "learning_rate": 1.9767904223671313e-05, + "loss": 1.3083, + "step": 3238 + }, + { + "epoch": 0.9647238407267448, + "grad_norm": 0.21490371227264404, + "learning_rate": 1.976769757074895e-05, + "loss": 1.3045, + "step": 3239 + }, + { + "epoch": 0.9650216869264133, + "grad_norm": 0.19391708076000214, + "learning_rate": 1.9767490826949074e-05, + "loss": 1.3125, + "step": 3240 + }, + { + "epoch": 0.965319533126082, + "grad_norm": 0.1888916939496994, + "learning_rate": 1.976728399227361e-05, + "loss": 1.2935, + "step": 3241 + }, + { + "epoch": 0.9656173793257506, + "grad_norm": 0.20408473908901215, + "learning_rate": 1.9767077066724475e-05, + "loss": 1.3003, + "step": 3242 + }, + { + "epoch": 0.9659152255254193, + "grad_norm": 0.20652951300144196, + "learning_rate": 1.9766870050303603e-05, + "loss": 1.3059, + "step": 3243 + }, + { + "epoch": 0.966213071725088, + "grad_norm": 0.19031745195388794, + "learning_rate": 1.9766662943012918e-05, + "loss": 1.2848, + "step": 3244 + }, + { + "epoch": 0.9665109179247566, + "grad_norm": 0.19920264184474945, + "learning_rate": 1.9766455744854348e-05, + "loss": 1.3031, + "step": 3245 + }, + { + "epoch": 0.9668087641244253, + "grad_norm": 0.19462813436985016, + "learning_rate": 1.9766248455829817e-05, + "loss": 1.2851, + "step": 3246 + }, + { + "epoch": 0.9671066103240938, + "grad_norm": 0.19669468700885773, + "learning_rate": 1.9766041075941253e-05, + "loss": 1.2877, + "step": 3247 + }, + { + "epoch": 0.9674044565237625, + "grad_norm": 0.19745111465454102, + "learning_rate": 1.9765833605190594e-05, + "loss": 1.314, + "step": 3248 + }, + { + "epoch": 0.9677023027234312, + "grad_norm": 0.19436196982860565, + "learning_rate": 1.976562604357976e-05, + "loss": 1.3028, + "step": 3249 + }, + { + "epoch": 0.9680001489230998, + "grad_norm": 0.19147449731826782, + "learning_rate": 1.976541839111069e-05, + "loss": 1.2855, + "step": 3250 + }, + { + "epoch": 0.9682979951227685, + "grad_norm": 0.20012982189655304, + "learning_rate": 1.9765210647785308e-05, + "loss": 1.3043, + "step": 3251 + }, + { + "epoch": 0.9685958413224371, + "grad_norm": 0.20200534164905548, + "learning_rate": 1.9765002813605554e-05, + "loss": 1.3101, + "step": 3252 + }, + { + "epoch": 0.9688936875221058, + "grad_norm": 0.20013867318630219, + "learning_rate": 1.976479488857336e-05, + "loss": 1.3185, + "step": 3253 + }, + { + "epoch": 0.9691915337217745, + "grad_norm": 0.20642933249473572, + "learning_rate": 1.9764586872690655e-05, + "loss": 1.3037, + "step": 3254 + }, + { + "epoch": 0.969489379921443, + "grad_norm": 0.196793794631958, + "learning_rate": 1.9764378765959385e-05, + "loss": 1.3181, + "step": 3255 + }, + { + "epoch": 0.9697872261211117, + "grad_norm": 0.20247621834278107, + "learning_rate": 1.9764170568381477e-05, + "loss": 1.2888, + "step": 3256 + }, + { + "epoch": 0.9700850723207803, + "grad_norm": 0.1943470686674118, + "learning_rate": 1.9763962279958872e-05, + "loss": 1.3143, + "step": 3257 + }, + { + "epoch": 0.970382918520449, + "grad_norm": 0.20508818328380585, + "learning_rate": 1.9763753900693504e-05, + "loss": 1.3134, + "step": 3258 + }, + { + "epoch": 0.9706807647201177, + "grad_norm": 0.21431271731853485, + "learning_rate": 1.9763545430587313e-05, + "loss": 1.3126, + "step": 3259 + }, + { + "epoch": 0.9709786109197863, + "grad_norm": 0.22474008798599243, + "learning_rate": 1.9763336869642248e-05, + "loss": 1.319, + "step": 3260 + }, + { + "epoch": 0.971276457119455, + "grad_norm": 0.20004312694072723, + "learning_rate": 1.9763128217860236e-05, + "loss": 1.3073, + "step": 3261 + }, + { + "epoch": 0.9715743033191235, + "grad_norm": 0.20104417204856873, + "learning_rate": 1.9762919475243226e-05, + "loss": 1.3062, + "step": 3262 + }, + { + "epoch": 0.9718721495187922, + "grad_norm": 0.20001773536205292, + "learning_rate": 1.9762710641793155e-05, + "loss": 1.2997, + "step": 3263 + }, + { + "epoch": 0.9721699957184609, + "grad_norm": 0.20377035439014435, + "learning_rate": 1.9762501717511972e-05, + "loss": 1.3179, + "step": 3264 + }, + { + "epoch": 0.9724678419181295, + "grad_norm": 0.19937177002429962, + "learning_rate": 1.9762292702401614e-05, + "loss": 1.2991, + "step": 3265 + }, + { + "epoch": 0.9727656881177982, + "grad_norm": 0.19473156332969666, + "learning_rate": 1.9762083596464035e-05, + "loss": 1.2993, + "step": 3266 + }, + { + "epoch": 0.9730635343174668, + "grad_norm": 0.20990370213985443, + "learning_rate": 1.9761874399701173e-05, + "loss": 1.3063, + "step": 3267 + }, + { + "epoch": 0.9733613805171355, + "grad_norm": 0.20723816752433777, + "learning_rate": 1.9761665112114973e-05, + "loss": 1.2908, + "step": 3268 + }, + { + "epoch": 0.9736592267168042, + "grad_norm": 0.20216530561447144, + "learning_rate": 1.976145573370739e-05, + "loss": 1.3003, + "step": 3269 + }, + { + "epoch": 0.9739570729164727, + "grad_norm": 0.22414833307266235, + "learning_rate": 1.976124626448037e-05, + "loss": 1.2981, + "step": 3270 + }, + { + "epoch": 0.9742549191161414, + "grad_norm": 0.20434461534023285, + "learning_rate": 1.9761036704435853e-05, + "loss": 1.2964, + "step": 3271 + }, + { + "epoch": 0.97455276531581, + "grad_norm": 0.20486347377300262, + "learning_rate": 1.9760827053575796e-05, + "loss": 1.3062, + "step": 3272 + }, + { + "epoch": 0.9748506115154787, + "grad_norm": 0.19999034702777863, + "learning_rate": 1.976061731190215e-05, + "loss": 1.3125, + "step": 3273 + }, + { + "epoch": 0.9751484577151474, + "grad_norm": 0.20354914665222168, + "learning_rate": 1.9760407479416864e-05, + "loss": 1.3, + "step": 3274 + }, + { + "epoch": 0.975446303914816, + "grad_norm": 0.1978183090686798, + "learning_rate": 1.9760197556121893e-05, + "loss": 1.2989, + "step": 3275 + }, + { + "epoch": 0.9757441501144847, + "grad_norm": 0.20380771160125732, + "learning_rate": 1.975998754201919e-05, + "loss": 1.2998, + "step": 3276 + }, + { + "epoch": 0.9760419963141532, + "grad_norm": 0.19634027779102325, + "learning_rate": 1.9759777437110706e-05, + "loss": 1.3153, + "step": 3277 + }, + { + "epoch": 0.9763398425138219, + "grad_norm": 0.19065183401107788, + "learning_rate": 1.9759567241398396e-05, + "loss": 1.2888, + "step": 3278 + }, + { + "epoch": 0.9766376887134905, + "grad_norm": 0.21245373785495758, + "learning_rate": 1.9759356954884218e-05, + "loss": 1.3291, + "step": 3279 + }, + { + "epoch": 0.9769355349131592, + "grad_norm": 0.20729570090770721, + "learning_rate": 1.9759146577570124e-05, + "loss": 1.3185, + "step": 3280 + }, + { + "epoch": 0.9772333811128279, + "grad_norm": 0.2136635184288025, + "learning_rate": 1.975893610945808e-05, + "loss": 1.3042, + "step": 3281 + }, + { + "epoch": 0.9775312273124965, + "grad_norm": 0.20022720098495483, + "learning_rate": 1.9758725550550036e-05, + "loss": 1.3015, + "step": 3282 + }, + { + "epoch": 0.9778290735121652, + "grad_norm": 0.20077665150165558, + "learning_rate": 1.9758514900847955e-05, + "loss": 1.3183, + "step": 3283 + }, + { + "epoch": 0.9781269197118339, + "grad_norm": 0.21058335900306702, + "learning_rate": 1.9758304160353794e-05, + "loss": 1.3094, + "step": 3284 + }, + { + "epoch": 0.9784247659115024, + "grad_norm": 0.19979368150234222, + "learning_rate": 1.9758093329069516e-05, + "loss": 1.2985, + "step": 3285 + }, + { + "epoch": 0.9787226121111711, + "grad_norm": 0.20603543519973755, + "learning_rate": 1.9757882406997085e-05, + "loss": 1.2928, + "step": 3286 + }, + { + "epoch": 0.9790204583108397, + "grad_norm": 0.21984848380088806, + "learning_rate": 1.9757671394138457e-05, + "loss": 1.3117, + "step": 3287 + }, + { + "epoch": 0.9793183045105084, + "grad_norm": 0.20999737083911896, + "learning_rate": 1.97574602904956e-05, + "loss": 1.3065, + "step": 3288 + }, + { + "epoch": 0.979616150710177, + "grad_norm": 0.19377809762954712, + "learning_rate": 1.9757249096070475e-05, + "loss": 1.2862, + "step": 3289 + }, + { + "epoch": 0.9799139969098457, + "grad_norm": 0.2010391801595688, + "learning_rate": 1.975703781086505e-05, + "loss": 1.3056, + "step": 3290 + }, + { + "epoch": 0.9802118431095144, + "grad_norm": 0.21323293447494507, + "learning_rate": 1.9756826434881286e-05, + "loss": 1.3036, + "step": 3291 + }, + { + "epoch": 0.9805096893091829, + "grad_norm": 0.21092991530895233, + "learning_rate": 1.9756614968121157e-05, + "loss": 1.3103, + "step": 3292 + }, + { + "epoch": 0.9808075355088516, + "grad_norm": 0.21400882303714752, + "learning_rate": 1.9756403410586625e-05, + "loss": 1.3083, + "step": 3293 + }, + { + "epoch": 0.9811053817085202, + "grad_norm": 0.19310444593429565, + "learning_rate": 1.9756191762279662e-05, + "loss": 1.2848, + "step": 3294 + }, + { + "epoch": 0.9814032279081889, + "grad_norm": 0.22360482811927795, + "learning_rate": 1.9755980023202232e-05, + "loss": 1.3122, + "step": 3295 + }, + { + "epoch": 0.9817010741078576, + "grad_norm": 0.219071626663208, + "learning_rate": 1.9755768193356308e-05, + "loss": 1.3023, + "step": 3296 + }, + { + "epoch": 0.9819989203075262, + "grad_norm": 0.22543108463287354, + "learning_rate": 1.9755556272743864e-05, + "loss": 1.3079, + "step": 3297 + }, + { + "epoch": 0.9822967665071949, + "grad_norm": 0.20201154053211212, + "learning_rate": 1.9755344261366863e-05, + "loss": 1.3031, + "step": 3298 + }, + { + "epoch": 0.9825946127068635, + "grad_norm": 0.21543379127979279, + "learning_rate": 1.9755132159227287e-05, + "loss": 1.3065, + "step": 3299 + }, + { + "epoch": 0.9828924589065321, + "grad_norm": 0.2380930483341217, + "learning_rate": 1.9754919966327107e-05, + "loss": 1.2956, + "step": 3300 + }, + { + "epoch": 0.9831903051062008, + "grad_norm": 0.2125539481639862, + "learning_rate": 1.975470768266829e-05, + "loss": 1.318, + "step": 3301 + }, + { + "epoch": 0.9834881513058694, + "grad_norm": 0.20666386187076569, + "learning_rate": 1.975449530825282e-05, + "loss": 1.3041, + "step": 3302 + }, + { + "epoch": 0.9837859975055381, + "grad_norm": 0.19961614906787872, + "learning_rate": 1.975428284308267e-05, + "loss": 1.2951, + "step": 3303 + }, + { + "epoch": 0.9840838437052067, + "grad_norm": 0.2076793760061264, + "learning_rate": 1.9754070287159815e-05, + "loss": 1.2906, + "step": 3304 + }, + { + "epoch": 0.9843816899048754, + "grad_norm": 0.21194185316562653, + "learning_rate": 1.9753857640486233e-05, + "loss": 1.3005, + "step": 3305 + }, + { + "epoch": 0.9846795361045441, + "grad_norm": 0.22201673686504364, + "learning_rate": 1.9753644903063906e-05, + "loss": 1.3032, + "step": 3306 + }, + { + "epoch": 0.9849773823042126, + "grad_norm": 0.20467990636825562, + "learning_rate": 1.975343207489481e-05, + "loss": 1.3336, + "step": 3307 + }, + { + "epoch": 0.9852752285038813, + "grad_norm": 0.20192810893058777, + "learning_rate": 1.9753219155980922e-05, + "loss": 1.2886, + "step": 3308 + }, + { + "epoch": 0.9855730747035499, + "grad_norm": 0.2171163409948349, + "learning_rate": 1.9753006146324232e-05, + "loss": 1.3066, + "step": 3309 + }, + { + "epoch": 0.9858709209032186, + "grad_norm": 0.20057716965675354, + "learning_rate": 1.9752793045926712e-05, + "loss": 1.2986, + "step": 3310 + }, + { + "epoch": 0.9861687671028873, + "grad_norm": 0.19825346767902374, + "learning_rate": 1.9752579854790353e-05, + "loss": 1.3079, + "step": 3311 + }, + { + "epoch": 0.9864666133025559, + "grad_norm": 0.1995212435722351, + "learning_rate": 1.9752366572917135e-05, + "loss": 1.3016, + "step": 3312 + }, + { + "epoch": 0.9867644595022246, + "grad_norm": 0.21994702517986298, + "learning_rate": 1.9752153200309037e-05, + "loss": 1.3127, + "step": 3313 + }, + { + "epoch": 0.9870623057018932, + "grad_norm": 0.20895038545131683, + "learning_rate": 1.9751939736968053e-05, + "loss": 1.3024, + "step": 3314 + }, + { + "epoch": 0.9873601519015618, + "grad_norm": 0.20712026953697205, + "learning_rate": 1.9751726182896166e-05, + "loss": 1.294, + "step": 3315 + }, + { + "epoch": 0.9876579981012304, + "grad_norm": 0.21284706890583038, + "learning_rate": 1.9751512538095357e-05, + "loss": 1.2929, + "step": 3316 + }, + { + "epoch": 0.9879558443008991, + "grad_norm": 0.1990649700164795, + "learning_rate": 1.9751298802567624e-05, + "loss": 1.313, + "step": 3317 + }, + { + "epoch": 0.9882536905005678, + "grad_norm": 0.19659705460071564, + "learning_rate": 1.975108497631495e-05, + "loss": 1.3124, + "step": 3318 + }, + { + "epoch": 0.9885515367002364, + "grad_norm": 0.19998084008693695, + "learning_rate": 1.975087105933932e-05, + "loss": 1.3047, + "step": 3319 + }, + { + "epoch": 0.9888493828999051, + "grad_norm": 0.2667716145515442, + "learning_rate": 1.9750657051642738e-05, + "loss": 1.2841, + "step": 3320 + }, + { + "epoch": 0.9891472290995738, + "grad_norm": 0.20697744190692902, + "learning_rate": 1.975044295322718e-05, + "loss": 1.2899, + "step": 3321 + }, + { + "epoch": 0.9894450752992423, + "grad_norm": 0.20054593682289124, + "learning_rate": 1.9750228764094645e-05, + "loss": 1.2943, + "step": 3322 + }, + { + "epoch": 0.989742921498911, + "grad_norm": 0.2037118375301361, + "learning_rate": 1.9750014484247124e-05, + "loss": 1.2944, + "step": 3323 + }, + { + "epoch": 0.9900407676985796, + "grad_norm": 0.20013418793678284, + "learning_rate": 1.9749800113686613e-05, + "loss": 1.2788, + "step": 3324 + }, + { + "epoch": 0.9903386138982483, + "grad_norm": 0.19793759286403656, + "learning_rate": 1.9749585652415104e-05, + "loss": 1.3095, + "step": 3325 + }, + { + "epoch": 0.990636460097917, + "grad_norm": 0.20117361843585968, + "learning_rate": 1.9749371100434596e-05, + "loss": 1.2982, + "step": 3326 + }, + { + "epoch": 0.9909343062975856, + "grad_norm": 0.2205909639596939, + "learning_rate": 1.9749156457747078e-05, + "loss": 1.2955, + "step": 3327 + }, + { + "epoch": 0.9912321524972543, + "grad_norm": 0.21903647482395172, + "learning_rate": 1.9748941724354555e-05, + "loss": 1.283, + "step": 3328 + }, + { + "epoch": 0.9915299986969229, + "grad_norm": 0.20330850780010223, + "learning_rate": 1.974872690025902e-05, + "loss": 1.3012, + "step": 3329 + }, + { + "epoch": 0.9918278448965915, + "grad_norm": 0.297417014837265, + "learning_rate": 1.9748511985462474e-05, + "loss": 1.3106, + "step": 3330 + }, + { + "epoch": 0.9921256910962601, + "grad_norm": 0.1906329095363617, + "learning_rate": 1.9748296979966916e-05, + "loss": 1.3036, + "step": 3331 + }, + { + "epoch": 0.9924235372959288, + "grad_norm": 0.21068215370178223, + "learning_rate": 1.9748081883774346e-05, + "loss": 1.3204, + "step": 3332 + }, + { + "epoch": 0.9927213834955975, + "grad_norm": 0.20962785184383392, + "learning_rate": 1.9747866696886767e-05, + "loss": 1.3056, + "step": 3333 + }, + { + "epoch": 0.9930192296952661, + "grad_norm": 0.20342715084552765, + "learning_rate": 1.9747651419306174e-05, + "loss": 1.3013, + "step": 3334 + }, + { + "epoch": 0.9933170758949348, + "grad_norm": 0.199946790933609, + "learning_rate": 1.9747436051034577e-05, + "loss": 1.3148, + "step": 3335 + }, + { + "epoch": 0.9936149220946034, + "grad_norm": 0.2034381926059723, + "learning_rate": 1.974722059207398e-05, + "loss": 1.3077, + "step": 3336 + }, + { + "epoch": 0.993912768294272, + "grad_norm": 0.20843319594860077, + "learning_rate": 1.974700504242639e-05, + "loss": 1.3088, + "step": 3337 + }, + { + "epoch": 0.9942106144939407, + "grad_norm": 0.1890590339899063, + "learning_rate": 1.97467894020938e-05, + "loss": 1.2777, + "step": 3338 + }, + { + "epoch": 0.9945084606936093, + "grad_norm": 0.20892484486103058, + "learning_rate": 1.9746573671078225e-05, + "loss": 1.3049, + "step": 3339 + }, + { + "epoch": 0.994806306893278, + "grad_norm": 0.19718541204929352, + "learning_rate": 1.9746357849381675e-05, + "loss": 1.2859, + "step": 3340 + }, + { + "epoch": 0.9951041530929466, + "grad_norm": 0.21576572954654694, + "learning_rate": 1.9746141937006155e-05, + "loss": 1.3173, + "step": 3341 + }, + { + "epoch": 0.9954019992926153, + "grad_norm": 0.20363959670066833, + "learning_rate": 1.974592593395367e-05, + "loss": 1.2754, + "step": 3342 + }, + { + "epoch": 0.995699845492284, + "grad_norm": 0.20009107887744904, + "learning_rate": 1.9745709840226236e-05, + "loss": 1.2979, + "step": 3343 + }, + { + "epoch": 0.9959976916919525, + "grad_norm": 0.19112053513526917, + "learning_rate": 1.9745493655825858e-05, + "loss": 1.305, + "step": 3344 + }, + { + "epoch": 0.9962955378916212, + "grad_norm": 0.20680101215839386, + "learning_rate": 1.9745277380754553e-05, + "loss": 1.3173, + "step": 3345 + }, + { + "epoch": 0.9965933840912898, + "grad_norm": 0.19246166944503784, + "learning_rate": 1.9745061015014325e-05, + "loss": 1.3163, + "step": 3346 + }, + { + "epoch": 0.9968912302909585, + "grad_norm": 0.1975969821214676, + "learning_rate": 1.97448445586072e-05, + "loss": 1.2984, + "step": 3347 + }, + { + "epoch": 0.9971890764906272, + "grad_norm": 0.1960238516330719, + "learning_rate": 1.9744628011535176e-05, + "loss": 1.3003, + "step": 3348 + }, + { + "epoch": 0.9974869226902958, + "grad_norm": 0.20400959253311157, + "learning_rate": 1.974441137380028e-05, + "loss": 1.2883, + "step": 3349 + }, + { + "epoch": 0.9977847688899645, + "grad_norm": 0.1912810355424881, + "learning_rate": 1.9744194645404523e-05, + "loss": 1.2951, + "step": 3350 + }, + { + "epoch": 0.9980826150896331, + "grad_norm": 0.18957996368408203, + "learning_rate": 1.9743977826349917e-05, + "loss": 1.2963, + "step": 3351 + }, + { + "epoch": 0.9983804612893017, + "grad_norm": 0.2020920068025589, + "learning_rate": 1.974376091663849e-05, + "loss": 1.3111, + "step": 3352 + }, + { + "epoch": 0.9986783074889704, + "grad_norm": 0.19816243648529053, + "learning_rate": 1.9743543916272252e-05, + "loss": 1.2878, + "step": 3353 + }, + { + "epoch": 0.998976153688639, + "grad_norm": 0.19619014859199524, + "learning_rate": 1.9743326825253225e-05, + "loss": 1.3091, + "step": 3354 + }, + { + "epoch": 0.9992739998883077, + "grad_norm": 0.202503502368927, + "learning_rate": 1.9743109643583425e-05, + "loss": 1.2974, + "step": 3355 + }, + { + "epoch": 0.9995718460879763, + "grad_norm": 0.19494196772575378, + "learning_rate": 1.9742892371264876e-05, + "loss": 1.3062, + "step": 3356 + }, + { + "epoch": 0.999869692287645, + "grad_norm": 0.2075500786304474, + "learning_rate": 1.97426750082996e-05, + "loss": 1.311, + "step": 3357 + }, + { + "epoch": 1.0001675384873137, + "grad_norm": 0.20512253046035767, + "learning_rate": 1.974245755468962e-05, + "loss": 1.3058, + "step": 3358 + }, + { + "epoch": 1.0004653846869822, + "grad_norm": 0.2019047886133194, + "learning_rate": 1.9742240010436955e-05, + "loss": 1.2981, + "step": 3359 + }, + { + "epoch": 1.000763230886651, + "grad_norm": 0.20687521994113922, + "learning_rate": 1.974202237554363e-05, + "loss": 1.3051, + "step": 3360 + }, + { + "epoch": 1.0010610770863195, + "grad_norm": 0.2059999257326126, + "learning_rate": 1.9741804650011673e-05, + "loss": 1.3112, + "step": 3361 + }, + { + "epoch": 1.0013589232859883, + "grad_norm": 0.2035730481147766, + "learning_rate": 1.974158683384311e-05, + "loss": 1.3189, + "step": 3362 + }, + { + "epoch": 1.0016567694856569, + "grad_norm": 0.21043939888477325, + "learning_rate": 1.9741368927039962e-05, + "loss": 1.296, + "step": 3363 + }, + { + "epoch": 1.0019546156853254, + "grad_norm": 0.20265917479991913, + "learning_rate": 1.974115092960426e-05, + "loss": 1.3064, + "step": 3364 + }, + { + "epoch": 1.0022524618849942, + "grad_norm": 0.1880495548248291, + "learning_rate": 1.9740932841538035e-05, + "loss": 1.3062, + "step": 3365 + }, + { + "epoch": 1.0025503080846627, + "grad_norm": 0.19218531250953674, + "learning_rate": 1.9740714662843313e-05, + "loss": 1.3058, + "step": 3366 + }, + { + "epoch": 1.0028481542843315, + "grad_norm": 0.19349585473537445, + "learning_rate": 1.9740496393522123e-05, + "loss": 1.3055, + "step": 3367 + }, + { + "epoch": 1.003146000484, + "grad_norm": 0.23449759185314178, + "learning_rate": 1.9740278033576496e-05, + "loss": 1.3067, + "step": 3368 + }, + { + "epoch": 1.0034438466836688, + "grad_norm": 0.1942831575870514, + "learning_rate": 1.9740059583008463e-05, + "loss": 1.2853, + "step": 3369 + }, + { + "epoch": 1.0037416928833374, + "grad_norm": 0.19284681975841522, + "learning_rate": 1.973984104182006e-05, + "loss": 1.2824, + "step": 3370 + }, + { + "epoch": 1.004039539083006, + "grad_norm": 0.2020491063594818, + "learning_rate": 1.973962241001332e-05, + "loss": 1.2929, + "step": 3371 + }, + { + "epoch": 1.0043373852826747, + "grad_norm": 0.19560948014259338, + "learning_rate": 1.9739403687590268e-05, + "loss": 1.2821, + "step": 3372 + }, + { + "epoch": 1.0046352314823432, + "grad_norm": 0.20239092409610748, + "learning_rate": 1.973918487455295e-05, + "loss": 1.297, + "step": 3373 + }, + { + "epoch": 1.004933077682012, + "grad_norm": 0.19475553929805756, + "learning_rate": 1.9738965970903397e-05, + "loss": 1.3142, + "step": 3374 + }, + { + "epoch": 1.0052309238816806, + "grad_norm": 0.2040357142686844, + "learning_rate": 1.973874697664365e-05, + "loss": 1.3028, + "step": 3375 + }, + { + "epoch": 1.0055287700813493, + "grad_norm": 0.1978982836008072, + "learning_rate": 1.9738527891775746e-05, + "loss": 1.303, + "step": 3376 + }, + { + "epoch": 1.0058266162810179, + "grad_norm": 0.20032471418380737, + "learning_rate": 1.973830871630171e-05, + "loss": 1.3013, + "step": 3377 + }, + { + "epoch": 1.0061244624806864, + "grad_norm": 0.2011338174343109, + "learning_rate": 1.9738089450223602e-05, + "loss": 1.29, + "step": 3378 + }, + { + "epoch": 1.0064223086803552, + "grad_norm": 0.19828371703624725, + "learning_rate": 1.9737870093543446e-05, + "loss": 1.3042, + "step": 3379 + }, + { + "epoch": 1.0067201548800238, + "grad_norm": 0.20131751894950867, + "learning_rate": 1.973765064626329e-05, + "loss": 1.2847, + "step": 3380 + }, + { + "epoch": 1.0070180010796925, + "grad_norm": 0.19987799227237701, + "learning_rate": 1.9737431108385174e-05, + "loss": 1.3098, + "step": 3381 + }, + { + "epoch": 1.007315847279361, + "grad_norm": 0.20842686295509338, + "learning_rate": 1.9737211479911143e-05, + "loss": 1.3177, + "step": 3382 + }, + { + "epoch": 1.0076136934790298, + "grad_norm": 0.19841377437114716, + "learning_rate": 1.9736991760843235e-05, + "loss": 1.2875, + "step": 3383 + }, + { + "epoch": 1.0079115396786984, + "grad_norm": 0.1997886598110199, + "learning_rate": 1.97367719511835e-05, + "loss": 1.2997, + "step": 3384 + }, + { + "epoch": 1.008209385878367, + "grad_norm": 0.20692330598831177, + "learning_rate": 1.973655205093398e-05, + "loss": 1.3109, + "step": 3385 + }, + { + "epoch": 1.0085072320780357, + "grad_norm": 0.19390694797039032, + "learning_rate": 1.973633206009672e-05, + "loss": 1.327, + "step": 3386 + }, + { + "epoch": 1.0088050782777043, + "grad_norm": 0.207501620054245, + "learning_rate": 1.973611197867377e-05, + "loss": 1.2863, + "step": 3387 + }, + { + "epoch": 1.009102924477373, + "grad_norm": 0.2012711763381958, + "learning_rate": 1.9735891806667173e-05, + "loss": 1.2983, + "step": 3388 + }, + { + "epoch": 1.0094007706770416, + "grad_norm": 0.20405852794647217, + "learning_rate": 1.9735671544078985e-05, + "loss": 1.3109, + "step": 3389 + }, + { + "epoch": 1.0096986168767104, + "grad_norm": 0.20827427506446838, + "learning_rate": 1.973545119091125e-05, + "loss": 1.3079, + "step": 3390 + }, + { + "epoch": 1.009996463076379, + "grad_norm": 0.21407508850097656, + "learning_rate": 1.9735230747166014e-05, + "loss": 1.2979, + "step": 3391 + }, + { + "epoch": 1.0102943092760477, + "grad_norm": 0.20271380245685577, + "learning_rate": 1.9735010212845336e-05, + "loss": 1.3111, + "step": 3392 + }, + { + "epoch": 1.0105921554757162, + "grad_norm": 0.19732552766799927, + "learning_rate": 1.9734789587951264e-05, + "loss": 1.3056, + "step": 3393 + }, + { + "epoch": 1.0108900016753848, + "grad_norm": 0.20793870091438293, + "learning_rate": 1.9734568872485852e-05, + "loss": 1.3091, + "step": 3394 + }, + { + "epoch": 1.0111878478750536, + "grad_norm": 0.21210235357284546, + "learning_rate": 1.973434806645115e-05, + "loss": 1.2873, + "step": 3395 + }, + { + "epoch": 1.011485694074722, + "grad_norm": 0.19632075726985931, + "learning_rate": 1.973412716984922e-05, + "loss": 1.2919, + "step": 3396 + }, + { + "epoch": 1.0117835402743909, + "grad_norm": 0.20312587916851044, + "learning_rate": 1.9733906182682107e-05, + "loss": 1.2898, + "step": 3397 + }, + { + "epoch": 1.0120813864740594, + "grad_norm": 0.19858725368976593, + "learning_rate": 1.9733685104951874e-05, + "loss": 1.3011, + "step": 3398 + }, + { + "epoch": 1.0123792326737282, + "grad_norm": 0.20250661671161652, + "learning_rate": 1.973346393666058e-05, + "loss": 1.3047, + "step": 3399 + }, + { + "epoch": 1.0126770788733968, + "grad_norm": 0.2005874067544937, + "learning_rate": 1.9733242677810277e-05, + "loss": 1.3041, + "step": 3400 + }, + { + "epoch": 1.0129749250730653, + "grad_norm": 0.2064727246761322, + "learning_rate": 1.9733021328403022e-05, + "loss": 1.2991, + "step": 3401 + }, + { + "epoch": 1.013272771272734, + "grad_norm": 0.21274413168430328, + "learning_rate": 1.973279988844088e-05, + "loss": 1.2957, + "step": 3402 + }, + { + "epoch": 1.0135706174724026, + "grad_norm": 0.2098504602909088, + "learning_rate": 1.9732578357925913e-05, + "loss": 1.3196, + "step": 3403 + }, + { + "epoch": 1.0138684636720714, + "grad_norm": 0.20712429285049438, + "learning_rate": 1.9732356736860172e-05, + "loss": 1.3001, + "step": 3404 + }, + { + "epoch": 1.01416630987174, + "grad_norm": 0.2014516294002533, + "learning_rate": 1.973213502524573e-05, + "loss": 1.2929, + "step": 3405 + }, + { + "epoch": 1.0144641560714087, + "grad_norm": 0.20681215822696686, + "learning_rate": 1.973191322308464e-05, + "loss": 1.307, + "step": 3406 + }, + { + "epoch": 1.0147620022710773, + "grad_norm": 0.21458099782466888, + "learning_rate": 1.9731691330378972e-05, + "loss": 1.3124, + "step": 3407 + }, + { + "epoch": 1.0150598484707458, + "grad_norm": 0.20352648198604584, + "learning_rate": 1.9731469347130793e-05, + "loss": 1.29, + "step": 3408 + }, + { + "epoch": 1.0153576946704146, + "grad_norm": 0.20370642840862274, + "learning_rate": 1.973124727334216e-05, + "loss": 1.2969, + "step": 3409 + }, + { + "epoch": 1.0156555408700831, + "grad_norm": 0.21255552768707275, + "learning_rate": 1.9731025109015146e-05, + "loss": 1.3053, + "step": 3410 + }, + { + "epoch": 1.015953387069752, + "grad_norm": 0.21125616133213043, + "learning_rate": 1.9730802854151815e-05, + "loss": 1.3053, + "step": 3411 + }, + { + "epoch": 1.0162512332694205, + "grad_norm": 0.1962079554796219, + "learning_rate": 1.9730580508754235e-05, + "loss": 1.312, + "step": 3412 + }, + { + "epoch": 1.0165490794690892, + "grad_norm": 0.19924552738666534, + "learning_rate": 1.9730358072824476e-05, + "loss": 1.2835, + "step": 3413 + }, + { + "epoch": 1.0168469256687578, + "grad_norm": 0.2046997994184494, + "learning_rate": 1.9730135546364605e-05, + "loss": 1.2881, + "step": 3414 + }, + { + "epoch": 1.0171447718684263, + "grad_norm": 0.21070091426372528, + "learning_rate": 1.9729912929376695e-05, + "loss": 1.3205, + "step": 3415 + }, + { + "epoch": 1.017442618068095, + "grad_norm": 0.19929635524749756, + "learning_rate": 1.9729690221862816e-05, + "loss": 1.3032, + "step": 3416 + }, + { + "epoch": 1.0177404642677637, + "grad_norm": 0.19725117087364197, + "learning_rate": 1.972946742382504e-05, + "loss": 1.312, + "step": 3417 + }, + { + "epoch": 1.0180383104674324, + "grad_norm": 0.21880078315734863, + "learning_rate": 1.972924453526544e-05, + "loss": 1.2986, + "step": 3418 + }, + { + "epoch": 1.018336156667101, + "grad_norm": 0.21586327254772186, + "learning_rate": 1.972902155618609e-05, + "loss": 1.3067, + "step": 3419 + }, + { + "epoch": 1.0186340028667697, + "grad_norm": 0.21872232854366302, + "learning_rate": 1.9728798486589065e-05, + "loss": 1.3118, + "step": 3420 + }, + { + "epoch": 1.0189318490664383, + "grad_norm": 0.2075866013765335, + "learning_rate": 1.9728575326476437e-05, + "loss": 1.3075, + "step": 3421 + }, + { + "epoch": 1.019229695266107, + "grad_norm": 0.21058566868305206, + "learning_rate": 1.9728352075850287e-05, + "loss": 1.3022, + "step": 3422 + }, + { + "epoch": 1.0195275414657756, + "grad_norm": 0.20899464190006256, + "learning_rate": 1.972812873471269e-05, + "loss": 1.2853, + "step": 3423 + }, + { + "epoch": 1.0198253876654442, + "grad_norm": 0.22943811118602753, + "learning_rate": 1.9727905303065724e-05, + "loss": 1.3279, + "step": 3424 + }, + { + "epoch": 1.020123233865113, + "grad_norm": 0.20464380085468292, + "learning_rate": 1.9727681780911467e-05, + "loss": 1.3055, + "step": 3425 + }, + { + "epoch": 1.0204210800647815, + "grad_norm": 0.20870985090732574, + "learning_rate": 1.9727458168252e-05, + "loss": 1.3253, + "step": 3426 + }, + { + "epoch": 1.0207189262644503, + "grad_norm": 0.2142518013715744, + "learning_rate": 1.9727234465089406e-05, + "loss": 1.3113, + "step": 3427 + }, + { + "epoch": 1.0210167724641188, + "grad_norm": 0.20791509747505188, + "learning_rate": 1.9727010671425758e-05, + "loss": 1.2828, + "step": 3428 + }, + { + "epoch": 1.0213146186637876, + "grad_norm": 0.20511488616466522, + "learning_rate": 1.9726786787263145e-05, + "loss": 1.2879, + "step": 3429 + }, + { + "epoch": 1.0216124648634561, + "grad_norm": 0.20509251952171326, + "learning_rate": 1.972656281260365e-05, + "loss": 1.293, + "step": 3430 + }, + { + "epoch": 1.0219103110631247, + "grad_norm": 0.20584897696971893, + "learning_rate": 1.9726338747449356e-05, + "loss": 1.2958, + "step": 3431 + }, + { + "epoch": 1.0222081572627935, + "grad_norm": 0.2150140404701233, + "learning_rate": 1.9726114591802343e-05, + "loss": 1.3014, + "step": 3432 + }, + { + "epoch": 1.022506003462462, + "grad_norm": 0.22393961250782013, + "learning_rate": 1.9725890345664703e-05, + "loss": 1.3094, + "step": 3433 + }, + { + "epoch": 1.0228038496621308, + "grad_norm": 0.20525576174259186, + "learning_rate": 1.972566600903852e-05, + "loss": 1.3008, + "step": 3434 + }, + { + "epoch": 1.0231016958617993, + "grad_norm": 0.20266354084014893, + "learning_rate": 1.972544158192588e-05, + "loss": 1.3082, + "step": 3435 + }, + { + "epoch": 1.023399542061468, + "grad_norm": 0.21380510926246643, + "learning_rate": 1.972521706432887e-05, + "loss": 1.312, + "step": 3436 + }, + { + "epoch": 1.0236973882611367, + "grad_norm": 0.20366153120994568, + "learning_rate": 1.9724992456249584e-05, + "loss": 1.3105, + "step": 3437 + }, + { + "epoch": 1.0239952344608052, + "grad_norm": 0.21148470044136047, + "learning_rate": 1.972476775769011e-05, + "loss": 1.3014, + "step": 3438 + }, + { + "epoch": 1.024293080660474, + "grad_norm": 0.21774984896183014, + "learning_rate": 1.9724542968652532e-05, + "loss": 1.2974, + "step": 3439 + }, + { + "epoch": 1.0245909268601425, + "grad_norm": 0.20611019432544708, + "learning_rate": 1.9724318089138948e-05, + "loss": 1.2826, + "step": 3440 + }, + { + "epoch": 1.0248887730598113, + "grad_norm": 0.20914271473884583, + "learning_rate": 1.9724093119151454e-05, + "loss": 1.3097, + "step": 3441 + }, + { + "epoch": 1.0251866192594798, + "grad_norm": 0.21499302983283997, + "learning_rate": 1.972386805869213e-05, + "loss": 1.2998, + "step": 3442 + }, + { + "epoch": 1.0254844654591486, + "grad_norm": 0.19540037214756012, + "learning_rate": 1.9723642907763082e-05, + "loss": 1.2874, + "step": 3443 + }, + { + "epoch": 1.0257823116588172, + "grad_norm": 0.20741817355155945, + "learning_rate": 1.97234176663664e-05, + "loss": 1.3061, + "step": 3444 + }, + { + "epoch": 1.0260801578584857, + "grad_norm": 0.2004505693912506, + "learning_rate": 1.9723192334504183e-05, + "loss": 1.3131, + "step": 3445 + }, + { + "epoch": 1.0263780040581545, + "grad_norm": 0.19974607229232788, + "learning_rate": 1.972296691217852e-05, + "loss": 1.3027, + "step": 3446 + }, + { + "epoch": 1.026675850257823, + "grad_norm": 0.2027173936367035, + "learning_rate": 1.9722741399391517e-05, + "loss": 1.3005, + "step": 3447 + }, + { + "epoch": 1.0269736964574918, + "grad_norm": 0.20132912695407867, + "learning_rate": 1.972251579614527e-05, + "loss": 1.2963, + "step": 3448 + }, + { + "epoch": 1.0272715426571604, + "grad_norm": 0.2098199427127838, + "learning_rate": 1.972229010244187e-05, + "loss": 1.3115, + "step": 3449 + }, + { + "epoch": 1.0275693888568291, + "grad_norm": 0.20256924629211426, + "learning_rate": 1.9722064318283425e-05, + "loss": 1.3161, + "step": 3450 + }, + { + "epoch": 1.0278672350564977, + "grad_norm": 0.20966893434524536, + "learning_rate": 1.9721838443672035e-05, + "loss": 1.2963, + "step": 3451 + }, + { + "epoch": 1.0281650812561662, + "grad_norm": 0.2011045664548874, + "learning_rate": 1.97216124786098e-05, + "loss": 1.3011, + "step": 3452 + }, + { + "epoch": 1.028462927455835, + "grad_norm": 0.19789999723434448, + "learning_rate": 1.972138642309882e-05, + "loss": 1.2844, + "step": 3453 + }, + { + "epoch": 1.0287607736555036, + "grad_norm": 0.20889869332313538, + "learning_rate": 1.9721160277141205e-05, + "loss": 1.2916, + "step": 3454 + }, + { + "epoch": 1.0290586198551723, + "grad_norm": 0.19314032793045044, + "learning_rate": 1.972093404073905e-05, + "loss": 1.3052, + "step": 3455 + }, + { + "epoch": 1.0293564660548409, + "grad_norm": 0.1976383626461029, + "learning_rate": 1.972070771389447e-05, + "loss": 1.2919, + "step": 3456 + }, + { + "epoch": 1.0296543122545097, + "grad_norm": 0.20616964995861053, + "learning_rate": 1.9720481296609562e-05, + "loss": 1.3037, + "step": 3457 + }, + { + "epoch": 1.0299521584541782, + "grad_norm": 0.19698336720466614, + "learning_rate": 1.9720254788886435e-05, + "loss": 1.3013, + "step": 3458 + }, + { + "epoch": 1.030250004653847, + "grad_norm": 0.2089948058128357, + "learning_rate": 1.97200281907272e-05, + "loss": 1.2874, + "step": 3459 + }, + { + "epoch": 1.0305478508535155, + "grad_norm": 0.20039913058280945, + "learning_rate": 1.971980150213396e-05, + "loss": 1.2924, + "step": 3460 + }, + { + "epoch": 1.030845697053184, + "grad_norm": 0.192903533577919, + "learning_rate": 1.9719574723108828e-05, + "loss": 1.2985, + "step": 3461 + }, + { + "epoch": 1.0311435432528528, + "grad_norm": 0.19958768784999847, + "learning_rate": 1.971934785365391e-05, + "loss": 1.3123, + "step": 3462 + }, + { + "epoch": 1.0314413894525214, + "grad_norm": 0.20145928859710693, + "learning_rate": 1.9719120893771328e-05, + "loss": 1.3063, + "step": 3463 + }, + { + "epoch": 1.0317392356521902, + "grad_norm": 0.20627877116203308, + "learning_rate": 1.9718893843463177e-05, + "loss": 1.3132, + "step": 3464 + }, + { + "epoch": 1.0320370818518587, + "grad_norm": 0.20193171501159668, + "learning_rate": 1.971866670273158e-05, + "loss": 1.3049, + "step": 3465 + }, + { + "epoch": 1.0323349280515275, + "grad_norm": 0.21276478469371796, + "learning_rate": 1.9718439471578645e-05, + "loss": 1.3172, + "step": 3466 + }, + { + "epoch": 1.032632774251196, + "grad_norm": 0.21268029510974884, + "learning_rate": 1.971821215000649e-05, + "loss": 1.3027, + "step": 3467 + }, + { + "epoch": 1.0329306204508646, + "grad_norm": 0.20449820160865784, + "learning_rate": 1.971798473801723e-05, + "loss": 1.3016, + "step": 3468 + }, + { + "epoch": 1.0332284666505334, + "grad_norm": 0.2022082656621933, + "learning_rate": 1.9717757235612977e-05, + "loss": 1.2869, + "step": 3469 + }, + { + "epoch": 1.033526312850202, + "grad_norm": 0.22149288654327393, + "learning_rate": 1.9717529642795853e-05, + "loss": 1.2966, + "step": 3470 + }, + { + "epoch": 1.0338241590498707, + "grad_norm": 0.20105499029159546, + "learning_rate": 1.971730195956797e-05, + "loss": 1.2822, + "step": 3471 + }, + { + "epoch": 1.0341220052495392, + "grad_norm": 0.21253730356693268, + "learning_rate": 1.9717074185931454e-05, + "loss": 1.2882, + "step": 3472 + }, + { + "epoch": 1.034419851449208, + "grad_norm": 0.22005505859851837, + "learning_rate": 1.9716846321888417e-05, + "loss": 1.3278, + "step": 3473 + }, + { + "epoch": 1.0347176976488766, + "grad_norm": 0.19879037141799927, + "learning_rate": 1.9716618367440978e-05, + "loss": 1.3033, + "step": 3474 + }, + { + "epoch": 1.035015543848545, + "grad_norm": 0.2200528383255005, + "learning_rate": 1.9716390322591264e-05, + "loss": 1.3149, + "step": 3475 + }, + { + "epoch": 1.0353133900482139, + "grad_norm": 0.21222293376922607, + "learning_rate": 1.9716162187341393e-05, + "loss": 1.3334, + "step": 3476 + }, + { + "epoch": 1.0356112362478824, + "grad_norm": 0.22255544364452362, + "learning_rate": 1.971593396169349e-05, + "loss": 1.2955, + "step": 3477 + }, + { + "epoch": 1.0359090824475512, + "grad_norm": 0.2219313234090805, + "learning_rate": 1.9715705645649677e-05, + "loss": 1.3059, + "step": 3478 + }, + { + "epoch": 1.0362069286472197, + "grad_norm": 0.19292521476745605, + "learning_rate": 1.9715477239212074e-05, + "loss": 1.2983, + "step": 3479 + }, + { + "epoch": 1.0365047748468885, + "grad_norm": 0.21676228940486908, + "learning_rate": 1.9715248742382815e-05, + "loss": 1.2815, + "step": 3480 + }, + { + "epoch": 1.036802621046557, + "grad_norm": 0.21063315868377686, + "learning_rate": 1.9715020155164017e-05, + "loss": 1.3115, + "step": 3481 + }, + { + "epoch": 1.0371004672462258, + "grad_norm": 0.2131437510251999, + "learning_rate": 1.971479147755781e-05, + "loss": 1.2765, + "step": 3482 + }, + { + "epoch": 1.0373983134458944, + "grad_norm": 0.22631488740444183, + "learning_rate": 1.9714562709566326e-05, + "loss": 1.3272, + "step": 3483 + }, + { + "epoch": 1.037696159645563, + "grad_norm": 0.20695947110652924, + "learning_rate": 1.9714333851191688e-05, + "loss": 1.2974, + "step": 3484 + }, + { + "epoch": 1.0379940058452317, + "grad_norm": 0.22460728883743286, + "learning_rate": 1.971410490243603e-05, + "loss": 1.308, + "step": 3485 + }, + { + "epoch": 1.0382918520449003, + "grad_norm": 0.21392583847045898, + "learning_rate": 1.9713875863301475e-05, + "loss": 1.2898, + "step": 3486 + }, + { + "epoch": 1.038589698244569, + "grad_norm": 0.21392081677913666, + "learning_rate": 1.9713646733790158e-05, + "loss": 1.2902, + "step": 3487 + }, + { + "epoch": 1.0388875444442376, + "grad_norm": 0.21687346696853638, + "learning_rate": 1.9713417513904213e-05, + "loss": 1.304, + "step": 3488 + }, + { + "epoch": 1.0391853906439064, + "grad_norm": 0.2224649339914322, + "learning_rate": 1.971318820364577e-05, + "loss": 1.3146, + "step": 3489 + }, + { + "epoch": 1.039483236843575, + "grad_norm": 0.2142845094203949, + "learning_rate": 1.9712958803016962e-05, + "loss": 1.2824, + "step": 3490 + }, + { + "epoch": 1.0397810830432435, + "grad_norm": 0.19883057475090027, + "learning_rate": 1.9712729312019925e-05, + "loss": 1.2832, + "step": 3491 + }, + { + "epoch": 1.0400789292429122, + "grad_norm": 0.20167869329452515, + "learning_rate": 1.9712499730656796e-05, + "loss": 1.3068, + "step": 3492 + }, + { + "epoch": 1.0403767754425808, + "grad_norm": 0.21614550054073334, + "learning_rate": 1.9712270058929704e-05, + "loss": 1.3258, + "step": 3493 + }, + { + "epoch": 1.0406746216422496, + "grad_norm": 0.22374406456947327, + "learning_rate": 1.9712040296840795e-05, + "loss": 1.3295, + "step": 3494 + }, + { + "epoch": 1.040972467841918, + "grad_norm": 0.2011774331331253, + "learning_rate": 1.9711810444392198e-05, + "loss": 1.3044, + "step": 3495 + }, + { + "epoch": 1.0412703140415869, + "grad_norm": 0.2045108824968338, + "learning_rate": 1.971158050158606e-05, + "loss": 1.3218, + "step": 3496 + }, + { + "epoch": 1.0415681602412554, + "grad_norm": 0.22346441447734833, + "learning_rate": 1.971135046842451e-05, + "loss": 1.3086, + "step": 3497 + }, + { + "epoch": 1.041866006440924, + "grad_norm": 0.22070644795894623, + "learning_rate": 1.9711120344909695e-05, + "loss": 1.3092, + "step": 3498 + }, + { + "epoch": 1.0421638526405927, + "grad_norm": 0.2148740440607071, + "learning_rate": 1.971089013104376e-05, + "loss": 1.3033, + "step": 3499 + }, + { + "epoch": 1.0424616988402613, + "grad_norm": 0.20926442742347717, + "learning_rate": 1.971065982682884e-05, + "loss": 1.312, + "step": 3500 + }, + { + "epoch": 1.0424616988402613, + "eval_loss": 1.3607652187347412, + "eval_runtime": 21.2984, + "eval_samples_per_second": 81.415, + "eval_steps_per_second": 5.118, + "step": 3500 + }, + { + "epoch": 1.04275954503993, + "grad_norm": 0.21613319218158722, + "learning_rate": 1.9710429432267076e-05, + "loss": 1.3031, + "step": 3501 + }, + { + "epoch": 1.0430573912395986, + "grad_norm": 0.21077358722686768, + "learning_rate": 1.9710198947360616e-05, + "loss": 1.2972, + "step": 3502 + }, + { + "epoch": 1.0433552374392674, + "grad_norm": 0.2114875614643097, + "learning_rate": 1.9709968372111604e-05, + "loss": 1.3196, + "step": 3503 + }, + { + "epoch": 1.043653083638936, + "grad_norm": 0.19951535761356354, + "learning_rate": 1.970973770652219e-05, + "loss": 1.2752, + "step": 3504 + }, + { + "epoch": 1.0439509298386045, + "grad_norm": 0.22120660543441772, + "learning_rate": 1.970950695059451e-05, + "loss": 1.3142, + "step": 3505 + }, + { + "epoch": 1.0442487760382733, + "grad_norm": 0.2088823914527893, + "learning_rate": 1.970927610433072e-05, + "loss": 1.3014, + "step": 3506 + }, + { + "epoch": 1.0445466222379418, + "grad_norm": 0.2029641717672348, + "learning_rate": 1.9709045167732958e-05, + "loss": 1.3043, + "step": 3507 + }, + { + "epoch": 1.0448444684376106, + "grad_norm": 0.20846408605575562, + "learning_rate": 1.970881414080338e-05, + "loss": 1.3097, + "step": 3508 + }, + { + "epoch": 1.0451423146372791, + "grad_norm": 0.208656907081604, + "learning_rate": 1.9708583023544138e-05, + "loss": 1.3067, + "step": 3509 + }, + { + "epoch": 1.045440160836948, + "grad_norm": 0.22487233579158783, + "learning_rate": 1.9708351815957374e-05, + "loss": 1.2978, + "step": 3510 + }, + { + "epoch": 1.0457380070366165, + "grad_norm": 0.22238117456436157, + "learning_rate": 1.9708120518045245e-05, + "loss": 1.3077, + "step": 3511 + }, + { + "epoch": 1.046035853236285, + "grad_norm": 0.22144672274589539, + "learning_rate": 1.9707889129809898e-05, + "loss": 1.2787, + "step": 3512 + }, + { + "epoch": 1.0463336994359538, + "grad_norm": 0.2036600261926651, + "learning_rate": 1.9707657651253492e-05, + "loss": 1.2807, + "step": 3513 + }, + { + "epoch": 1.0466315456356223, + "grad_norm": 0.2069157212972641, + "learning_rate": 1.9707426082378177e-05, + "loss": 1.2925, + "step": 3514 + }, + { + "epoch": 1.046929391835291, + "grad_norm": 0.2219168096780777, + "learning_rate": 1.970719442318611e-05, + "loss": 1.3058, + "step": 3515 + }, + { + "epoch": 1.0472272380349597, + "grad_norm": 0.2090456485748291, + "learning_rate": 1.970696267367944e-05, + "loss": 1.3094, + "step": 3516 + }, + { + "epoch": 1.0475250842346284, + "grad_norm": 0.21614153683185577, + "learning_rate": 1.970673083386033e-05, + "loss": 1.2913, + "step": 3517 + }, + { + "epoch": 1.047822930434297, + "grad_norm": 0.20335333049297333, + "learning_rate": 1.9706498903730934e-05, + "loss": 1.293, + "step": 3518 + }, + { + "epoch": 1.0481207766339655, + "grad_norm": 0.21833182871341705, + "learning_rate": 1.9706266883293413e-05, + "loss": 1.3076, + "step": 3519 + }, + { + "epoch": 1.0484186228336343, + "grad_norm": 0.2128637731075287, + "learning_rate": 1.970603477254992e-05, + "loss": 1.2974, + "step": 3520 + }, + { + "epoch": 1.0487164690333028, + "grad_norm": 0.21557752788066864, + "learning_rate": 1.970580257150262e-05, + "loss": 1.3052, + "step": 3521 + }, + { + "epoch": 1.0490143152329716, + "grad_norm": 0.21013541519641876, + "learning_rate": 1.970557028015367e-05, + "loss": 1.3026, + "step": 3522 + }, + { + "epoch": 1.0493121614326402, + "grad_norm": 0.2258487045764923, + "learning_rate": 1.970533789850523e-05, + "loss": 1.3012, + "step": 3523 + }, + { + "epoch": 1.049610007632309, + "grad_norm": 0.2073233425617218, + "learning_rate": 1.970510542655947e-05, + "loss": 1.3225, + "step": 3524 + }, + { + "epoch": 1.0499078538319775, + "grad_norm": 0.2119024246931076, + "learning_rate": 1.970487286431854e-05, + "loss": 1.3007, + "step": 3525 + }, + { + "epoch": 1.0502057000316463, + "grad_norm": 0.20058347284793854, + "learning_rate": 1.9704640211784617e-05, + "loss": 1.2962, + "step": 3526 + }, + { + "epoch": 1.0505035462313148, + "grad_norm": 0.20473431050777435, + "learning_rate": 1.9704407468959855e-05, + "loss": 1.2948, + "step": 3527 + }, + { + "epoch": 1.0508013924309834, + "grad_norm": 0.20855119824409485, + "learning_rate": 1.9704174635846426e-05, + "loss": 1.3028, + "step": 3528 + }, + { + "epoch": 1.0510992386306521, + "grad_norm": 0.20362649857997894, + "learning_rate": 1.9703941712446495e-05, + "loss": 1.3033, + "step": 3529 + }, + { + "epoch": 1.0513970848303207, + "grad_norm": 0.20610179007053375, + "learning_rate": 1.9703708698762226e-05, + "loss": 1.2986, + "step": 3530 + }, + { + "epoch": 1.0516949310299895, + "grad_norm": 0.20430563390254974, + "learning_rate": 1.9703475594795792e-05, + "loss": 1.311, + "step": 3531 + }, + { + "epoch": 1.051992777229658, + "grad_norm": 0.20857155323028564, + "learning_rate": 1.9703242400549358e-05, + "loss": 1.3088, + "step": 3532 + }, + { + "epoch": 1.0522906234293268, + "grad_norm": 0.19734951853752136, + "learning_rate": 1.970300911602509e-05, + "loss": 1.3023, + "step": 3533 + }, + { + "epoch": 1.0525884696289953, + "grad_norm": 0.20390652120113373, + "learning_rate": 1.970277574122517e-05, + "loss": 1.2888, + "step": 3534 + }, + { + "epoch": 1.0528863158286639, + "grad_norm": 0.20076511800289154, + "learning_rate": 1.970254227615176e-05, + "loss": 1.3049, + "step": 3535 + }, + { + "epoch": 1.0531841620283326, + "grad_norm": 0.19973789155483246, + "learning_rate": 1.970230872080703e-05, + "loss": 1.3008, + "step": 3536 + }, + { + "epoch": 1.0534820082280012, + "grad_norm": 0.20469968020915985, + "learning_rate": 1.9702075075193162e-05, + "loss": 1.3116, + "step": 3537 + }, + { + "epoch": 1.05377985442767, + "grad_norm": 0.19541417062282562, + "learning_rate": 1.9701841339312326e-05, + "loss": 1.3026, + "step": 3538 + }, + { + "epoch": 1.0540777006273385, + "grad_norm": 0.2089960277080536, + "learning_rate": 1.9701607513166695e-05, + "loss": 1.3073, + "step": 3539 + }, + { + "epoch": 1.0543755468270073, + "grad_norm": 0.22149527072906494, + "learning_rate": 1.9701373596758442e-05, + "loss": 1.2924, + "step": 3540 + }, + { + "epoch": 1.0546733930266758, + "grad_norm": 0.20074641704559326, + "learning_rate": 1.970113959008975e-05, + "loss": 1.3005, + "step": 3541 + }, + { + "epoch": 1.0549712392263444, + "grad_norm": 0.2029009610414505, + "learning_rate": 1.9700905493162792e-05, + "loss": 1.303, + "step": 3542 + }, + { + "epoch": 1.0552690854260132, + "grad_norm": 0.20230336487293243, + "learning_rate": 1.9700671305979746e-05, + "loss": 1.2961, + "step": 3543 + }, + { + "epoch": 1.0555669316256817, + "grad_norm": 0.1958988606929779, + "learning_rate": 1.9700437028542794e-05, + "loss": 1.2986, + "step": 3544 + }, + { + "epoch": 1.0558647778253505, + "grad_norm": 0.20778551697731018, + "learning_rate": 1.9700202660854113e-05, + "loss": 1.2911, + "step": 3545 + }, + { + "epoch": 1.056162624025019, + "grad_norm": 0.20250651240348816, + "learning_rate": 1.9699968202915884e-05, + "loss": 1.3175, + "step": 3546 + }, + { + "epoch": 1.0564604702246878, + "grad_norm": 0.2022535353899002, + "learning_rate": 1.9699733654730285e-05, + "loss": 1.2876, + "step": 3547 + }, + { + "epoch": 1.0567583164243564, + "grad_norm": 0.19783522188663483, + "learning_rate": 1.9699499016299506e-05, + "loss": 1.2855, + "step": 3548 + }, + { + "epoch": 1.0570561626240251, + "grad_norm": 0.20000971853733063, + "learning_rate": 1.9699264287625722e-05, + "loss": 1.312, + "step": 3549 + }, + { + "epoch": 1.0573540088236937, + "grad_norm": 0.199110746383667, + "learning_rate": 1.969902946871112e-05, + "loss": 1.3082, + "step": 3550 + }, + { + "epoch": 1.0576518550233622, + "grad_norm": 0.19644653797149658, + "learning_rate": 1.969879455955789e-05, + "loss": 1.3039, + "step": 3551 + }, + { + "epoch": 1.057949701223031, + "grad_norm": 0.2020893692970276, + "learning_rate": 1.969855956016821e-05, + "loss": 1.2969, + "step": 3552 + }, + { + "epoch": 1.0582475474226996, + "grad_norm": 0.20060493052005768, + "learning_rate": 1.9698324470544268e-05, + "loss": 1.2959, + "step": 3553 + }, + { + "epoch": 1.0585453936223683, + "grad_norm": 0.1976126879453659, + "learning_rate": 1.9698089290688253e-05, + "loss": 1.2967, + "step": 3554 + }, + { + "epoch": 1.0588432398220369, + "grad_norm": 0.2165234386920929, + "learning_rate": 1.9697854020602353e-05, + "loss": 1.2919, + "step": 3555 + }, + { + "epoch": 1.0591410860217056, + "grad_norm": 0.202513188123703, + "learning_rate": 1.9697618660288757e-05, + "loss": 1.2934, + "step": 3556 + }, + { + "epoch": 1.0594389322213742, + "grad_norm": 0.20844443142414093, + "learning_rate": 1.9697383209749653e-05, + "loss": 1.3099, + "step": 3557 + }, + { + "epoch": 1.0597367784210427, + "grad_norm": 0.1990274339914322, + "learning_rate": 1.9697147668987235e-05, + "loss": 1.3027, + "step": 3558 + }, + { + "epoch": 1.0600346246207115, + "grad_norm": 0.19725582003593445, + "learning_rate": 1.969691203800369e-05, + "loss": 1.2981, + "step": 3559 + }, + { + "epoch": 1.06033247082038, + "grad_norm": 0.2047271728515625, + "learning_rate": 1.9696676316801213e-05, + "loss": 1.2986, + "step": 3560 + }, + { + "epoch": 1.0606303170200488, + "grad_norm": 0.21688127517700195, + "learning_rate": 1.9696440505381997e-05, + "loss": 1.3113, + "step": 3561 + }, + { + "epoch": 1.0609281632197174, + "grad_norm": 0.2020910382270813, + "learning_rate": 1.9696204603748236e-05, + "loss": 1.3008, + "step": 3562 + }, + { + "epoch": 1.0612260094193862, + "grad_norm": 0.19921623170375824, + "learning_rate": 1.9695968611902122e-05, + "loss": 1.3083, + "step": 3563 + }, + { + "epoch": 1.0615238556190547, + "grad_norm": 0.21965418756008148, + "learning_rate": 1.9695732529845854e-05, + "loss": 1.3077, + "step": 3564 + }, + { + "epoch": 1.0618217018187233, + "grad_norm": 0.2067146599292755, + "learning_rate": 1.969549635758163e-05, + "loss": 1.3171, + "step": 3565 + }, + { + "epoch": 1.062119548018392, + "grad_norm": 0.1980430632829666, + "learning_rate": 1.9695260095111644e-05, + "loss": 1.2916, + "step": 3566 + }, + { + "epoch": 1.0624173942180606, + "grad_norm": 0.20229408144950867, + "learning_rate": 1.9695023742438093e-05, + "loss": 1.281, + "step": 3567 + }, + { + "epoch": 1.0627152404177294, + "grad_norm": 0.20833523571491241, + "learning_rate": 1.9694787299563177e-05, + "loss": 1.3001, + "step": 3568 + }, + { + "epoch": 1.063013086617398, + "grad_norm": 0.1953403651714325, + "learning_rate": 1.96945507664891e-05, + "loss": 1.2966, + "step": 3569 + }, + { + "epoch": 1.0633109328170667, + "grad_norm": 0.20932403206825256, + "learning_rate": 1.969431414321806e-05, + "loss": 1.3006, + "step": 3570 + }, + { + "epoch": 1.0636087790167352, + "grad_norm": 0.19709676504135132, + "learning_rate": 1.9694077429752258e-05, + "loss": 1.2995, + "step": 3571 + }, + { + "epoch": 1.0639066252164038, + "grad_norm": 0.20356795191764832, + "learning_rate": 1.969384062609389e-05, + "loss": 1.2836, + "step": 3572 + }, + { + "epoch": 1.0642044714160726, + "grad_norm": 0.19794374704360962, + "learning_rate": 1.9693603732245176e-05, + "loss": 1.2843, + "step": 3573 + }, + { + "epoch": 1.064502317615741, + "grad_norm": 0.19218191504478455, + "learning_rate": 1.9693366748208303e-05, + "loss": 1.3023, + "step": 3574 + }, + { + "epoch": 1.0648001638154099, + "grad_norm": 0.2103659212589264, + "learning_rate": 1.9693129673985484e-05, + "loss": 1.3042, + "step": 3575 + }, + { + "epoch": 1.0650980100150784, + "grad_norm": 0.18991057574748993, + "learning_rate": 1.969289250957892e-05, + "loss": 1.2992, + "step": 3576 + }, + { + "epoch": 1.0653958562147472, + "grad_norm": 0.2115504890680313, + "learning_rate": 1.9692655254990824e-05, + "loss": 1.2892, + "step": 3577 + }, + { + "epoch": 1.0656937024144157, + "grad_norm": 0.200431689620018, + "learning_rate": 1.96924179102234e-05, + "loss": 1.2873, + "step": 3578 + }, + { + "epoch": 1.0659915486140843, + "grad_norm": 0.20939333736896515, + "learning_rate": 1.9692180475278853e-05, + "loss": 1.3031, + "step": 3579 + }, + { + "epoch": 1.066289394813753, + "grad_norm": 0.20650987327098846, + "learning_rate": 1.96919429501594e-05, + "loss": 1.3008, + "step": 3580 + }, + { + "epoch": 1.0665872410134216, + "grad_norm": 0.20073242485523224, + "learning_rate": 1.9691705334867246e-05, + "loss": 1.3007, + "step": 3581 + }, + { + "epoch": 1.0668850872130904, + "grad_norm": 0.19522999227046967, + "learning_rate": 1.9691467629404602e-05, + "loss": 1.2886, + "step": 3582 + }, + { + "epoch": 1.067182933412759, + "grad_norm": 0.2064772993326187, + "learning_rate": 1.9691229833773678e-05, + "loss": 1.3139, + "step": 3583 + }, + { + "epoch": 1.0674807796124277, + "grad_norm": 0.2075064331293106, + "learning_rate": 1.9690991947976686e-05, + "loss": 1.2917, + "step": 3584 + }, + { + "epoch": 1.0677786258120963, + "grad_norm": 0.2000180035829544, + "learning_rate": 1.9690753972015846e-05, + "loss": 1.3011, + "step": 3585 + }, + { + "epoch": 1.0680764720117648, + "grad_norm": 0.20665602385997772, + "learning_rate": 1.9690515905893367e-05, + "loss": 1.2927, + "step": 3586 + }, + { + "epoch": 1.0683743182114336, + "grad_norm": 0.20087812840938568, + "learning_rate": 1.969027774961146e-05, + "loss": 1.2936, + "step": 3587 + }, + { + "epoch": 1.0686721644111021, + "grad_norm": 0.19530273973941803, + "learning_rate": 1.9690039503172346e-05, + "loss": 1.2681, + "step": 3588 + }, + { + "epoch": 1.068970010610771, + "grad_norm": 0.2068195939064026, + "learning_rate": 1.9689801166578247e-05, + "loss": 1.2885, + "step": 3589 + }, + { + "epoch": 1.0692678568104395, + "grad_norm": 0.2021002173423767, + "learning_rate": 1.9689562739831368e-05, + "loss": 1.2993, + "step": 3590 + }, + { + "epoch": 1.0695657030101082, + "grad_norm": 0.20480947196483612, + "learning_rate": 1.9689324222933936e-05, + "loss": 1.3172, + "step": 3591 + }, + { + "epoch": 1.0698635492097768, + "grad_norm": 0.20652180910110474, + "learning_rate": 1.9689085615888166e-05, + "loss": 1.2853, + "step": 3592 + }, + { + "epoch": 1.0701613954094455, + "grad_norm": 0.20586484670639038, + "learning_rate": 1.968884691869628e-05, + "loss": 1.3097, + "step": 3593 + }, + { + "epoch": 1.070459241609114, + "grad_norm": 0.1960112303495407, + "learning_rate": 1.96886081313605e-05, + "loss": 1.3006, + "step": 3594 + }, + { + "epoch": 1.0707570878087826, + "grad_norm": 0.22065342962741852, + "learning_rate": 1.9688369253883043e-05, + "loss": 1.3018, + "step": 3595 + }, + { + "epoch": 1.0710549340084514, + "grad_norm": 0.21142372488975525, + "learning_rate": 1.968813028626614e-05, + "loss": 1.3154, + "step": 3596 + }, + { + "epoch": 1.07135278020812, + "grad_norm": 0.2077939659357071, + "learning_rate": 1.9687891228512003e-05, + "loss": 1.2903, + "step": 3597 + }, + { + "epoch": 1.0716506264077887, + "grad_norm": 0.19702668488025665, + "learning_rate": 1.9687652080622866e-05, + "loss": 1.311, + "step": 3598 + }, + { + "epoch": 1.0719484726074573, + "grad_norm": 0.19933748245239258, + "learning_rate": 1.968741284260095e-05, + "loss": 1.2832, + "step": 3599 + }, + { + "epoch": 1.072246318807126, + "grad_norm": 0.2061723917722702, + "learning_rate": 1.968717351444848e-05, + "loss": 1.2934, + "step": 3600 + }, + { + "epoch": 1.0725441650067946, + "grad_norm": 0.2108973264694214, + "learning_rate": 1.968693409616768e-05, + "loss": 1.2809, + "step": 3601 + }, + { + "epoch": 1.0728420112064632, + "grad_norm": 0.2182292491197586, + "learning_rate": 1.9686694587760786e-05, + "loss": 1.304, + "step": 3602 + }, + { + "epoch": 1.073139857406132, + "grad_norm": 0.21988658607006073, + "learning_rate": 1.968645498923002e-05, + "loss": 1.3043, + "step": 3603 + }, + { + "epoch": 1.0734377036058005, + "grad_norm": 0.21043458580970764, + "learning_rate": 1.9686215300577613e-05, + "loss": 1.3036, + "step": 3604 + }, + { + "epoch": 1.0737355498054693, + "grad_norm": 0.1996757984161377, + "learning_rate": 1.9685975521805793e-05, + "loss": 1.3026, + "step": 3605 + }, + { + "epoch": 1.0740333960051378, + "grad_norm": 0.20943333208560944, + "learning_rate": 1.9685735652916797e-05, + "loss": 1.2773, + "step": 3606 + }, + { + "epoch": 1.0743312422048066, + "grad_norm": 0.20317253470420837, + "learning_rate": 1.9685495693912846e-05, + "loss": 1.3079, + "step": 3607 + }, + { + "epoch": 1.0746290884044751, + "grad_norm": 0.20812860131263733, + "learning_rate": 1.9685255644796184e-05, + "loss": 1.3113, + "step": 3608 + }, + { + "epoch": 1.074926934604144, + "grad_norm": 0.2107335329055786, + "learning_rate": 1.9685015505569036e-05, + "loss": 1.3022, + "step": 3609 + }, + { + "epoch": 1.0752247808038125, + "grad_norm": 0.2116023153066635, + "learning_rate": 1.968477527623364e-05, + "loss": 1.2959, + "step": 3610 + }, + { + "epoch": 1.075522627003481, + "grad_norm": 0.21139049530029297, + "learning_rate": 1.9684534956792233e-05, + "loss": 1.2899, + "step": 3611 + }, + { + "epoch": 1.0758204732031498, + "grad_norm": 0.21621239185333252, + "learning_rate": 1.9684294547247046e-05, + "loss": 1.3221, + "step": 3612 + }, + { + "epoch": 1.0761183194028183, + "grad_norm": 0.19819071888923645, + "learning_rate": 1.9684054047600315e-05, + "loss": 1.2978, + "step": 3613 + }, + { + "epoch": 1.076416165602487, + "grad_norm": 0.2018895149230957, + "learning_rate": 1.968381345785429e-05, + "loss": 1.292, + "step": 3614 + }, + { + "epoch": 1.0767140118021556, + "grad_norm": 0.2259438931941986, + "learning_rate": 1.9683572778011193e-05, + "loss": 1.2992, + "step": 3615 + }, + { + "epoch": 1.0770118580018244, + "grad_norm": 0.20852015912532806, + "learning_rate": 1.968333200807327e-05, + "loss": 1.3029, + "step": 3616 + }, + { + "epoch": 1.077309704201493, + "grad_norm": 0.21100470423698425, + "learning_rate": 1.9683091148042765e-05, + "loss": 1.2844, + "step": 3617 + }, + { + "epoch": 1.0776075504011615, + "grad_norm": 0.1936296671628952, + "learning_rate": 1.9682850197921914e-05, + "loss": 1.2876, + "step": 3618 + }, + { + "epoch": 1.0779053966008303, + "grad_norm": 0.2020305097103119, + "learning_rate": 1.968260915771296e-05, + "loss": 1.2876, + "step": 3619 + }, + { + "epoch": 1.0782032428004988, + "grad_norm": 0.20538529753684998, + "learning_rate": 1.9682368027418147e-05, + "loss": 1.2761, + "step": 3620 + }, + { + "epoch": 1.0785010890001676, + "grad_norm": 0.20383626222610474, + "learning_rate": 1.9682126807039714e-05, + "loss": 1.293, + "step": 3621 + }, + { + "epoch": 1.0787989351998362, + "grad_norm": 0.19912081956863403, + "learning_rate": 1.9681885496579914e-05, + "loss": 1.2898, + "step": 3622 + }, + { + "epoch": 1.079096781399505, + "grad_norm": 0.2121354043483734, + "learning_rate": 1.968164409604098e-05, + "loss": 1.3029, + "step": 3623 + }, + { + "epoch": 1.0793946275991735, + "grad_norm": 0.2046370953321457, + "learning_rate": 1.968140260542517e-05, + "loss": 1.3002, + "step": 3624 + }, + { + "epoch": 1.079692473798842, + "grad_norm": 0.2005256861448288, + "learning_rate": 1.968116102473472e-05, + "loss": 1.2891, + "step": 3625 + }, + { + "epoch": 1.0799903199985108, + "grad_norm": 0.2049214243888855, + "learning_rate": 1.968091935397189e-05, + "loss": 1.3096, + "step": 3626 + }, + { + "epoch": 1.0802881661981794, + "grad_norm": 0.21116481721401215, + "learning_rate": 1.968067759313892e-05, + "loss": 1.2935, + "step": 3627 + }, + { + "epoch": 1.0805860123978481, + "grad_norm": 0.2080266922712326, + "learning_rate": 1.9680435742238056e-05, + "loss": 1.2879, + "step": 3628 + }, + { + "epoch": 1.0808838585975167, + "grad_norm": 0.19524142146110535, + "learning_rate": 1.9680193801271558e-05, + "loss": 1.304, + "step": 3629 + }, + { + "epoch": 1.0811817047971854, + "grad_norm": 0.20208951830863953, + "learning_rate": 1.9679951770241673e-05, + "loss": 1.2854, + "step": 3630 + }, + { + "epoch": 1.081479550996854, + "grad_norm": 0.2034522444009781, + "learning_rate": 1.9679709649150647e-05, + "loss": 1.3, + "step": 3631 + }, + { + "epoch": 1.0817773971965225, + "grad_norm": 0.2104310393333435, + "learning_rate": 1.967946743800074e-05, + "loss": 1.2851, + "step": 3632 + }, + { + "epoch": 1.0820752433961913, + "grad_norm": 0.2176777422428131, + "learning_rate": 1.9679225136794203e-05, + "loss": 1.3078, + "step": 3633 + }, + { + "epoch": 1.0823730895958599, + "grad_norm": 0.20740337669849396, + "learning_rate": 1.967898274553329e-05, + "loss": 1.2937, + "step": 3634 + }, + { + "epoch": 1.0826709357955286, + "grad_norm": 0.21155376732349396, + "learning_rate": 1.9678740264220257e-05, + "loss": 1.3129, + "step": 3635 + }, + { + "epoch": 1.0829687819951972, + "grad_norm": 0.21064534783363342, + "learning_rate": 1.967849769285736e-05, + "loss": 1.3073, + "step": 3636 + }, + { + "epoch": 1.083266628194866, + "grad_norm": 0.21301575005054474, + "learning_rate": 1.9678255031446855e-05, + "loss": 1.2812, + "step": 3637 + }, + { + "epoch": 1.0835644743945345, + "grad_norm": 0.21204115450382233, + "learning_rate": 1.9678012279991e-05, + "loss": 1.3044, + "step": 3638 + }, + { + "epoch": 1.083862320594203, + "grad_norm": 0.2081066370010376, + "learning_rate": 1.9677769438492055e-05, + "loss": 1.2933, + "step": 3639 + }, + { + "epoch": 1.0841601667938718, + "grad_norm": 0.20554843544960022, + "learning_rate": 1.9677526506952275e-05, + "loss": 1.2992, + "step": 3640 + }, + { + "epoch": 1.0844580129935404, + "grad_norm": 0.2064088135957718, + "learning_rate": 1.9677283485373923e-05, + "loss": 1.307, + "step": 3641 + }, + { + "epoch": 1.0847558591932092, + "grad_norm": 0.20249561965465546, + "learning_rate": 1.9677040373759266e-05, + "loss": 1.2989, + "step": 3642 + }, + { + "epoch": 1.0850537053928777, + "grad_norm": 0.20630252361297607, + "learning_rate": 1.9676797172110552e-05, + "loss": 1.3039, + "step": 3643 + }, + { + "epoch": 1.0853515515925465, + "grad_norm": 0.20152369141578674, + "learning_rate": 1.9676553880430056e-05, + "loss": 1.2981, + "step": 3644 + }, + { + "epoch": 1.085649397792215, + "grad_norm": 0.2110234946012497, + "learning_rate": 1.967631049872004e-05, + "loss": 1.3199, + "step": 3645 + }, + { + "epoch": 1.0859472439918836, + "grad_norm": 0.19750213623046875, + "learning_rate": 1.9676067026982762e-05, + "loss": 1.2809, + "step": 3646 + }, + { + "epoch": 1.0862450901915524, + "grad_norm": 0.2041853368282318, + "learning_rate": 1.967582346522049e-05, + "loss": 1.3162, + "step": 3647 + }, + { + "epoch": 1.086542936391221, + "grad_norm": 0.20011192560195923, + "learning_rate": 1.9675579813435495e-05, + "loss": 1.2837, + "step": 3648 + }, + { + "epoch": 1.0868407825908897, + "grad_norm": 0.20544438064098358, + "learning_rate": 1.967533607163004e-05, + "loss": 1.2828, + "step": 3649 + }, + { + "epoch": 1.0871386287905582, + "grad_norm": 0.20484983921051025, + "learning_rate": 1.967509223980639e-05, + "loss": 1.2918, + "step": 3650 + }, + { + "epoch": 1.087436474990227, + "grad_norm": 0.21132327616214752, + "learning_rate": 1.967484831796682e-05, + "loss": 1.2938, + "step": 3651 + }, + { + "epoch": 1.0877343211898955, + "grad_norm": 0.1959044486284256, + "learning_rate": 1.967460430611359e-05, + "loss": 1.3045, + "step": 3652 + }, + { + "epoch": 1.088032167389564, + "grad_norm": 0.20495103299617767, + "learning_rate": 1.967436020424898e-05, + "loss": 1.276, + "step": 3653 + }, + { + "epoch": 1.0883300135892329, + "grad_norm": 0.203419491648674, + "learning_rate": 1.967411601237526e-05, + "loss": 1.3149, + "step": 3654 + }, + { + "epoch": 1.0886278597889014, + "grad_norm": 0.19674086570739746, + "learning_rate": 1.967387173049469e-05, + "loss": 1.288, + "step": 3655 + }, + { + "epoch": 1.0889257059885702, + "grad_norm": 0.21166275441646576, + "learning_rate": 1.967362735860956e-05, + "loss": 1.3057, + "step": 3656 + }, + { + "epoch": 1.0892235521882387, + "grad_norm": 0.2079380601644516, + "learning_rate": 1.9673382896722134e-05, + "loss": 1.2847, + "step": 3657 + }, + { + "epoch": 1.0895213983879075, + "grad_norm": 0.21181327104568481, + "learning_rate": 1.9673138344834686e-05, + "loss": 1.3122, + "step": 3658 + }, + { + "epoch": 1.089819244587576, + "grad_norm": 0.20894701778888702, + "learning_rate": 1.9672893702949492e-05, + "loss": 1.2882, + "step": 3659 + }, + { + "epoch": 1.0901170907872448, + "grad_norm": 0.2010115683078766, + "learning_rate": 1.9672648971068833e-05, + "loss": 1.2841, + "step": 3660 + }, + { + "epoch": 1.0904149369869134, + "grad_norm": 0.24069705605506897, + "learning_rate": 1.967240414919498e-05, + "loss": 1.3047, + "step": 3661 + }, + { + "epoch": 1.090712783186582, + "grad_norm": 0.2139674872159958, + "learning_rate": 1.967215923733021e-05, + "loss": 1.2796, + "step": 3662 + }, + { + "epoch": 1.0910106293862507, + "grad_norm": 0.22698570787906647, + "learning_rate": 1.967191423547681e-05, + "loss": 1.285, + "step": 3663 + }, + { + "epoch": 1.0913084755859193, + "grad_norm": 0.21402917802333832, + "learning_rate": 1.9671669143637054e-05, + "loss": 1.3145, + "step": 3664 + }, + { + "epoch": 1.091606321785588, + "grad_norm": 0.2062504142522812, + "learning_rate": 1.9671423961813222e-05, + "loss": 1.3042, + "step": 3665 + }, + { + "epoch": 1.0919041679852566, + "grad_norm": 0.22338880598545074, + "learning_rate": 1.9671178690007596e-05, + "loss": 1.2936, + "step": 3666 + }, + { + "epoch": 1.0922020141849254, + "grad_norm": 0.19754505157470703, + "learning_rate": 1.9670933328222453e-05, + "loss": 1.2981, + "step": 3667 + }, + { + "epoch": 1.092499860384594, + "grad_norm": 0.21521669626235962, + "learning_rate": 1.9670687876460084e-05, + "loss": 1.2916, + "step": 3668 + }, + { + "epoch": 1.0927977065842625, + "grad_norm": 0.22141075134277344, + "learning_rate": 1.9670442334722767e-05, + "loss": 1.2899, + "step": 3669 + }, + { + "epoch": 1.0930955527839312, + "grad_norm": 0.20699742436408997, + "learning_rate": 1.967019670301279e-05, + "loss": 1.2974, + "step": 3670 + }, + { + "epoch": 1.0933933989835998, + "grad_norm": 0.21716786921024323, + "learning_rate": 1.9669950981332436e-05, + "loss": 1.293, + "step": 3671 + }, + { + "epoch": 1.0936912451832685, + "grad_norm": 0.21116264164447784, + "learning_rate": 1.9669705169683996e-05, + "loss": 1.2923, + "step": 3672 + }, + { + "epoch": 1.093989091382937, + "grad_norm": 0.2112829089164734, + "learning_rate": 1.966945926806975e-05, + "loss": 1.3111, + "step": 3673 + }, + { + "epoch": 1.0942869375826059, + "grad_norm": 0.20704688131809235, + "learning_rate": 1.9669213276491988e-05, + "loss": 1.2943, + "step": 3674 + }, + { + "epoch": 1.0945847837822744, + "grad_norm": 0.19894598424434662, + "learning_rate": 1.9668967194953e-05, + "loss": 1.3108, + "step": 3675 + }, + { + "epoch": 1.0948826299819432, + "grad_norm": 0.20967112481594086, + "learning_rate": 1.9668721023455072e-05, + "loss": 1.2952, + "step": 3676 + }, + { + "epoch": 1.0951804761816117, + "grad_norm": 0.21888650953769684, + "learning_rate": 1.96684747620005e-05, + "loss": 1.2752, + "step": 3677 + }, + { + "epoch": 1.0954783223812803, + "grad_norm": 0.21551842987537384, + "learning_rate": 1.9668228410591577e-05, + "loss": 1.2913, + "step": 3678 + }, + { + "epoch": 1.095776168580949, + "grad_norm": 0.2061617374420166, + "learning_rate": 1.9667981969230586e-05, + "loss": 1.2855, + "step": 3679 + }, + { + "epoch": 1.0960740147806176, + "grad_norm": 0.2146829217672348, + "learning_rate": 1.9667735437919826e-05, + "loss": 1.2983, + "step": 3680 + }, + { + "epoch": 1.0963718609802864, + "grad_norm": 0.21890433132648468, + "learning_rate": 1.966748881666159e-05, + "loss": 1.2913, + "step": 3681 + }, + { + "epoch": 1.096669707179955, + "grad_norm": 0.217780202627182, + "learning_rate": 1.966724210545817e-05, + "loss": 1.2984, + "step": 3682 + }, + { + "epoch": 1.0969675533796237, + "grad_norm": 0.20973043143749237, + "learning_rate": 1.966699530431186e-05, + "loss": 1.2921, + "step": 3683 + }, + { + "epoch": 1.0972653995792923, + "grad_norm": 0.2042854130268097, + "learning_rate": 1.9666748413224966e-05, + "loss": 1.3071, + "step": 3684 + }, + { + "epoch": 1.0975632457789608, + "grad_norm": 0.21178506314754486, + "learning_rate": 1.9666501432199772e-05, + "loss": 1.2983, + "step": 3685 + }, + { + "epoch": 1.0978610919786296, + "grad_norm": 0.21143445372581482, + "learning_rate": 1.9666254361238585e-05, + "loss": 1.2918, + "step": 3686 + }, + { + "epoch": 1.0981589381782981, + "grad_norm": 0.21675217151641846, + "learning_rate": 1.9666007200343702e-05, + "loss": 1.2936, + "step": 3687 + }, + { + "epoch": 1.098456784377967, + "grad_norm": 0.22217164933681488, + "learning_rate": 1.9665759949517424e-05, + "loss": 1.2981, + "step": 3688 + }, + { + "epoch": 1.0987546305776354, + "grad_norm": 0.20536118745803833, + "learning_rate": 1.9665512608762043e-05, + "loss": 1.2848, + "step": 3689 + }, + { + "epoch": 1.0990524767773042, + "grad_norm": 0.2172800451517105, + "learning_rate": 1.966526517807987e-05, + "loss": 1.3013, + "step": 3690 + }, + { + "epoch": 1.0993503229769728, + "grad_norm": 0.20742656290531158, + "learning_rate": 1.96650176574732e-05, + "loss": 1.2808, + "step": 3691 + }, + { + "epoch": 1.0996481691766413, + "grad_norm": 0.22240039706230164, + "learning_rate": 1.966477004694434e-05, + "loss": 1.3126, + "step": 3692 + }, + { + "epoch": 1.09994601537631, + "grad_norm": 0.20464514195919037, + "learning_rate": 1.9664522346495593e-05, + "loss": 1.2789, + "step": 3693 + }, + { + "epoch": 1.1002438615759786, + "grad_norm": 0.21234962344169617, + "learning_rate": 1.9664274556129266e-05, + "loss": 1.3126, + "step": 3694 + }, + { + "epoch": 1.1005417077756474, + "grad_norm": 0.20635056495666504, + "learning_rate": 1.966402667584766e-05, + "loss": 1.2924, + "step": 3695 + }, + { + "epoch": 1.100839553975316, + "grad_norm": 0.19965322315692902, + "learning_rate": 1.9663778705653082e-05, + "loss": 1.3111, + "step": 3696 + }, + { + "epoch": 1.1011374001749847, + "grad_norm": 0.2301105260848999, + "learning_rate": 1.9663530645547842e-05, + "loss": 1.3003, + "step": 3697 + }, + { + "epoch": 1.1014352463746533, + "grad_norm": 0.2165602147579193, + "learning_rate": 1.9663282495534247e-05, + "loss": 1.3017, + "step": 3698 + }, + { + "epoch": 1.1017330925743218, + "grad_norm": 0.19816917181015015, + "learning_rate": 1.9663034255614602e-05, + "loss": 1.3024, + "step": 3699 + }, + { + "epoch": 1.1020309387739906, + "grad_norm": 0.21041910350322723, + "learning_rate": 1.966278592579122e-05, + "loss": 1.302, + "step": 3700 + }, + { + "epoch": 1.1023287849736592, + "grad_norm": 0.20530866086483002, + "learning_rate": 1.966253750606641e-05, + "loss": 1.2768, + "step": 3701 + }, + { + "epoch": 1.102626631173328, + "grad_norm": 0.20916764438152313, + "learning_rate": 1.966228899644249e-05, + "loss": 1.2983, + "step": 3702 + }, + { + "epoch": 1.1029244773729965, + "grad_norm": 0.206373929977417, + "learning_rate": 1.966204039692176e-05, + "loss": 1.2997, + "step": 3703 + }, + { + "epoch": 1.1032223235726653, + "grad_norm": 0.1992659568786621, + "learning_rate": 1.966179170750654e-05, + "loss": 1.3076, + "step": 3704 + }, + { + "epoch": 1.1035201697723338, + "grad_norm": 0.20194512605667114, + "learning_rate": 1.9661542928199144e-05, + "loss": 1.2932, + "step": 3705 + }, + { + "epoch": 1.1038180159720024, + "grad_norm": 0.2105811983346939, + "learning_rate": 1.9661294059001884e-05, + "loss": 1.2847, + "step": 3706 + }, + { + "epoch": 1.1041158621716711, + "grad_norm": 0.21456459164619446, + "learning_rate": 1.966104509991708e-05, + "loss": 1.3056, + "step": 3707 + }, + { + "epoch": 1.1044137083713397, + "grad_norm": 0.2006983458995819, + "learning_rate": 1.9660796050947045e-05, + "loss": 1.2968, + "step": 3708 + }, + { + "epoch": 1.1047115545710084, + "grad_norm": 0.19508807361125946, + "learning_rate": 1.9660546912094095e-05, + "loss": 1.2826, + "step": 3709 + }, + { + "epoch": 1.105009400770677, + "grad_norm": 0.19879397749900818, + "learning_rate": 1.9660297683360548e-05, + "loss": 1.3001, + "step": 3710 + }, + { + "epoch": 1.1053072469703458, + "grad_norm": 0.21688780188560486, + "learning_rate": 1.9660048364748724e-05, + "loss": 1.3096, + "step": 3711 + }, + { + "epoch": 1.1056050931700143, + "grad_norm": 0.20888207852840424, + "learning_rate": 1.9659798956260948e-05, + "loss": 1.3002, + "step": 3712 + }, + { + "epoch": 1.1059029393696829, + "grad_norm": 0.20637017488479614, + "learning_rate": 1.965954945789953e-05, + "loss": 1.3093, + "step": 3713 + }, + { + "epoch": 1.1062007855693516, + "grad_norm": 0.20435267686843872, + "learning_rate": 1.96592998696668e-05, + "loss": 1.2938, + "step": 3714 + }, + { + "epoch": 1.1064986317690202, + "grad_norm": 0.21587617695331573, + "learning_rate": 1.9659050191565076e-05, + "loss": 1.2859, + "step": 3715 + }, + { + "epoch": 1.106796477968689, + "grad_norm": 0.20713822543621063, + "learning_rate": 1.9658800423596676e-05, + "loss": 1.2925, + "step": 3716 + }, + { + "epoch": 1.1070943241683575, + "grad_norm": 0.20413658022880554, + "learning_rate": 1.965855056576394e-05, + "loss": 1.2846, + "step": 3717 + }, + { + "epoch": 1.1073921703680263, + "grad_norm": 0.2183109074831009, + "learning_rate": 1.9658300618069175e-05, + "loss": 1.3003, + "step": 3718 + }, + { + "epoch": 1.1076900165676948, + "grad_norm": 0.2023807168006897, + "learning_rate": 1.9658050580514712e-05, + "loss": 1.303, + "step": 3719 + }, + { + "epoch": 1.1079878627673634, + "grad_norm": 0.20855534076690674, + "learning_rate": 1.9657800453102884e-05, + "loss": 1.3078, + "step": 3720 + }, + { + "epoch": 1.1082857089670322, + "grad_norm": 0.2180613875389099, + "learning_rate": 1.9657550235836012e-05, + "loss": 1.2914, + "step": 3721 + }, + { + "epoch": 1.1085835551667007, + "grad_norm": 0.20898592472076416, + "learning_rate": 1.9657299928716423e-05, + "loss": 1.2891, + "step": 3722 + }, + { + "epoch": 1.1088814013663695, + "grad_norm": 0.2195393294095993, + "learning_rate": 1.965704953174645e-05, + "loss": 1.2995, + "step": 3723 + }, + { + "epoch": 1.109179247566038, + "grad_norm": 0.2105409801006317, + "learning_rate": 1.965679904492842e-05, + "loss": 1.2744, + "step": 3724 + }, + { + "epoch": 1.1094770937657068, + "grad_norm": 0.20014241337776184, + "learning_rate": 1.9656548468264664e-05, + "loss": 1.3026, + "step": 3725 + }, + { + "epoch": 1.1097749399653754, + "grad_norm": 0.21112357079982758, + "learning_rate": 1.9656297801757514e-05, + "loss": 1.2834, + "step": 3726 + }, + { + "epoch": 1.1100727861650441, + "grad_norm": 0.2157319337129593, + "learning_rate": 1.9656047045409302e-05, + "loss": 1.287, + "step": 3727 + }, + { + "epoch": 1.1103706323647127, + "grad_norm": 0.20597907900810242, + "learning_rate": 1.9655796199222357e-05, + "loss": 1.3017, + "step": 3728 + }, + { + "epoch": 1.1106684785643812, + "grad_norm": 0.20393458008766174, + "learning_rate": 1.965554526319902e-05, + "loss": 1.2901, + "step": 3729 + }, + { + "epoch": 1.11096632476405, + "grad_norm": 0.2154955267906189, + "learning_rate": 1.9655294237341622e-05, + "loss": 1.2982, + "step": 3730 + }, + { + "epoch": 1.1112641709637185, + "grad_norm": 0.20532436668872833, + "learning_rate": 1.9655043121652496e-05, + "loss": 1.2765, + "step": 3731 + }, + { + "epoch": 1.1115620171633873, + "grad_norm": 0.20799566805362701, + "learning_rate": 1.9654791916133986e-05, + "loss": 1.2956, + "step": 3732 + }, + { + "epoch": 1.1118598633630559, + "grad_norm": 0.22477100789546967, + "learning_rate": 1.965454062078842e-05, + "loss": 1.2966, + "step": 3733 + }, + { + "epoch": 1.1121577095627246, + "grad_norm": 0.20537179708480835, + "learning_rate": 1.965428923561814e-05, + "loss": 1.3057, + "step": 3734 + }, + { + "epoch": 1.1124555557623932, + "grad_norm": 0.20808099210262299, + "learning_rate": 1.9654037760625486e-05, + "loss": 1.3127, + "step": 3735 + }, + { + "epoch": 1.1127534019620617, + "grad_norm": 0.20625001192092896, + "learning_rate": 1.9653786195812797e-05, + "loss": 1.2887, + "step": 3736 + }, + { + "epoch": 1.1130512481617305, + "grad_norm": 0.22057880461215973, + "learning_rate": 1.9653534541182412e-05, + "loss": 1.3152, + "step": 3737 + }, + { + "epoch": 1.113349094361399, + "grad_norm": 0.21256573498249054, + "learning_rate": 1.9653282796736677e-05, + "loss": 1.3075, + "step": 3738 + }, + { + "epoch": 1.1136469405610678, + "grad_norm": 0.21121853590011597, + "learning_rate": 1.965303096247793e-05, + "loss": 1.2952, + "step": 3739 + }, + { + "epoch": 1.1139447867607364, + "grad_norm": 0.20226754248142242, + "learning_rate": 1.965277903840851e-05, + "loss": 1.287, + "step": 3740 + }, + { + "epoch": 1.1142426329604052, + "grad_norm": 0.21369782090187073, + "learning_rate": 1.965252702453077e-05, + "loss": 1.3082, + "step": 3741 + }, + { + "epoch": 1.1145404791600737, + "grad_norm": 0.1950555294752121, + "learning_rate": 1.965227492084705e-05, + "loss": 1.2965, + "step": 3742 + }, + { + "epoch": 1.1148383253597425, + "grad_norm": 0.2038709968328476, + "learning_rate": 1.96520227273597e-05, + "loss": 1.2898, + "step": 3743 + }, + { + "epoch": 1.115136171559411, + "grad_norm": 0.2121773660182953, + "learning_rate": 1.9651770444071058e-05, + "loss": 1.3005, + "step": 3744 + }, + { + "epoch": 1.1154340177590796, + "grad_norm": 0.21432389318943024, + "learning_rate": 1.9651518070983474e-05, + "loss": 1.2952, + "step": 3745 + }, + { + "epoch": 1.1157318639587483, + "grad_norm": 0.20774976909160614, + "learning_rate": 1.96512656080993e-05, + "loss": 1.2983, + "step": 3746 + }, + { + "epoch": 1.116029710158417, + "grad_norm": 0.1962873339653015, + "learning_rate": 1.9651013055420882e-05, + "loss": 1.2897, + "step": 3747 + }, + { + "epoch": 1.1163275563580857, + "grad_norm": 0.1967422515153885, + "learning_rate": 1.965076041295057e-05, + "loss": 1.2972, + "step": 3748 + }, + { + "epoch": 1.1166254025577542, + "grad_norm": 0.20895633101463318, + "learning_rate": 1.9650507680690716e-05, + "loss": 1.2871, + "step": 3749 + }, + { + "epoch": 1.116923248757423, + "grad_norm": 0.24089834094047546, + "learning_rate": 1.965025485864367e-05, + "loss": 1.2985, + "step": 3750 + }, + { + "epoch": 1.1172210949570915, + "grad_norm": 0.21199411153793335, + "learning_rate": 1.9650001946811784e-05, + "loss": 1.3011, + "step": 3751 + }, + { + "epoch": 1.11751894115676, + "grad_norm": 0.21411864459514618, + "learning_rate": 1.9649748945197412e-05, + "loss": 1.2979, + "step": 3752 + }, + { + "epoch": 1.1178167873564289, + "grad_norm": 0.20356041193008423, + "learning_rate": 1.9649495853802907e-05, + "loss": 1.2948, + "step": 3753 + }, + { + "epoch": 1.1181146335560974, + "grad_norm": 0.20256462693214417, + "learning_rate": 1.9649242672630625e-05, + "loss": 1.2915, + "step": 3754 + }, + { + "epoch": 1.1184124797557662, + "grad_norm": 0.20421351492404938, + "learning_rate": 1.964898940168292e-05, + "loss": 1.2914, + "step": 3755 + }, + { + "epoch": 1.1187103259554347, + "grad_norm": 0.20901718735694885, + "learning_rate": 1.964873604096215e-05, + "loss": 1.2886, + "step": 3756 + }, + { + "epoch": 1.1190081721551035, + "grad_norm": 0.19340622425079346, + "learning_rate": 1.9648482590470666e-05, + "loss": 1.2966, + "step": 3757 + }, + { + "epoch": 1.119306018354772, + "grad_norm": 0.206883043050766, + "learning_rate": 1.9648229050210838e-05, + "loss": 1.308, + "step": 3758 + }, + { + "epoch": 1.1196038645544406, + "grad_norm": 0.20688460767269135, + "learning_rate": 1.9647975420185016e-05, + "loss": 1.3051, + "step": 3759 + }, + { + "epoch": 1.1199017107541094, + "grad_norm": 0.20681655406951904, + "learning_rate": 1.964772170039556e-05, + "loss": 1.3133, + "step": 3760 + }, + { + "epoch": 1.120199556953778, + "grad_norm": 0.20772404968738556, + "learning_rate": 1.9647467890844836e-05, + "loss": 1.301, + "step": 3761 + }, + { + "epoch": 1.1204974031534467, + "grad_norm": 0.20942550897598267, + "learning_rate": 1.9647213991535202e-05, + "loss": 1.2888, + "step": 3762 + }, + { + "epoch": 1.1207952493531153, + "grad_norm": 0.20836666226387024, + "learning_rate": 1.9646960002469017e-05, + "loss": 1.2976, + "step": 3763 + }, + { + "epoch": 1.121093095552784, + "grad_norm": 0.2175648808479309, + "learning_rate": 1.964670592364865e-05, + "loss": 1.2924, + "step": 3764 + }, + { + "epoch": 1.1213909417524526, + "grad_norm": 0.21817858517169952, + "learning_rate": 1.9646451755076464e-05, + "loss": 1.294, + "step": 3765 + }, + { + "epoch": 1.1216887879521211, + "grad_norm": 0.21252726018428802, + "learning_rate": 1.964619749675482e-05, + "loss": 1.2988, + "step": 3766 + }, + { + "epoch": 1.12198663415179, + "grad_norm": 0.23812422156333923, + "learning_rate": 1.9645943148686082e-05, + "loss": 1.2986, + "step": 3767 + }, + { + "epoch": 1.1222844803514584, + "grad_norm": 0.22017017006874084, + "learning_rate": 1.9645688710872622e-05, + "loss": 1.2837, + "step": 3768 + }, + { + "epoch": 1.1225823265511272, + "grad_norm": 0.21784719824790955, + "learning_rate": 1.9645434183316808e-05, + "loss": 1.3051, + "step": 3769 + }, + { + "epoch": 1.1228801727507958, + "grad_norm": 0.2230665236711502, + "learning_rate": 1.9645179566021007e-05, + "loss": 1.304, + "step": 3770 + }, + { + "epoch": 1.1231780189504645, + "grad_norm": 0.20696522295475006, + "learning_rate": 1.9644924858987582e-05, + "loss": 1.2892, + "step": 3771 + }, + { + "epoch": 1.123475865150133, + "grad_norm": 0.23003578186035156, + "learning_rate": 1.964467006221891e-05, + "loss": 1.3022, + "step": 3772 + }, + { + "epoch": 1.1237737113498016, + "grad_norm": 0.20720861852169037, + "learning_rate": 1.9644415175717356e-05, + "loss": 1.2933, + "step": 3773 + }, + { + "epoch": 1.1240715575494704, + "grad_norm": 0.20392398536205292, + "learning_rate": 1.9644160199485297e-05, + "loss": 1.2834, + "step": 3774 + }, + { + "epoch": 1.124369403749139, + "grad_norm": 0.20635534822940826, + "learning_rate": 1.9643905133525102e-05, + "loss": 1.2945, + "step": 3775 + }, + { + "epoch": 1.1246672499488077, + "grad_norm": 0.20737838745117188, + "learning_rate": 1.964364997783914e-05, + "loss": 1.2704, + "step": 3776 + }, + { + "epoch": 1.1249650961484763, + "grad_norm": 0.20279735326766968, + "learning_rate": 1.9643394732429795e-05, + "loss": 1.2932, + "step": 3777 + }, + { + "epoch": 1.125262942348145, + "grad_norm": 0.21164770424365997, + "learning_rate": 1.9643139397299437e-05, + "loss": 1.3012, + "step": 3778 + }, + { + "epoch": 1.1255607885478136, + "grad_norm": 0.2219165563583374, + "learning_rate": 1.9642883972450434e-05, + "loss": 1.295, + "step": 3779 + }, + { + "epoch": 1.1258586347474822, + "grad_norm": 0.2005193829536438, + "learning_rate": 1.9642628457885175e-05, + "loss": 1.2954, + "step": 3780 + }, + { + "epoch": 1.126156480947151, + "grad_norm": 0.2079988270998001, + "learning_rate": 1.964237285360603e-05, + "loss": 1.2879, + "step": 3781 + }, + { + "epoch": 1.1264543271468195, + "grad_norm": 0.20791326463222504, + "learning_rate": 1.964211715961538e-05, + "loss": 1.2914, + "step": 3782 + }, + { + "epoch": 1.1267521733464883, + "grad_norm": 0.20197267830371857, + "learning_rate": 1.96418613759156e-05, + "loss": 1.2985, + "step": 3783 + }, + { + "epoch": 1.1270500195461568, + "grad_norm": 0.2005607932806015, + "learning_rate": 1.9641605502509074e-05, + "loss": 1.3072, + "step": 3784 + }, + { + "epoch": 1.1273478657458256, + "grad_norm": 0.21449939906597137, + "learning_rate": 1.9641349539398182e-05, + "loss": 1.3081, + "step": 3785 + }, + { + "epoch": 1.1276457119454941, + "grad_norm": 0.20632421970367432, + "learning_rate": 1.96410934865853e-05, + "loss": 1.3078, + "step": 3786 + }, + { + "epoch": 1.1279435581451627, + "grad_norm": 0.21048493683338165, + "learning_rate": 1.9640837344072825e-05, + "loss": 1.28, + "step": 3787 + }, + { + "epoch": 1.1282414043448314, + "grad_norm": 0.2121082842350006, + "learning_rate": 1.964058111186312e-05, + "loss": 1.301, + "step": 3788 + }, + { + "epoch": 1.1285392505445, + "grad_norm": 0.19674839079380035, + "learning_rate": 1.964032478995858e-05, + "loss": 1.2883, + "step": 3789 + }, + { + "epoch": 1.1288370967441688, + "grad_norm": 0.2155827283859253, + "learning_rate": 1.9640068378361592e-05, + "loss": 1.2841, + "step": 3790 + }, + { + "epoch": 1.1291349429438373, + "grad_norm": 0.20801493525505066, + "learning_rate": 1.9639811877074537e-05, + "loss": 1.2914, + "step": 3791 + }, + { + "epoch": 1.129432789143506, + "grad_norm": 0.20542113482952118, + "learning_rate": 1.9639555286099802e-05, + "loss": 1.3002, + "step": 3792 + }, + { + "epoch": 1.1297306353431746, + "grad_norm": 0.201734721660614, + "learning_rate": 1.9639298605439775e-05, + "loss": 1.3046, + "step": 3793 + }, + { + "epoch": 1.1300284815428434, + "grad_norm": 0.20444509387016296, + "learning_rate": 1.9639041835096845e-05, + "loss": 1.2954, + "step": 3794 + }, + { + "epoch": 1.130326327742512, + "grad_norm": 0.20374253392219543, + "learning_rate": 1.96387849750734e-05, + "loss": 1.2746, + "step": 3795 + }, + { + "epoch": 1.1306241739421805, + "grad_norm": 0.21780647337436676, + "learning_rate": 1.963852802537183e-05, + "loss": 1.3071, + "step": 3796 + }, + { + "epoch": 1.1309220201418493, + "grad_norm": 0.20837095379829407, + "learning_rate": 1.9638270985994526e-05, + "loss": 1.2804, + "step": 3797 + }, + { + "epoch": 1.1312198663415178, + "grad_norm": 0.20445789396762848, + "learning_rate": 1.963801385694388e-05, + "loss": 1.3028, + "step": 3798 + }, + { + "epoch": 1.1315177125411866, + "grad_norm": 0.2184823602437973, + "learning_rate": 1.963775663822228e-05, + "loss": 1.3017, + "step": 3799 + }, + { + "epoch": 1.1318155587408552, + "grad_norm": 0.21296125650405884, + "learning_rate": 1.9637499329832123e-05, + "loss": 1.2998, + "step": 3800 + }, + { + "epoch": 1.132113404940524, + "grad_norm": 0.20897722244262695, + "learning_rate": 1.9637241931775803e-05, + "loss": 1.2891, + "step": 3801 + }, + { + "epoch": 1.1324112511401925, + "grad_norm": 0.20662549138069153, + "learning_rate": 1.9636984444055716e-05, + "loss": 1.2835, + "step": 3802 + }, + { + "epoch": 1.1327090973398612, + "grad_norm": 0.20672594010829926, + "learning_rate": 1.963672686667425e-05, + "loss": 1.2908, + "step": 3803 + }, + { + "epoch": 1.1330069435395298, + "grad_norm": 0.21389494836330414, + "learning_rate": 1.9636469199633813e-05, + "loss": 1.3081, + "step": 3804 + }, + { + "epoch": 1.1333047897391983, + "grad_norm": 0.20938587188720703, + "learning_rate": 1.9636211442936798e-05, + "loss": 1.2825, + "step": 3805 + }, + { + "epoch": 1.1336026359388671, + "grad_norm": 0.21103520691394806, + "learning_rate": 1.9635953596585597e-05, + "loss": 1.287, + "step": 3806 + }, + { + "epoch": 1.1339004821385357, + "grad_norm": 0.20821918547153473, + "learning_rate": 1.9635695660582617e-05, + "loss": 1.292, + "step": 3807 + }, + { + "epoch": 1.1341983283382044, + "grad_norm": 0.20691712200641632, + "learning_rate": 1.9635437634930252e-05, + "loss": 1.305, + "step": 3808 + }, + { + "epoch": 1.134496174537873, + "grad_norm": 0.19941256940364838, + "learning_rate": 1.9635179519630905e-05, + "loss": 1.2947, + "step": 3809 + }, + { + "epoch": 1.1347940207375418, + "grad_norm": 0.20479938387870789, + "learning_rate": 1.963492131468698e-05, + "loss": 1.2878, + "step": 3810 + }, + { + "epoch": 1.1350918669372103, + "grad_norm": 0.2046506404876709, + "learning_rate": 1.9634663020100877e-05, + "loss": 1.2918, + "step": 3811 + }, + { + "epoch": 1.1353897131368789, + "grad_norm": 0.21905964612960815, + "learning_rate": 1.9634404635874996e-05, + "loss": 1.3204, + "step": 3812 + }, + { + "epoch": 1.1356875593365476, + "grad_norm": 0.214660182595253, + "learning_rate": 1.963414616201175e-05, + "loss": 1.291, + "step": 3813 + }, + { + "epoch": 1.1359854055362162, + "grad_norm": 0.20889678597450256, + "learning_rate": 1.963388759851353e-05, + "loss": 1.3095, + "step": 3814 + }, + { + "epoch": 1.136283251735885, + "grad_norm": 0.21485598385334015, + "learning_rate": 1.9633628945382754e-05, + "loss": 1.3088, + "step": 3815 + }, + { + "epoch": 1.1365810979355535, + "grad_norm": 0.2200709581375122, + "learning_rate": 1.9633370202621823e-05, + "loss": 1.2874, + "step": 3816 + }, + { + "epoch": 1.1368789441352223, + "grad_norm": 0.22136938571929932, + "learning_rate": 1.9633111370233148e-05, + "loss": 1.288, + "step": 3817 + }, + { + "epoch": 1.1371767903348908, + "grad_norm": 0.20491695404052734, + "learning_rate": 1.963285244821913e-05, + "loss": 1.2945, + "step": 3818 + }, + { + "epoch": 1.1374746365345594, + "grad_norm": 0.20777413249015808, + "learning_rate": 1.9632593436582187e-05, + "loss": 1.2855, + "step": 3819 + }, + { + "epoch": 1.1377724827342282, + "grad_norm": 0.21913692355155945, + "learning_rate": 1.9632334335324723e-05, + "loss": 1.2853, + "step": 3820 + }, + { + "epoch": 1.1380703289338967, + "grad_norm": 0.19954827427864075, + "learning_rate": 1.9632075144449146e-05, + "loss": 1.2795, + "step": 3821 + }, + { + "epoch": 1.1383681751335655, + "grad_norm": 0.2131366729736328, + "learning_rate": 1.9631815863957873e-05, + "loss": 1.2922, + "step": 3822 + }, + { + "epoch": 1.138666021333234, + "grad_norm": 0.2245870679616928, + "learning_rate": 1.9631556493853317e-05, + "loss": 1.289, + "step": 3823 + }, + { + "epoch": 1.1389638675329028, + "grad_norm": 0.21591255068778992, + "learning_rate": 1.9631297034137886e-05, + "loss": 1.2894, + "step": 3824 + }, + { + "epoch": 1.1392617137325713, + "grad_norm": 0.20875537395477295, + "learning_rate": 1.9631037484814e-05, + "loss": 1.2887, + "step": 3825 + }, + { + "epoch": 1.13955955993224, + "grad_norm": 0.2085467278957367, + "learning_rate": 1.9630777845884068e-05, + "loss": 1.2922, + "step": 3826 + }, + { + "epoch": 1.1398574061319087, + "grad_norm": 0.20918689668178558, + "learning_rate": 1.963051811735051e-05, + "loss": 1.2929, + "step": 3827 + }, + { + "epoch": 1.1401552523315772, + "grad_norm": 0.22871707379817963, + "learning_rate": 1.963025829921574e-05, + "loss": 1.2919, + "step": 3828 + }, + { + "epoch": 1.140453098531246, + "grad_norm": 0.21244798600673676, + "learning_rate": 1.9629998391482177e-05, + "loss": 1.2905, + "step": 3829 + }, + { + "epoch": 1.1407509447309145, + "grad_norm": 0.2155030071735382, + "learning_rate": 1.9629738394152237e-05, + "loss": 1.2915, + "step": 3830 + }, + { + "epoch": 1.1410487909305833, + "grad_norm": 0.20495347678661346, + "learning_rate": 1.962947830722834e-05, + "loss": 1.283, + "step": 3831 + }, + { + "epoch": 1.1413466371302519, + "grad_norm": 0.23412112891674042, + "learning_rate": 1.9629218130712906e-05, + "loss": 1.2975, + "step": 3832 + }, + { + "epoch": 1.1416444833299204, + "grad_norm": 0.21597813069820404, + "learning_rate": 1.962895786460836e-05, + "loss": 1.2876, + "step": 3833 + }, + { + "epoch": 1.1419423295295892, + "grad_norm": 0.23608894646167755, + "learning_rate": 1.9628697508917113e-05, + "loss": 1.2987, + "step": 3834 + }, + { + "epoch": 1.1422401757292577, + "grad_norm": 0.20092350244522095, + "learning_rate": 1.9628437063641595e-05, + "loss": 1.2761, + "step": 3835 + }, + { + "epoch": 1.1425380219289265, + "grad_norm": 0.2102275788784027, + "learning_rate": 1.9628176528784228e-05, + "loss": 1.3009, + "step": 3836 + }, + { + "epoch": 1.142835868128595, + "grad_norm": 0.22043468058109283, + "learning_rate": 1.9627915904347435e-05, + "loss": 1.2758, + "step": 3837 + }, + { + "epoch": 1.1431337143282638, + "grad_norm": 0.21429385244846344, + "learning_rate": 1.9627655190333645e-05, + "loss": 1.2837, + "step": 3838 + }, + { + "epoch": 1.1434315605279324, + "grad_norm": 0.207056924700737, + "learning_rate": 1.9627394386745274e-05, + "loss": 1.2877, + "step": 3839 + }, + { + "epoch": 1.143729406727601, + "grad_norm": 0.21268786489963531, + "learning_rate": 1.962713349358476e-05, + "loss": 1.2794, + "step": 3840 + }, + { + "epoch": 1.1440272529272697, + "grad_norm": 0.20578525960445404, + "learning_rate": 1.9626872510854525e-05, + "loss": 1.3017, + "step": 3841 + }, + { + "epoch": 1.1443250991269382, + "grad_norm": 0.21620962023735046, + "learning_rate": 1.9626611438556997e-05, + "loss": 1.2905, + "step": 3842 + }, + { + "epoch": 1.144622945326607, + "grad_norm": 0.20550653338432312, + "learning_rate": 1.96263502766946e-05, + "loss": 1.3023, + "step": 3843 + }, + { + "epoch": 1.1449207915262756, + "grad_norm": 0.20768411457538605, + "learning_rate": 1.9626089025269773e-05, + "loss": 1.2964, + "step": 3844 + }, + { + "epoch": 1.1452186377259443, + "grad_norm": 0.2171637862920761, + "learning_rate": 1.9625827684284943e-05, + "loss": 1.3013, + "step": 3845 + }, + { + "epoch": 1.145516483925613, + "grad_norm": 0.2186325192451477, + "learning_rate": 1.962556625374254e-05, + "loss": 1.2831, + "step": 3846 + }, + { + "epoch": 1.1458143301252814, + "grad_norm": 0.2084307074546814, + "learning_rate": 1.9625304733644998e-05, + "loss": 1.2874, + "step": 3847 + }, + { + "epoch": 1.1461121763249502, + "grad_norm": 0.217898890376091, + "learning_rate": 1.9625043123994748e-05, + "loss": 1.2932, + "step": 3848 + }, + { + "epoch": 1.1464100225246188, + "grad_norm": 0.20947939157485962, + "learning_rate": 1.9624781424794226e-05, + "loss": 1.2984, + "step": 3849 + }, + { + "epoch": 1.1467078687242875, + "grad_norm": 0.21262046694755554, + "learning_rate": 1.9624519636045866e-05, + "loss": 1.2939, + "step": 3850 + }, + { + "epoch": 1.147005714923956, + "grad_norm": 0.22141948342323303, + "learning_rate": 1.9624257757752104e-05, + "loss": 1.2946, + "step": 3851 + }, + { + "epoch": 1.1473035611236249, + "grad_norm": 0.21174995601177216, + "learning_rate": 1.9623995789915374e-05, + "loss": 1.3007, + "step": 3852 + }, + { + "epoch": 1.1476014073232934, + "grad_norm": 0.21072399616241455, + "learning_rate": 1.9623733732538118e-05, + "loss": 1.2946, + "step": 3853 + }, + { + "epoch": 1.147899253522962, + "grad_norm": 0.2091841995716095, + "learning_rate": 1.9623471585622774e-05, + "loss": 1.3003, + "step": 3854 + }, + { + "epoch": 1.1481970997226307, + "grad_norm": 0.20846165716648102, + "learning_rate": 1.9623209349171775e-05, + "loss": 1.298, + "step": 3855 + }, + { + "epoch": 1.1484949459222993, + "grad_norm": 0.21168039739131927, + "learning_rate": 1.962294702318757e-05, + "loss": 1.279, + "step": 3856 + }, + { + "epoch": 1.148792792121968, + "grad_norm": 0.21227902173995972, + "learning_rate": 1.962268460767259e-05, + "loss": 1.3035, + "step": 3857 + }, + { + "epoch": 1.1490906383216366, + "grad_norm": 0.4189324378967285, + "learning_rate": 1.9622422102629284e-05, + "loss": 1.2847, + "step": 3858 + }, + { + "epoch": 1.1493884845213054, + "grad_norm": 0.20862701535224915, + "learning_rate": 1.9622159508060087e-05, + "loss": 1.2975, + "step": 3859 + }, + { + "epoch": 1.149686330720974, + "grad_norm": 0.21685558557510376, + "learning_rate": 1.962189682396745e-05, + "loss": 1.2963, + "step": 3860 + }, + { + "epoch": 1.1499841769206427, + "grad_norm": 0.20306533575057983, + "learning_rate": 1.9621634050353813e-05, + "loss": 1.2913, + "step": 3861 + }, + { + "epoch": 1.1502820231203112, + "grad_norm": 0.20939095318317413, + "learning_rate": 1.962137118722162e-05, + "loss": 1.2862, + "step": 3862 + }, + { + "epoch": 1.15057986931998, + "grad_norm": 0.20440231263637543, + "learning_rate": 1.9621108234573316e-05, + "loss": 1.2678, + "step": 3863 + }, + { + "epoch": 1.1508777155196486, + "grad_norm": 0.21635262668132782, + "learning_rate": 1.962084519241135e-05, + "loss": 1.298, + "step": 3864 + }, + { + "epoch": 1.1511755617193171, + "grad_norm": 0.21755145490169525, + "learning_rate": 1.9620582060738172e-05, + "loss": 1.3014, + "step": 3865 + }, + { + "epoch": 1.151473407918986, + "grad_norm": 0.19997747242450714, + "learning_rate": 1.9620318839556223e-05, + "loss": 1.293, + "step": 3866 + }, + { + "epoch": 1.1517712541186544, + "grad_norm": 0.20665189623832703, + "learning_rate": 1.9620055528867957e-05, + "loss": 1.2842, + "step": 3867 + }, + { + "epoch": 1.1520691003183232, + "grad_norm": 0.20914733409881592, + "learning_rate": 1.961979212867582e-05, + "loss": 1.2974, + "step": 3868 + }, + { + "epoch": 1.1523669465179918, + "grad_norm": 0.21082919836044312, + "learning_rate": 1.961952863898227e-05, + "loss": 1.2705, + "step": 3869 + }, + { + "epoch": 1.1526647927176605, + "grad_norm": 0.22354228794574738, + "learning_rate": 1.961926505978975e-05, + "loss": 1.3124, + "step": 3870 + }, + { + "epoch": 1.152962638917329, + "grad_norm": 0.21259479224681854, + "learning_rate": 1.9619001391100715e-05, + "loss": 1.2724, + "step": 3871 + }, + { + "epoch": 1.1532604851169976, + "grad_norm": 0.20475426316261292, + "learning_rate": 1.961873763291762e-05, + "loss": 1.2936, + "step": 3872 + }, + { + "epoch": 1.1535583313166664, + "grad_norm": 0.2088763266801834, + "learning_rate": 1.961847378524292e-05, + "loss": 1.301, + "step": 3873 + }, + { + "epoch": 1.153856177516335, + "grad_norm": 0.2091834992170334, + "learning_rate": 1.9618209848079066e-05, + "loss": 1.3026, + "step": 3874 + }, + { + "epoch": 1.1541540237160037, + "grad_norm": 0.21396473050117493, + "learning_rate": 1.9617945821428517e-05, + "loss": 1.2975, + "step": 3875 + }, + { + "epoch": 1.1544518699156723, + "grad_norm": 0.19916091859340668, + "learning_rate": 1.961768170529373e-05, + "loss": 1.281, + "step": 3876 + }, + { + "epoch": 1.154749716115341, + "grad_norm": 0.20929083228111267, + "learning_rate": 1.9617417499677152e-05, + "loss": 1.298, + "step": 3877 + }, + { + "epoch": 1.1550475623150096, + "grad_norm": 0.21480098366737366, + "learning_rate": 1.9617153204581256e-05, + "loss": 1.2833, + "step": 3878 + }, + { + "epoch": 1.1553454085146782, + "grad_norm": 0.2138931155204773, + "learning_rate": 1.9616888820008492e-05, + "loss": 1.2994, + "step": 3879 + }, + { + "epoch": 1.155643254714347, + "grad_norm": 0.21063470840454102, + "learning_rate": 1.9616624345961324e-05, + "loss": 1.292, + "step": 3880 + }, + { + "epoch": 1.1559411009140155, + "grad_norm": 0.21075375378131866, + "learning_rate": 1.961635978244221e-05, + "loss": 1.2809, + "step": 3881 + }, + { + "epoch": 1.1562389471136842, + "grad_norm": 0.2145451158285141, + "learning_rate": 1.961609512945361e-05, + "loss": 1.2942, + "step": 3882 + }, + { + "epoch": 1.1565367933133528, + "grad_norm": 0.21413937211036682, + "learning_rate": 1.961583038699799e-05, + "loss": 1.2944, + "step": 3883 + }, + { + "epoch": 1.1568346395130216, + "grad_norm": 0.21136625111103058, + "learning_rate": 1.9615565555077817e-05, + "loss": 1.2964, + "step": 3884 + }, + { + "epoch": 1.1571324857126901, + "grad_norm": 0.22657860815525055, + "learning_rate": 1.9615300633695545e-05, + "loss": 1.2982, + "step": 3885 + }, + { + "epoch": 1.1574303319123587, + "grad_norm": 0.21359318494796753, + "learning_rate": 1.9615035622853643e-05, + "loss": 1.3067, + "step": 3886 + }, + { + "epoch": 1.1577281781120274, + "grad_norm": 0.19199714064598083, + "learning_rate": 1.9614770522554576e-05, + "loss": 1.3028, + "step": 3887 + }, + { + "epoch": 1.158026024311696, + "grad_norm": 0.23190546035766602, + "learning_rate": 1.9614505332800814e-05, + "loss": 1.2931, + "step": 3888 + }, + { + "epoch": 1.1583238705113648, + "grad_norm": 0.2225271463394165, + "learning_rate": 1.961424005359482e-05, + "loss": 1.307, + "step": 3889 + }, + { + "epoch": 1.1586217167110333, + "grad_norm": 0.22097532451152802, + "learning_rate": 1.9613974684939062e-05, + "loss": 1.2974, + "step": 3890 + }, + { + "epoch": 1.158919562910702, + "grad_norm": 0.21925872564315796, + "learning_rate": 1.9613709226836016e-05, + "loss": 1.2807, + "step": 3891 + }, + { + "epoch": 1.1592174091103706, + "grad_norm": 0.20450474321842194, + "learning_rate": 1.9613443679288144e-05, + "loss": 1.3048, + "step": 3892 + }, + { + "epoch": 1.1595152553100392, + "grad_norm": 0.2073204219341278, + "learning_rate": 1.961317804229792e-05, + "loss": 1.2886, + "step": 3893 + }, + { + "epoch": 1.159813101509708, + "grad_norm": 0.19831877946853638, + "learning_rate": 1.9612912315867815e-05, + "loss": 1.3044, + "step": 3894 + }, + { + "epoch": 1.1601109477093765, + "grad_norm": 0.21263408660888672, + "learning_rate": 1.96126465000003e-05, + "loss": 1.2783, + "step": 3895 + }, + { + "epoch": 1.1604087939090453, + "grad_norm": 0.21071314811706543, + "learning_rate": 1.9612380594697852e-05, + "loss": 1.2862, + "step": 3896 + }, + { + "epoch": 1.1607066401087138, + "grad_norm": 0.2105439007282257, + "learning_rate": 1.961211459996294e-05, + "loss": 1.2792, + "step": 3897 + }, + { + "epoch": 1.1610044863083826, + "grad_norm": 0.2100057452917099, + "learning_rate": 1.961184851579804e-05, + "loss": 1.2865, + "step": 3898 + }, + { + "epoch": 1.1613023325080511, + "grad_norm": 0.20325833559036255, + "learning_rate": 1.961158234220563e-05, + "loss": 1.2874, + "step": 3899 + }, + { + "epoch": 1.1616001787077197, + "grad_norm": 0.19613565504550934, + "learning_rate": 1.9611316079188185e-05, + "loss": 1.2889, + "step": 3900 + }, + { + "epoch": 1.1618980249073885, + "grad_norm": 0.2009575217962265, + "learning_rate": 1.961104972674818e-05, + "loss": 1.3017, + "step": 3901 + }, + { + "epoch": 1.162195871107057, + "grad_norm": 0.19757536053657532, + "learning_rate": 1.96107832848881e-05, + "loss": 1.3061, + "step": 3902 + }, + { + "epoch": 1.1624937173067258, + "grad_norm": 0.2167089879512787, + "learning_rate": 1.9610516753610412e-05, + "loss": 1.2811, + "step": 3903 + }, + { + "epoch": 1.1627915635063943, + "grad_norm": 0.20637845993041992, + "learning_rate": 1.961025013291761e-05, + "loss": 1.2984, + "step": 3904 + }, + { + "epoch": 1.1630894097060631, + "grad_norm": 0.203271746635437, + "learning_rate": 1.9609983422812163e-05, + "loss": 1.2832, + "step": 3905 + }, + { + "epoch": 1.1633872559057317, + "grad_norm": 0.20609252154827118, + "learning_rate": 1.9609716623296563e-05, + "loss": 1.2721, + "step": 3906 + }, + { + "epoch": 1.1636851021054002, + "grad_norm": 0.2045542150735855, + "learning_rate": 1.9609449734373282e-05, + "loss": 1.2894, + "step": 3907 + }, + { + "epoch": 1.163982948305069, + "grad_norm": 0.20650260150432587, + "learning_rate": 1.960918275604481e-05, + "loss": 1.2896, + "step": 3908 + }, + { + "epoch": 1.1642807945047375, + "grad_norm": 0.19946768879890442, + "learning_rate": 1.9608915688313626e-05, + "loss": 1.303, + "step": 3909 + }, + { + "epoch": 1.1645786407044063, + "grad_norm": 0.2126103639602661, + "learning_rate": 1.9608648531182217e-05, + "loss": 1.2986, + "step": 3910 + }, + { + "epoch": 1.1648764869040749, + "grad_norm": 0.2057417929172516, + "learning_rate": 1.960838128465307e-05, + "loss": 1.2979, + "step": 3911 + }, + { + "epoch": 1.1651743331037436, + "grad_norm": 0.22044256329536438, + "learning_rate": 1.960811394872867e-05, + "loss": 1.3012, + "step": 3912 + }, + { + "epoch": 1.1654721793034122, + "grad_norm": 0.19745013117790222, + "learning_rate": 1.9607846523411506e-05, + "loss": 1.307, + "step": 3913 + }, + { + "epoch": 1.1657700255030807, + "grad_norm": 0.2026630938053131, + "learning_rate": 1.9607579008704064e-05, + "loss": 1.299, + "step": 3914 + }, + { + "epoch": 1.1660678717027495, + "grad_norm": 0.21625575423240662, + "learning_rate": 1.960731140460883e-05, + "loss": 1.2911, + "step": 3915 + }, + { + "epoch": 1.166365717902418, + "grad_norm": 0.2037813812494278, + "learning_rate": 1.96070437111283e-05, + "loss": 1.3124, + "step": 3916 + }, + { + "epoch": 1.1666635641020868, + "grad_norm": 0.22128376364707947, + "learning_rate": 1.9606775928264964e-05, + "loss": 1.2894, + "step": 3917 + }, + { + "epoch": 1.1669614103017554, + "grad_norm": 0.21458415687084198, + "learning_rate": 1.960650805602131e-05, + "loss": 1.2915, + "step": 3918 + }, + { + "epoch": 1.1672592565014241, + "grad_norm": 0.19575026631355286, + "learning_rate": 1.960624009439983e-05, + "loss": 1.2744, + "step": 3919 + }, + { + "epoch": 1.1675571027010927, + "grad_norm": 0.22207599878311157, + "learning_rate": 1.960597204340302e-05, + "loss": 1.2786, + "step": 3920 + }, + { + "epoch": 1.1678549489007612, + "grad_norm": 0.21496036648750305, + "learning_rate": 1.9605703903033374e-05, + "loss": 1.3091, + "step": 3921 + }, + { + "epoch": 1.16815279510043, + "grad_norm": 0.2081436961889267, + "learning_rate": 1.9605435673293384e-05, + "loss": 1.2839, + "step": 3922 + }, + { + "epoch": 1.1684506413000986, + "grad_norm": 0.20992983877658844, + "learning_rate": 1.9605167354185542e-05, + "loss": 1.3063, + "step": 3923 + }, + { + "epoch": 1.1687484874997673, + "grad_norm": 0.21274542808532715, + "learning_rate": 1.9604898945712357e-05, + "loss": 1.2795, + "step": 3924 + }, + { + "epoch": 1.169046333699436, + "grad_norm": 0.21605077385902405, + "learning_rate": 1.9604630447876315e-05, + "loss": 1.293, + "step": 3925 + }, + { + "epoch": 1.1693441798991047, + "grad_norm": 0.20542150735855103, + "learning_rate": 1.960436186067992e-05, + "loss": 1.2992, + "step": 3926 + }, + { + "epoch": 1.1696420260987732, + "grad_norm": 0.20835378766059875, + "learning_rate": 1.9604093184125666e-05, + "loss": 1.3003, + "step": 3927 + }, + { + "epoch": 1.169939872298442, + "grad_norm": 0.21384067833423615, + "learning_rate": 1.9603824418216052e-05, + "loss": 1.2728, + "step": 3928 + }, + { + "epoch": 1.1702377184981105, + "grad_norm": 0.2059001922607422, + "learning_rate": 1.9603555562953587e-05, + "loss": 1.289, + "step": 3929 + }, + { + "epoch": 1.1705355646977793, + "grad_norm": 0.20917369425296783, + "learning_rate": 1.9603286618340768e-05, + "loss": 1.2876, + "step": 3930 + }, + { + "epoch": 1.1708334108974479, + "grad_norm": 0.21927359700202942, + "learning_rate": 1.9603017584380094e-05, + "loss": 1.2995, + "step": 3931 + }, + { + "epoch": 1.1711312570971164, + "grad_norm": 0.20671169459819794, + "learning_rate": 1.960274846107407e-05, + "loss": 1.2934, + "step": 3932 + }, + { + "epoch": 1.1714291032967852, + "grad_norm": 0.21301668882369995, + "learning_rate": 1.96024792484252e-05, + "loss": 1.2877, + "step": 3933 + }, + { + "epoch": 1.1717269494964537, + "grad_norm": 0.2051594853401184, + "learning_rate": 1.960220994643599e-05, + "loss": 1.2832, + "step": 3934 + }, + { + "epoch": 1.1720247956961225, + "grad_norm": 0.21233688294887543, + "learning_rate": 1.9601940555108943e-05, + "loss": 1.2885, + "step": 3935 + }, + { + "epoch": 1.172322641895791, + "grad_norm": 0.21571063995361328, + "learning_rate": 1.9601671074446572e-05, + "loss": 1.3206, + "step": 3936 + }, + { + "epoch": 1.1726204880954598, + "grad_norm": 0.20769110321998596, + "learning_rate": 1.9601401504451374e-05, + "loss": 1.2917, + "step": 3937 + }, + { + "epoch": 1.1729183342951284, + "grad_norm": 0.21504393219947815, + "learning_rate": 1.9601131845125865e-05, + "loss": 1.3085, + "step": 3938 + }, + { + "epoch": 1.173216180494797, + "grad_norm": 0.2108887881040573, + "learning_rate": 1.960086209647255e-05, + "loss": 1.2993, + "step": 3939 + }, + { + "epoch": 1.1735140266944657, + "grad_norm": 0.2132851630449295, + "learning_rate": 1.960059225849394e-05, + "loss": 1.3069, + "step": 3940 + }, + { + "epoch": 1.1738118728941342, + "grad_norm": 0.20661744475364685, + "learning_rate": 1.960032233119255e-05, + "loss": 1.2708, + "step": 3941 + }, + { + "epoch": 1.174109719093803, + "grad_norm": 0.19309358298778534, + "learning_rate": 1.960005231457088e-05, + "loss": 1.2986, + "step": 3942 + }, + { + "epoch": 1.1744075652934716, + "grad_norm": 0.2165200263261795, + "learning_rate": 1.959978220863145e-05, + "loss": 1.2891, + "step": 3943 + }, + { + "epoch": 1.1747054114931403, + "grad_norm": 0.22003872692584991, + "learning_rate": 1.9599512013376775e-05, + "loss": 1.2956, + "step": 3944 + }, + { + "epoch": 1.175003257692809, + "grad_norm": 0.21869656443595886, + "learning_rate": 1.9599241728809363e-05, + "loss": 1.2977, + "step": 3945 + }, + { + "epoch": 1.1753011038924774, + "grad_norm": 0.20122909545898438, + "learning_rate": 1.9598971354931735e-05, + "loss": 1.3045, + "step": 3946 + }, + { + "epoch": 1.1755989500921462, + "grad_norm": 0.19450651109218597, + "learning_rate": 1.9598700891746403e-05, + "loss": 1.2806, + "step": 3947 + }, + { + "epoch": 1.1758967962918148, + "grad_norm": 0.23108182847499847, + "learning_rate": 1.959843033925588e-05, + "loss": 1.3044, + "step": 3948 + }, + { + "epoch": 1.1761946424914835, + "grad_norm": 0.21403539180755615, + "learning_rate": 1.959815969746269e-05, + "loss": 1.2963, + "step": 3949 + }, + { + "epoch": 1.176492488691152, + "grad_norm": 0.21135477721691132, + "learning_rate": 1.9597888966369347e-05, + "loss": 1.2954, + "step": 3950 + }, + { + "epoch": 1.1767903348908209, + "grad_norm": 0.21040703356266022, + "learning_rate": 1.959761814597837e-05, + "loss": 1.2879, + "step": 3951 + }, + { + "epoch": 1.1770881810904894, + "grad_norm": 0.21072392165660858, + "learning_rate": 1.959734723629228e-05, + "loss": 1.2762, + "step": 3952 + }, + { + "epoch": 1.177386027290158, + "grad_norm": 0.22450478374958038, + "learning_rate": 1.9597076237313594e-05, + "loss": 1.3059, + "step": 3953 + }, + { + "epoch": 1.1776838734898267, + "grad_norm": 0.23020806908607483, + "learning_rate": 1.9596805149044838e-05, + "loss": 1.2826, + "step": 3954 + }, + { + "epoch": 1.1779817196894953, + "grad_norm": 0.21168041229248047, + "learning_rate": 1.9596533971488533e-05, + "loss": 1.3038, + "step": 3955 + }, + { + "epoch": 1.178279565889164, + "grad_norm": 0.2160586565732956, + "learning_rate": 1.95962627046472e-05, + "loss": 1.2917, + "step": 3956 + }, + { + "epoch": 1.1785774120888326, + "grad_norm": 0.2052982598543167, + "learning_rate": 1.9595991348523363e-05, + "loss": 1.2921, + "step": 3957 + }, + { + "epoch": 1.1788752582885014, + "grad_norm": 0.21587428450584412, + "learning_rate": 1.9595719903119554e-05, + "loss": 1.3024, + "step": 3958 + }, + { + "epoch": 1.17917310448817, + "grad_norm": 0.22295457124710083, + "learning_rate": 1.9595448368438285e-05, + "loss": 1.293, + "step": 3959 + }, + { + "epoch": 1.1794709506878385, + "grad_norm": 0.20518071949481964, + "learning_rate": 1.9595176744482095e-05, + "loss": 1.2778, + "step": 3960 + }, + { + "epoch": 1.1797687968875072, + "grad_norm": 0.21127352118492126, + "learning_rate": 1.95949050312535e-05, + "loss": 1.2959, + "step": 3961 + }, + { + "epoch": 1.1800666430871758, + "grad_norm": 0.21096312999725342, + "learning_rate": 1.9594633228755038e-05, + "loss": 1.2829, + "step": 3962 + }, + { + "epoch": 1.1803644892868446, + "grad_norm": 0.21224980056285858, + "learning_rate": 1.9594361336989232e-05, + "loss": 1.3133, + "step": 3963 + }, + { + "epoch": 1.1806623354865131, + "grad_norm": 0.22840841114521027, + "learning_rate": 1.9594089355958612e-05, + "loss": 1.3053, + "step": 3964 + }, + { + "epoch": 1.1809601816861819, + "grad_norm": 0.21935367584228516, + "learning_rate": 1.9593817285665712e-05, + "loss": 1.2869, + "step": 3965 + }, + { + "epoch": 1.1812580278858504, + "grad_norm": 0.21192151308059692, + "learning_rate": 1.9593545126113063e-05, + "loss": 1.2963, + "step": 3966 + }, + { + "epoch": 1.181555874085519, + "grad_norm": 0.2130948156118393, + "learning_rate": 1.9593272877303192e-05, + "loss": 1.3048, + "step": 3967 + }, + { + "epoch": 1.1818537202851878, + "grad_norm": 0.21849285066127777, + "learning_rate": 1.9593000539238637e-05, + "loss": 1.299, + "step": 3968 + }, + { + "epoch": 1.1821515664848563, + "grad_norm": 0.22445747256278992, + "learning_rate": 1.9592728111921926e-05, + "loss": 1.2823, + "step": 3969 + }, + { + "epoch": 1.182449412684525, + "grad_norm": 0.21361754834651947, + "learning_rate": 1.95924555953556e-05, + "loss": 1.2737, + "step": 3970 + }, + { + "epoch": 1.1827472588841936, + "grad_norm": 0.21532507240772247, + "learning_rate": 1.959218298954219e-05, + "loss": 1.3136, + "step": 3971 + }, + { + "epoch": 1.1830451050838624, + "grad_norm": 0.2088194340467453, + "learning_rate": 1.959191029448424e-05, + "loss": 1.2899, + "step": 3972 + }, + { + "epoch": 1.183342951283531, + "grad_norm": 0.2056037336587906, + "learning_rate": 1.9591637510184277e-05, + "loss": 1.2808, + "step": 3973 + }, + { + "epoch": 1.1836407974831995, + "grad_norm": 0.20478075742721558, + "learning_rate": 1.9591364636644842e-05, + "loss": 1.2664, + "step": 3974 + }, + { + "epoch": 1.1839386436828683, + "grad_norm": 0.2041953206062317, + "learning_rate": 1.9591091673868477e-05, + "loss": 1.2755, + "step": 3975 + }, + { + "epoch": 1.1842364898825368, + "grad_norm": 0.2047312706708908, + "learning_rate": 1.959081862185772e-05, + "loss": 1.2782, + "step": 3976 + }, + { + "epoch": 1.1845343360822056, + "grad_norm": 0.21848949790000916, + "learning_rate": 1.959054548061511e-05, + "loss": 1.2986, + "step": 3977 + }, + { + "epoch": 1.1848321822818741, + "grad_norm": 0.21306787431240082, + "learning_rate": 1.9590272250143193e-05, + "loss": 1.2924, + "step": 3978 + }, + { + "epoch": 1.185130028481543, + "grad_norm": 0.21525615453720093, + "learning_rate": 1.9589998930444508e-05, + "loss": 1.2936, + "step": 3979 + }, + { + "epoch": 1.1854278746812115, + "grad_norm": 0.21527034044265747, + "learning_rate": 1.9589725521521596e-05, + "loss": 1.2891, + "step": 3980 + }, + { + "epoch": 1.18572572088088, + "grad_norm": 0.2184000164270401, + "learning_rate": 1.9589452023377e-05, + "loss": 1.2918, + "step": 3981 + }, + { + "epoch": 1.1860235670805488, + "grad_norm": 0.22402919828891754, + "learning_rate": 1.9589178436013268e-05, + "loss": 1.2926, + "step": 3982 + }, + { + "epoch": 1.1863214132802173, + "grad_norm": 0.21852335333824158, + "learning_rate": 1.9588904759432945e-05, + "loss": 1.3042, + "step": 3983 + }, + { + "epoch": 1.1866192594798861, + "grad_norm": 0.21109206974506378, + "learning_rate": 1.9588630993638575e-05, + "loss": 1.2826, + "step": 3984 + }, + { + "epoch": 1.1869171056795547, + "grad_norm": 0.20981113612651825, + "learning_rate": 1.9588357138632706e-05, + "loss": 1.3027, + "step": 3985 + }, + { + "epoch": 1.1872149518792234, + "grad_norm": 0.22667832672595978, + "learning_rate": 1.958808319441789e-05, + "loss": 1.3037, + "step": 3986 + }, + { + "epoch": 1.187512798078892, + "grad_norm": 0.2064162641763687, + "learning_rate": 1.958780916099667e-05, + "loss": 1.2846, + "step": 3987 + }, + { + "epoch": 1.1878106442785605, + "grad_norm": 0.2067207247018814, + "learning_rate": 1.95875350383716e-05, + "loss": 1.2758, + "step": 3988 + }, + { + "epoch": 1.1881084904782293, + "grad_norm": 0.21313981711864471, + "learning_rate": 1.9587260826545225e-05, + "loss": 1.2912, + "step": 3989 + }, + { + "epoch": 1.1884063366778979, + "grad_norm": 0.21533231437206268, + "learning_rate": 1.9586986525520104e-05, + "loss": 1.3031, + "step": 3990 + }, + { + "epoch": 1.1887041828775666, + "grad_norm": 0.2270992547273636, + "learning_rate": 1.9586712135298778e-05, + "loss": 1.2797, + "step": 3991 + }, + { + "epoch": 1.1890020290772352, + "grad_norm": 0.21351057291030884, + "learning_rate": 1.9586437655883812e-05, + "loss": 1.2974, + "step": 3992 + }, + { + "epoch": 1.189299875276904, + "grad_norm": 0.20550043880939484, + "learning_rate": 1.958616308727775e-05, + "loss": 1.3004, + "step": 3993 + }, + { + "epoch": 1.1895977214765725, + "grad_norm": 0.20054568350315094, + "learning_rate": 1.9585888429483155e-05, + "loss": 1.2796, + "step": 3994 + }, + { + "epoch": 1.1898955676762413, + "grad_norm": 0.2166435420513153, + "learning_rate": 1.9585613682502574e-05, + "loss": 1.2906, + "step": 3995 + }, + { + "epoch": 1.1901934138759098, + "grad_norm": 0.20004740357398987, + "learning_rate": 1.9585338846338574e-05, + "loss": 1.2846, + "step": 3996 + }, + { + "epoch": 1.1904912600755786, + "grad_norm": 0.20989029109477997, + "learning_rate": 1.9585063920993698e-05, + "loss": 1.2887, + "step": 3997 + }, + { + "epoch": 1.1907891062752471, + "grad_norm": 0.21801543235778809, + "learning_rate": 1.9584788906470516e-05, + "loss": 1.2885, + "step": 3998 + }, + { + "epoch": 1.1910869524749157, + "grad_norm": 0.20614711940288544, + "learning_rate": 1.958451380277158e-05, + "loss": 1.2954, + "step": 3999 + }, + { + "epoch": 1.1913847986745845, + "grad_norm": 0.2140260934829712, + "learning_rate": 1.9584238609899452e-05, + "loss": 1.2771, + "step": 4000 + }, + { + "epoch": 1.1913847986745845, + "eval_loss": 1.355595588684082, + "eval_runtime": 20.1193, + "eval_samples_per_second": 86.186, + "eval_steps_per_second": 5.418, + "step": 4000 + }, + { + "epoch": 1.191682644874253, + "grad_norm": 0.19806234538555145, + "learning_rate": 1.9583963327856696e-05, + "loss": 1.2977, + "step": 4001 + }, + { + "epoch": 1.1919804910739218, + "grad_norm": 0.21819539368152618, + "learning_rate": 1.958368795664586e-05, + "loss": 1.3065, + "step": 4002 + }, + { + "epoch": 1.1922783372735903, + "grad_norm": 0.2082003653049469, + "learning_rate": 1.958341249626952e-05, + "loss": 1.3048, + "step": 4003 + }, + { + "epoch": 1.192576183473259, + "grad_norm": 0.22938930988311768, + "learning_rate": 1.9583136946730237e-05, + "loss": 1.3025, + "step": 4004 + }, + { + "epoch": 1.1928740296729277, + "grad_norm": 0.2137821465730667, + "learning_rate": 1.9582861308030567e-05, + "loss": 1.293, + "step": 4005 + }, + { + "epoch": 1.1931718758725962, + "grad_norm": 0.21068082749843597, + "learning_rate": 1.9582585580173083e-05, + "loss": 1.2854, + "step": 4006 + }, + { + "epoch": 1.193469722072265, + "grad_norm": 0.21020939946174622, + "learning_rate": 1.9582309763160343e-05, + "loss": 1.2937, + "step": 4007 + }, + { + "epoch": 1.1937675682719335, + "grad_norm": 0.21396276354789734, + "learning_rate": 1.958203385699492e-05, + "loss": 1.2914, + "step": 4008 + }, + { + "epoch": 1.1940654144716023, + "grad_norm": 0.22837236523628235, + "learning_rate": 1.9581757861679372e-05, + "loss": 1.287, + "step": 4009 + }, + { + "epoch": 1.1943632606712709, + "grad_norm": 0.21628418564796448, + "learning_rate": 1.9581481777216277e-05, + "loss": 1.3058, + "step": 4010 + }, + { + "epoch": 1.1946611068709396, + "grad_norm": 0.22252708673477173, + "learning_rate": 1.95812056036082e-05, + "loss": 1.2798, + "step": 4011 + }, + { + "epoch": 1.1949589530706082, + "grad_norm": 0.2062803953886032, + "learning_rate": 1.9580929340857707e-05, + "loss": 1.3059, + "step": 4012 + }, + { + "epoch": 1.1952567992702767, + "grad_norm": 0.2138001173734665, + "learning_rate": 1.958065298896737e-05, + "loss": 1.2957, + "step": 4013 + }, + { + "epoch": 1.1955546454699455, + "grad_norm": 0.21340374648571014, + "learning_rate": 1.9580376547939763e-05, + "loss": 1.3015, + "step": 4014 + }, + { + "epoch": 1.195852491669614, + "grad_norm": 0.20899172127246857, + "learning_rate": 1.9580100017777455e-05, + "loss": 1.2774, + "step": 4015 + }, + { + "epoch": 1.1961503378692828, + "grad_norm": 0.21481002867221832, + "learning_rate": 1.957982339848302e-05, + "loss": 1.2859, + "step": 4016 + }, + { + "epoch": 1.1964481840689514, + "grad_norm": 0.20328159630298615, + "learning_rate": 1.9579546690059033e-05, + "loss": 1.2875, + "step": 4017 + }, + { + "epoch": 1.1967460302686201, + "grad_norm": 0.22197499871253967, + "learning_rate": 1.9579269892508065e-05, + "loss": 1.2953, + "step": 4018 + }, + { + "epoch": 1.1970438764682887, + "grad_norm": 0.2199348658323288, + "learning_rate": 1.9578993005832697e-05, + "loss": 1.301, + "step": 4019 + }, + { + "epoch": 1.1973417226679572, + "grad_norm": 0.22500820457935333, + "learning_rate": 1.9578716030035497e-05, + "loss": 1.2943, + "step": 4020 + }, + { + "epoch": 1.197639568867626, + "grad_norm": 0.2298382967710495, + "learning_rate": 1.957843896511905e-05, + "loss": 1.2977, + "step": 4021 + }, + { + "epoch": 1.1979374150672946, + "grad_norm": 0.22839154303073883, + "learning_rate": 1.9578161811085924e-05, + "loss": 1.2899, + "step": 4022 + }, + { + "epoch": 1.1982352612669633, + "grad_norm": 0.2266552597284317, + "learning_rate": 1.957788456793871e-05, + "loss": 1.283, + "step": 4023 + }, + { + "epoch": 1.1985331074666319, + "grad_norm": 0.21820247173309326, + "learning_rate": 1.957760723567998e-05, + "loss": 1.2815, + "step": 4024 + }, + { + "epoch": 1.1988309536663007, + "grad_norm": 0.2251671552658081, + "learning_rate": 1.9577329814312314e-05, + "loss": 1.2832, + "step": 4025 + }, + { + "epoch": 1.1991287998659692, + "grad_norm": 0.22719109058380127, + "learning_rate": 1.9577052303838292e-05, + "loss": 1.2841, + "step": 4026 + }, + { + "epoch": 1.1994266460656378, + "grad_norm": 0.2208300679922104, + "learning_rate": 1.9576774704260503e-05, + "loss": 1.2923, + "step": 4027 + }, + { + "epoch": 1.1997244922653065, + "grad_norm": 0.23036691546440125, + "learning_rate": 1.9576497015581523e-05, + "loss": 1.2983, + "step": 4028 + }, + { + "epoch": 1.200022338464975, + "grad_norm": 0.20784254372119904, + "learning_rate": 1.957621923780394e-05, + "loss": 1.2892, + "step": 4029 + }, + { + "epoch": 1.2003201846646439, + "grad_norm": 0.22049713134765625, + "learning_rate": 1.9575941370930333e-05, + "loss": 1.3079, + "step": 4030 + }, + { + "epoch": 1.2006180308643124, + "grad_norm": 0.22592094540596008, + "learning_rate": 1.957566341496329e-05, + "loss": 1.2937, + "step": 4031 + }, + { + "epoch": 1.2009158770639812, + "grad_norm": 0.22759899497032166, + "learning_rate": 1.95753853699054e-05, + "loss": 1.2873, + "step": 4032 + }, + { + "epoch": 1.2012137232636497, + "grad_norm": 0.21229848265647888, + "learning_rate": 1.9575107235759248e-05, + "loss": 1.2747, + "step": 4033 + }, + { + "epoch": 1.2015115694633183, + "grad_norm": 0.1977112889289856, + "learning_rate": 1.957482901252742e-05, + "loss": 1.2851, + "step": 4034 + }, + { + "epoch": 1.201809415662987, + "grad_norm": 0.2177518755197525, + "learning_rate": 1.9574550700212505e-05, + "loss": 1.2916, + "step": 4035 + }, + { + "epoch": 1.2021072618626556, + "grad_norm": 0.2394571155309677, + "learning_rate": 1.9574272298817093e-05, + "loss": 1.2904, + "step": 4036 + }, + { + "epoch": 1.2024051080623244, + "grad_norm": 0.21085403859615326, + "learning_rate": 1.9573993808343776e-05, + "loss": 1.3214, + "step": 4037 + }, + { + "epoch": 1.202702954261993, + "grad_norm": 0.20579291880130768, + "learning_rate": 1.9573715228795142e-05, + "loss": 1.2918, + "step": 4038 + }, + { + "epoch": 1.2030008004616617, + "grad_norm": 0.22272194921970367, + "learning_rate": 1.9573436560173784e-05, + "loss": 1.307, + "step": 4039 + }, + { + "epoch": 1.2032986466613302, + "grad_norm": 0.22960400581359863, + "learning_rate": 1.9573157802482294e-05, + "loss": 1.2914, + "step": 4040 + }, + { + "epoch": 1.2035964928609988, + "grad_norm": 0.21425718069076538, + "learning_rate": 1.9572878955723266e-05, + "loss": 1.2935, + "step": 4041 + }, + { + "epoch": 1.2038943390606676, + "grad_norm": 0.22439956665039062, + "learning_rate": 1.95726000198993e-05, + "loss": 1.2716, + "step": 4042 + }, + { + "epoch": 1.2041921852603361, + "grad_norm": 0.21399863064289093, + "learning_rate": 1.957232099501298e-05, + "loss": 1.2917, + "step": 4043 + }, + { + "epoch": 1.2044900314600049, + "grad_norm": 0.22308999300003052, + "learning_rate": 1.957204188106691e-05, + "loss": 1.301, + "step": 4044 + }, + { + "epoch": 1.2047878776596734, + "grad_norm": 0.23780715465545654, + "learning_rate": 1.9571762678063685e-05, + "loss": 1.2773, + "step": 4045 + }, + { + "epoch": 1.2050857238593422, + "grad_norm": 0.2037344127893448, + "learning_rate": 1.9571483386005905e-05, + "loss": 1.3037, + "step": 4046 + }, + { + "epoch": 1.2053835700590108, + "grad_norm": 0.20831087231636047, + "learning_rate": 1.957120400489616e-05, + "loss": 1.2815, + "step": 4047 + }, + { + "epoch": 1.2056814162586793, + "grad_norm": 0.2218935489654541, + "learning_rate": 1.957092453473706e-05, + "loss": 1.2883, + "step": 4048 + }, + { + "epoch": 1.205979262458348, + "grad_norm": 0.22106003761291504, + "learning_rate": 1.9570644975531202e-05, + "loss": 1.312, + "step": 4049 + }, + { + "epoch": 1.2062771086580166, + "grad_norm": 0.20549699664115906, + "learning_rate": 1.9570365327281184e-05, + "loss": 1.2767, + "step": 4050 + }, + { + "epoch": 1.2065749548576854, + "grad_norm": 0.20258985459804535, + "learning_rate": 1.9570085589989605e-05, + "loss": 1.2831, + "step": 4051 + }, + { + "epoch": 1.206872801057354, + "grad_norm": 0.21634162962436676, + "learning_rate": 1.9569805763659075e-05, + "loss": 1.2911, + "step": 4052 + }, + { + "epoch": 1.2071706472570227, + "grad_norm": 0.22728785872459412, + "learning_rate": 1.9569525848292192e-05, + "loss": 1.2984, + "step": 4053 + }, + { + "epoch": 1.2074684934566913, + "grad_norm": 0.23223990201950073, + "learning_rate": 1.9569245843891565e-05, + "loss": 1.2949, + "step": 4054 + }, + { + "epoch": 1.20776633965636, + "grad_norm": 0.22025689482688904, + "learning_rate": 1.95689657504598e-05, + "loss": 1.293, + "step": 4055 + }, + { + "epoch": 1.2080641858560286, + "grad_norm": 0.22677235305309296, + "learning_rate": 1.9568685567999495e-05, + "loss": 1.2875, + "step": 4056 + }, + { + "epoch": 1.2083620320556971, + "grad_norm": 0.22293587028980255, + "learning_rate": 1.9568405296513262e-05, + "loss": 1.2921, + "step": 4057 + }, + { + "epoch": 1.208659878255366, + "grad_norm": 0.22843754291534424, + "learning_rate": 1.956812493600371e-05, + "loss": 1.2974, + "step": 4058 + }, + { + "epoch": 1.2089577244550345, + "grad_norm": 0.2277127504348755, + "learning_rate": 1.9567844486473447e-05, + "loss": 1.2793, + "step": 4059 + }, + { + "epoch": 1.2092555706547032, + "grad_norm": 0.20787858963012695, + "learning_rate": 1.9567563947925075e-05, + "loss": 1.2745, + "step": 4060 + }, + { + "epoch": 1.2095534168543718, + "grad_norm": 0.20440879464149475, + "learning_rate": 1.956728332036122e-05, + "loss": 1.2835, + "step": 4061 + }, + { + "epoch": 1.2098512630540406, + "grad_norm": 0.21900875866413116, + "learning_rate": 1.956700260378447e-05, + "loss": 1.3026, + "step": 4062 + }, + { + "epoch": 1.210149109253709, + "grad_norm": 0.22136928141117096, + "learning_rate": 1.956672179819746e-05, + "loss": 1.2818, + "step": 4063 + }, + { + "epoch": 1.2104469554533779, + "grad_norm": 0.2110716998577118, + "learning_rate": 1.9566440903602787e-05, + "loss": 1.3016, + "step": 4064 + }, + { + "epoch": 1.2107448016530464, + "grad_norm": 0.21238602697849274, + "learning_rate": 1.956615992000307e-05, + "loss": 1.2836, + "step": 4065 + }, + { + "epoch": 1.211042647852715, + "grad_norm": 0.21743468940258026, + "learning_rate": 1.9565878847400924e-05, + "loss": 1.2932, + "step": 4066 + }, + { + "epoch": 1.2113404940523838, + "grad_norm": 0.22191287577152252, + "learning_rate": 1.9565597685798965e-05, + "loss": 1.287, + "step": 4067 + }, + { + "epoch": 1.2116383402520523, + "grad_norm": 0.23670993745326996, + "learning_rate": 1.9565316435199807e-05, + "loss": 1.2826, + "step": 4068 + }, + { + "epoch": 1.211936186451721, + "grad_norm": 0.21914072334766388, + "learning_rate": 1.956503509560606e-05, + "loss": 1.276, + "step": 4069 + }, + { + "epoch": 1.2122340326513896, + "grad_norm": 0.22580796480178833, + "learning_rate": 1.9564753667020354e-05, + "loss": 1.2866, + "step": 4070 + }, + { + "epoch": 1.2125318788510584, + "grad_norm": 0.24082769453525543, + "learning_rate": 1.95644721494453e-05, + "loss": 1.2804, + "step": 4071 + }, + { + "epoch": 1.212829725050727, + "grad_norm": 0.21872304379940033, + "learning_rate": 1.9564190542883522e-05, + "loss": 1.3067, + "step": 4072 + }, + { + "epoch": 1.2131275712503955, + "grad_norm": 0.2314034253358841, + "learning_rate": 1.956390884733763e-05, + "loss": 1.2861, + "step": 4073 + }, + { + "epoch": 1.2134254174500643, + "grad_norm": 0.212004154920578, + "learning_rate": 1.956362706281026e-05, + "loss": 1.2795, + "step": 4074 + }, + { + "epoch": 1.2137232636497328, + "grad_norm": 0.2215016484260559, + "learning_rate": 1.9563345189304016e-05, + "loss": 1.289, + "step": 4075 + }, + { + "epoch": 1.2140211098494016, + "grad_norm": 0.21931003034114838, + "learning_rate": 1.9563063226821533e-05, + "loss": 1.3087, + "step": 4076 + }, + { + "epoch": 1.2143189560490701, + "grad_norm": 0.22194616496562958, + "learning_rate": 1.9562781175365436e-05, + "loss": 1.3011, + "step": 4077 + }, + { + "epoch": 1.214616802248739, + "grad_norm": 0.2346036732196808, + "learning_rate": 1.956249903493834e-05, + "loss": 1.2898, + "step": 4078 + }, + { + "epoch": 1.2149146484484075, + "grad_norm": 0.20830166339874268, + "learning_rate": 1.9562216805542876e-05, + "loss": 1.2936, + "step": 4079 + }, + { + "epoch": 1.215212494648076, + "grad_norm": 0.21001702547073364, + "learning_rate": 1.9561934487181665e-05, + "loss": 1.2792, + "step": 4080 + }, + { + "epoch": 1.2155103408477448, + "grad_norm": 0.22858309745788574, + "learning_rate": 1.956165207985734e-05, + "loss": 1.3071, + "step": 4081 + }, + { + "epoch": 1.2158081870474133, + "grad_norm": 0.237175852060318, + "learning_rate": 1.956136958357252e-05, + "loss": 1.2896, + "step": 4082 + }, + { + "epoch": 1.216106033247082, + "grad_norm": 0.21885618567466736, + "learning_rate": 1.9561086998329847e-05, + "loss": 1.2969, + "step": 4083 + }, + { + "epoch": 1.2164038794467507, + "grad_norm": 0.21815459430217743, + "learning_rate": 1.9560804324131938e-05, + "loss": 1.3052, + "step": 4084 + }, + { + "epoch": 1.2167017256464194, + "grad_norm": 0.22342005372047424, + "learning_rate": 1.9560521560981428e-05, + "loss": 1.288, + "step": 4085 + }, + { + "epoch": 1.216999571846088, + "grad_norm": 0.25384339690208435, + "learning_rate": 1.9560238708880942e-05, + "loss": 1.2987, + "step": 4086 + }, + { + "epoch": 1.2172974180457565, + "grad_norm": 0.2196073979139328, + "learning_rate": 1.955995576783312e-05, + "loss": 1.2933, + "step": 4087 + }, + { + "epoch": 1.2175952642454253, + "grad_norm": 0.22214414179325104, + "learning_rate": 1.955967273784059e-05, + "loss": 1.298, + "step": 4088 + }, + { + "epoch": 1.2178931104450939, + "grad_norm": 0.21233679354190826, + "learning_rate": 1.9559389618905986e-05, + "loss": 1.2773, + "step": 4089 + }, + { + "epoch": 1.2181909566447626, + "grad_norm": 0.20780721306800842, + "learning_rate": 1.955910641103194e-05, + "loss": 1.2952, + "step": 4090 + }, + { + "epoch": 1.2184888028444312, + "grad_norm": 0.20027214288711548, + "learning_rate": 1.955882311422109e-05, + "loss": 1.2931, + "step": 4091 + }, + { + "epoch": 1.2187866490441, + "grad_norm": 0.2059791088104248, + "learning_rate": 1.9558539728476072e-05, + "loss": 1.2821, + "step": 4092 + }, + { + "epoch": 1.2190844952437685, + "grad_norm": 0.20750513672828674, + "learning_rate": 1.9558256253799523e-05, + "loss": 1.2948, + "step": 4093 + }, + { + "epoch": 1.219382341443437, + "grad_norm": 0.20728327333927155, + "learning_rate": 1.9557972690194075e-05, + "loss": 1.2817, + "step": 4094 + }, + { + "epoch": 1.2196801876431058, + "grad_norm": 0.21938657760620117, + "learning_rate": 1.9557689037662372e-05, + "loss": 1.2947, + "step": 4095 + }, + { + "epoch": 1.2199780338427744, + "grad_norm": 0.21456745266914368, + "learning_rate": 1.955740529620705e-05, + "loss": 1.2688, + "step": 4096 + }, + { + "epoch": 1.2202758800424431, + "grad_norm": 0.20828519761562347, + "learning_rate": 1.9557121465830747e-05, + "loss": 1.2797, + "step": 4097 + }, + { + "epoch": 1.2205737262421117, + "grad_norm": 0.20779909193515778, + "learning_rate": 1.955683754653611e-05, + "loss": 1.3035, + "step": 4098 + }, + { + "epoch": 1.2208715724417805, + "grad_norm": 0.2051311582326889, + "learning_rate": 1.955655353832578e-05, + "loss": 1.2778, + "step": 4099 + }, + { + "epoch": 1.221169418641449, + "grad_norm": 0.2165333777666092, + "learning_rate": 1.9556269441202392e-05, + "loss": 1.3092, + "step": 4100 + }, + { + "epoch": 1.2214672648411176, + "grad_norm": 0.22863154113292694, + "learning_rate": 1.9555985255168593e-05, + "loss": 1.2889, + "step": 4101 + }, + { + "epoch": 1.2217651110407863, + "grad_norm": 0.213213250041008, + "learning_rate": 1.9555700980227028e-05, + "loss": 1.2863, + "step": 4102 + }, + { + "epoch": 1.2220629572404549, + "grad_norm": 0.20332065224647522, + "learning_rate": 1.9555416616380346e-05, + "loss": 1.2952, + "step": 4103 + }, + { + "epoch": 1.2223608034401237, + "grad_norm": 0.2252652943134308, + "learning_rate": 1.9555132163631186e-05, + "loss": 1.3054, + "step": 4104 + }, + { + "epoch": 1.2226586496397922, + "grad_norm": 0.23016788065433502, + "learning_rate": 1.9554847621982195e-05, + "loss": 1.3008, + "step": 4105 + }, + { + "epoch": 1.222956495839461, + "grad_norm": 0.21542948484420776, + "learning_rate": 1.9554562991436022e-05, + "loss": 1.2791, + "step": 4106 + }, + { + "epoch": 1.2232543420391295, + "grad_norm": 0.23817753791809082, + "learning_rate": 1.955427827199532e-05, + "loss": 1.2995, + "step": 4107 + }, + { + "epoch": 1.223552188238798, + "grad_norm": 0.22850258648395538, + "learning_rate": 1.955399346366273e-05, + "loss": 1.294, + "step": 4108 + }, + { + "epoch": 1.2238500344384668, + "grad_norm": 0.2405814826488495, + "learning_rate": 1.9553708566440904e-05, + "loss": 1.2986, + "step": 4109 + }, + { + "epoch": 1.2241478806381354, + "grad_norm": 0.2581832706928253, + "learning_rate": 1.9553423580332495e-05, + "loss": 1.2792, + "step": 4110 + }, + { + "epoch": 1.2244457268378042, + "grad_norm": 0.20295637845993042, + "learning_rate": 1.9553138505340156e-05, + "loss": 1.2843, + "step": 4111 + }, + { + "epoch": 1.2247435730374727, + "grad_norm": 0.22549006342887878, + "learning_rate": 1.9552853341466532e-05, + "loss": 1.3055, + "step": 4112 + }, + { + "epoch": 1.2250414192371415, + "grad_norm": 0.2101021260023117, + "learning_rate": 1.9552568088714283e-05, + "loss": 1.29, + "step": 4113 + }, + { + "epoch": 1.22533926543681, + "grad_norm": 0.2183251976966858, + "learning_rate": 1.955228274708606e-05, + "loss": 1.2791, + "step": 4114 + }, + { + "epoch": 1.2256371116364786, + "grad_norm": 0.2154933512210846, + "learning_rate": 1.955199731658452e-05, + "loss": 1.2951, + "step": 4115 + }, + { + "epoch": 1.2259349578361474, + "grad_norm": 0.22604544460773468, + "learning_rate": 1.9551711797212317e-05, + "loss": 1.2906, + "step": 4116 + }, + { + "epoch": 1.226232804035816, + "grad_norm": 0.21755193173885345, + "learning_rate": 1.9551426188972104e-05, + "loss": 1.3036, + "step": 4117 + }, + { + "epoch": 1.2265306502354847, + "grad_norm": 0.21554438769817352, + "learning_rate": 1.9551140491866546e-05, + "loss": 1.2921, + "step": 4118 + }, + { + "epoch": 1.2268284964351532, + "grad_norm": 0.22313088178634644, + "learning_rate": 1.9550854705898295e-05, + "loss": 1.2953, + "step": 4119 + }, + { + "epoch": 1.227126342634822, + "grad_norm": 0.20665155351161957, + "learning_rate": 1.9550568831070013e-05, + "loss": 1.2845, + "step": 4120 + }, + { + "epoch": 1.2274241888344906, + "grad_norm": 0.20775890350341797, + "learning_rate": 1.955028286738436e-05, + "loss": 1.2737, + "step": 4121 + }, + { + "epoch": 1.2277220350341593, + "grad_norm": 0.2068718671798706, + "learning_rate": 1.954999681484399e-05, + "loss": 1.2931, + "step": 4122 + }, + { + "epoch": 1.2280198812338279, + "grad_norm": 0.20639167726039886, + "learning_rate": 1.9549710673451574e-05, + "loss": 1.31, + "step": 4123 + }, + { + "epoch": 1.2283177274334964, + "grad_norm": 0.21572212874889374, + "learning_rate": 1.9549424443209768e-05, + "loss": 1.2864, + "step": 4124 + }, + { + "epoch": 1.2286155736331652, + "grad_norm": 0.21003632247447968, + "learning_rate": 1.9549138124121236e-05, + "loss": 1.2768, + "step": 4125 + }, + { + "epoch": 1.2289134198328338, + "grad_norm": 0.21323645114898682, + "learning_rate": 1.9548851716188645e-05, + "loss": 1.2853, + "step": 4126 + }, + { + "epoch": 1.2292112660325025, + "grad_norm": 0.2155407965183258, + "learning_rate": 1.9548565219414658e-05, + "loss": 1.2947, + "step": 4127 + }, + { + "epoch": 1.229509112232171, + "grad_norm": 0.20775429904460907, + "learning_rate": 1.9548278633801937e-05, + "loss": 1.2916, + "step": 4128 + }, + { + "epoch": 1.2298069584318398, + "grad_norm": 0.21007290482521057, + "learning_rate": 1.9547991959353153e-05, + "loss": 1.3014, + "step": 4129 + }, + { + "epoch": 1.2301048046315084, + "grad_norm": 0.2189212143421173, + "learning_rate": 1.954770519607097e-05, + "loss": 1.2872, + "step": 4130 + }, + { + "epoch": 1.2304026508311772, + "grad_norm": 0.21716181933879852, + "learning_rate": 1.9547418343958058e-05, + "loss": 1.2849, + "step": 4131 + }, + { + "epoch": 1.2307004970308457, + "grad_norm": 0.20262238383293152, + "learning_rate": 1.9547131403017086e-05, + "loss": 1.3047, + "step": 4132 + }, + { + "epoch": 1.2309983432305143, + "grad_norm": 0.20321613550186157, + "learning_rate": 1.954684437325072e-05, + "loss": 1.2857, + "step": 4133 + }, + { + "epoch": 1.231296189430183, + "grad_norm": 0.21657630801200867, + "learning_rate": 1.9546557254661634e-05, + "loss": 1.2935, + "step": 4134 + }, + { + "epoch": 1.2315940356298516, + "grad_norm": 0.24504989385604858, + "learning_rate": 1.9546270047252506e-05, + "loss": 1.2841, + "step": 4135 + }, + { + "epoch": 1.2318918818295204, + "grad_norm": 0.19349680840969086, + "learning_rate": 1.9545982751025994e-05, + "loss": 1.2751, + "step": 4136 + }, + { + "epoch": 1.232189728029189, + "grad_norm": 0.30486607551574707, + "learning_rate": 1.9545695365984777e-05, + "loss": 1.2776, + "step": 4137 + }, + { + "epoch": 1.2324875742288577, + "grad_norm": 0.22059954702854156, + "learning_rate": 1.9545407892131533e-05, + "loss": 1.2872, + "step": 4138 + }, + { + "epoch": 1.2327854204285262, + "grad_norm": 0.22120457887649536, + "learning_rate": 1.9545120329468933e-05, + "loss": 1.2954, + "step": 4139 + }, + { + "epoch": 1.2330832666281948, + "grad_norm": 0.21268272399902344, + "learning_rate": 1.9544832677999653e-05, + "loss": 1.2859, + "step": 4140 + }, + { + "epoch": 1.2333811128278636, + "grad_norm": 0.21759754419326782, + "learning_rate": 1.9544544937726367e-05, + "loss": 1.3013, + "step": 4141 + }, + { + "epoch": 1.233678959027532, + "grad_norm": 0.21478748321533203, + "learning_rate": 1.954425710865176e-05, + "loss": 1.2927, + "step": 4142 + }, + { + "epoch": 1.2339768052272009, + "grad_norm": 0.19693058729171753, + "learning_rate": 1.9543969190778494e-05, + "loss": 1.2988, + "step": 4143 + }, + { + "epoch": 1.2342746514268694, + "grad_norm": 0.2068811058998108, + "learning_rate": 1.9543681184109267e-05, + "loss": 1.2864, + "step": 4144 + }, + { + "epoch": 1.2345724976265382, + "grad_norm": 0.21200990676879883, + "learning_rate": 1.9543393088646746e-05, + "loss": 1.2916, + "step": 4145 + }, + { + "epoch": 1.2348703438262068, + "grad_norm": 0.20587991178035736, + "learning_rate": 1.9543104904393612e-05, + "loss": 1.2882, + "step": 4146 + }, + { + "epoch": 1.2351681900258753, + "grad_norm": 0.20354631543159485, + "learning_rate": 1.9542816631352554e-05, + "loss": 1.2864, + "step": 4147 + }, + { + "epoch": 1.235466036225544, + "grad_norm": 0.21679729223251343, + "learning_rate": 1.9542528269526248e-05, + "loss": 1.3085, + "step": 4148 + }, + { + "epoch": 1.2357638824252126, + "grad_norm": 0.24312865734100342, + "learning_rate": 1.9542239818917373e-05, + "loss": 1.3036, + "step": 4149 + }, + { + "epoch": 1.2360617286248814, + "grad_norm": 0.2099834382534027, + "learning_rate": 1.9541951279528624e-05, + "loss": 1.2929, + "step": 4150 + }, + { + "epoch": 1.23635957482455, + "grad_norm": 0.22744804620742798, + "learning_rate": 1.9541662651362677e-05, + "loss": 1.3021, + "step": 4151 + }, + { + "epoch": 1.2366574210242187, + "grad_norm": 0.20686295628547668, + "learning_rate": 1.954137393442222e-05, + "loss": 1.2835, + "step": 4152 + }, + { + "epoch": 1.2369552672238873, + "grad_norm": 0.20510515570640564, + "learning_rate": 1.9541085128709937e-05, + "loss": 1.296, + "step": 4153 + }, + { + "epoch": 1.2372531134235558, + "grad_norm": 0.2252977192401886, + "learning_rate": 1.954079623422852e-05, + "loss": 1.29, + "step": 4154 + }, + { + "epoch": 1.2375509596232246, + "grad_norm": 0.21374380588531494, + "learning_rate": 1.954050725098065e-05, + "loss": 1.3085, + "step": 4155 + }, + { + "epoch": 1.2378488058228931, + "grad_norm": 0.2219097912311554, + "learning_rate": 1.9540218178969018e-05, + "loss": 1.2814, + "step": 4156 + }, + { + "epoch": 1.238146652022562, + "grad_norm": 0.22387543320655823, + "learning_rate": 1.953992901819632e-05, + "loss": 1.3054, + "step": 4157 + }, + { + "epoch": 1.2384444982222305, + "grad_norm": 0.21464106440544128, + "learning_rate": 1.953963976866524e-05, + "loss": 1.3065, + "step": 4158 + }, + { + "epoch": 1.2387423444218992, + "grad_norm": 0.21472743153572083, + "learning_rate": 1.953935043037847e-05, + "loss": 1.302, + "step": 4159 + }, + { + "epoch": 1.2390401906215678, + "grad_norm": 0.20701003074645996, + "learning_rate": 1.95390610033387e-05, + "loss": 1.2993, + "step": 4160 + }, + { + "epoch": 1.2393380368212363, + "grad_norm": 0.21344922482967377, + "learning_rate": 1.9538771487548628e-05, + "loss": 1.2914, + "step": 4161 + }, + { + "epoch": 1.239635883020905, + "grad_norm": 0.21337179839611053, + "learning_rate": 1.9538481883010943e-05, + "loss": 1.3071, + "step": 4162 + }, + { + "epoch": 1.2399337292205737, + "grad_norm": 0.22430700063705444, + "learning_rate": 1.953819218972834e-05, + "loss": 1.2926, + "step": 4163 + }, + { + "epoch": 1.2402315754202424, + "grad_norm": 0.2090141922235489, + "learning_rate": 1.9537902407703514e-05, + "loss": 1.2822, + "step": 4164 + }, + { + "epoch": 1.240529421619911, + "grad_norm": 0.21654996275901794, + "learning_rate": 1.953761253693917e-05, + "loss": 1.2977, + "step": 4165 + }, + { + "epoch": 1.2408272678195797, + "grad_norm": 0.2089291512966156, + "learning_rate": 1.953732257743799e-05, + "loss": 1.3051, + "step": 4166 + }, + { + "epoch": 1.2411251140192483, + "grad_norm": 0.21138688921928406, + "learning_rate": 1.953703252920268e-05, + "loss": 1.2856, + "step": 4167 + }, + { + "epoch": 1.2414229602189168, + "grad_norm": 0.23540814220905304, + "learning_rate": 1.953674239223594e-05, + "loss": 1.2965, + "step": 4168 + }, + { + "epoch": 1.2417208064185856, + "grad_norm": 0.20688270032405853, + "learning_rate": 1.9536452166540468e-05, + "loss": 1.2927, + "step": 4169 + }, + { + "epoch": 1.2420186526182542, + "grad_norm": 0.20564410090446472, + "learning_rate": 1.953616185211896e-05, + "loss": 1.2848, + "step": 4170 + }, + { + "epoch": 1.242316498817923, + "grad_norm": 0.2275800108909607, + "learning_rate": 1.953587144897412e-05, + "loss": 1.2762, + "step": 4171 + }, + { + "epoch": 1.2426143450175915, + "grad_norm": 0.2123938351869583, + "learning_rate": 1.953558095710865e-05, + "loss": 1.2932, + "step": 4172 + }, + { + "epoch": 1.2429121912172603, + "grad_norm": 0.24131610989570618, + "learning_rate": 1.953529037652526e-05, + "loss": 1.2655, + "step": 4173 + }, + { + "epoch": 1.2432100374169288, + "grad_norm": 0.2212681919336319, + "learning_rate": 1.953499970722664e-05, + "loss": 1.2736, + "step": 4174 + }, + { + "epoch": 1.2435078836165974, + "grad_norm": 0.21585984528064728, + "learning_rate": 1.9534708949215505e-05, + "loss": 1.2828, + "step": 4175 + }, + { + "epoch": 1.2438057298162661, + "grad_norm": 0.20756615698337555, + "learning_rate": 1.9534418102494554e-05, + "loss": 1.2779, + "step": 4176 + }, + { + "epoch": 1.2441035760159347, + "grad_norm": 0.20586632192134857, + "learning_rate": 1.9534127167066495e-05, + "loss": 1.2708, + "step": 4177 + }, + { + "epoch": 1.2444014222156035, + "grad_norm": 0.21256816387176514, + "learning_rate": 1.9533836142934033e-05, + "loss": 1.2897, + "step": 4178 + }, + { + "epoch": 1.244699268415272, + "grad_norm": 0.21810956299304962, + "learning_rate": 1.953354503009988e-05, + "loss": 1.2873, + "step": 4179 + }, + { + "epoch": 1.2449971146149408, + "grad_norm": 0.20932936668395996, + "learning_rate": 1.953325382856674e-05, + "loss": 1.2842, + "step": 4180 + }, + { + "epoch": 1.2452949608146093, + "grad_norm": 0.20388562977313995, + "learning_rate": 1.9532962538337326e-05, + "loss": 1.2945, + "step": 4181 + }, + { + "epoch": 1.2455928070142779, + "grad_norm": 0.2170279175043106, + "learning_rate": 1.9532671159414346e-05, + "loss": 1.2986, + "step": 4182 + }, + { + "epoch": 1.2458906532139467, + "grad_norm": 0.20950999855995178, + "learning_rate": 1.953237969180051e-05, + "loss": 1.2841, + "step": 4183 + }, + { + "epoch": 1.2461884994136152, + "grad_norm": 0.20044785737991333, + "learning_rate": 1.9532088135498535e-05, + "loss": 1.2891, + "step": 4184 + }, + { + "epoch": 1.246486345613284, + "grad_norm": 0.22087042033672333, + "learning_rate": 1.9531796490511126e-05, + "loss": 1.2682, + "step": 4185 + }, + { + "epoch": 1.2467841918129525, + "grad_norm": 0.2074468731880188, + "learning_rate": 1.9531504756841003e-05, + "loss": 1.2948, + "step": 4186 + }, + { + "epoch": 1.2470820380126213, + "grad_norm": 0.21442781388759613, + "learning_rate": 1.9531212934490874e-05, + "loss": 1.2814, + "step": 4187 + }, + { + "epoch": 1.2473798842122898, + "grad_norm": 0.21842724084854126, + "learning_rate": 1.953092102346346e-05, + "loss": 1.2786, + "step": 4188 + }, + { + "epoch": 1.2476777304119586, + "grad_norm": 0.22215518355369568, + "learning_rate": 1.9530629023761475e-05, + "loss": 1.2873, + "step": 4189 + }, + { + "epoch": 1.2479755766116272, + "grad_norm": 0.2289196401834488, + "learning_rate": 1.9530336935387632e-05, + "loss": 1.3195, + "step": 4190 + }, + { + "epoch": 1.248273422811296, + "grad_norm": 0.20055343210697174, + "learning_rate": 1.9530044758344652e-05, + "loss": 1.283, + "step": 4191 + }, + { + "epoch": 1.2485712690109645, + "grad_norm": 0.2111535668373108, + "learning_rate": 1.952975249263526e-05, + "loss": 1.2922, + "step": 4192 + }, + { + "epoch": 1.248869115210633, + "grad_norm": 0.21124912798404694, + "learning_rate": 1.952946013826216e-05, + "loss": 1.3003, + "step": 4193 + }, + { + "epoch": 1.2491669614103018, + "grad_norm": 0.20609290897846222, + "learning_rate": 1.9529167695228084e-05, + "loss": 1.3008, + "step": 4194 + }, + { + "epoch": 1.2494648076099704, + "grad_norm": 0.2232304811477661, + "learning_rate": 1.9528875163535747e-05, + "loss": 1.267, + "step": 4195 + }, + { + "epoch": 1.2497626538096391, + "grad_norm": 0.22072570025920868, + "learning_rate": 1.9528582543187876e-05, + "loss": 1.2904, + "step": 4196 + }, + { + "epoch": 1.2500605000093077, + "grad_norm": 0.21208128333091736, + "learning_rate": 1.952828983418719e-05, + "loss": 1.296, + "step": 4197 + }, + { + "epoch": 1.2503583462089765, + "grad_norm": 0.2197960913181305, + "learning_rate": 1.952799703653641e-05, + "loss": 1.2944, + "step": 4198 + }, + { + "epoch": 1.250656192408645, + "grad_norm": 0.21740934252738953, + "learning_rate": 1.9527704150238268e-05, + "loss": 1.2844, + "step": 4199 + }, + { + "epoch": 1.2509540386083136, + "grad_norm": 0.21597795188426971, + "learning_rate": 1.952741117529548e-05, + "loss": 1.2693, + "step": 4200 + }, + { + "epoch": 1.2512518848079823, + "grad_norm": 0.2201114147901535, + "learning_rate": 1.9527118111710775e-05, + "loss": 1.3139, + "step": 4201 + }, + { + "epoch": 1.2515497310076509, + "grad_norm": 0.22156377136707306, + "learning_rate": 1.9526824959486884e-05, + "loss": 1.2858, + "step": 4202 + }, + { + "epoch": 1.2518475772073197, + "grad_norm": 0.22167210280895233, + "learning_rate": 1.9526531718626525e-05, + "loss": 1.2826, + "step": 4203 + }, + { + "epoch": 1.2521454234069882, + "grad_norm": 0.2108449786901474, + "learning_rate": 1.9526238389132435e-05, + "loss": 1.284, + "step": 4204 + }, + { + "epoch": 1.252443269606657, + "grad_norm": 0.2104608416557312, + "learning_rate": 1.952594497100734e-05, + "loss": 1.2766, + "step": 4205 + }, + { + "epoch": 1.2527411158063255, + "grad_norm": 0.20697778463363647, + "learning_rate": 1.9525651464253972e-05, + "loss": 1.2892, + "step": 4206 + }, + { + "epoch": 1.253038962005994, + "grad_norm": 0.22090132534503937, + "learning_rate": 1.9525357868875057e-05, + "loss": 1.2807, + "step": 4207 + }, + { + "epoch": 1.2533368082056628, + "grad_norm": 0.2159069925546646, + "learning_rate": 1.9525064184873332e-05, + "loss": 1.2731, + "step": 4208 + }, + { + "epoch": 1.2536346544053314, + "grad_norm": 0.22110089659690857, + "learning_rate": 1.9524770412251523e-05, + "loss": 1.2902, + "step": 4209 + }, + { + "epoch": 1.2539325006050002, + "grad_norm": 0.21377326548099518, + "learning_rate": 1.952447655101237e-05, + "loss": 1.2809, + "step": 4210 + }, + { + "epoch": 1.2542303468046687, + "grad_norm": 0.22198982536792755, + "learning_rate": 1.9524182601158604e-05, + "loss": 1.2934, + "step": 4211 + }, + { + "epoch": 1.2545281930043375, + "grad_norm": 0.22517961263656616, + "learning_rate": 1.952388856269296e-05, + "loss": 1.2922, + "step": 4212 + }, + { + "epoch": 1.254826039204006, + "grad_norm": 0.21857579052448273, + "learning_rate": 1.9523594435618173e-05, + "loss": 1.2959, + "step": 4213 + }, + { + "epoch": 1.2551238854036746, + "grad_norm": 0.2065434604883194, + "learning_rate": 1.952330021993698e-05, + "loss": 1.2927, + "step": 4214 + }, + { + "epoch": 1.2554217316033434, + "grad_norm": 0.21812330186367035, + "learning_rate": 1.952300591565212e-05, + "loss": 1.2897, + "step": 4215 + }, + { + "epoch": 1.255719577803012, + "grad_norm": 0.2338680475950241, + "learning_rate": 1.9522711522766328e-05, + "loss": 1.2997, + "step": 4216 + }, + { + "epoch": 1.2560174240026807, + "grad_norm": 0.3979673683643341, + "learning_rate": 1.9522417041282344e-05, + "loss": 1.2781, + "step": 4217 + }, + { + "epoch": 1.2563152702023492, + "grad_norm": 0.25537681579589844, + "learning_rate": 1.952212247120291e-05, + "loss": 1.305, + "step": 4218 + }, + { + "epoch": 1.256613116402018, + "grad_norm": 0.24418216943740845, + "learning_rate": 1.952182781253077e-05, + "loss": 1.2891, + "step": 4219 + }, + { + "epoch": 1.2569109626016866, + "grad_norm": 0.24024534225463867, + "learning_rate": 1.9521533065268652e-05, + "loss": 1.2926, + "step": 4220 + }, + { + "epoch": 1.257208808801355, + "grad_norm": 0.23503734171390533, + "learning_rate": 1.952123822941931e-05, + "loss": 1.2916, + "step": 4221 + }, + { + "epoch": 1.2575066550010239, + "grad_norm": 0.21015004813671112, + "learning_rate": 1.9520943304985484e-05, + "loss": 1.2885, + "step": 4222 + }, + { + "epoch": 1.2578045012006924, + "grad_norm": 0.22911418974399567, + "learning_rate": 1.9520648291969918e-05, + "loss": 1.2957, + "step": 4223 + }, + { + "epoch": 1.2581023474003612, + "grad_norm": 0.22934724390506744, + "learning_rate": 1.9520353190375356e-05, + "loss": 1.2866, + "step": 4224 + }, + { + "epoch": 1.2584001936000297, + "grad_norm": 0.2139049470424652, + "learning_rate": 1.9520058000204546e-05, + "loss": 1.2728, + "step": 4225 + }, + { + "epoch": 1.2586980397996985, + "grad_norm": 0.23983533680438995, + "learning_rate": 1.9519762721460233e-05, + "loss": 1.3024, + "step": 4226 + }, + { + "epoch": 1.258995885999367, + "grad_norm": 0.21902920305728912, + "learning_rate": 1.9519467354145165e-05, + "loss": 1.2855, + "step": 4227 + }, + { + "epoch": 1.2592937321990356, + "grad_norm": 0.2280544638633728, + "learning_rate": 1.9519171898262084e-05, + "loss": 1.2984, + "step": 4228 + }, + { + "epoch": 1.2595915783987044, + "grad_norm": 0.2220582515001297, + "learning_rate": 1.951887635381375e-05, + "loss": 1.2788, + "step": 4229 + }, + { + "epoch": 1.259889424598373, + "grad_norm": 0.22434242069721222, + "learning_rate": 1.9518580720802902e-05, + "loss": 1.2805, + "step": 4230 + }, + { + "epoch": 1.2601872707980417, + "grad_norm": 0.22028429806232452, + "learning_rate": 1.9518284999232298e-05, + "loss": 1.3068, + "step": 4231 + }, + { + "epoch": 1.2604851169977103, + "grad_norm": 0.21655960381031036, + "learning_rate": 1.9517989189104685e-05, + "loss": 1.2782, + "step": 4232 + }, + { + "epoch": 1.260782963197379, + "grad_norm": 0.22345635294914246, + "learning_rate": 1.9517693290422815e-05, + "loss": 1.288, + "step": 4233 + }, + { + "epoch": 1.2610808093970476, + "grad_norm": 0.24837669730186462, + "learning_rate": 1.9517397303189445e-05, + "loss": 1.277, + "step": 4234 + }, + { + "epoch": 1.2613786555967161, + "grad_norm": 0.21125711500644684, + "learning_rate": 1.9517101227407326e-05, + "loss": 1.2771, + "step": 4235 + }, + { + "epoch": 1.261676501796385, + "grad_norm": 0.23232734203338623, + "learning_rate": 1.9516805063079217e-05, + "loss": 1.3007, + "step": 4236 + }, + { + "epoch": 1.2619743479960535, + "grad_norm": 0.22613781690597534, + "learning_rate": 1.9516508810207865e-05, + "loss": 1.2769, + "step": 4237 + }, + { + "epoch": 1.2622721941957222, + "grad_norm": 0.21599330008029938, + "learning_rate": 1.9516212468796033e-05, + "loss": 1.3, + "step": 4238 + }, + { + "epoch": 1.2625700403953908, + "grad_norm": 0.2066168487071991, + "learning_rate": 1.9515916038846474e-05, + "loss": 1.2621, + "step": 4239 + }, + { + "epoch": 1.2628678865950596, + "grad_norm": 0.2147272527217865, + "learning_rate": 1.951561952036195e-05, + "loss": 1.2962, + "step": 4240 + }, + { + "epoch": 1.263165732794728, + "grad_norm": 0.2461201250553131, + "learning_rate": 1.9515322913345218e-05, + "loss": 1.2731, + "step": 4241 + }, + { + "epoch": 1.2634635789943967, + "grad_norm": 0.21921150386333466, + "learning_rate": 1.951502621779904e-05, + "loss": 1.3004, + "step": 4242 + }, + { + "epoch": 1.2637614251940654, + "grad_norm": 0.21837396919727325, + "learning_rate": 1.9514729433726166e-05, + "loss": 1.2832, + "step": 4243 + }, + { + "epoch": 1.2640592713937342, + "grad_norm": 0.21291287243366241, + "learning_rate": 1.951443256112937e-05, + "loss": 1.2755, + "step": 4244 + }, + { + "epoch": 1.2643571175934027, + "grad_norm": 0.21658475697040558, + "learning_rate": 1.951413560001141e-05, + "loss": 1.2783, + "step": 4245 + }, + { + "epoch": 1.2646549637930713, + "grad_norm": 0.24197445809841156, + "learning_rate": 1.951383855037505e-05, + "loss": 1.2942, + "step": 4246 + }, + { + "epoch": 1.26495280999274, + "grad_norm": 0.23163466155529022, + "learning_rate": 1.9513541412223045e-05, + "loss": 1.3004, + "step": 4247 + }, + { + "epoch": 1.2652506561924086, + "grad_norm": 0.22297687828540802, + "learning_rate": 1.951324418555817e-05, + "loss": 1.2846, + "step": 4248 + }, + { + "epoch": 1.2655485023920772, + "grad_norm": 0.2045927494764328, + "learning_rate": 1.9512946870383186e-05, + "loss": 1.2692, + "step": 4249 + }, + { + "epoch": 1.265846348591746, + "grad_norm": 0.2474483996629715, + "learning_rate": 1.951264946670086e-05, + "loss": 1.2793, + "step": 4250 + }, + { + "epoch": 1.2661441947914147, + "grad_norm": 0.21718846261501312, + "learning_rate": 1.9512351974513963e-05, + "loss": 1.294, + "step": 4251 + }, + { + "epoch": 1.2664420409910833, + "grad_norm": 0.22226892411708832, + "learning_rate": 1.9512054393825255e-05, + "loss": 1.2919, + "step": 4252 + }, + { + "epoch": 1.2667398871907518, + "grad_norm": 0.22698166966438293, + "learning_rate": 1.9511756724637508e-05, + "loss": 1.267, + "step": 4253 + }, + { + "epoch": 1.2670377333904206, + "grad_norm": 0.21398533880710602, + "learning_rate": 1.9511458966953493e-05, + "loss": 1.2783, + "step": 4254 + }, + { + "epoch": 1.2673355795900891, + "grad_norm": 0.2410353273153305, + "learning_rate": 1.951116112077598e-05, + "loss": 1.304, + "step": 4255 + }, + { + "epoch": 1.2676334257897577, + "grad_norm": 0.2103150188922882, + "learning_rate": 1.9510863186107737e-05, + "loss": 1.2822, + "step": 4256 + }, + { + "epoch": 1.2679312719894265, + "grad_norm": 0.22095987200737, + "learning_rate": 1.9510565162951538e-05, + "loss": 1.2747, + "step": 4257 + }, + { + "epoch": 1.2682291181890952, + "grad_norm": 0.20994897186756134, + "learning_rate": 1.9510267051310157e-05, + "loss": 1.2905, + "step": 4258 + }, + { + "epoch": 1.2685269643887638, + "grad_norm": 0.19952505826950073, + "learning_rate": 1.9509968851186367e-05, + "loss": 1.2934, + "step": 4259 + }, + { + "epoch": 1.2688248105884323, + "grad_norm": 0.22760345041751862, + "learning_rate": 1.950967056258294e-05, + "loss": 1.2726, + "step": 4260 + }, + { + "epoch": 1.269122656788101, + "grad_norm": 0.21213096380233765, + "learning_rate": 1.9509372185502655e-05, + "loss": 1.2858, + "step": 4261 + }, + { + "epoch": 1.2694205029877697, + "grad_norm": 0.21417930722236633, + "learning_rate": 1.9509073719948287e-05, + "loss": 1.284, + "step": 4262 + }, + { + "epoch": 1.2697183491874382, + "grad_norm": 0.21398523449897766, + "learning_rate": 1.950877516592261e-05, + "loss": 1.2757, + "step": 4263 + }, + { + "epoch": 1.270016195387107, + "grad_norm": 0.1974363923072815, + "learning_rate": 1.9508476523428407e-05, + "loss": 1.2671, + "step": 4264 + }, + { + "epoch": 1.2703140415867757, + "grad_norm": 0.22823457419872284, + "learning_rate": 1.950817779246845e-05, + "loss": 1.289, + "step": 4265 + }, + { + "epoch": 1.2706118877864443, + "grad_norm": 0.20725269615650177, + "learning_rate": 1.9507878973045524e-05, + "loss": 1.2962, + "step": 4266 + }, + { + "epoch": 1.2709097339861128, + "grad_norm": 0.21264007687568665, + "learning_rate": 1.9507580065162405e-05, + "loss": 1.2832, + "step": 4267 + }, + { + "epoch": 1.2712075801857816, + "grad_norm": 0.2234097719192505, + "learning_rate": 1.950728106882188e-05, + "loss": 1.2933, + "step": 4268 + }, + { + "epoch": 1.2715054263854502, + "grad_norm": 0.20301519334316254, + "learning_rate": 1.9506981984026723e-05, + "loss": 1.2858, + "step": 4269 + }, + { + "epoch": 1.271803272585119, + "grad_norm": 0.20755331218242645, + "learning_rate": 1.9506682810779722e-05, + "loss": 1.2939, + "step": 4270 + }, + { + "epoch": 1.2721011187847875, + "grad_norm": 0.2089110016822815, + "learning_rate": 1.950638354908366e-05, + "loss": 1.3018, + "step": 4271 + }, + { + "epoch": 1.2723989649844563, + "grad_norm": 0.20934276282787323, + "learning_rate": 1.9506084198941316e-05, + "loss": 1.2788, + "step": 4272 + }, + { + "epoch": 1.2726968111841248, + "grad_norm": 0.2058245688676834, + "learning_rate": 1.9505784760355485e-05, + "loss": 1.279, + "step": 4273 + }, + { + "epoch": 1.2729946573837934, + "grad_norm": 0.2077820897102356, + "learning_rate": 1.9505485233328944e-05, + "loss": 1.2879, + "step": 4274 + }, + { + "epoch": 1.2732925035834621, + "grad_norm": 0.21222275495529175, + "learning_rate": 1.9505185617864483e-05, + "loss": 1.2802, + "step": 4275 + }, + { + "epoch": 1.2735903497831307, + "grad_norm": 0.21486896276474, + "learning_rate": 1.950488591396489e-05, + "loss": 1.2902, + "step": 4276 + }, + { + "epoch": 1.2738881959827995, + "grad_norm": 0.24208930134773254, + "learning_rate": 1.9504586121632957e-05, + "loss": 1.2848, + "step": 4277 + }, + { + "epoch": 1.274186042182468, + "grad_norm": 0.211529940366745, + "learning_rate": 1.950428624087146e-05, + "loss": 1.3098, + "step": 4278 + }, + { + "epoch": 1.2744838883821368, + "grad_norm": 0.2138810157775879, + "learning_rate": 1.9503986271683206e-05, + "loss": 1.2828, + "step": 4279 + }, + { + "epoch": 1.2747817345818053, + "grad_norm": 0.2132432907819748, + "learning_rate": 1.9503686214070975e-05, + "loss": 1.2861, + "step": 4280 + }, + { + "epoch": 1.2750795807814739, + "grad_norm": 0.2315203845500946, + "learning_rate": 1.9503386068037568e-05, + "loss": 1.2804, + "step": 4281 + }, + { + "epoch": 1.2753774269811426, + "grad_norm": 0.21458743512630463, + "learning_rate": 1.9503085833585766e-05, + "loss": 1.2748, + "step": 4282 + }, + { + "epoch": 1.2756752731808112, + "grad_norm": 0.2249085009098053, + "learning_rate": 1.9502785510718368e-05, + "loss": 1.2868, + "step": 4283 + }, + { + "epoch": 1.27597311938048, + "grad_norm": 0.2124832719564438, + "learning_rate": 1.9502485099438165e-05, + "loss": 1.2944, + "step": 4284 + }, + { + "epoch": 1.2762709655801485, + "grad_norm": 0.2258549928665161, + "learning_rate": 1.950218459974796e-05, + "loss": 1.2939, + "step": 4285 + }, + { + "epoch": 1.2765688117798173, + "grad_norm": 0.22272054851055145, + "learning_rate": 1.9501884011650542e-05, + "loss": 1.2998, + "step": 4286 + }, + { + "epoch": 1.2768666579794858, + "grad_norm": 0.21465766429901123, + "learning_rate": 1.950158333514871e-05, + "loss": 1.2914, + "step": 4287 + }, + { + "epoch": 1.2771645041791544, + "grad_norm": 0.21221554279327393, + "learning_rate": 1.950128257024526e-05, + "loss": 1.2734, + "step": 4288 + }, + { + "epoch": 1.2774623503788232, + "grad_norm": 0.2124350517988205, + "learning_rate": 1.950098171694299e-05, + "loss": 1.2894, + "step": 4289 + }, + { + "epoch": 1.2777601965784917, + "grad_norm": 0.2039698362350464, + "learning_rate": 1.9500680775244702e-05, + "loss": 1.2772, + "step": 4290 + }, + { + "epoch": 1.2780580427781605, + "grad_norm": 0.2236998826265335, + "learning_rate": 1.9500379745153193e-05, + "loss": 1.305, + "step": 4291 + }, + { + "epoch": 1.278355888977829, + "grad_norm": 0.21744026243686676, + "learning_rate": 1.9500078626671268e-05, + "loss": 1.299, + "step": 4292 + }, + { + "epoch": 1.2786537351774978, + "grad_norm": 0.215065598487854, + "learning_rate": 1.9499777419801722e-05, + "loss": 1.2762, + "step": 4293 + }, + { + "epoch": 1.2789515813771664, + "grad_norm": 0.20630770921707153, + "learning_rate": 1.949947612454736e-05, + "loss": 1.2982, + "step": 4294 + }, + { + "epoch": 1.279249427576835, + "grad_norm": 0.22154337167739868, + "learning_rate": 1.949917474091099e-05, + "loss": 1.2827, + "step": 4295 + }, + { + "epoch": 1.2795472737765037, + "grad_norm": 0.20596937835216522, + "learning_rate": 1.949887326889541e-05, + "loss": 1.2794, + "step": 4296 + }, + { + "epoch": 1.2798451199761722, + "grad_norm": 0.20480772852897644, + "learning_rate": 1.949857170850343e-05, + "loss": 1.304, + "step": 4297 + }, + { + "epoch": 1.280142966175841, + "grad_norm": 0.24925456941127777, + "learning_rate": 1.9498270059737846e-05, + "loss": 1.2814, + "step": 4298 + }, + { + "epoch": 1.2804408123755096, + "grad_norm": 0.23596645891666412, + "learning_rate": 1.9497968322601478e-05, + "loss": 1.2875, + "step": 4299 + }, + { + "epoch": 1.2807386585751783, + "grad_norm": 0.26264750957489014, + "learning_rate": 1.9497666497097123e-05, + "loss": 1.2729, + "step": 4300 + }, + { + "epoch": 1.2810365047748469, + "grad_norm": 0.2836257815361023, + "learning_rate": 1.9497364583227594e-05, + "loss": 1.2886, + "step": 4301 + }, + { + "epoch": 1.2813343509745154, + "grad_norm": 0.21951961517333984, + "learning_rate": 1.9497062580995697e-05, + "loss": 1.2787, + "step": 4302 + }, + { + "epoch": 1.2816321971741842, + "grad_norm": 0.2535304129123688, + "learning_rate": 1.9496760490404246e-05, + "loss": 1.289, + "step": 4303 + }, + { + "epoch": 1.2819300433738527, + "grad_norm": 0.22259165346622467, + "learning_rate": 1.9496458311456048e-05, + "loss": 1.279, + "step": 4304 + }, + { + "epoch": 1.2822278895735215, + "grad_norm": 0.22210922837257385, + "learning_rate": 1.9496156044153914e-05, + "loss": 1.2869, + "step": 4305 + }, + { + "epoch": 1.28252573577319, + "grad_norm": 0.250213086605072, + "learning_rate": 1.949585368850066e-05, + "loss": 1.2945, + "step": 4306 + }, + { + "epoch": 1.2828235819728588, + "grad_norm": 0.22138406336307526, + "learning_rate": 1.9495551244499092e-05, + "loss": 1.2767, + "step": 4307 + }, + { + "epoch": 1.2831214281725274, + "grad_norm": 0.21412420272827148, + "learning_rate": 1.9495248712152035e-05, + "loss": 1.279, + "step": 4308 + }, + { + "epoch": 1.283419274372196, + "grad_norm": 0.22160974144935608, + "learning_rate": 1.9494946091462294e-05, + "loss": 1.2808, + "step": 4309 + }, + { + "epoch": 1.2837171205718647, + "grad_norm": 0.2283344566822052, + "learning_rate": 1.949464338243269e-05, + "loss": 1.2845, + "step": 4310 + }, + { + "epoch": 1.2840149667715335, + "grad_norm": 0.2155960649251938, + "learning_rate": 1.9494340585066033e-05, + "loss": 1.2865, + "step": 4311 + }, + { + "epoch": 1.284312812971202, + "grad_norm": 0.21037152409553528, + "learning_rate": 1.9494037699365148e-05, + "loss": 1.2828, + "step": 4312 + }, + { + "epoch": 1.2846106591708706, + "grad_norm": 0.22889646887779236, + "learning_rate": 1.949373472533285e-05, + "loss": 1.2761, + "step": 4313 + }, + { + "epoch": 1.2849085053705394, + "grad_norm": 0.23508226871490479, + "learning_rate": 1.9493431662971956e-05, + "loss": 1.2841, + "step": 4314 + }, + { + "epoch": 1.285206351570208, + "grad_norm": 0.2230076789855957, + "learning_rate": 1.949312851228529e-05, + "loss": 1.2909, + "step": 4315 + }, + { + "epoch": 1.2855041977698765, + "grad_norm": 0.21971538662910461, + "learning_rate": 1.9492825273275667e-05, + "loss": 1.2876, + "step": 4316 + }, + { + "epoch": 1.2858020439695452, + "grad_norm": 0.22598077356815338, + "learning_rate": 1.9492521945945912e-05, + "loss": 1.2783, + "step": 4317 + }, + { + "epoch": 1.286099890169214, + "grad_norm": 0.2192184180021286, + "learning_rate": 1.9492218530298843e-05, + "loss": 1.2868, + "step": 4318 + }, + { + "epoch": 1.2863977363688825, + "grad_norm": 0.225177600979805, + "learning_rate": 1.949191502633729e-05, + "loss": 1.2822, + "step": 4319 + }, + { + "epoch": 1.286695582568551, + "grad_norm": 0.23144379258155823, + "learning_rate": 1.9491611434064072e-05, + "loss": 1.2797, + "step": 4320 + }, + { + "epoch": 1.2869934287682199, + "grad_norm": 0.21540115773677826, + "learning_rate": 1.949130775348201e-05, + "loss": 1.275, + "step": 4321 + }, + { + "epoch": 1.2872912749678884, + "grad_norm": 0.20263580977916718, + "learning_rate": 1.9491003984593936e-05, + "loss": 1.2993, + "step": 4322 + }, + { + "epoch": 1.287589121167557, + "grad_norm": 0.20649705827236176, + "learning_rate": 1.9490700127402676e-05, + "loss": 1.3125, + "step": 4323 + }, + { + "epoch": 1.2878869673672257, + "grad_norm": 0.22655902802944183, + "learning_rate": 1.9490396181911054e-05, + "loss": 1.2771, + "step": 4324 + }, + { + "epoch": 1.2881848135668945, + "grad_norm": 0.21653781831264496, + "learning_rate": 1.9490092148121898e-05, + "loss": 1.2728, + "step": 4325 + }, + { + "epoch": 1.288482659766563, + "grad_norm": 0.2190086841583252, + "learning_rate": 1.9489788026038038e-05, + "loss": 1.2875, + "step": 4326 + }, + { + "epoch": 1.2887805059662316, + "grad_norm": 0.22180475294589996, + "learning_rate": 1.94894838156623e-05, + "loss": 1.2889, + "step": 4327 + }, + { + "epoch": 1.2890783521659004, + "grad_norm": 0.21253114938735962, + "learning_rate": 1.9489179516997522e-05, + "loss": 1.3026, + "step": 4328 + }, + { + "epoch": 1.289376198365569, + "grad_norm": 0.21621105074882507, + "learning_rate": 1.9488875130046528e-05, + "loss": 1.2805, + "step": 4329 + }, + { + "epoch": 1.2896740445652375, + "grad_norm": 0.2294853925704956, + "learning_rate": 1.9488570654812152e-05, + "loss": 1.3019, + "step": 4330 + }, + { + "epoch": 1.2899718907649063, + "grad_norm": 0.21360130608081818, + "learning_rate": 1.948826609129723e-05, + "loss": 1.3032, + "step": 4331 + }, + { + "epoch": 1.290269736964575, + "grad_norm": 0.21647316217422485, + "learning_rate": 1.948796143950459e-05, + "loss": 1.2728, + "step": 4332 + }, + { + "epoch": 1.2905675831642436, + "grad_norm": 0.21768029034137726, + "learning_rate": 1.9487656699437073e-05, + "loss": 1.2735, + "step": 4333 + }, + { + "epoch": 1.2908654293639121, + "grad_norm": 0.2219466269016266, + "learning_rate": 1.9487351871097507e-05, + "loss": 1.2781, + "step": 4334 + }, + { + "epoch": 1.291163275563581, + "grad_norm": 0.21864573657512665, + "learning_rate": 1.9487046954488737e-05, + "loss": 1.292, + "step": 4335 + }, + { + "epoch": 1.2914611217632495, + "grad_norm": 0.23011457920074463, + "learning_rate": 1.9486741949613587e-05, + "loss": 1.2991, + "step": 4336 + }, + { + "epoch": 1.2917589679629182, + "grad_norm": 0.22989176213741302, + "learning_rate": 1.9486436856474907e-05, + "loss": 1.2688, + "step": 4337 + }, + { + "epoch": 1.2920568141625868, + "grad_norm": 0.2277642786502838, + "learning_rate": 1.948613167507553e-05, + "loss": 1.2934, + "step": 4338 + }, + { + "epoch": 1.2923546603622555, + "grad_norm": 0.21044525504112244, + "learning_rate": 1.9485826405418297e-05, + "loss": 1.2959, + "step": 4339 + }, + { + "epoch": 1.292652506561924, + "grad_norm": 0.22159777581691742, + "learning_rate": 1.9485521047506045e-05, + "loss": 1.302, + "step": 4340 + }, + { + "epoch": 1.2929503527615926, + "grad_norm": 0.22066263854503632, + "learning_rate": 1.948521560134162e-05, + "loss": 1.2753, + "step": 4341 + }, + { + "epoch": 1.2932481989612614, + "grad_norm": 0.20811598002910614, + "learning_rate": 1.9484910066927862e-05, + "loss": 1.295, + "step": 4342 + }, + { + "epoch": 1.29354604516093, + "grad_norm": 0.2093423455953598, + "learning_rate": 1.9484604444267613e-05, + "loss": 1.3022, + "step": 4343 + }, + { + "epoch": 1.2938438913605987, + "grad_norm": 0.21169036626815796, + "learning_rate": 1.9484298733363715e-05, + "loss": 1.2762, + "step": 4344 + }, + { + "epoch": 1.2941417375602673, + "grad_norm": 0.2275347262620926, + "learning_rate": 1.9483992934219014e-05, + "loss": 1.2982, + "step": 4345 + }, + { + "epoch": 1.294439583759936, + "grad_norm": 0.20272690057754517, + "learning_rate": 1.9483687046836354e-05, + "loss": 1.276, + "step": 4346 + }, + { + "epoch": 1.2947374299596046, + "grad_norm": 0.21795092523097992, + "learning_rate": 1.9483381071218583e-05, + "loss": 1.2839, + "step": 4347 + }, + { + "epoch": 1.2950352761592732, + "grad_norm": 0.22539231181144714, + "learning_rate": 1.9483075007368544e-05, + "loss": 1.2993, + "step": 4348 + }, + { + "epoch": 1.295333122358942, + "grad_norm": 0.2134052813053131, + "learning_rate": 1.948276885528909e-05, + "loss": 1.2973, + "step": 4349 + }, + { + "epoch": 1.2956309685586105, + "grad_norm": 0.21409595012664795, + "learning_rate": 1.9482462614983065e-05, + "loss": 1.2858, + "step": 4350 + }, + { + "epoch": 1.2959288147582793, + "grad_norm": 0.2155877649784088, + "learning_rate": 1.9482156286453323e-05, + "loss": 1.2809, + "step": 4351 + }, + { + "epoch": 1.2962266609579478, + "grad_norm": 0.22553592920303345, + "learning_rate": 1.9481849869702708e-05, + "loss": 1.276, + "step": 4352 + }, + { + "epoch": 1.2965245071576166, + "grad_norm": 0.2257729023694992, + "learning_rate": 1.9481543364734075e-05, + "loss": 1.2804, + "step": 4353 + }, + { + "epoch": 1.2968223533572851, + "grad_norm": 0.20933204889297485, + "learning_rate": 1.9481236771550275e-05, + "loss": 1.2852, + "step": 4354 + }, + { + "epoch": 1.2971201995569537, + "grad_norm": 0.21046243607997894, + "learning_rate": 1.948093009015416e-05, + "loss": 1.2729, + "step": 4355 + }, + { + "epoch": 1.2974180457566225, + "grad_norm": 0.21620683372020721, + "learning_rate": 1.948062332054858e-05, + "loss": 1.2835, + "step": 4356 + }, + { + "epoch": 1.297715891956291, + "grad_norm": 0.22007808089256287, + "learning_rate": 1.9480316462736394e-05, + "loss": 1.2873, + "step": 4357 + }, + { + "epoch": 1.2980137381559598, + "grad_norm": 0.2278037965297699, + "learning_rate": 1.9480009516720457e-05, + "loss": 1.291, + "step": 4358 + }, + { + "epoch": 1.2983115843556283, + "grad_norm": 0.20690450072288513, + "learning_rate": 1.947970248250362e-05, + "loss": 1.2852, + "step": 4359 + }, + { + "epoch": 1.298609430555297, + "grad_norm": 0.21415205299854279, + "learning_rate": 1.9479395360088744e-05, + "loss": 1.2865, + "step": 4360 + }, + { + "epoch": 1.2989072767549656, + "grad_norm": 0.21674586832523346, + "learning_rate": 1.9479088149478688e-05, + "loss": 1.2806, + "step": 4361 + }, + { + "epoch": 1.2992051229546342, + "grad_norm": 0.21887333691120148, + "learning_rate": 1.9478780850676303e-05, + "loss": 1.3123, + "step": 4362 + }, + { + "epoch": 1.299502969154303, + "grad_norm": 0.22247789800167084, + "learning_rate": 1.9478473463684456e-05, + "loss": 1.2868, + "step": 4363 + }, + { + "epoch": 1.2998008153539715, + "grad_norm": 0.2039203941822052, + "learning_rate": 1.9478165988506003e-05, + "loss": 1.287, + "step": 4364 + }, + { + "epoch": 1.3000986615536403, + "grad_norm": 0.22355547547340393, + "learning_rate": 1.9477858425143804e-05, + "loss": 1.2792, + "step": 4365 + }, + { + "epoch": 1.3003965077533088, + "grad_norm": 0.21715961396694183, + "learning_rate": 1.9477550773600718e-05, + "loss": 1.304, + "step": 4366 + }, + { + "epoch": 1.3006943539529776, + "grad_norm": 0.20409496128559113, + "learning_rate": 1.9477243033879615e-05, + "loss": 1.2747, + "step": 4367 + }, + { + "epoch": 1.3009922001526462, + "grad_norm": 0.2287827432155609, + "learning_rate": 1.9476935205983355e-05, + "loss": 1.2928, + "step": 4368 + }, + { + "epoch": 1.3012900463523147, + "grad_norm": 0.21227742731571198, + "learning_rate": 1.94766272899148e-05, + "loss": 1.274, + "step": 4369 + }, + { + "epoch": 1.3015878925519835, + "grad_norm": 0.2068890929222107, + "learning_rate": 1.9476319285676817e-05, + "loss": 1.2727, + "step": 4370 + }, + { + "epoch": 1.301885738751652, + "grad_norm": 0.2015000283718109, + "learning_rate": 1.947601119327227e-05, + "loss": 1.2805, + "step": 4371 + }, + { + "epoch": 1.3021835849513208, + "grad_norm": 0.21332566440105438, + "learning_rate": 1.9475703012704026e-05, + "loss": 1.2968, + "step": 4372 + }, + { + "epoch": 1.3024814311509894, + "grad_norm": 0.2227504700422287, + "learning_rate": 1.947539474397495e-05, + "loss": 1.2885, + "step": 4373 + }, + { + "epoch": 1.3027792773506581, + "grad_norm": 0.22368992865085602, + "learning_rate": 1.947508638708792e-05, + "loss": 1.2901, + "step": 4374 + }, + { + "epoch": 1.3030771235503267, + "grad_norm": 0.2185707837343216, + "learning_rate": 1.9474777942045787e-05, + "loss": 1.2711, + "step": 4375 + }, + { + "epoch": 1.3033749697499952, + "grad_norm": 0.23849822580814362, + "learning_rate": 1.9474469408851436e-05, + "loss": 1.3015, + "step": 4376 + }, + { + "epoch": 1.303672815949664, + "grad_norm": 0.22657613456249237, + "learning_rate": 1.9474160787507735e-05, + "loss": 1.3063, + "step": 4377 + }, + { + "epoch": 1.3039706621493328, + "grad_norm": 0.23561915755271912, + "learning_rate": 1.947385207801755e-05, + "loss": 1.2935, + "step": 4378 + }, + { + "epoch": 1.3042685083490013, + "grad_norm": 0.23508505523204803, + "learning_rate": 1.9473543280383755e-05, + "loss": 1.2923, + "step": 4379 + }, + { + "epoch": 1.3045663545486699, + "grad_norm": 0.23660191893577576, + "learning_rate": 1.947323439460923e-05, + "loss": 1.2775, + "step": 4380 + }, + { + "epoch": 1.3048642007483386, + "grad_norm": 0.2111383080482483, + "learning_rate": 1.9472925420696836e-05, + "loss": 1.2892, + "step": 4381 + }, + { + "epoch": 1.3051620469480072, + "grad_norm": 0.22012507915496826, + "learning_rate": 1.9472616358649458e-05, + "loss": 1.273, + "step": 4382 + }, + { + "epoch": 1.3054598931476757, + "grad_norm": 0.2353675812482834, + "learning_rate": 1.947230720846997e-05, + "loss": 1.2778, + "step": 4383 + }, + { + "epoch": 1.3057577393473445, + "grad_norm": 0.2224927544593811, + "learning_rate": 1.9471997970161242e-05, + "loss": 1.3011, + "step": 4384 + }, + { + "epoch": 1.3060555855470133, + "grad_norm": 0.22048144042491913, + "learning_rate": 1.947168864372616e-05, + "loss": 1.2934, + "step": 4385 + }, + { + "epoch": 1.3063534317466818, + "grad_norm": 0.22624224424362183, + "learning_rate": 1.9471379229167597e-05, + "loss": 1.2919, + "step": 4386 + }, + { + "epoch": 1.3066512779463504, + "grad_norm": 0.21678951382637024, + "learning_rate": 1.9471069726488432e-05, + "loss": 1.2714, + "step": 4387 + }, + { + "epoch": 1.3069491241460192, + "grad_norm": 0.21833226084709167, + "learning_rate": 1.947076013569154e-05, + "loss": 1.296, + "step": 4388 + }, + { + "epoch": 1.3072469703456877, + "grad_norm": 0.22498853504657745, + "learning_rate": 1.9470450456779812e-05, + "loss": 1.2889, + "step": 4389 + }, + { + "epoch": 1.3075448165453563, + "grad_norm": 0.20521242916584015, + "learning_rate": 1.947014068975612e-05, + "loss": 1.2675, + "step": 4390 + }, + { + "epoch": 1.307842662745025, + "grad_norm": 0.21246260404586792, + "learning_rate": 1.9469830834623352e-05, + "loss": 1.2813, + "step": 4391 + }, + { + "epoch": 1.3081405089446938, + "grad_norm": 0.20792168378829956, + "learning_rate": 1.946952089138439e-05, + "loss": 1.2886, + "step": 4392 + }, + { + "epoch": 1.3084383551443624, + "grad_norm": 0.22202304005622864, + "learning_rate": 1.9469210860042117e-05, + "loss": 1.2799, + "step": 4393 + }, + { + "epoch": 1.308736201344031, + "grad_norm": 0.2078225314617157, + "learning_rate": 1.946890074059941e-05, + "loss": 1.2934, + "step": 4394 + }, + { + "epoch": 1.3090340475436997, + "grad_norm": 0.2236662060022354, + "learning_rate": 1.9468590533059167e-05, + "loss": 1.2811, + "step": 4395 + }, + { + "epoch": 1.3093318937433682, + "grad_norm": 0.22034013271331787, + "learning_rate": 1.9468280237424263e-05, + "loss": 1.289, + "step": 4396 + }, + { + "epoch": 1.309629739943037, + "grad_norm": 0.23171408474445343, + "learning_rate": 1.946796985369759e-05, + "loss": 1.2669, + "step": 4397 + }, + { + "epoch": 1.3099275861427055, + "grad_norm": 0.26519349217414856, + "learning_rate": 1.9467659381882044e-05, + "loss": 1.3003, + "step": 4398 + }, + { + "epoch": 1.3102254323423743, + "grad_norm": 0.22557543218135834, + "learning_rate": 1.9467348821980495e-05, + "loss": 1.3045, + "step": 4399 + }, + { + "epoch": 1.3105232785420429, + "grad_norm": 0.2138628512620926, + "learning_rate": 1.946703817399585e-05, + "loss": 1.278, + "step": 4400 + }, + { + "epoch": 1.3108211247417114, + "grad_norm": 0.23420557379722595, + "learning_rate": 1.9466727437930987e-05, + "loss": 1.2868, + "step": 4401 + }, + { + "epoch": 1.3111189709413802, + "grad_norm": 0.21214647591114044, + "learning_rate": 1.9466416613788806e-05, + "loss": 1.2949, + "step": 4402 + }, + { + "epoch": 1.3114168171410487, + "grad_norm": 0.21572305262088776, + "learning_rate": 1.9466105701572193e-05, + "loss": 1.2784, + "step": 4403 + }, + { + "epoch": 1.3117146633407175, + "grad_norm": 0.21837739646434784, + "learning_rate": 1.946579470128404e-05, + "loss": 1.2985, + "step": 4404 + }, + { + "epoch": 1.312012509540386, + "grad_norm": 0.22033147513866425, + "learning_rate": 1.9465483612927246e-05, + "loss": 1.2955, + "step": 4405 + }, + { + "epoch": 1.3123103557400548, + "grad_norm": 0.21132859587669373, + "learning_rate": 1.9465172436504705e-05, + "loss": 1.2848, + "step": 4406 + }, + { + "epoch": 1.3126082019397234, + "grad_norm": 0.22771236300468445, + "learning_rate": 1.9464861172019307e-05, + "loss": 1.2878, + "step": 4407 + }, + { + "epoch": 1.312906048139392, + "grad_norm": 0.2159237563610077, + "learning_rate": 1.946454981947395e-05, + "loss": 1.291, + "step": 4408 + }, + { + "epoch": 1.3132038943390607, + "grad_norm": 0.22716277837753296, + "learning_rate": 1.9464238378871535e-05, + "loss": 1.2824, + "step": 4409 + }, + { + "epoch": 1.3135017405387293, + "grad_norm": 0.21937571465969086, + "learning_rate": 1.946392685021495e-05, + "loss": 1.2848, + "step": 4410 + }, + { + "epoch": 1.313799586738398, + "grad_norm": 0.21618427336215973, + "learning_rate": 1.9463615233507104e-05, + "loss": 1.2919, + "step": 4411 + }, + { + "epoch": 1.3140974329380666, + "grad_norm": 0.2274778038263321, + "learning_rate": 1.946330352875089e-05, + "loss": 1.2827, + "step": 4412 + }, + { + "epoch": 1.3143952791377354, + "grad_norm": 0.2229549139738083, + "learning_rate": 1.9462991735949206e-05, + "loss": 1.2814, + "step": 4413 + }, + { + "epoch": 1.314693125337404, + "grad_norm": 0.20636674761772156, + "learning_rate": 1.9462679855104963e-05, + "loss": 1.2873, + "step": 4414 + }, + { + "epoch": 1.3149909715370725, + "grad_norm": 0.21750383079051971, + "learning_rate": 1.9462367886221054e-05, + "loss": 1.2778, + "step": 4415 + }, + { + "epoch": 1.3152888177367412, + "grad_norm": 0.21384619176387787, + "learning_rate": 1.9462055829300382e-05, + "loss": 1.2866, + "step": 4416 + }, + { + "epoch": 1.3155866639364098, + "grad_norm": 0.23439331352710724, + "learning_rate": 1.9461743684345855e-05, + "loss": 1.2687, + "step": 4417 + }, + { + "epoch": 1.3158845101360785, + "grad_norm": 0.23477253317832947, + "learning_rate": 1.946143145136037e-05, + "loss": 1.2966, + "step": 4418 + }, + { + "epoch": 1.316182356335747, + "grad_norm": 0.21923504769802094, + "learning_rate": 1.946111913034684e-05, + "loss": 1.2911, + "step": 4419 + }, + { + "epoch": 1.3164802025354159, + "grad_norm": 0.28623026609420776, + "learning_rate": 1.9460806721308167e-05, + "loss": 1.2931, + "step": 4420 + }, + { + "epoch": 1.3167780487350844, + "grad_norm": 0.24381518363952637, + "learning_rate": 1.9460494224247255e-05, + "loss": 1.2724, + "step": 4421 + }, + { + "epoch": 1.317075894934753, + "grad_norm": 0.22767974436283112, + "learning_rate": 1.9460181639167015e-05, + "loss": 1.2802, + "step": 4422 + }, + { + "epoch": 1.3173737411344217, + "grad_norm": 0.23355694115161896, + "learning_rate": 1.9459868966070356e-05, + "loss": 1.2672, + "step": 4423 + }, + { + "epoch": 1.3176715873340903, + "grad_norm": 0.23531609773635864, + "learning_rate": 1.9459556204960183e-05, + "loss": 1.2877, + "step": 4424 + }, + { + "epoch": 1.317969433533759, + "grad_norm": 0.21968941390514374, + "learning_rate": 1.945924335583941e-05, + "loss": 1.2863, + "step": 4425 + }, + { + "epoch": 1.3182672797334276, + "grad_norm": 0.2364933043718338, + "learning_rate": 1.9458930418710947e-05, + "loss": 1.2825, + "step": 4426 + }, + { + "epoch": 1.3185651259330964, + "grad_norm": 0.21582764387130737, + "learning_rate": 1.9458617393577705e-05, + "loss": 1.2746, + "step": 4427 + }, + { + "epoch": 1.318862972132765, + "grad_norm": 0.21979326009750366, + "learning_rate": 1.9458304280442594e-05, + "loss": 1.2879, + "step": 4428 + }, + { + "epoch": 1.3191608183324335, + "grad_norm": 0.23910918831825256, + "learning_rate": 1.945799107930853e-05, + "loss": 1.2991, + "step": 4429 + }, + { + "epoch": 1.3194586645321023, + "grad_norm": 0.2085385024547577, + "learning_rate": 1.9457677790178424e-05, + "loss": 1.287, + "step": 4430 + }, + { + "epoch": 1.3197565107317708, + "grad_norm": 0.22950591146945953, + "learning_rate": 1.9457364413055196e-05, + "loss": 1.2865, + "step": 4431 + }, + { + "epoch": 1.3200543569314396, + "grad_norm": 0.21736650168895721, + "learning_rate": 1.9457050947941755e-05, + "loss": 1.2918, + "step": 4432 + }, + { + "epoch": 1.3203522031311081, + "grad_norm": 0.20605461299419403, + "learning_rate": 1.9456737394841024e-05, + "loss": 1.2887, + "step": 4433 + }, + { + "epoch": 1.320650049330777, + "grad_norm": 0.22259333729743958, + "learning_rate": 1.945642375375592e-05, + "loss": 1.2758, + "step": 4434 + }, + { + "epoch": 1.3209478955304454, + "grad_norm": 0.21373486518859863, + "learning_rate": 1.9456110024689353e-05, + "loss": 1.2764, + "step": 4435 + }, + { + "epoch": 1.321245741730114, + "grad_norm": 0.21873639523983002, + "learning_rate": 1.945579620764425e-05, + "loss": 1.2999, + "step": 4436 + }, + { + "epoch": 1.3215435879297828, + "grad_norm": 0.2185695916414261, + "learning_rate": 1.9455482302623525e-05, + "loss": 1.2976, + "step": 4437 + }, + { + "epoch": 1.3218414341294515, + "grad_norm": 0.21117202937602997, + "learning_rate": 1.9455168309630104e-05, + "loss": 1.2917, + "step": 4438 + }, + { + "epoch": 1.32213928032912, + "grad_norm": 0.21831433475017548, + "learning_rate": 1.9454854228666905e-05, + "loss": 1.2764, + "step": 4439 + }, + { + "epoch": 1.3224371265287886, + "grad_norm": 0.22479133307933807, + "learning_rate": 1.945454005973685e-05, + "loss": 1.2787, + "step": 4440 + }, + { + "epoch": 1.3227349727284574, + "grad_norm": 0.22005313634872437, + "learning_rate": 1.9454225802842865e-05, + "loss": 1.2897, + "step": 4441 + }, + { + "epoch": 1.323032818928126, + "grad_norm": 0.22108127176761627, + "learning_rate": 1.9453911457987872e-05, + "loss": 1.3085, + "step": 4442 + }, + { + "epoch": 1.3233306651277945, + "grad_norm": 0.21450850367546082, + "learning_rate": 1.9453597025174793e-05, + "loss": 1.2766, + "step": 4443 + }, + { + "epoch": 1.3236285113274633, + "grad_norm": 0.23771388828754425, + "learning_rate": 1.9453282504406558e-05, + "loss": 1.286, + "step": 4444 + }, + { + "epoch": 1.323926357527132, + "grad_norm": 0.20356306433677673, + "learning_rate": 1.945296789568609e-05, + "loss": 1.2827, + "step": 4445 + }, + { + "epoch": 1.3242242037268006, + "grad_norm": 0.22749559581279755, + "learning_rate": 1.9452653199016316e-05, + "loss": 1.2841, + "step": 4446 + }, + { + "epoch": 1.3245220499264692, + "grad_norm": 0.22910209000110626, + "learning_rate": 1.945233841440017e-05, + "loss": 1.2902, + "step": 4447 + }, + { + "epoch": 1.324819896126138, + "grad_norm": 0.22924593091011047, + "learning_rate": 1.945202354184057e-05, + "loss": 1.2734, + "step": 4448 + }, + { + "epoch": 1.3251177423258065, + "grad_norm": 0.23718386888504028, + "learning_rate": 1.9451708581340454e-05, + "loss": 1.274, + "step": 4449 + }, + { + "epoch": 1.325415588525475, + "grad_norm": 0.20674824714660645, + "learning_rate": 1.9451393532902748e-05, + "loss": 1.3067, + "step": 4450 + }, + { + "epoch": 1.3257134347251438, + "grad_norm": 0.21308964490890503, + "learning_rate": 1.945107839653039e-05, + "loss": 1.2862, + "step": 4451 + }, + { + "epoch": 1.3260112809248126, + "grad_norm": 0.21982550621032715, + "learning_rate": 1.94507631722263e-05, + "loss": 1.2803, + "step": 4452 + }, + { + "epoch": 1.3263091271244811, + "grad_norm": 0.21804852783679962, + "learning_rate": 1.9450447859993423e-05, + "loss": 1.2908, + "step": 4453 + }, + { + "epoch": 1.3266069733241497, + "grad_norm": 0.21226952970027924, + "learning_rate": 1.9450132459834685e-05, + "loss": 1.2989, + "step": 4454 + }, + { + "epoch": 1.3269048195238184, + "grad_norm": 0.21352677047252655, + "learning_rate": 1.9449816971753025e-05, + "loss": 1.2802, + "step": 4455 + }, + { + "epoch": 1.327202665723487, + "grad_norm": 0.21268369257450104, + "learning_rate": 1.9449501395751374e-05, + "loss": 1.2769, + "step": 4456 + }, + { + "epoch": 1.3275005119231555, + "grad_norm": 0.21231991052627563, + "learning_rate": 1.9449185731832667e-05, + "loss": 1.2847, + "step": 4457 + }, + { + "epoch": 1.3277983581228243, + "grad_norm": 0.21618475019931793, + "learning_rate": 1.944886997999985e-05, + "loss": 1.2858, + "step": 4458 + }, + { + "epoch": 1.328096204322493, + "grad_norm": 0.21233153343200684, + "learning_rate": 1.9448554140255852e-05, + "loss": 1.2788, + "step": 4459 + }, + { + "epoch": 1.3283940505221616, + "grad_norm": 0.23762664198875427, + "learning_rate": 1.944823821260361e-05, + "loss": 1.2966, + "step": 4460 + }, + { + "epoch": 1.3286918967218302, + "grad_norm": 0.2210892140865326, + "learning_rate": 1.9447922197046075e-05, + "loss": 1.2867, + "step": 4461 + }, + { + "epoch": 1.328989742921499, + "grad_norm": 0.21532797813415527, + "learning_rate": 1.9447606093586176e-05, + "loss": 1.2907, + "step": 4462 + }, + { + "epoch": 1.3292875891211675, + "grad_norm": 0.211831733584404, + "learning_rate": 1.9447289902226857e-05, + "loss": 1.2854, + "step": 4463 + }, + { + "epoch": 1.3295854353208363, + "grad_norm": 0.22693310678005219, + "learning_rate": 1.9446973622971064e-05, + "loss": 1.2737, + "step": 4464 + }, + { + "epoch": 1.3298832815205048, + "grad_norm": 0.24312475323677063, + "learning_rate": 1.9446657255821735e-05, + "loss": 1.2788, + "step": 4465 + }, + { + "epoch": 1.3301811277201736, + "grad_norm": 0.21440081298351288, + "learning_rate": 1.9446340800781814e-05, + "loss": 1.2891, + "step": 4466 + }, + { + "epoch": 1.3304789739198422, + "grad_norm": 0.21881331503391266, + "learning_rate": 1.9446024257854246e-05, + "loss": 1.2914, + "step": 4467 + }, + { + "epoch": 1.3307768201195107, + "grad_norm": 0.22630274295806885, + "learning_rate": 1.9445707627041975e-05, + "loss": 1.2779, + "step": 4468 + }, + { + "epoch": 1.3310746663191795, + "grad_norm": 0.22332996129989624, + "learning_rate": 1.944539090834795e-05, + "loss": 1.3013, + "step": 4469 + }, + { + "epoch": 1.331372512518848, + "grad_norm": 0.23849384486675262, + "learning_rate": 1.9445074101775115e-05, + "loss": 1.3143, + "step": 4470 + }, + { + "epoch": 1.3316703587185168, + "grad_norm": 0.22371995449066162, + "learning_rate": 1.9444757207326414e-05, + "loss": 1.2949, + "step": 4471 + }, + { + "epoch": 1.3319682049181854, + "grad_norm": 0.21483688056468964, + "learning_rate": 1.9444440225004805e-05, + "loss": 1.2912, + "step": 4472 + }, + { + "epoch": 1.3322660511178541, + "grad_norm": 0.21644139289855957, + "learning_rate": 1.9444123154813227e-05, + "loss": 1.2831, + "step": 4473 + }, + { + "epoch": 1.3325638973175227, + "grad_norm": 0.2105797380208969, + "learning_rate": 1.944380599675464e-05, + "loss": 1.2849, + "step": 4474 + }, + { + "epoch": 1.3328617435171912, + "grad_norm": 0.21353058516979218, + "learning_rate": 1.9443488750831988e-05, + "loss": 1.2697, + "step": 4475 + }, + { + "epoch": 1.33315958971686, + "grad_norm": 0.2092098444700241, + "learning_rate": 1.944317141704822e-05, + "loss": 1.2927, + "step": 4476 + }, + { + "epoch": 1.3334574359165285, + "grad_norm": 0.21660619974136353, + "learning_rate": 1.9442853995406297e-05, + "loss": 1.2914, + "step": 4477 + }, + { + "epoch": 1.3337552821161973, + "grad_norm": 0.2247992902994156, + "learning_rate": 1.9442536485909165e-05, + "loss": 1.3109, + "step": 4478 + }, + { + "epoch": 1.3340531283158659, + "grad_norm": 0.20930930972099304, + "learning_rate": 1.9442218888559782e-05, + "loss": 1.2843, + "step": 4479 + }, + { + "epoch": 1.3343509745155346, + "grad_norm": 0.20886363089084625, + "learning_rate": 1.9441901203361105e-05, + "loss": 1.2605, + "step": 4480 + }, + { + "epoch": 1.3346488207152032, + "grad_norm": 0.21827760338783264, + "learning_rate": 1.944158343031608e-05, + "loss": 1.2791, + "step": 4481 + }, + { + "epoch": 1.3349466669148717, + "grad_norm": 0.2067975103855133, + "learning_rate": 1.9441265569427674e-05, + "loss": 1.2843, + "step": 4482 + }, + { + "epoch": 1.3352445131145405, + "grad_norm": 0.2142345905303955, + "learning_rate": 1.944094762069884e-05, + "loss": 1.2874, + "step": 4483 + }, + { + "epoch": 1.335542359314209, + "grad_norm": 0.21202720701694489, + "learning_rate": 1.9440629584132536e-05, + "loss": 1.2677, + "step": 4484 + }, + { + "epoch": 1.3358402055138778, + "grad_norm": 0.20592474937438965, + "learning_rate": 1.944031145973172e-05, + "loss": 1.3011, + "step": 4485 + }, + { + "epoch": 1.3361380517135464, + "grad_norm": 0.21177034080028534, + "learning_rate": 1.9439993247499352e-05, + "loss": 1.2803, + "step": 4486 + }, + { + "epoch": 1.3364358979132152, + "grad_norm": 0.2227989137172699, + "learning_rate": 1.9439674947438398e-05, + "loss": 1.2789, + "step": 4487 + }, + { + "epoch": 1.3367337441128837, + "grad_norm": 0.22396139800548553, + "learning_rate": 1.9439356559551813e-05, + "loss": 1.2653, + "step": 4488 + }, + { + "epoch": 1.3370315903125523, + "grad_norm": 0.22306722402572632, + "learning_rate": 1.9439038083842562e-05, + "loss": 1.2709, + "step": 4489 + }, + { + "epoch": 1.337329436512221, + "grad_norm": 0.22983412444591522, + "learning_rate": 1.9438719520313606e-05, + "loss": 1.2839, + "step": 4490 + }, + { + "epoch": 1.3376272827118896, + "grad_norm": 0.21686509251594543, + "learning_rate": 1.9438400868967916e-05, + "loss": 1.2873, + "step": 4491 + }, + { + "epoch": 1.3379251289115583, + "grad_norm": 0.21184255182743073, + "learning_rate": 1.9438082129808443e-05, + "loss": 1.2843, + "step": 4492 + }, + { + "epoch": 1.338222975111227, + "grad_norm": 0.2111564576625824, + "learning_rate": 1.9437763302838166e-05, + "loss": 1.2969, + "step": 4493 + }, + { + "epoch": 1.3385208213108957, + "grad_norm": 0.25139352679252625, + "learning_rate": 1.9437444388060045e-05, + "loss": 1.3023, + "step": 4494 + }, + { + "epoch": 1.3388186675105642, + "grad_norm": 0.2681314945220947, + "learning_rate": 1.9437125385477046e-05, + "loss": 1.2981, + "step": 4495 + }, + { + "epoch": 1.3391165137102328, + "grad_norm": 0.2067553848028183, + "learning_rate": 1.9436806295092143e-05, + "loss": 1.2917, + "step": 4496 + }, + { + "epoch": 1.3394143599099015, + "grad_norm": 0.34385374188423157, + "learning_rate": 1.9436487116908294e-05, + "loss": 1.2805, + "step": 4497 + }, + { + "epoch": 1.33971220610957, + "grad_norm": 0.2229735404253006, + "learning_rate": 1.943616785092848e-05, + "loss": 1.2938, + "step": 4498 + }, + { + "epoch": 1.3400100523092389, + "grad_norm": 0.22187785804271698, + "learning_rate": 1.943584849715567e-05, + "loss": 1.2912, + "step": 4499 + }, + { + "epoch": 1.3403078985089074, + "grad_norm": 0.22360198199748993, + "learning_rate": 1.943552905559283e-05, + "loss": 1.3031, + "step": 4500 + }, + { + "epoch": 1.3403078985089074, + "eval_loss": 1.3569642305374146, + "eval_runtime": 21.1279, + "eval_samples_per_second": 82.072, + "eval_steps_per_second": 5.159, + "step": 4500 + }, + { + "epoch": 1.3406057447085762, + "grad_norm": 0.22713731229305267, + "learning_rate": 1.9435209526242933e-05, + "loss": 1.2966, + "step": 4501 + }, + { + "epoch": 1.3409035909082447, + "grad_norm": 0.2253570407629013, + "learning_rate": 1.9434889909108952e-05, + "loss": 1.2839, + "step": 4502 + }, + { + "epoch": 1.3412014371079133, + "grad_norm": 0.2101002037525177, + "learning_rate": 1.9434570204193863e-05, + "loss": 1.2798, + "step": 4503 + }, + { + "epoch": 1.341499283307582, + "grad_norm": 0.2120308130979538, + "learning_rate": 1.9434250411500638e-05, + "loss": 1.2644, + "step": 4504 + }, + { + "epoch": 1.3417971295072508, + "grad_norm": 0.22912685573101044, + "learning_rate": 1.9433930531032255e-05, + "loss": 1.2782, + "step": 4505 + }, + { + "epoch": 1.3420949757069194, + "grad_norm": 0.22854988276958466, + "learning_rate": 1.943361056279169e-05, + "loss": 1.2619, + "step": 4506 + }, + { + "epoch": 1.342392821906588, + "grad_norm": 0.22772414982318878, + "learning_rate": 1.9433290506781915e-05, + "loss": 1.3025, + "step": 4507 + }, + { + "epoch": 1.3426906681062567, + "grad_norm": 0.21109721064567566, + "learning_rate": 1.9432970363005913e-05, + "loss": 1.2858, + "step": 4508 + }, + { + "epoch": 1.3429885143059253, + "grad_norm": 0.2292889803647995, + "learning_rate": 1.943265013146666e-05, + "loss": 1.2839, + "step": 4509 + }, + { + "epoch": 1.3432863605055938, + "grad_norm": 0.23116926848888397, + "learning_rate": 1.943232981216714e-05, + "loss": 1.2975, + "step": 4510 + }, + { + "epoch": 1.3435842067052626, + "grad_norm": 0.21780000627040863, + "learning_rate": 1.9432009405110323e-05, + "loss": 1.2765, + "step": 4511 + }, + { + "epoch": 1.3438820529049313, + "grad_norm": 0.22388587892055511, + "learning_rate": 1.9431688910299203e-05, + "loss": 1.2856, + "step": 4512 + }, + { + "epoch": 1.3441798991046, + "grad_norm": 0.231467604637146, + "learning_rate": 1.943136832773675e-05, + "loss": 1.283, + "step": 4513 + }, + { + "epoch": 1.3444777453042684, + "grad_norm": 0.22428205609321594, + "learning_rate": 1.9431047657425956e-05, + "loss": 1.2697, + "step": 4514 + }, + { + "epoch": 1.3447755915039372, + "grad_norm": 0.2137261927127838, + "learning_rate": 1.94307268993698e-05, + "loss": 1.2915, + "step": 4515 + }, + { + "epoch": 1.3450734377036058, + "grad_norm": 0.2115289568901062, + "learning_rate": 1.9430406053571265e-05, + "loss": 1.274, + "step": 4516 + }, + { + "epoch": 1.3453712839032743, + "grad_norm": 0.23327216506004333, + "learning_rate": 1.9430085120033338e-05, + "loss": 1.2839, + "step": 4517 + }, + { + "epoch": 1.345669130102943, + "grad_norm": 0.22178760170936584, + "learning_rate": 1.9429764098759007e-05, + "loss": 1.2852, + "step": 4518 + }, + { + "epoch": 1.3459669763026119, + "grad_norm": 0.21289120614528656, + "learning_rate": 1.9429442989751255e-05, + "loss": 1.2921, + "step": 4519 + }, + { + "epoch": 1.3462648225022804, + "grad_norm": 0.22279572486877441, + "learning_rate": 1.942912179301307e-05, + "loss": 1.2878, + "step": 4520 + }, + { + "epoch": 1.346562668701949, + "grad_norm": 0.21776123344898224, + "learning_rate": 1.9428800508547444e-05, + "loss": 1.282, + "step": 4521 + }, + { + "epoch": 1.3468605149016177, + "grad_norm": 0.22095216810703278, + "learning_rate": 1.9428479136357364e-05, + "loss": 1.2896, + "step": 4522 + }, + { + "epoch": 1.3471583611012863, + "grad_norm": 0.21695873141288757, + "learning_rate": 1.9428157676445818e-05, + "loss": 1.2797, + "step": 4523 + }, + { + "epoch": 1.3474562073009548, + "grad_norm": 0.21062494814395905, + "learning_rate": 1.9427836128815797e-05, + "loss": 1.2861, + "step": 4524 + }, + { + "epoch": 1.3477540535006236, + "grad_norm": 0.21845440566539764, + "learning_rate": 1.9427514493470297e-05, + "loss": 1.2738, + "step": 4525 + }, + { + "epoch": 1.3480518997002924, + "grad_norm": 0.21795763075351715, + "learning_rate": 1.942719277041231e-05, + "loss": 1.2662, + "step": 4526 + }, + { + "epoch": 1.348349745899961, + "grad_norm": 0.22721393406391144, + "learning_rate": 1.9426870959644822e-05, + "loss": 1.2803, + "step": 4527 + }, + { + "epoch": 1.3486475920996295, + "grad_norm": 0.2117440104484558, + "learning_rate": 1.9426549061170834e-05, + "loss": 1.2939, + "step": 4528 + }, + { + "epoch": 1.3489454382992982, + "grad_norm": 0.21315455436706543, + "learning_rate": 1.942622707499334e-05, + "loss": 1.2853, + "step": 4529 + }, + { + "epoch": 1.3492432844989668, + "grad_norm": 0.21192117035388947, + "learning_rate": 1.9425905001115332e-05, + "loss": 1.2913, + "step": 4530 + }, + { + "epoch": 1.3495411306986356, + "grad_norm": 0.215905100107193, + "learning_rate": 1.9425582839539813e-05, + "loss": 1.2862, + "step": 4531 + }, + { + "epoch": 1.3498389768983041, + "grad_norm": 0.21604588627815247, + "learning_rate": 1.9425260590269775e-05, + "loss": 1.2671, + "step": 4532 + }, + { + "epoch": 1.350136823097973, + "grad_norm": 0.20911955833435059, + "learning_rate": 1.9424938253308217e-05, + "loss": 1.2856, + "step": 4533 + }, + { + "epoch": 1.3504346692976414, + "grad_norm": 0.20078156888484955, + "learning_rate": 1.9424615828658138e-05, + "loss": 1.2826, + "step": 4534 + }, + { + "epoch": 1.35073251549731, + "grad_norm": 0.21429602801799774, + "learning_rate": 1.942429331632254e-05, + "loss": 1.2742, + "step": 4535 + }, + { + "epoch": 1.3510303616969788, + "grad_norm": 0.2233225554227829, + "learning_rate": 1.942397071630442e-05, + "loss": 1.2664, + "step": 4536 + }, + { + "epoch": 1.3513282078966473, + "grad_norm": 0.22216635942459106, + "learning_rate": 1.9423648028606786e-05, + "loss": 1.2922, + "step": 4537 + }, + { + "epoch": 1.351626054096316, + "grad_norm": 0.2184571623802185, + "learning_rate": 1.9423325253232632e-05, + "loss": 1.2873, + "step": 4538 + }, + { + "epoch": 1.3519239002959846, + "grad_norm": 0.20877744257450104, + "learning_rate": 1.9423002390184967e-05, + "loss": 1.2935, + "step": 4539 + }, + { + "epoch": 1.3522217464956534, + "grad_norm": 0.21714043617248535, + "learning_rate": 1.942267943946679e-05, + "loss": 1.2808, + "step": 4540 + }, + { + "epoch": 1.352519592695322, + "grad_norm": 0.21297597885131836, + "learning_rate": 1.9422356401081107e-05, + "loss": 1.2765, + "step": 4541 + }, + { + "epoch": 1.3528174388949905, + "grad_norm": 0.2160167247056961, + "learning_rate": 1.942203327503093e-05, + "loss": 1.2935, + "step": 4542 + }, + { + "epoch": 1.3531152850946593, + "grad_norm": 0.21878854930400848, + "learning_rate": 1.9421710061319258e-05, + "loss": 1.2857, + "step": 4543 + }, + { + "epoch": 1.3534131312943278, + "grad_norm": 0.21440044045448303, + "learning_rate": 1.9421386759949102e-05, + "loss": 1.2822, + "step": 4544 + }, + { + "epoch": 1.3537109774939966, + "grad_norm": 0.20837941765785217, + "learning_rate": 1.9421063370923464e-05, + "loss": 1.2863, + "step": 4545 + }, + { + "epoch": 1.3540088236936652, + "grad_norm": 0.20805011689662933, + "learning_rate": 1.9420739894245363e-05, + "loss": 1.2725, + "step": 4546 + }, + { + "epoch": 1.354306669893334, + "grad_norm": 0.23626692593097687, + "learning_rate": 1.94204163299178e-05, + "loss": 1.2867, + "step": 4547 + }, + { + "epoch": 1.3546045160930025, + "grad_norm": 0.22463680803775787, + "learning_rate": 1.942009267794379e-05, + "loss": 1.2947, + "step": 4548 + }, + { + "epoch": 1.354902362292671, + "grad_norm": 0.2230614572763443, + "learning_rate": 1.9419768938326337e-05, + "loss": 1.2658, + "step": 4549 + }, + { + "epoch": 1.3552002084923398, + "grad_norm": 0.22742493450641632, + "learning_rate": 1.941944511106846e-05, + "loss": 1.3002, + "step": 4550 + }, + { + "epoch": 1.3554980546920083, + "grad_norm": 0.21157263219356537, + "learning_rate": 1.9419121196173175e-05, + "loss": 1.2811, + "step": 4551 + }, + { + "epoch": 1.3557959008916771, + "grad_norm": 0.23207604885101318, + "learning_rate": 1.9418797193643488e-05, + "loss": 1.2803, + "step": 4552 + }, + { + "epoch": 1.3560937470913457, + "grad_norm": 0.22329147160053253, + "learning_rate": 1.941847310348242e-05, + "loss": 1.2926, + "step": 4553 + }, + { + "epoch": 1.3563915932910144, + "grad_norm": 0.21609167754650116, + "learning_rate": 1.9418148925692978e-05, + "loss": 1.2772, + "step": 4554 + }, + { + "epoch": 1.356689439490683, + "grad_norm": 0.21065406501293182, + "learning_rate": 1.9417824660278182e-05, + "loss": 1.2991, + "step": 4555 + }, + { + "epoch": 1.3569872856903515, + "grad_norm": 0.21100889146327972, + "learning_rate": 1.9417500307241054e-05, + "loss": 1.2858, + "step": 4556 + }, + { + "epoch": 1.3572851318900203, + "grad_norm": 0.22105230391025543, + "learning_rate": 1.9417175866584605e-05, + "loss": 1.2834, + "step": 4557 + }, + { + "epoch": 1.3575829780896889, + "grad_norm": 0.2316344827413559, + "learning_rate": 1.9416851338311854e-05, + "loss": 1.2787, + "step": 4558 + }, + { + "epoch": 1.3578808242893576, + "grad_norm": 0.21157117187976837, + "learning_rate": 1.9416526722425826e-05, + "loss": 1.2716, + "step": 4559 + }, + { + "epoch": 1.3581786704890262, + "grad_norm": 0.21879585087299347, + "learning_rate": 1.9416202018929537e-05, + "loss": 1.2773, + "step": 4560 + }, + { + "epoch": 1.358476516688695, + "grad_norm": 0.2180936336517334, + "learning_rate": 1.9415877227826007e-05, + "loss": 1.3113, + "step": 4561 + }, + { + "epoch": 1.3587743628883635, + "grad_norm": 0.21200427412986755, + "learning_rate": 1.9415552349118263e-05, + "loss": 1.2925, + "step": 4562 + }, + { + "epoch": 1.359072209088032, + "grad_norm": 0.22136510908603668, + "learning_rate": 1.941522738280932e-05, + "loss": 1.2855, + "step": 4563 + }, + { + "epoch": 1.3593700552877008, + "grad_norm": 0.2183605581521988, + "learning_rate": 1.9414902328902207e-05, + "loss": 1.2752, + "step": 4564 + }, + { + "epoch": 1.3596679014873694, + "grad_norm": 0.21506759524345398, + "learning_rate": 1.9414577187399947e-05, + "loss": 1.29, + "step": 4565 + }, + { + "epoch": 1.3599657476870382, + "grad_norm": 0.215349942445755, + "learning_rate": 1.9414251958305566e-05, + "loss": 1.2857, + "step": 4566 + }, + { + "epoch": 1.3602635938867067, + "grad_norm": 0.2266671359539032, + "learning_rate": 1.9413926641622086e-05, + "loss": 1.293, + "step": 4567 + }, + { + "epoch": 1.3605614400863755, + "grad_norm": 0.218679279088974, + "learning_rate": 1.9413601237352536e-05, + "loss": 1.2942, + "step": 4568 + }, + { + "epoch": 1.360859286286044, + "grad_norm": 0.2285957783460617, + "learning_rate": 1.941327574549995e-05, + "loss": 1.2833, + "step": 4569 + }, + { + "epoch": 1.3611571324857126, + "grad_norm": 0.2153051793575287, + "learning_rate": 1.9412950166067347e-05, + "loss": 1.2817, + "step": 4570 + }, + { + "epoch": 1.3614549786853813, + "grad_norm": 0.20250266790390015, + "learning_rate": 1.9412624499057755e-05, + "loss": 1.3068, + "step": 4571 + }, + { + "epoch": 1.3617528248850501, + "grad_norm": 0.21537740528583527, + "learning_rate": 1.9412298744474213e-05, + "loss": 1.3089, + "step": 4572 + }, + { + "epoch": 1.3620506710847187, + "grad_norm": 0.21063105762004852, + "learning_rate": 1.9411972902319746e-05, + "loss": 1.2839, + "step": 4573 + }, + { + "epoch": 1.3623485172843872, + "grad_norm": 0.22086693346500397, + "learning_rate": 1.9411646972597387e-05, + "loss": 1.2801, + "step": 4574 + }, + { + "epoch": 1.362646363484056, + "grad_norm": 0.22338207066059113, + "learning_rate": 1.941132095531017e-05, + "loss": 1.2862, + "step": 4575 + }, + { + "epoch": 1.3629442096837245, + "grad_norm": 0.20457307994365692, + "learning_rate": 1.9410994850461125e-05, + "loss": 1.2701, + "step": 4576 + }, + { + "epoch": 1.363242055883393, + "grad_norm": 0.20595508813858032, + "learning_rate": 1.9410668658053286e-05, + "loss": 1.273, + "step": 4577 + }, + { + "epoch": 1.3635399020830619, + "grad_norm": 0.2182329148054123, + "learning_rate": 1.941034237808969e-05, + "loss": 1.2864, + "step": 4578 + }, + { + "epoch": 1.3638377482827306, + "grad_norm": 0.21785813570022583, + "learning_rate": 1.9410016010573373e-05, + "loss": 1.2858, + "step": 4579 + }, + { + "epoch": 1.3641355944823992, + "grad_norm": 0.21548466384410858, + "learning_rate": 1.9409689555507373e-05, + "loss": 1.2748, + "step": 4580 + }, + { + "epoch": 1.3644334406820677, + "grad_norm": 0.2248479425907135, + "learning_rate": 1.940936301289472e-05, + "loss": 1.2742, + "step": 4581 + }, + { + "epoch": 1.3647312868817365, + "grad_norm": 0.21730519831180573, + "learning_rate": 1.940903638273846e-05, + "loss": 1.2864, + "step": 4582 + }, + { + "epoch": 1.365029133081405, + "grad_norm": 0.2234262079000473, + "learning_rate": 1.9408709665041627e-05, + "loss": 1.2886, + "step": 4583 + }, + { + "epoch": 1.3653269792810736, + "grad_norm": 0.21113720536231995, + "learning_rate": 1.9408382859807264e-05, + "loss": 1.2748, + "step": 4584 + }, + { + "epoch": 1.3656248254807424, + "grad_norm": 0.21111853420734406, + "learning_rate": 1.940805596703841e-05, + "loss": 1.2809, + "step": 4585 + }, + { + "epoch": 1.3659226716804111, + "grad_norm": 0.21144357323646545, + "learning_rate": 1.9407728986738107e-05, + "loss": 1.2863, + "step": 4586 + }, + { + "epoch": 1.3662205178800797, + "grad_norm": 0.20854952931404114, + "learning_rate": 1.9407401918909394e-05, + "loss": 1.279, + "step": 4587 + }, + { + "epoch": 1.3665183640797482, + "grad_norm": 0.206075519323349, + "learning_rate": 1.9407074763555317e-05, + "loss": 1.2763, + "step": 4588 + }, + { + "epoch": 1.366816210279417, + "grad_norm": 0.21549838781356812, + "learning_rate": 1.9406747520678922e-05, + "loss": 1.2895, + "step": 4589 + }, + { + "epoch": 1.3671140564790856, + "grad_norm": 0.21204976737499237, + "learning_rate": 1.9406420190283254e-05, + "loss": 1.2996, + "step": 4590 + }, + { + "epoch": 1.3674119026787541, + "grad_norm": 0.19928468763828278, + "learning_rate": 1.940609277237135e-05, + "loss": 1.2868, + "step": 4591 + }, + { + "epoch": 1.367709748878423, + "grad_norm": 0.22234809398651123, + "learning_rate": 1.9405765266946263e-05, + "loss": 1.2798, + "step": 4592 + }, + { + "epoch": 1.3680075950780917, + "grad_norm": 0.21840263903141022, + "learning_rate": 1.9405437674011042e-05, + "loss": 1.2761, + "step": 4593 + }, + { + "epoch": 1.3683054412777602, + "grad_norm": 0.19964468479156494, + "learning_rate": 1.940510999356873e-05, + "loss": 1.2694, + "step": 4594 + }, + { + "epoch": 1.3686032874774288, + "grad_norm": 0.21210229396820068, + "learning_rate": 1.9404782225622376e-05, + "loss": 1.3048, + "step": 4595 + }, + { + "epoch": 1.3689011336770975, + "grad_norm": 0.20749431848526, + "learning_rate": 1.940445437017503e-05, + "loss": 1.2658, + "step": 4596 + }, + { + "epoch": 1.369198979876766, + "grad_norm": 0.21786624193191528, + "learning_rate": 1.9404126427229745e-05, + "loss": 1.3007, + "step": 4597 + }, + { + "epoch": 1.3694968260764349, + "grad_norm": 0.2164333164691925, + "learning_rate": 1.9403798396789572e-05, + "loss": 1.267, + "step": 4598 + }, + { + "epoch": 1.3697946722761034, + "grad_norm": 0.22057907283306122, + "learning_rate": 1.940347027885756e-05, + "loss": 1.2628, + "step": 4599 + }, + { + "epoch": 1.3700925184757722, + "grad_norm": 0.21018175780773163, + "learning_rate": 1.9403142073436766e-05, + "loss": 1.301, + "step": 4600 + }, + { + "epoch": 1.3703903646754407, + "grad_norm": 0.224778413772583, + "learning_rate": 1.9402813780530235e-05, + "loss": 1.2823, + "step": 4601 + }, + { + "epoch": 1.3706882108751093, + "grad_norm": 0.22259201109409332, + "learning_rate": 1.9402485400141032e-05, + "loss": 1.2918, + "step": 4602 + }, + { + "epoch": 1.370986057074778, + "grad_norm": 0.2236838936805725, + "learning_rate": 1.9402156932272205e-05, + "loss": 1.2994, + "step": 4603 + }, + { + "epoch": 1.3712839032744466, + "grad_norm": 0.22335247695446014, + "learning_rate": 1.9401828376926813e-05, + "loss": 1.286, + "step": 4604 + }, + { + "epoch": 1.3715817494741154, + "grad_norm": 0.20863713324069977, + "learning_rate": 1.9401499734107915e-05, + "loss": 1.2834, + "step": 4605 + }, + { + "epoch": 1.371879595673784, + "grad_norm": 0.205750972032547, + "learning_rate": 1.940117100381856e-05, + "loss": 1.2711, + "step": 4606 + }, + { + "epoch": 1.3721774418734527, + "grad_norm": 0.21060913801193237, + "learning_rate": 1.940084218606182e-05, + "loss": 1.2817, + "step": 4607 + }, + { + "epoch": 1.3724752880731212, + "grad_norm": 0.22086259722709656, + "learning_rate": 1.9400513280840744e-05, + "loss": 1.2882, + "step": 4608 + }, + { + "epoch": 1.3727731342727898, + "grad_norm": 0.2112884372472763, + "learning_rate": 1.9400184288158393e-05, + "loss": 1.2789, + "step": 4609 + }, + { + "epoch": 1.3730709804724586, + "grad_norm": 0.21684275567531586, + "learning_rate": 1.9399855208017828e-05, + "loss": 1.2947, + "step": 4610 + }, + { + "epoch": 1.3733688266721271, + "grad_norm": 0.21600863337516785, + "learning_rate": 1.9399526040422114e-05, + "loss": 1.2779, + "step": 4611 + }, + { + "epoch": 1.373666672871796, + "grad_norm": 0.20554274320602417, + "learning_rate": 1.9399196785374313e-05, + "loss": 1.2645, + "step": 4612 + }, + { + "epoch": 1.3739645190714644, + "grad_norm": 0.21336773037910461, + "learning_rate": 1.939886744287749e-05, + "loss": 1.2764, + "step": 4613 + }, + { + "epoch": 1.3742623652711332, + "grad_norm": 0.22521883249282837, + "learning_rate": 1.9398538012934703e-05, + "loss": 1.2894, + "step": 4614 + }, + { + "epoch": 1.3745602114708018, + "grad_norm": 0.22535529732704163, + "learning_rate": 1.9398208495549018e-05, + "loss": 1.2914, + "step": 4615 + }, + { + "epoch": 1.3748580576704703, + "grad_norm": 0.20711123943328857, + "learning_rate": 1.9397878890723506e-05, + "loss": 1.2747, + "step": 4616 + }, + { + "epoch": 1.375155903870139, + "grad_norm": 0.21560992300510406, + "learning_rate": 1.939754919846123e-05, + "loss": 1.2903, + "step": 4617 + }, + { + "epoch": 1.3754537500698076, + "grad_norm": 0.21865800023078918, + "learning_rate": 1.9397219418765262e-05, + "loss": 1.2694, + "step": 4618 + }, + { + "epoch": 1.3757515962694764, + "grad_norm": 0.21390901505947113, + "learning_rate": 1.939688955163866e-05, + "loss": 1.2873, + "step": 4619 + }, + { + "epoch": 1.376049442469145, + "grad_norm": 0.21455232799053192, + "learning_rate": 1.9396559597084507e-05, + "loss": 1.2844, + "step": 4620 + }, + { + "epoch": 1.3763472886688137, + "grad_norm": 0.2182733714580536, + "learning_rate": 1.9396229555105863e-05, + "loss": 1.2884, + "step": 4621 + }, + { + "epoch": 1.3766451348684823, + "grad_norm": 0.2096615880727768, + "learning_rate": 1.93958994257058e-05, + "loss": 1.2687, + "step": 4622 + }, + { + "epoch": 1.3769429810681508, + "grad_norm": 0.21721862256526947, + "learning_rate": 1.9395569208887388e-05, + "loss": 1.2844, + "step": 4623 + }, + { + "epoch": 1.3772408272678196, + "grad_norm": 0.22202299535274506, + "learning_rate": 1.9395238904653706e-05, + "loss": 1.2824, + "step": 4624 + }, + { + "epoch": 1.3775386734674882, + "grad_norm": 0.21815629303455353, + "learning_rate": 1.9394908513007823e-05, + "loss": 1.2884, + "step": 4625 + }, + { + "epoch": 1.377836519667157, + "grad_norm": 0.23223276436328888, + "learning_rate": 1.939457803395281e-05, + "loss": 1.3044, + "step": 4626 + }, + { + "epoch": 1.3781343658668255, + "grad_norm": 0.22536700963974, + "learning_rate": 1.9394247467491744e-05, + "loss": 1.2686, + "step": 4627 + }, + { + "epoch": 1.3784322120664942, + "grad_norm": 0.23456823825836182, + "learning_rate": 1.9393916813627704e-05, + "loss": 1.2788, + "step": 4628 + }, + { + "epoch": 1.3787300582661628, + "grad_norm": 0.22492651641368866, + "learning_rate": 1.9393586072363765e-05, + "loss": 1.2945, + "step": 4629 + }, + { + "epoch": 1.3790279044658313, + "grad_norm": 0.21123461425304413, + "learning_rate": 1.9393255243702997e-05, + "loss": 1.2868, + "step": 4630 + }, + { + "epoch": 1.3793257506655001, + "grad_norm": 0.2123865783214569, + "learning_rate": 1.9392924327648486e-05, + "loss": 1.2751, + "step": 4631 + }, + { + "epoch": 1.3796235968651687, + "grad_norm": 0.21001622080802917, + "learning_rate": 1.939259332420331e-05, + "loss": 1.2917, + "step": 4632 + }, + { + "epoch": 1.3799214430648374, + "grad_norm": 0.21975718438625336, + "learning_rate": 1.939226223337055e-05, + "loss": 1.2732, + "step": 4633 + }, + { + "epoch": 1.380219289264506, + "grad_norm": 0.22425499558448792, + "learning_rate": 1.9391931055153278e-05, + "loss": 1.264, + "step": 4634 + }, + { + "epoch": 1.3805171354641748, + "grad_norm": 0.21482954919338226, + "learning_rate": 1.9391599789554582e-05, + "loss": 1.2774, + "step": 4635 + }, + { + "epoch": 1.3808149816638433, + "grad_norm": 0.20815294981002808, + "learning_rate": 1.9391268436577543e-05, + "loss": 1.2884, + "step": 4636 + }, + { + "epoch": 1.3811128278635119, + "grad_norm": 0.2155391275882721, + "learning_rate": 1.9390936996225247e-05, + "loss": 1.2755, + "step": 4637 + }, + { + "epoch": 1.3814106740631806, + "grad_norm": 0.22542138397693634, + "learning_rate": 1.939060546850077e-05, + "loss": 1.2994, + "step": 4638 + }, + { + "epoch": 1.3817085202628494, + "grad_norm": 0.22146451473236084, + "learning_rate": 1.9390273853407205e-05, + "loss": 1.2763, + "step": 4639 + }, + { + "epoch": 1.382006366462518, + "grad_norm": 0.2203773856163025, + "learning_rate": 1.938994215094763e-05, + "loss": 1.2944, + "step": 4640 + }, + { + "epoch": 1.3823042126621865, + "grad_norm": 0.21654027700424194, + "learning_rate": 1.9389610361125133e-05, + "loss": 1.2726, + "step": 4641 + }, + { + "epoch": 1.3826020588618553, + "grad_norm": 0.22399553656578064, + "learning_rate": 1.938927848394281e-05, + "loss": 1.2695, + "step": 4642 + }, + { + "epoch": 1.3828999050615238, + "grad_norm": 0.22837962210178375, + "learning_rate": 1.9388946519403733e-05, + "loss": 1.3121, + "step": 4643 + }, + { + "epoch": 1.3831977512611924, + "grad_norm": 0.22870467603206635, + "learning_rate": 1.9388614467511003e-05, + "loss": 1.2727, + "step": 4644 + }, + { + "epoch": 1.3834955974608611, + "grad_norm": 0.23329536616802216, + "learning_rate": 1.9388282328267703e-05, + "loss": 1.277, + "step": 4645 + }, + { + "epoch": 1.38379344366053, + "grad_norm": 0.22212761640548706, + "learning_rate": 1.9387950101676925e-05, + "loss": 1.2938, + "step": 4646 + }, + { + "epoch": 1.3840912898601985, + "grad_norm": 0.21830856800079346, + "learning_rate": 1.938761778774176e-05, + "loss": 1.2708, + "step": 4647 + }, + { + "epoch": 1.384389136059867, + "grad_norm": 0.38767769932746887, + "learning_rate": 1.93872853864653e-05, + "loss": 1.2788, + "step": 4648 + }, + { + "epoch": 1.3846869822595358, + "grad_norm": 0.22776472568511963, + "learning_rate": 1.938695289785064e-05, + "loss": 1.2966, + "step": 4649 + }, + { + "epoch": 1.3849848284592043, + "grad_norm": 0.2177894413471222, + "learning_rate": 1.9386620321900868e-05, + "loss": 1.2894, + "step": 4650 + }, + { + "epoch": 1.385282674658873, + "grad_norm": 0.21724140644073486, + "learning_rate": 1.9386287658619083e-05, + "loss": 1.2736, + "step": 4651 + }, + { + "epoch": 1.3855805208585417, + "grad_norm": 0.22220289707183838, + "learning_rate": 1.9385954908008377e-05, + "loss": 1.3074, + "step": 4652 + }, + { + "epoch": 1.3858783670582104, + "grad_norm": 0.23475584387779236, + "learning_rate": 1.938562207007185e-05, + "loss": 1.2722, + "step": 4653 + }, + { + "epoch": 1.386176213257879, + "grad_norm": 0.2162562906742096, + "learning_rate": 1.938528914481259e-05, + "loss": 1.283, + "step": 4654 + }, + { + "epoch": 1.3864740594575475, + "grad_norm": 0.21465593576431274, + "learning_rate": 1.9384956132233706e-05, + "loss": 1.2622, + "step": 4655 + }, + { + "epoch": 1.3867719056572163, + "grad_norm": 0.2208995521068573, + "learning_rate": 1.938462303233829e-05, + "loss": 1.2848, + "step": 4656 + }, + { + "epoch": 1.3870697518568849, + "grad_norm": 0.21462784707546234, + "learning_rate": 1.938428984512944e-05, + "loss": 1.284, + "step": 4657 + }, + { + "epoch": 1.3873675980565536, + "grad_norm": 0.20261240005493164, + "learning_rate": 1.938395657061026e-05, + "loss": 1.2906, + "step": 4658 + }, + { + "epoch": 1.3876654442562222, + "grad_norm": 0.2288421094417572, + "learning_rate": 1.9383623208783845e-05, + "loss": 1.2687, + "step": 4659 + }, + { + "epoch": 1.387963290455891, + "grad_norm": 0.21072624623775482, + "learning_rate": 1.9383289759653304e-05, + "loss": 1.296, + "step": 4660 + }, + { + "epoch": 1.3882611366555595, + "grad_norm": 0.22929874062538147, + "learning_rate": 1.938295622322173e-05, + "loss": 1.287, + "step": 4661 + }, + { + "epoch": 1.388558982855228, + "grad_norm": 0.22117933630943298, + "learning_rate": 1.9382622599492237e-05, + "loss": 1.2825, + "step": 4662 + }, + { + "epoch": 1.3888568290548968, + "grad_norm": 0.2204248011112213, + "learning_rate": 1.938228888846792e-05, + "loss": 1.2946, + "step": 4663 + }, + { + "epoch": 1.3891546752545654, + "grad_norm": 0.21365541219711304, + "learning_rate": 1.938195509015189e-05, + "loss": 1.2806, + "step": 4664 + }, + { + "epoch": 1.3894525214542341, + "grad_norm": 0.22503085434436798, + "learning_rate": 1.938162120454725e-05, + "loss": 1.2987, + "step": 4665 + }, + { + "epoch": 1.3897503676539027, + "grad_norm": 0.22421522438526154, + "learning_rate": 1.9381287231657105e-05, + "loss": 1.2927, + "step": 4666 + }, + { + "epoch": 1.3900482138535715, + "grad_norm": 0.21837452054023743, + "learning_rate": 1.9380953171484566e-05, + "loss": 1.2814, + "step": 4667 + }, + { + "epoch": 1.39034606005324, + "grad_norm": 0.21142376959323883, + "learning_rate": 1.9380619024032734e-05, + "loss": 1.2951, + "step": 4668 + }, + { + "epoch": 1.3906439062529086, + "grad_norm": 0.2263711541891098, + "learning_rate": 1.938028478930473e-05, + "loss": 1.2858, + "step": 4669 + }, + { + "epoch": 1.3909417524525773, + "grad_norm": 0.21461977064609528, + "learning_rate": 1.937995046730365e-05, + "loss": 1.2786, + "step": 4670 + }, + { + "epoch": 1.391239598652246, + "grad_norm": 0.21937595307826996, + "learning_rate": 1.937961605803261e-05, + "loss": 1.2999, + "step": 4671 + }, + { + "epoch": 1.3915374448519147, + "grad_norm": 0.205740287899971, + "learning_rate": 1.9379281561494726e-05, + "loss": 1.294, + "step": 4672 + }, + { + "epoch": 1.3918352910515832, + "grad_norm": 0.22866885364055634, + "learning_rate": 1.9378946977693106e-05, + "loss": 1.2815, + "step": 4673 + }, + { + "epoch": 1.392133137251252, + "grad_norm": 0.21737025678157806, + "learning_rate": 1.937861230663086e-05, + "loss": 1.2841, + "step": 4674 + }, + { + "epoch": 1.3924309834509205, + "grad_norm": 0.21780544519424438, + "learning_rate": 1.937827754831111e-05, + "loss": 1.2705, + "step": 4675 + }, + { + "epoch": 1.392728829650589, + "grad_norm": 0.21779504418373108, + "learning_rate": 1.937794270273696e-05, + "loss": 1.2892, + "step": 4676 + }, + { + "epoch": 1.3930266758502579, + "grad_norm": 0.21866978704929352, + "learning_rate": 1.9377607769911534e-05, + "loss": 1.2663, + "step": 4677 + }, + { + "epoch": 1.3933245220499264, + "grad_norm": 0.22655676305294037, + "learning_rate": 1.9377272749837944e-05, + "loss": 1.2791, + "step": 4678 + }, + { + "epoch": 1.3936223682495952, + "grad_norm": 0.20959815382957458, + "learning_rate": 1.9376937642519307e-05, + "loss": 1.2854, + "step": 4679 + }, + { + "epoch": 1.3939202144492637, + "grad_norm": 0.2104579508304596, + "learning_rate": 1.9376602447958747e-05, + "loss": 1.2846, + "step": 4680 + }, + { + "epoch": 1.3942180606489325, + "grad_norm": 0.21385684609413147, + "learning_rate": 1.937626716615937e-05, + "loss": 1.2871, + "step": 4681 + }, + { + "epoch": 1.394515906848601, + "grad_norm": 0.20603463053703308, + "learning_rate": 1.9375931797124306e-05, + "loss": 1.2704, + "step": 4682 + }, + { + "epoch": 1.3948137530482696, + "grad_norm": 0.22123649716377258, + "learning_rate": 1.9375596340856673e-05, + "loss": 1.2796, + "step": 4683 + }, + { + "epoch": 1.3951115992479384, + "grad_norm": 0.21366514265537262, + "learning_rate": 1.937526079735959e-05, + "loss": 1.2814, + "step": 4684 + }, + { + "epoch": 1.395409445447607, + "grad_norm": 0.2190808802843094, + "learning_rate": 1.937492516663618e-05, + "loss": 1.2726, + "step": 4685 + }, + { + "epoch": 1.3957072916472757, + "grad_norm": 0.21271266043186188, + "learning_rate": 1.9374589448689567e-05, + "loss": 1.2728, + "step": 4686 + }, + { + "epoch": 1.3960051378469442, + "grad_norm": 0.22519513964653015, + "learning_rate": 1.937425364352287e-05, + "loss": 1.2888, + "step": 4687 + }, + { + "epoch": 1.396302984046613, + "grad_norm": 0.21729159355163574, + "learning_rate": 1.937391775113922e-05, + "loss": 1.2858, + "step": 4688 + }, + { + "epoch": 1.3966008302462816, + "grad_norm": 0.2038884460926056, + "learning_rate": 1.9373581771541737e-05, + "loss": 1.2831, + "step": 4689 + }, + { + "epoch": 1.3968986764459501, + "grad_norm": 0.20512959361076355, + "learning_rate": 1.937324570473355e-05, + "loss": 1.2724, + "step": 4690 + }, + { + "epoch": 1.3971965226456189, + "grad_norm": 0.22382956743240356, + "learning_rate": 1.937290955071778e-05, + "loss": 1.3006, + "step": 4691 + }, + { + "epoch": 1.3974943688452874, + "grad_norm": 0.21378615498542786, + "learning_rate": 1.937257330949756e-05, + "loss": 1.2821, + "step": 4692 + }, + { + "epoch": 1.3977922150449562, + "grad_norm": 0.22013157606124878, + "learning_rate": 1.937223698107602e-05, + "loss": 1.2914, + "step": 4693 + }, + { + "epoch": 1.3980900612446248, + "grad_norm": 0.21234650909900665, + "learning_rate": 1.937190056545628e-05, + "loss": 1.274, + "step": 4694 + }, + { + "epoch": 1.3983879074442935, + "grad_norm": 0.20825199782848358, + "learning_rate": 1.9371564062641482e-05, + "loss": 1.2729, + "step": 4695 + }, + { + "epoch": 1.398685753643962, + "grad_norm": 0.2126312106847763, + "learning_rate": 1.937122747263475e-05, + "loss": 1.2862, + "step": 4696 + }, + { + "epoch": 1.3989835998436306, + "grad_norm": 0.22088484466075897, + "learning_rate": 1.9370890795439215e-05, + "loss": 1.2713, + "step": 4697 + }, + { + "epoch": 1.3992814460432994, + "grad_norm": 0.2233424335718155, + "learning_rate": 1.9370554031058013e-05, + "loss": 1.2773, + "step": 4698 + }, + { + "epoch": 1.3995792922429682, + "grad_norm": 0.21901319921016693, + "learning_rate": 1.9370217179494274e-05, + "loss": 1.2768, + "step": 4699 + }, + { + "epoch": 1.3998771384426367, + "grad_norm": 0.2273193895816803, + "learning_rate": 1.9369880240751132e-05, + "loss": 1.2725, + "step": 4700 + }, + { + "epoch": 1.4001749846423053, + "grad_norm": 0.2120569795370102, + "learning_rate": 1.9369543214831725e-05, + "loss": 1.2689, + "step": 4701 + }, + { + "epoch": 1.400472830841974, + "grad_norm": 0.2282368242740631, + "learning_rate": 1.9369206101739184e-05, + "loss": 1.2655, + "step": 4702 + }, + { + "epoch": 1.4007706770416426, + "grad_norm": 0.21750496327877045, + "learning_rate": 1.936886890147665e-05, + "loss": 1.3106, + "step": 4703 + }, + { + "epoch": 1.4010685232413111, + "grad_norm": 0.21052533388137817, + "learning_rate": 1.936853161404726e-05, + "loss": 1.2659, + "step": 4704 + }, + { + "epoch": 1.40136636944098, + "grad_norm": 0.21183964610099792, + "learning_rate": 1.9368194239454146e-05, + "loss": 1.2824, + "step": 4705 + }, + { + "epoch": 1.4016642156406487, + "grad_norm": 0.20848579704761505, + "learning_rate": 1.9367856777700455e-05, + "loss": 1.2781, + "step": 4706 + }, + { + "epoch": 1.4019620618403172, + "grad_norm": 0.2138061374425888, + "learning_rate": 1.936751922878932e-05, + "loss": 1.2687, + "step": 4707 + }, + { + "epoch": 1.4022599080399858, + "grad_norm": 0.2122078537940979, + "learning_rate": 1.936718159272389e-05, + "loss": 1.2923, + "step": 4708 + }, + { + "epoch": 1.4025577542396546, + "grad_norm": 0.2128915786743164, + "learning_rate": 1.9366843869507296e-05, + "loss": 1.2808, + "step": 4709 + }, + { + "epoch": 1.4028556004393231, + "grad_norm": 0.22548538446426392, + "learning_rate": 1.9366506059142688e-05, + "loss": 1.2771, + "step": 4710 + }, + { + "epoch": 1.4031534466389917, + "grad_norm": 0.21388880908489227, + "learning_rate": 1.9366168161633206e-05, + "loss": 1.284, + "step": 4711 + }, + { + "epoch": 1.4034512928386604, + "grad_norm": 0.21711203455924988, + "learning_rate": 1.9365830176981994e-05, + "loss": 1.2683, + "step": 4712 + }, + { + "epoch": 1.4037491390383292, + "grad_norm": 0.22926999628543854, + "learning_rate": 1.9365492105192193e-05, + "loss": 1.2879, + "step": 4713 + }, + { + "epoch": 1.4040469852379978, + "grad_norm": 0.22222432494163513, + "learning_rate": 1.936515394626695e-05, + "loss": 1.2813, + "step": 4714 + }, + { + "epoch": 1.4043448314376663, + "grad_norm": 0.3137796223163605, + "learning_rate": 1.9364815700209417e-05, + "loss": 1.2679, + "step": 4715 + }, + { + "epoch": 1.404642677637335, + "grad_norm": 0.26504454016685486, + "learning_rate": 1.9364477367022738e-05, + "loss": 1.2887, + "step": 4716 + }, + { + "epoch": 1.4049405238370036, + "grad_norm": 0.23855175077915192, + "learning_rate": 1.9364138946710057e-05, + "loss": 1.2706, + "step": 4717 + }, + { + "epoch": 1.4052383700366722, + "grad_norm": 0.2073981761932373, + "learning_rate": 1.9363800439274528e-05, + "loss": 1.2718, + "step": 4718 + }, + { + "epoch": 1.405536216236341, + "grad_norm": 0.23830677568912506, + "learning_rate": 1.9363461844719292e-05, + "loss": 1.2737, + "step": 4719 + }, + { + "epoch": 1.4058340624360097, + "grad_norm": 0.24615485966205597, + "learning_rate": 1.936312316304751e-05, + "loss": 1.283, + "step": 4720 + }, + { + "epoch": 1.4061319086356783, + "grad_norm": 0.2343778759241104, + "learning_rate": 1.9362784394262327e-05, + "loss": 1.2938, + "step": 4721 + }, + { + "epoch": 1.4064297548353468, + "grad_norm": 0.2170053869485855, + "learning_rate": 1.93624455383669e-05, + "loss": 1.2877, + "step": 4722 + }, + { + "epoch": 1.4067276010350156, + "grad_norm": 0.23598410189151764, + "learning_rate": 1.9362106595364373e-05, + "loss": 1.3068, + "step": 4723 + }, + { + "epoch": 1.4070254472346841, + "grad_norm": 0.20887908339500427, + "learning_rate": 1.9361767565257904e-05, + "loss": 1.2805, + "step": 4724 + }, + { + "epoch": 1.407323293434353, + "grad_norm": 0.20937970280647278, + "learning_rate": 1.9361428448050645e-05, + "loss": 1.3008, + "step": 4725 + }, + { + "epoch": 1.4076211396340215, + "grad_norm": 0.2293359786272049, + "learning_rate": 1.9361089243745755e-05, + "loss": 1.2878, + "step": 4726 + }, + { + "epoch": 1.4079189858336902, + "grad_norm": 0.22540301084518433, + "learning_rate": 1.9360749952346393e-05, + "loss": 1.256, + "step": 4727 + }, + { + "epoch": 1.4082168320333588, + "grad_norm": 0.22422142326831818, + "learning_rate": 1.9360410573855707e-05, + "loss": 1.2824, + "step": 4728 + }, + { + "epoch": 1.4085146782330273, + "grad_norm": 0.21460947394371033, + "learning_rate": 1.936007110827686e-05, + "loss": 1.2958, + "step": 4729 + }, + { + "epoch": 1.4088125244326961, + "grad_norm": 0.2221807837486267, + "learning_rate": 1.9359731555613012e-05, + "loss": 1.2928, + "step": 4730 + }, + { + "epoch": 1.4091103706323647, + "grad_norm": 0.23458969593048096, + "learning_rate": 1.9359391915867315e-05, + "loss": 1.2903, + "step": 4731 + }, + { + "epoch": 1.4094082168320334, + "grad_norm": 0.22301706671714783, + "learning_rate": 1.9359052189042932e-05, + "loss": 1.2735, + "step": 4732 + }, + { + "epoch": 1.409706063031702, + "grad_norm": 0.21750102937221527, + "learning_rate": 1.9358712375143026e-05, + "loss": 1.2828, + "step": 4733 + }, + { + "epoch": 1.4100039092313708, + "grad_norm": 0.24715521931648254, + "learning_rate": 1.9358372474170763e-05, + "loss": 1.2917, + "step": 4734 + }, + { + "epoch": 1.4103017554310393, + "grad_norm": 0.21691952645778656, + "learning_rate": 1.9358032486129296e-05, + "loss": 1.2782, + "step": 4735 + }, + { + "epoch": 1.4105996016307079, + "grad_norm": 0.23581166565418243, + "learning_rate": 1.935769241102179e-05, + "loss": 1.2858, + "step": 4736 + }, + { + "epoch": 1.4108974478303766, + "grad_norm": 0.221417635679245, + "learning_rate": 1.935735224885141e-05, + "loss": 1.3084, + "step": 4737 + }, + { + "epoch": 1.4111952940300452, + "grad_norm": 0.21256232261657715, + "learning_rate": 1.9357011999621326e-05, + "loss": 1.2831, + "step": 4738 + }, + { + "epoch": 1.411493140229714, + "grad_norm": 0.21626178920269012, + "learning_rate": 1.9356671663334697e-05, + "loss": 1.2882, + "step": 4739 + }, + { + "epoch": 1.4117909864293825, + "grad_norm": 0.21899373829364777, + "learning_rate": 1.9356331239994698e-05, + "loss": 1.2654, + "step": 4740 + }, + { + "epoch": 1.4120888326290513, + "grad_norm": 0.23099561035633087, + "learning_rate": 1.9355990729604482e-05, + "loss": 1.2698, + "step": 4741 + }, + { + "epoch": 1.4123866788287198, + "grad_norm": 0.21576914191246033, + "learning_rate": 1.9355650132167228e-05, + "loss": 1.2881, + "step": 4742 + }, + { + "epoch": 1.4126845250283884, + "grad_norm": 0.21933506429195404, + "learning_rate": 1.9355309447686107e-05, + "loss": 1.2817, + "step": 4743 + }, + { + "epoch": 1.4129823712280571, + "grad_norm": 0.23537559807300568, + "learning_rate": 1.935496867616428e-05, + "loss": 1.2902, + "step": 4744 + }, + { + "epoch": 1.4132802174277257, + "grad_norm": 0.21506574749946594, + "learning_rate": 1.9354627817604922e-05, + "loss": 1.2804, + "step": 4745 + }, + { + "epoch": 1.4135780636273945, + "grad_norm": 0.20947204530239105, + "learning_rate": 1.93542868720112e-05, + "loss": 1.272, + "step": 4746 + }, + { + "epoch": 1.413875909827063, + "grad_norm": 0.21577829122543335, + "learning_rate": 1.9353945839386297e-05, + "loss": 1.294, + "step": 4747 + }, + { + "epoch": 1.4141737560267318, + "grad_norm": 0.22076302766799927, + "learning_rate": 1.9353604719733373e-05, + "loss": 1.2829, + "step": 4748 + }, + { + "epoch": 1.4144716022264003, + "grad_norm": 0.2247573882341385, + "learning_rate": 1.935326351305561e-05, + "loss": 1.273, + "step": 4749 + }, + { + "epoch": 1.4147694484260689, + "grad_norm": 0.2100122720003128, + "learning_rate": 1.935292221935618e-05, + "loss": 1.2947, + "step": 4750 + }, + { + "epoch": 1.4150672946257377, + "grad_norm": 0.2099503129720688, + "learning_rate": 1.9352580838638258e-05, + "loss": 1.2953, + "step": 4751 + }, + { + "epoch": 1.4153651408254062, + "grad_norm": 0.21044223010540009, + "learning_rate": 1.9352239370905025e-05, + "loss": 1.2674, + "step": 4752 + }, + { + "epoch": 1.415662987025075, + "grad_norm": 0.2194964736700058, + "learning_rate": 1.935189781615965e-05, + "loss": 1.2783, + "step": 4753 + }, + { + "epoch": 1.4159608332247435, + "grad_norm": 0.21921008825302124, + "learning_rate": 1.935155617440531e-05, + "loss": 1.2807, + "step": 4754 + }, + { + "epoch": 1.4162586794244123, + "grad_norm": 0.2217089831829071, + "learning_rate": 1.9351214445645193e-05, + "loss": 1.2916, + "step": 4755 + }, + { + "epoch": 1.4165565256240809, + "grad_norm": 0.2105427384376526, + "learning_rate": 1.935087262988247e-05, + "loss": 1.2699, + "step": 4756 + }, + { + "epoch": 1.4168543718237494, + "grad_norm": 0.2272225320339203, + "learning_rate": 1.935053072712033e-05, + "loss": 1.2817, + "step": 4757 + }, + { + "epoch": 1.4171522180234182, + "grad_norm": 0.22575689852237701, + "learning_rate": 1.9350188737361947e-05, + "loss": 1.2978, + "step": 4758 + }, + { + "epoch": 1.4174500642230867, + "grad_norm": 0.2152029424905777, + "learning_rate": 1.93498466606105e-05, + "loss": 1.2723, + "step": 4759 + }, + { + "epoch": 1.4177479104227555, + "grad_norm": 0.22033043205738068, + "learning_rate": 1.9349504496869177e-05, + "loss": 1.2911, + "step": 4760 + }, + { + "epoch": 1.418045756622424, + "grad_norm": 0.21459448337554932, + "learning_rate": 1.9349162246141165e-05, + "loss": 1.282, + "step": 4761 + }, + { + "epoch": 1.4183436028220928, + "grad_norm": 0.2216905653476715, + "learning_rate": 1.934881990842964e-05, + "loss": 1.2976, + "step": 4762 + }, + { + "epoch": 1.4186414490217614, + "grad_norm": 0.21749120950698853, + "learning_rate": 1.9348477483737792e-05, + "loss": 1.2664, + "step": 4763 + }, + { + "epoch": 1.41893929522143, + "grad_norm": 0.2163579910993576, + "learning_rate": 1.934813497206881e-05, + "loss": 1.2757, + "step": 4764 + }, + { + "epoch": 1.4192371414210987, + "grad_norm": 0.2084561586380005, + "learning_rate": 1.934779237342587e-05, + "loss": 1.2762, + "step": 4765 + }, + { + "epoch": 1.4195349876207675, + "grad_norm": 0.22256968915462494, + "learning_rate": 1.934744968781217e-05, + "loss": 1.2815, + "step": 4766 + }, + { + "epoch": 1.419832833820436, + "grad_norm": 0.2200809270143509, + "learning_rate": 1.934710691523089e-05, + "loss": 1.3122, + "step": 4767 + }, + { + "epoch": 1.4201306800201046, + "grad_norm": 0.23718221485614777, + "learning_rate": 1.9346764055685224e-05, + "loss": 1.2699, + "step": 4768 + }, + { + "epoch": 1.4204285262197733, + "grad_norm": 0.20865055918693542, + "learning_rate": 1.9346421109178365e-05, + "loss": 1.2726, + "step": 4769 + }, + { + "epoch": 1.4207263724194419, + "grad_norm": 0.21131010353565216, + "learning_rate": 1.9346078075713498e-05, + "loss": 1.2908, + "step": 4770 + }, + { + "epoch": 1.4210242186191104, + "grad_norm": 0.22388875484466553, + "learning_rate": 1.9345734955293817e-05, + "loss": 1.2942, + "step": 4771 + }, + { + "epoch": 1.4213220648187792, + "grad_norm": 0.2043249011039734, + "learning_rate": 1.9345391747922515e-05, + "loss": 1.2744, + "step": 4772 + }, + { + "epoch": 1.421619911018448, + "grad_norm": 0.22854197025299072, + "learning_rate": 1.9345048453602782e-05, + "loss": 1.2865, + "step": 4773 + }, + { + "epoch": 1.4219177572181165, + "grad_norm": 0.20839452743530273, + "learning_rate": 1.9344705072337815e-05, + "loss": 1.2639, + "step": 4774 + }, + { + "epoch": 1.422215603417785, + "grad_norm": 0.21528539061546326, + "learning_rate": 1.9344361604130807e-05, + "loss": 1.2805, + "step": 4775 + }, + { + "epoch": 1.4225134496174539, + "grad_norm": 0.20581011474132538, + "learning_rate": 1.9344018048984955e-05, + "loss": 1.2673, + "step": 4776 + }, + { + "epoch": 1.4228112958171224, + "grad_norm": 0.22256039083003998, + "learning_rate": 1.9343674406903455e-05, + "loss": 1.2769, + "step": 4777 + }, + { + "epoch": 1.423109142016791, + "grad_norm": 0.22472919523715973, + "learning_rate": 1.9343330677889504e-05, + "loss": 1.2981, + "step": 4778 + }, + { + "epoch": 1.4234069882164597, + "grad_norm": 0.21348482370376587, + "learning_rate": 1.9342986861946303e-05, + "loss": 1.278, + "step": 4779 + }, + { + "epoch": 1.4237048344161285, + "grad_norm": 0.21071229875087738, + "learning_rate": 1.9342642959077044e-05, + "loss": 1.2877, + "step": 4780 + }, + { + "epoch": 1.424002680615797, + "grad_norm": 0.2112603634595871, + "learning_rate": 1.9342298969284932e-05, + "loss": 1.2761, + "step": 4781 + }, + { + "epoch": 1.4243005268154656, + "grad_norm": 0.21463853120803833, + "learning_rate": 1.9341954892573165e-05, + "loss": 1.2662, + "step": 4782 + }, + { + "epoch": 1.4245983730151344, + "grad_norm": 0.20784740149974823, + "learning_rate": 1.9341610728944945e-05, + "loss": 1.2803, + "step": 4783 + }, + { + "epoch": 1.424896219214803, + "grad_norm": 0.21706542372703552, + "learning_rate": 1.9341266478403474e-05, + "loss": 1.2885, + "step": 4784 + }, + { + "epoch": 1.4251940654144715, + "grad_norm": 0.2044822871685028, + "learning_rate": 1.9340922140951954e-05, + "loss": 1.2825, + "step": 4785 + }, + { + "epoch": 1.4254919116141402, + "grad_norm": 0.21804824471473694, + "learning_rate": 1.9340577716593593e-05, + "loss": 1.2635, + "step": 4786 + }, + { + "epoch": 1.425789757813809, + "grad_norm": 0.2229154109954834, + "learning_rate": 1.934023320533159e-05, + "loss": 1.2888, + "step": 4787 + }, + { + "epoch": 1.4260876040134776, + "grad_norm": 0.23313355445861816, + "learning_rate": 1.9339888607169152e-05, + "loss": 1.2746, + "step": 4788 + }, + { + "epoch": 1.4263854502131461, + "grad_norm": 0.2325923591852188, + "learning_rate": 1.9339543922109487e-05, + "loss": 1.2875, + "step": 4789 + }, + { + "epoch": 1.4266832964128149, + "grad_norm": 0.21830318868160248, + "learning_rate": 1.9339199150155804e-05, + "loss": 1.2793, + "step": 4790 + }, + { + "epoch": 1.4269811426124834, + "grad_norm": 0.21931092441082, + "learning_rate": 1.93388542913113e-05, + "loss": 1.2778, + "step": 4791 + }, + { + "epoch": 1.4272789888121522, + "grad_norm": 0.2196022868156433, + "learning_rate": 1.9338509345579196e-05, + "loss": 1.2791, + "step": 4792 + }, + { + "epoch": 1.4275768350118208, + "grad_norm": 0.2201681137084961, + "learning_rate": 1.9338164312962694e-05, + "loss": 1.2953, + "step": 4793 + }, + { + "epoch": 1.4278746812114895, + "grad_norm": 0.2099919468164444, + "learning_rate": 1.9337819193465007e-05, + "loss": 1.2716, + "step": 4794 + }, + { + "epoch": 1.428172527411158, + "grad_norm": 0.21924568712711334, + "learning_rate": 1.9337473987089346e-05, + "loss": 1.2851, + "step": 4795 + }, + { + "epoch": 1.4284703736108266, + "grad_norm": 0.21668729186058044, + "learning_rate": 1.933712869383892e-05, + "loss": 1.2812, + "step": 4796 + }, + { + "epoch": 1.4287682198104954, + "grad_norm": 0.20736163854599, + "learning_rate": 1.9336783313716946e-05, + "loss": 1.2678, + "step": 4797 + }, + { + "epoch": 1.429066066010164, + "grad_norm": 0.20503593981266022, + "learning_rate": 1.9336437846726634e-05, + "loss": 1.2802, + "step": 4798 + }, + { + "epoch": 1.4293639122098327, + "grad_norm": 0.23220981657505035, + "learning_rate": 1.9336092292871197e-05, + "loss": 1.2623, + "step": 4799 + }, + { + "epoch": 1.4296617584095013, + "grad_norm": 0.22154389321804047, + "learning_rate": 1.9335746652153856e-05, + "loss": 1.2771, + "step": 4800 + }, + { + "epoch": 1.42995960460917, + "grad_norm": 0.2188727706670761, + "learning_rate": 1.933540092457782e-05, + "loss": 1.2755, + "step": 4801 + }, + { + "epoch": 1.4302574508088386, + "grad_norm": 0.22202670574188232, + "learning_rate": 1.933505511014631e-05, + "loss": 1.2695, + "step": 4802 + }, + { + "epoch": 1.4305552970085071, + "grad_norm": 0.21409845352172852, + "learning_rate": 1.9334709208862537e-05, + "loss": 1.2808, + "step": 4803 + }, + { + "epoch": 1.430853143208176, + "grad_norm": 0.21846888959407806, + "learning_rate": 1.9334363220729733e-05, + "loss": 1.268, + "step": 4804 + }, + { + "epoch": 1.4311509894078445, + "grad_norm": 0.22371357679367065, + "learning_rate": 1.9334017145751102e-05, + "loss": 1.2848, + "step": 4805 + }, + { + "epoch": 1.4314488356075132, + "grad_norm": 0.21155185997486115, + "learning_rate": 1.9333670983929872e-05, + "loss": 1.2774, + "step": 4806 + }, + { + "epoch": 1.4317466818071818, + "grad_norm": 0.21358896791934967, + "learning_rate": 1.933332473526926e-05, + "loss": 1.2747, + "step": 4807 + }, + { + "epoch": 1.4320445280068506, + "grad_norm": 0.21508902311325073, + "learning_rate": 1.9332978399772493e-05, + "loss": 1.2974, + "step": 4808 + }, + { + "epoch": 1.432342374206519, + "grad_norm": 0.22894176840782166, + "learning_rate": 1.9332631977442787e-05, + "loss": 1.275, + "step": 4809 + }, + { + "epoch": 1.4326402204061877, + "grad_norm": 0.21540142595767975, + "learning_rate": 1.9332285468283368e-05, + "loss": 1.2794, + "step": 4810 + }, + { + "epoch": 1.4329380666058564, + "grad_norm": 0.21178120374679565, + "learning_rate": 1.9331938872297454e-05, + "loss": 1.2854, + "step": 4811 + }, + { + "epoch": 1.433235912805525, + "grad_norm": 0.21985602378845215, + "learning_rate": 1.9331592189488285e-05, + "loss": 1.2958, + "step": 4812 + }, + { + "epoch": 1.4335337590051938, + "grad_norm": 0.21311499178409576, + "learning_rate": 1.933124541985907e-05, + "loss": 1.2702, + "step": 4813 + }, + { + "epoch": 1.4338316052048623, + "grad_norm": 0.21507811546325684, + "learning_rate": 1.933089856341304e-05, + "loss": 1.2707, + "step": 4814 + }, + { + "epoch": 1.434129451404531, + "grad_norm": 0.22037610411643982, + "learning_rate": 1.933055162015343e-05, + "loss": 1.2806, + "step": 4815 + }, + { + "epoch": 1.4344272976041996, + "grad_norm": 0.21492549777030945, + "learning_rate": 1.9330204590083457e-05, + "loss": 1.2773, + "step": 4816 + }, + { + "epoch": 1.4347251438038682, + "grad_norm": 0.21301521360874176, + "learning_rate": 1.9329857473206355e-05, + "loss": 1.2853, + "step": 4817 + }, + { + "epoch": 1.435022990003537, + "grad_norm": 0.2319442629814148, + "learning_rate": 1.9329510269525358e-05, + "loss": 1.2878, + "step": 4818 + }, + { + "epoch": 1.4353208362032055, + "grad_norm": 0.21806570887565613, + "learning_rate": 1.9329162979043687e-05, + "loss": 1.2735, + "step": 4819 + }, + { + "epoch": 1.4356186824028743, + "grad_norm": 0.21726150810718536, + "learning_rate": 1.9328815601764577e-05, + "loss": 1.3067, + "step": 4820 + }, + { + "epoch": 1.4359165286025428, + "grad_norm": 0.2200784981250763, + "learning_rate": 1.9328468137691266e-05, + "loss": 1.303, + "step": 4821 + }, + { + "epoch": 1.4362143748022116, + "grad_norm": 0.21538648009300232, + "learning_rate": 1.9328120586826977e-05, + "loss": 1.2801, + "step": 4822 + }, + { + "epoch": 1.4365122210018801, + "grad_norm": 0.22035753726959229, + "learning_rate": 1.9327772949174948e-05, + "loss": 1.2773, + "step": 4823 + }, + { + "epoch": 1.4368100672015487, + "grad_norm": 0.21646147966384888, + "learning_rate": 1.9327425224738413e-05, + "loss": 1.2709, + "step": 4824 + }, + { + "epoch": 1.4371079134012175, + "grad_norm": 0.22500768303871155, + "learning_rate": 1.932707741352061e-05, + "loss": 1.2805, + "step": 4825 + }, + { + "epoch": 1.437405759600886, + "grad_norm": 0.2312770038843155, + "learning_rate": 1.9326729515524772e-05, + "loss": 1.2925, + "step": 4826 + }, + { + "epoch": 1.4377036058005548, + "grad_norm": 0.22772246599197388, + "learning_rate": 1.9326381530754134e-05, + "loss": 1.2726, + "step": 4827 + }, + { + "epoch": 1.4380014520002233, + "grad_norm": 0.2224140763282776, + "learning_rate": 1.932603345921194e-05, + "loss": 1.2772, + "step": 4828 + }, + { + "epoch": 1.438299298199892, + "grad_norm": 0.22427061200141907, + "learning_rate": 1.932568530090142e-05, + "loss": 1.3016, + "step": 4829 + }, + { + "epoch": 1.4385971443995607, + "grad_norm": 0.21568401157855988, + "learning_rate": 1.9325337055825818e-05, + "loss": 1.2926, + "step": 4830 + }, + { + "epoch": 1.4388949905992292, + "grad_norm": 0.22633923590183258, + "learning_rate": 1.9324988723988377e-05, + "loss": 1.2857, + "step": 4831 + }, + { + "epoch": 1.439192836798898, + "grad_norm": 0.22848960757255554, + "learning_rate": 1.932464030539233e-05, + "loss": 1.2615, + "step": 4832 + }, + { + "epoch": 1.4394906829985668, + "grad_norm": 0.21749244630336761, + "learning_rate": 1.9324291800040927e-05, + "loss": 1.2802, + "step": 4833 + }, + { + "epoch": 1.4397885291982353, + "grad_norm": 0.2067568451166153, + "learning_rate": 1.9323943207937404e-05, + "loss": 1.2853, + "step": 4834 + }, + { + "epoch": 1.4400863753979039, + "grad_norm": 0.21011200547218323, + "learning_rate": 1.9323594529085005e-05, + "loss": 1.2806, + "step": 4835 + }, + { + "epoch": 1.4403842215975726, + "grad_norm": 0.21135860681533813, + "learning_rate": 1.9323245763486982e-05, + "loss": 1.2732, + "step": 4836 + }, + { + "epoch": 1.4406820677972412, + "grad_norm": 0.21184229850769043, + "learning_rate": 1.932289691114657e-05, + "loss": 1.2818, + "step": 4837 + }, + { + "epoch": 1.4409799139969097, + "grad_norm": 0.22322779893875122, + "learning_rate": 1.9322547972067016e-05, + "loss": 1.2677, + "step": 4838 + }, + { + "epoch": 1.4412777601965785, + "grad_norm": 0.21357326209545135, + "learning_rate": 1.9322198946251572e-05, + "loss": 1.2705, + "step": 4839 + }, + { + "epoch": 1.4415756063962473, + "grad_norm": 0.21662479639053345, + "learning_rate": 1.9321849833703484e-05, + "loss": 1.2731, + "step": 4840 + }, + { + "epoch": 1.4418734525959158, + "grad_norm": 0.22164317965507507, + "learning_rate": 1.9321500634425995e-05, + "loss": 1.2734, + "step": 4841 + }, + { + "epoch": 1.4421712987955844, + "grad_norm": 0.273386687040329, + "learning_rate": 1.9321151348422358e-05, + "loss": 1.289, + "step": 4842 + }, + { + "epoch": 1.4424691449952531, + "grad_norm": 0.22557945549488068, + "learning_rate": 1.932080197569582e-05, + "loss": 1.2689, + "step": 4843 + }, + { + "epoch": 1.4427669911949217, + "grad_norm": 0.214786559343338, + "learning_rate": 1.932045251624964e-05, + "loss": 1.2862, + "step": 4844 + }, + { + "epoch": 1.4430648373945902, + "grad_norm": 0.2258441299200058, + "learning_rate": 1.9320102970087055e-05, + "loss": 1.286, + "step": 4845 + }, + { + "epoch": 1.443362683594259, + "grad_norm": 0.21301913261413574, + "learning_rate": 1.9319753337211327e-05, + "loss": 1.2759, + "step": 4846 + }, + { + "epoch": 1.4436605297939278, + "grad_norm": 0.21130412817001343, + "learning_rate": 1.9319403617625707e-05, + "loss": 1.2989, + "step": 4847 + }, + { + "epoch": 1.4439583759935963, + "grad_norm": 0.2133743315935135, + "learning_rate": 1.931905381133345e-05, + "loss": 1.2857, + "step": 4848 + }, + { + "epoch": 1.4442562221932649, + "grad_norm": 0.21618391573429108, + "learning_rate": 1.9318703918337807e-05, + "loss": 1.2911, + "step": 4849 + }, + { + "epoch": 1.4445540683929337, + "grad_norm": 0.21383893489837646, + "learning_rate": 1.9318353938642037e-05, + "loss": 1.2873, + "step": 4850 + }, + { + "epoch": 1.4448519145926022, + "grad_norm": 0.21067416667938232, + "learning_rate": 1.9318003872249398e-05, + "loss": 1.2893, + "step": 4851 + }, + { + "epoch": 1.4451497607922708, + "grad_norm": 0.21318969130516052, + "learning_rate": 1.9317653719163137e-05, + "loss": 1.2833, + "step": 4852 + }, + { + "epoch": 1.4454476069919395, + "grad_norm": 0.21748173236846924, + "learning_rate": 1.9317303479386523e-05, + "loss": 1.2693, + "step": 4853 + }, + { + "epoch": 1.4457454531916083, + "grad_norm": 0.22137188911437988, + "learning_rate": 1.931695315292281e-05, + "loss": 1.2914, + "step": 4854 + }, + { + "epoch": 1.4460432993912768, + "grad_norm": 0.22523652017116547, + "learning_rate": 1.9316602739775255e-05, + "loss": 1.2787, + "step": 4855 + }, + { + "epoch": 1.4463411455909454, + "grad_norm": 0.2076699584722519, + "learning_rate": 1.9316252239947123e-05, + "loss": 1.265, + "step": 4856 + }, + { + "epoch": 1.4466389917906142, + "grad_norm": 0.2129327356815338, + "learning_rate": 1.9315901653441672e-05, + "loss": 1.271, + "step": 4857 + }, + { + "epoch": 1.4469368379902827, + "grad_norm": 0.21853697299957275, + "learning_rate": 1.931555098026216e-05, + "loss": 1.2663, + "step": 4858 + }, + { + "epoch": 1.4472346841899515, + "grad_norm": 0.25575652718544006, + "learning_rate": 1.9315200220411862e-05, + "loss": 1.2753, + "step": 4859 + }, + { + "epoch": 1.44753253038962, + "grad_norm": 0.2197348028421402, + "learning_rate": 1.931484937389403e-05, + "loss": 1.2602, + "step": 4860 + }, + { + "epoch": 1.4478303765892888, + "grad_norm": 0.23306891322135925, + "learning_rate": 1.9314498440711928e-05, + "loss": 1.2772, + "step": 4861 + }, + { + "epoch": 1.4481282227889574, + "grad_norm": 0.2237478792667389, + "learning_rate": 1.931414742086883e-05, + "loss": 1.283, + "step": 4862 + }, + { + "epoch": 1.448426068988626, + "grad_norm": 0.21376194059848785, + "learning_rate": 1.9313796314367995e-05, + "loss": 1.2757, + "step": 4863 + }, + { + "epoch": 1.4487239151882947, + "grad_norm": 0.2342880517244339, + "learning_rate": 1.9313445121212692e-05, + "loss": 1.2813, + "step": 4864 + }, + { + "epoch": 1.4490217613879632, + "grad_norm": 0.21437346935272217, + "learning_rate": 1.9313093841406186e-05, + "loss": 1.2851, + "step": 4865 + }, + { + "epoch": 1.449319607587632, + "grad_norm": 0.2158830463886261, + "learning_rate": 1.9312742474951747e-05, + "loss": 1.2785, + "step": 4866 + }, + { + "epoch": 1.4496174537873006, + "grad_norm": 0.20946374535560608, + "learning_rate": 1.9312391021852644e-05, + "loss": 1.28, + "step": 4867 + }, + { + "epoch": 1.4499152999869693, + "grad_norm": 0.21851618587970734, + "learning_rate": 1.9312039482112147e-05, + "loss": 1.2771, + "step": 4868 + }, + { + "epoch": 1.4502131461866379, + "grad_norm": 0.23499397933483124, + "learning_rate": 1.931168785573353e-05, + "loss": 1.3018, + "step": 4869 + }, + { + "epoch": 1.4505109923863064, + "grad_norm": 0.2345985770225525, + "learning_rate": 1.931133614272006e-05, + "loss": 1.2812, + "step": 4870 + }, + { + "epoch": 1.4508088385859752, + "grad_norm": 0.21713724732398987, + "learning_rate": 1.9310984343075006e-05, + "loss": 1.2889, + "step": 4871 + }, + { + "epoch": 1.4511066847856438, + "grad_norm": 0.2307348996400833, + "learning_rate": 1.931063245680165e-05, + "loss": 1.2784, + "step": 4872 + }, + { + "epoch": 1.4514045309853125, + "grad_norm": 0.22369886934757233, + "learning_rate": 1.931028048390326e-05, + "loss": 1.2824, + "step": 4873 + }, + { + "epoch": 1.451702377184981, + "grad_norm": 0.2125367820262909, + "learning_rate": 1.930992842438311e-05, + "loss": 1.2762, + "step": 4874 + }, + { + "epoch": 1.4520002233846498, + "grad_norm": 0.23226135969161987, + "learning_rate": 1.930957627824448e-05, + "loss": 1.2888, + "step": 4875 + }, + { + "epoch": 1.4522980695843184, + "grad_norm": 0.22335903346538544, + "learning_rate": 1.9309224045490643e-05, + "loss": 1.2783, + "step": 4876 + }, + { + "epoch": 1.452595915783987, + "grad_norm": 0.21399128437042236, + "learning_rate": 1.9308871726124877e-05, + "loss": 1.2811, + "step": 4877 + }, + { + "epoch": 1.4528937619836557, + "grad_norm": 0.2137995958328247, + "learning_rate": 1.9308519320150463e-05, + "loss": 1.283, + "step": 4878 + }, + { + "epoch": 1.4531916081833243, + "grad_norm": 0.21936951577663422, + "learning_rate": 1.9308166827570674e-05, + "loss": 1.2782, + "step": 4879 + }, + { + "epoch": 1.453489454382993, + "grad_norm": 0.22406361997127533, + "learning_rate": 1.9307814248388793e-05, + "loss": 1.293, + "step": 4880 + }, + { + "epoch": 1.4537873005826616, + "grad_norm": 0.21134215593338013, + "learning_rate": 1.93074615826081e-05, + "loss": 1.2849, + "step": 4881 + }, + { + "epoch": 1.4540851467823304, + "grad_norm": 0.2193523794412613, + "learning_rate": 1.9307108830231878e-05, + "loss": 1.2924, + "step": 4882 + }, + { + "epoch": 1.454382992981999, + "grad_norm": 0.21330083906650543, + "learning_rate": 1.9306755991263403e-05, + "loss": 1.2797, + "step": 4883 + }, + { + "epoch": 1.4546808391816675, + "grad_norm": 0.21060821413993835, + "learning_rate": 1.9306403065705965e-05, + "loss": 1.288, + "step": 4884 + }, + { + "epoch": 1.4549786853813362, + "grad_norm": 0.25724872946739197, + "learning_rate": 1.930605005356284e-05, + "loss": 1.2907, + "step": 4885 + }, + { + "epoch": 1.4552765315810048, + "grad_norm": 0.20756061375141144, + "learning_rate": 1.930569695483732e-05, + "loss": 1.2775, + "step": 4886 + }, + { + "epoch": 1.4555743777806736, + "grad_norm": 0.21263720095157623, + "learning_rate": 1.9305343769532686e-05, + "loss": 1.2834, + "step": 4887 + }, + { + "epoch": 1.455872223980342, + "grad_norm": 0.21779103577136993, + "learning_rate": 1.9304990497652224e-05, + "loss": 1.2674, + "step": 4888 + }, + { + "epoch": 1.4561700701800109, + "grad_norm": 0.21574203670024872, + "learning_rate": 1.9304637139199225e-05, + "loss": 1.2655, + "step": 4889 + }, + { + "epoch": 1.4564679163796794, + "grad_norm": 0.21916961669921875, + "learning_rate": 1.930428369417697e-05, + "loss": 1.2925, + "step": 4890 + }, + { + "epoch": 1.456765762579348, + "grad_norm": 0.21140176057815552, + "learning_rate": 1.930393016258875e-05, + "loss": 1.288, + "step": 4891 + }, + { + "epoch": 1.4570636087790168, + "grad_norm": 0.2188664972782135, + "learning_rate": 1.9303576544437854e-05, + "loss": 1.2685, + "step": 4892 + }, + { + "epoch": 1.4573614549786853, + "grad_norm": 0.22733062505722046, + "learning_rate": 1.9303222839727575e-05, + "loss": 1.2931, + "step": 4893 + }, + { + "epoch": 1.457659301178354, + "grad_norm": 0.21536576747894287, + "learning_rate": 1.93028690484612e-05, + "loss": 1.296, + "step": 4894 + }, + { + "epoch": 1.4579571473780226, + "grad_norm": 0.2092432826757431, + "learning_rate": 1.9302515170642026e-05, + "loss": 1.2639, + "step": 4895 + }, + { + "epoch": 1.4582549935776914, + "grad_norm": 0.22613051533699036, + "learning_rate": 1.930216120627334e-05, + "loss": 1.2741, + "step": 4896 + }, + { + "epoch": 1.45855283977736, + "grad_norm": 0.22264140844345093, + "learning_rate": 1.9301807155358437e-05, + "loss": 1.2829, + "step": 4897 + }, + { + "epoch": 1.4588506859770285, + "grad_norm": 0.23095308244228363, + "learning_rate": 1.9301453017900608e-05, + "loss": 1.279, + "step": 4898 + }, + { + "epoch": 1.4591485321766973, + "grad_norm": 0.2456885427236557, + "learning_rate": 1.9301098793903153e-05, + "loss": 1.277, + "step": 4899 + }, + { + "epoch": 1.459446378376366, + "grad_norm": 0.2145146131515503, + "learning_rate": 1.9300744483369363e-05, + "loss": 1.2852, + "step": 4900 + }, + { + "epoch": 1.4597442245760346, + "grad_norm": 0.2277066856622696, + "learning_rate": 1.9300390086302542e-05, + "loss": 1.2805, + "step": 4901 + }, + { + "epoch": 1.4600420707757031, + "grad_norm": 0.2226993590593338, + "learning_rate": 1.930003560270598e-05, + "loss": 1.2858, + "step": 4902 + }, + { + "epoch": 1.460339916975372, + "grad_norm": 0.2158169001340866, + "learning_rate": 1.9299681032582978e-05, + "loss": 1.2857, + "step": 4903 + }, + { + "epoch": 1.4606377631750405, + "grad_norm": 0.22127355635166168, + "learning_rate": 1.929932637593683e-05, + "loss": 1.2768, + "step": 4904 + }, + { + "epoch": 1.460935609374709, + "grad_norm": 0.2199540138244629, + "learning_rate": 1.9298971632770844e-05, + "loss": 1.2587, + "step": 4905 + }, + { + "epoch": 1.4612334555743778, + "grad_norm": 0.2244499772787094, + "learning_rate": 1.9298616803088318e-05, + "loss": 1.2593, + "step": 4906 + }, + { + "epoch": 1.4615313017740466, + "grad_norm": 0.2019004076719284, + "learning_rate": 1.9298261886892547e-05, + "loss": 1.2638, + "step": 4907 + }, + { + "epoch": 1.461829147973715, + "grad_norm": 0.2197563350200653, + "learning_rate": 1.929790688418684e-05, + "loss": 1.2976, + "step": 4908 + }, + { + "epoch": 1.4621269941733837, + "grad_norm": 0.229153573513031, + "learning_rate": 1.92975517949745e-05, + "loss": 1.2667, + "step": 4909 + }, + { + "epoch": 1.4624248403730524, + "grad_norm": 0.2259681075811386, + "learning_rate": 1.9297196619258826e-05, + "loss": 1.2761, + "step": 4910 + }, + { + "epoch": 1.462722686572721, + "grad_norm": 0.22611136734485626, + "learning_rate": 1.9296841357043124e-05, + "loss": 1.2703, + "step": 4911 + }, + { + "epoch": 1.4630205327723895, + "grad_norm": 0.2154085338115692, + "learning_rate": 1.92964860083307e-05, + "loss": 1.2735, + "step": 4912 + }, + { + "epoch": 1.4633183789720583, + "grad_norm": 0.2122606784105301, + "learning_rate": 1.9296130573124862e-05, + "loss": 1.2843, + "step": 4913 + }, + { + "epoch": 1.463616225171727, + "grad_norm": 0.23453332483768463, + "learning_rate": 1.9295775051428914e-05, + "loss": 1.2977, + "step": 4914 + }, + { + "epoch": 1.4639140713713956, + "grad_norm": 0.2286519557237625, + "learning_rate": 1.929541944324617e-05, + "loss": 1.2814, + "step": 4915 + }, + { + "epoch": 1.4642119175710642, + "grad_norm": 0.23382240533828735, + "learning_rate": 1.929506374857993e-05, + "loss": 1.2839, + "step": 4916 + }, + { + "epoch": 1.464509763770733, + "grad_norm": 0.21530047059059143, + "learning_rate": 1.9294707967433503e-05, + "loss": 1.2797, + "step": 4917 + }, + { + "epoch": 1.4648076099704015, + "grad_norm": 0.22245480120182037, + "learning_rate": 1.9294352099810207e-05, + "loss": 1.2932, + "step": 4918 + }, + { + "epoch": 1.46510545617007, + "grad_norm": 0.23918482661247253, + "learning_rate": 1.9293996145713348e-05, + "loss": 1.2843, + "step": 4919 + }, + { + "epoch": 1.4654033023697388, + "grad_norm": 0.2126484215259552, + "learning_rate": 1.929364010514624e-05, + "loss": 1.2931, + "step": 4920 + }, + { + "epoch": 1.4657011485694076, + "grad_norm": 0.2203528881072998, + "learning_rate": 1.929328397811219e-05, + "loss": 1.263, + "step": 4921 + }, + { + "epoch": 1.4659989947690761, + "grad_norm": 0.21826790273189545, + "learning_rate": 1.929292776461452e-05, + "loss": 1.2683, + "step": 4922 + }, + { + "epoch": 1.4662968409687447, + "grad_norm": 0.2075348049402237, + "learning_rate": 1.9292571464656538e-05, + "loss": 1.271, + "step": 4923 + }, + { + "epoch": 1.4665946871684135, + "grad_norm": 0.23244334757328033, + "learning_rate": 1.9292215078241564e-05, + "loss": 1.3111, + "step": 4924 + }, + { + "epoch": 1.466892533368082, + "grad_norm": 0.2200518697500229, + "learning_rate": 1.929185860537291e-05, + "loss": 1.2752, + "step": 4925 + }, + { + "epoch": 1.4671903795677508, + "grad_norm": 0.2329683154821396, + "learning_rate": 1.9291502046053888e-05, + "loss": 1.2694, + "step": 4926 + }, + { + "epoch": 1.4674882257674193, + "grad_norm": 0.22009992599487305, + "learning_rate": 1.9291145400287824e-05, + "loss": 1.2882, + "step": 4927 + }, + { + "epoch": 1.467786071967088, + "grad_norm": 0.21958021819591522, + "learning_rate": 1.9290788668078032e-05, + "loss": 1.3023, + "step": 4928 + }, + { + "epoch": 1.4680839181667567, + "grad_norm": 0.22004231810569763, + "learning_rate": 1.9290431849427834e-05, + "loss": 1.2749, + "step": 4929 + }, + { + "epoch": 1.4683817643664252, + "grad_norm": 0.21612748503684998, + "learning_rate": 1.9290074944340545e-05, + "loss": 1.2767, + "step": 4930 + }, + { + "epoch": 1.468679610566094, + "grad_norm": 0.21939951181411743, + "learning_rate": 1.9289717952819487e-05, + "loss": 1.2978, + "step": 4931 + }, + { + "epoch": 1.4689774567657625, + "grad_norm": 0.21950466930866241, + "learning_rate": 1.9289360874867987e-05, + "loss": 1.2717, + "step": 4932 + }, + { + "epoch": 1.4692753029654313, + "grad_norm": 0.21353593468666077, + "learning_rate": 1.928900371048936e-05, + "loss": 1.2974, + "step": 4933 + }, + { + "epoch": 1.4695731491650998, + "grad_norm": 0.22219714522361755, + "learning_rate": 1.928864645968693e-05, + "loss": 1.2641, + "step": 4934 + }, + { + "epoch": 1.4698709953647686, + "grad_norm": 0.21758060157299042, + "learning_rate": 1.9288289122464026e-05, + "loss": 1.2755, + "step": 4935 + }, + { + "epoch": 1.4701688415644372, + "grad_norm": 0.2182830423116684, + "learning_rate": 1.9287931698823964e-05, + "loss": 1.2664, + "step": 4936 + }, + { + "epoch": 1.4704666877641057, + "grad_norm": 0.22116537392139435, + "learning_rate": 1.9287574188770078e-05, + "loss": 1.2765, + "step": 4937 + }, + { + "epoch": 1.4707645339637745, + "grad_norm": 0.19969645142555237, + "learning_rate": 1.9287216592305692e-05, + "loss": 1.2877, + "step": 4938 + }, + { + "epoch": 1.471062380163443, + "grad_norm": 0.22789418697357178, + "learning_rate": 1.928685890943413e-05, + "loss": 1.2806, + "step": 4939 + }, + { + "epoch": 1.4713602263631118, + "grad_norm": 0.21748170256614685, + "learning_rate": 1.9286501140158727e-05, + "loss": 1.2716, + "step": 4940 + }, + { + "epoch": 1.4716580725627804, + "grad_norm": 0.23064807057380676, + "learning_rate": 1.92861432844828e-05, + "loss": 1.2832, + "step": 4941 + }, + { + "epoch": 1.4719559187624491, + "grad_norm": 0.25822216272354126, + "learning_rate": 1.9285785342409687e-05, + "loss": 1.2671, + "step": 4942 + }, + { + "epoch": 1.4722537649621177, + "grad_norm": 0.24876217544078827, + "learning_rate": 1.9285427313942717e-05, + "loss": 1.299, + "step": 4943 + }, + { + "epoch": 1.4725516111617862, + "grad_norm": 0.22561317682266235, + "learning_rate": 1.928506919908522e-05, + "loss": 1.2657, + "step": 4944 + }, + { + "epoch": 1.472849457361455, + "grad_norm": 0.2996424436569214, + "learning_rate": 1.928471099784053e-05, + "loss": 1.295, + "step": 4945 + }, + { + "epoch": 1.4731473035611236, + "grad_norm": 0.23014278709888458, + "learning_rate": 1.928435271021197e-05, + "loss": 1.2757, + "step": 4946 + }, + { + "epoch": 1.4734451497607923, + "grad_norm": 0.23713409900665283, + "learning_rate": 1.9283994336202888e-05, + "loss": 1.2852, + "step": 4947 + }, + { + "epoch": 1.4737429959604609, + "grad_norm": 0.22407498955726624, + "learning_rate": 1.928363587581661e-05, + "loss": 1.2806, + "step": 4948 + }, + { + "epoch": 1.4740408421601296, + "grad_norm": 0.21705381572246552, + "learning_rate": 1.928327732905647e-05, + "loss": 1.2663, + "step": 4949 + }, + { + "epoch": 1.4743386883597982, + "grad_norm": 0.23391960561275482, + "learning_rate": 1.928291869592581e-05, + "loss": 1.2617, + "step": 4950 + }, + { + "epoch": 1.4746365345594667, + "grad_norm": 0.20994536578655243, + "learning_rate": 1.9282559976427962e-05, + "loss": 1.2909, + "step": 4951 + }, + { + "epoch": 1.4749343807591355, + "grad_norm": 0.2166874259710312, + "learning_rate": 1.9282201170566265e-05, + "loss": 1.2705, + "step": 4952 + }, + { + "epoch": 1.475232226958804, + "grad_norm": 0.21621519327163696, + "learning_rate": 1.9281842278344053e-05, + "loss": 1.2761, + "step": 4953 + }, + { + "epoch": 1.4755300731584728, + "grad_norm": 0.22492529451847076, + "learning_rate": 1.928148329976467e-05, + "loss": 1.2766, + "step": 4954 + }, + { + "epoch": 1.4758279193581414, + "grad_norm": 0.22125613689422607, + "learning_rate": 1.9281124234831458e-05, + "loss": 1.2636, + "step": 4955 + }, + { + "epoch": 1.4761257655578102, + "grad_norm": 0.22913573682308197, + "learning_rate": 1.9280765083547753e-05, + "loss": 1.2956, + "step": 4956 + }, + { + "epoch": 1.4764236117574787, + "grad_norm": 0.21469125151634216, + "learning_rate": 1.9280405845916896e-05, + "loss": 1.2854, + "step": 4957 + }, + { + "epoch": 1.4767214579571473, + "grad_norm": 0.20909914374351501, + "learning_rate": 1.928004652194223e-05, + "loss": 1.2871, + "step": 4958 + }, + { + "epoch": 1.477019304156816, + "grad_norm": 0.2232624590396881, + "learning_rate": 1.9279687111627107e-05, + "loss": 1.2907, + "step": 4959 + }, + { + "epoch": 1.4773171503564846, + "grad_norm": 0.22202903032302856, + "learning_rate": 1.927932761497486e-05, + "loss": 1.2755, + "step": 4960 + }, + { + "epoch": 1.4776149965561534, + "grad_norm": 0.2193203866481781, + "learning_rate": 1.9278968031988835e-05, + "loss": 1.2709, + "step": 4961 + }, + { + "epoch": 1.477912842755822, + "grad_norm": 0.22183677554130554, + "learning_rate": 1.9278608362672376e-05, + "loss": 1.2912, + "step": 4962 + }, + { + "epoch": 1.4782106889554907, + "grad_norm": 0.22596612572669983, + "learning_rate": 1.927824860702884e-05, + "loss": 1.2716, + "step": 4963 + }, + { + "epoch": 1.4785085351551592, + "grad_norm": 0.21625077724456787, + "learning_rate": 1.9277888765061562e-05, + "loss": 1.2833, + "step": 4964 + }, + { + "epoch": 1.4788063813548278, + "grad_norm": 0.241022989153862, + "learning_rate": 1.92775288367739e-05, + "loss": 1.2767, + "step": 4965 + }, + { + "epoch": 1.4791042275544966, + "grad_norm": 0.21953821182250977, + "learning_rate": 1.9277168822169194e-05, + "loss": 1.2712, + "step": 4966 + }, + { + "epoch": 1.4794020737541653, + "grad_norm": 0.21410316228866577, + "learning_rate": 1.9276808721250798e-05, + "loss": 1.2912, + "step": 4967 + }, + { + "epoch": 1.4796999199538339, + "grad_norm": 0.21265076100826263, + "learning_rate": 1.9276448534022057e-05, + "loss": 1.2726, + "step": 4968 + }, + { + "epoch": 1.4799977661535024, + "grad_norm": 0.22445210814476013, + "learning_rate": 1.9276088260486335e-05, + "loss": 1.2824, + "step": 4969 + }, + { + "epoch": 1.4802956123531712, + "grad_norm": 0.21892309188842773, + "learning_rate": 1.927572790064697e-05, + "loss": 1.2789, + "step": 4970 + }, + { + "epoch": 1.4805934585528397, + "grad_norm": 0.2171856313943863, + "learning_rate": 1.9275367454507324e-05, + "loss": 1.2932, + "step": 4971 + }, + { + "epoch": 1.4808913047525083, + "grad_norm": 0.21604934334754944, + "learning_rate": 1.9275006922070743e-05, + "loss": 1.2669, + "step": 4972 + }, + { + "epoch": 1.481189150952177, + "grad_norm": 0.2263178676366806, + "learning_rate": 1.9274646303340587e-05, + "loss": 1.278, + "step": 4973 + }, + { + "epoch": 1.4814869971518458, + "grad_norm": 0.22390708327293396, + "learning_rate": 1.927428559832021e-05, + "loss": 1.2886, + "step": 4974 + }, + { + "epoch": 1.4817848433515144, + "grad_norm": 0.21440237760543823, + "learning_rate": 1.9273924807012966e-05, + "loss": 1.2955, + "step": 4975 + }, + { + "epoch": 1.482082689551183, + "grad_norm": 0.215138778090477, + "learning_rate": 1.9273563929422216e-05, + "loss": 1.2796, + "step": 4976 + }, + { + "epoch": 1.4823805357508517, + "grad_norm": 0.21976342797279358, + "learning_rate": 1.927320296555131e-05, + "loss": 1.2593, + "step": 4977 + }, + { + "epoch": 1.4826783819505203, + "grad_norm": 0.21736851334571838, + "learning_rate": 1.9272841915403612e-05, + "loss": 1.28, + "step": 4978 + }, + { + "epoch": 1.4829762281501888, + "grad_norm": 0.21572822332382202, + "learning_rate": 1.9272480778982484e-05, + "loss": 1.2929, + "step": 4979 + }, + { + "epoch": 1.4832740743498576, + "grad_norm": 0.22615979611873627, + "learning_rate": 1.927211955629128e-05, + "loss": 1.2785, + "step": 4980 + }, + { + "epoch": 1.4835719205495264, + "grad_norm": 0.2202790528535843, + "learning_rate": 1.9271758247333362e-05, + "loss": 1.2781, + "step": 4981 + }, + { + "epoch": 1.483869766749195, + "grad_norm": 0.22541899979114532, + "learning_rate": 1.9271396852112094e-05, + "loss": 1.2638, + "step": 4982 + }, + { + "epoch": 1.4841676129488635, + "grad_norm": 0.22249038517475128, + "learning_rate": 1.9271035370630838e-05, + "loss": 1.2735, + "step": 4983 + }, + { + "epoch": 1.4844654591485322, + "grad_norm": 0.25325918197631836, + "learning_rate": 1.9270673802892954e-05, + "loss": 1.2839, + "step": 4984 + }, + { + "epoch": 1.4847633053482008, + "grad_norm": 0.21583291888237, + "learning_rate": 1.927031214890181e-05, + "loss": 1.2605, + "step": 4985 + }, + { + "epoch": 1.4850611515478696, + "grad_norm": 0.23504725098609924, + "learning_rate": 1.9269950408660766e-05, + "loss": 1.2756, + "step": 4986 + }, + { + "epoch": 1.485358997747538, + "grad_norm": 0.2224356085062027, + "learning_rate": 1.926958858217319e-05, + "loss": 1.2851, + "step": 4987 + }, + { + "epoch": 1.4856568439472069, + "grad_norm": 0.23366491496562958, + "learning_rate": 1.926922666944245e-05, + "loss": 1.2689, + "step": 4988 + }, + { + "epoch": 1.4859546901468754, + "grad_norm": 0.2041642814874649, + "learning_rate": 1.9268864670471914e-05, + "loss": 1.2746, + "step": 4989 + }, + { + "epoch": 1.486252536346544, + "grad_norm": 0.21593394875526428, + "learning_rate": 1.9268502585264946e-05, + "loss": 1.2777, + "step": 4990 + }, + { + "epoch": 1.4865503825462127, + "grad_norm": 0.21973749995231628, + "learning_rate": 1.9268140413824915e-05, + "loss": 1.2757, + "step": 4991 + }, + { + "epoch": 1.4868482287458813, + "grad_norm": 0.22880889475345612, + "learning_rate": 1.9267778156155198e-05, + "loss": 1.2888, + "step": 4992 + }, + { + "epoch": 1.48714607494555, + "grad_norm": 0.2150540053844452, + "learning_rate": 1.9267415812259157e-05, + "loss": 1.2677, + "step": 4993 + }, + { + "epoch": 1.4874439211452186, + "grad_norm": 0.23506714403629303, + "learning_rate": 1.9267053382140166e-05, + "loss": 1.2886, + "step": 4994 + }, + { + "epoch": 1.4877417673448874, + "grad_norm": 0.22009918093681335, + "learning_rate": 1.9266690865801597e-05, + "loss": 1.2688, + "step": 4995 + }, + { + "epoch": 1.488039613544556, + "grad_norm": 0.229282408952713, + "learning_rate": 1.9266328263246824e-05, + "loss": 1.2643, + "step": 4996 + }, + { + "epoch": 1.4883374597442245, + "grad_norm": 0.21965311467647552, + "learning_rate": 1.9265965574479218e-05, + "loss": 1.2814, + "step": 4997 + }, + { + "epoch": 1.4886353059438933, + "grad_norm": 0.2273985892534256, + "learning_rate": 1.9265602799502154e-05, + "loss": 1.293, + "step": 4998 + }, + { + "epoch": 1.4889331521435618, + "grad_norm": 0.22290019690990448, + "learning_rate": 1.926523993831901e-05, + "loss": 1.2965, + "step": 4999 + }, + { + "epoch": 1.4892309983432306, + "grad_norm": 0.20859937369823456, + "learning_rate": 1.9264876990933156e-05, + "loss": 1.284, + "step": 5000 + }, + { + "epoch": 1.4892309983432306, + "eval_loss": 1.350777506828308, + "eval_runtime": 21.2077, + "eval_samples_per_second": 81.763, + "eval_steps_per_second": 5.14, + "step": 5000 + }, + { + "epoch": 1.4895288445428991, + "grad_norm": 0.2520102262496948, + "learning_rate": 1.9264513957347978e-05, + "loss": 1.2765, + "step": 5001 + }, + { + "epoch": 1.489826690742568, + "grad_norm": 0.23392906785011292, + "learning_rate": 1.9264150837566847e-05, + "loss": 1.2827, + "step": 5002 + }, + { + "epoch": 1.4901245369422365, + "grad_norm": 0.22240546345710754, + "learning_rate": 1.9263787631593144e-05, + "loss": 1.2861, + "step": 5003 + }, + { + "epoch": 1.490422383141905, + "grad_norm": 0.2256360948085785, + "learning_rate": 1.9263424339430244e-05, + "loss": 1.2811, + "step": 5004 + }, + { + "epoch": 1.4907202293415738, + "grad_norm": 0.2463768869638443, + "learning_rate": 1.926306096108153e-05, + "loss": 1.2646, + "step": 5005 + }, + { + "epoch": 1.4910180755412423, + "grad_norm": 0.22218027710914612, + "learning_rate": 1.9262697496550388e-05, + "loss": 1.2816, + "step": 5006 + }, + { + "epoch": 1.491315921740911, + "grad_norm": 0.2227209061384201, + "learning_rate": 1.926233394584019e-05, + "loss": 1.2874, + "step": 5007 + }, + { + "epoch": 1.4916137679405796, + "grad_norm": 0.22535954415798187, + "learning_rate": 1.9261970308954326e-05, + "loss": 1.2805, + "step": 5008 + }, + { + "epoch": 1.4919116141402484, + "grad_norm": 0.21909268200397491, + "learning_rate": 1.9261606585896174e-05, + "loss": 1.2841, + "step": 5009 + }, + { + "epoch": 1.492209460339917, + "grad_norm": 0.22318336367607117, + "learning_rate": 1.9261242776669123e-05, + "loss": 1.2798, + "step": 5010 + }, + { + "epoch": 1.4925073065395855, + "grad_norm": 0.2382373809814453, + "learning_rate": 1.9260878881276555e-05, + "loss": 1.2983, + "step": 5011 + }, + { + "epoch": 1.4928051527392543, + "grad_norm": 0.21087752282619476, + "learning_rate": 1.9260514899721854e-05, + "loss": 1.2786, + "step": 5012 + }, + { + "epoch": 1.4931029989389228, + "grad_norm": 0.2340303212404251, + "learning_rate": 1.9260150832008408e-05, + "loss": 1.2782, + "step": 5013 + }, + { + "epoch": 1.4934008451385916, + "grad_norm": 0.21488603949546814, + "learning_rate": 1.9259786678139605e-05, + "loss": 1.3042, + "step": 5014 + }, + { + "epoch": 1.4936986913382602, + "grad_norm": 0.23216792941093445, + "learning_rate": 1.9259422438118835e-05, + "loss": 1.2624, + "step": 5015 + }, + { + "epoch": 1.493996537537929, + "grad_norm": 0.221286341547966, + "learning_rate": 1.9259058111949483e-05, + "loss": 1.286, + "step": 5016 + }, + { + "epoch": 1.4942943837375975, + "grad_norm": 0.22693414986133575, + "learning_rate": 1.9258693699634937e-05, + "loss": 1.275, + "step": 5017 + }, + { + "epoch": 1.494592229937266, + "grad_norm": 0.2372429221868515, + "learning_rate": 1.9258329201178596e-05, + "loss": 1.2769, + "step": 5018 + }, + { + "epoch": 1.4948900761369348, + "grad_norm": 0.23893238604068756, + "learning_rate": 1.9257964616583843e-05, + "loss": 1.2803, + "step": 5019 + }, + { + "epoch": 1.4951879223366034, + "grad_norm": 0.21460749208927155, + "learning_rate": 1.9257599945854073e-05, + "loss": 1.2807, + "step": 5020 + }, + { + "epoch": 1.4954857685362721, + "grad_norm": 0.23852066695690155, + "learning_rate": 1.9257235188992676e-05, + "loss": 1.2846, + "step": 5021 + }, + { + "epoch": 1.4957836147359407, + "grad_norm": 0.22247886657714844, + "learning_rate": 1.9256870346003055e-05, + "loss": 1.2744, + "step": 5022 + }, + { + "epoch": 1.4960814609356095, + "grad_norm": 0.22894200682640076, + "learning_rate": 1.9256505416888595e-05, + "loss": 1.2793, + "step": 5023 + }, + { + "epoch": 1.496379307135278, + "grad_norm": 0.2209431231021881, + "learning_rate": 1.925614040165269e-05, + "loss": 1.2655, + "step": 5024 + }, + { + "epoch": 1.4966771533349466, + "grad_norm": 0.23314248025417328, + "learning_rate": 1.9255775300298744e-05, + "loss": 1.2803, + "step": 5025 + }, + { + "epoch": 1.4969749995346153, + "grad_norm": 0.22377417981624603, + "learning_rate": 1.9255410112830148e-05, + "loss": 1.2884, + "step": 5026 + }, + { + "epoch": 1.497272845734284, + "grad_norm": 0.22946637868881226, + "learning_rate": 1.92550448392503e-05, + "loss": 1.2714, + "step": 5027 + }, + { + "epoch": 1.4975706919339526, + "grad_norm": 0.21838387846946716, + "learning_rate": 1.9254679479562607e-05, + "loss": 1.2738, + "step": 5028 + }, + { + "epoch": 1.4978685381336212, + "grad_norm": 0.2359665036201477, + "learning_rate": 1.9254314033770456e-05, + "loss": 1.272, + "step": 5029 + }, + { + "epoch": 1.49816638433329, + "grad_norm": 0.22120609879493713, + "learning_rate": 1.9253948501877257e-05, + "loss": 1.2799, + "step": 5030 + }, + { + "epoch": 1.4984642305329585, + "grad_norm": 0.22768308222293854, + "learning_rate": 1.9253582883886398e-05, + "loss": 1.2829, + "step": 5031 + }, + { + "epoch": 1.498762076732627, + "grad_norm": 0.23316188156604767, + "learning_rate": 1.9253217179801297e-05, + "loss": 1.2654, + "step": 5032 + }, + { + "epoch": 1.4990599229322958, + "grad_norm": 0.22771909832954407, + "learning_rate": 1.9252851389625343e-05, + "loss": 1.2654, + "step": 5033 + }, + { + "epoch": 1.4993577691319646, + "grad_norm": 0.24694979190826416, + "learning_rate": 1.925248551336195e-05, + "loss": 1.2577, + "step": 5034 + }, + { + "epoch": 1.4996556153316332, + "grad_norm": 0.2150806188583374, + "learning_rate": 1.9252119551014516e-05, + "loss": 1.2957, + "step": 5035 + }, + { + "epoch": 1.4999534615313017, + "grad_norm": 0.22292621433734894, + "learning_rate": 1.9251753502586443e-05, + "loss": 1.2721, + "step": 5036 + }, + { + "epoch": 1.5002513077309705, + "grad_norm": 0.22536718845367432, + "learning_rate": 1.925138736808114e-05, + "loss": 1.2795, + "step": 5037 + }, + { + "epoch": 1.500549153930639, + "grad_norm": 0.226253941655159, + "learning_rate": 1.9251021147502016e-05, + "loss": 1.2814, + "step": 5038 + }, + { + "epoch": 1.5008470001303076, + "grad_norm": 0.23530824482440948, + "learning_rate": 1.9250654840852476e-05, + "loss": 1.2819, + "step": 5039 + }, + { + "epoch": 1.5011448463299764, + "grad_norm": 0.22629491984844208, + "learning_rate": 1.9250288448135928e-05, + "loss": 1.2728, + "step": 5040 + }, + { + "epoch": 1.5014426925296451, + "grad_norm": 0.2234068661928177, + "learning_rate": 1.924992196935578e-05, + "loss": 1.2817, + "step": 5041 + }, + { + "epoch": 1.5017405387293137, + "grad_norm": 0.21893714368343353, + "learning_rate": 1.9249555404515444e-05, + "loss": 1.286, + "step": 5042 + }, + { + "epoch": 1.5020383849289822, + "grad_norm": 0.21677549183368683, + "learning_rate": 1.9249188753618328e-05, + "loss": 1.2842, + "step": 5043 + }, + { + "epoch": 1.502336231128651, + "grad_norm": 0.2146213948726654, + "learning_rate": 1.9248822016667844e-05, + "loss": 1.2838, + "step": 5044 + }, + { + "epoch": 1.5026340773283196, + "grad_norm": 0.2185962051153183, + "learning_rate": 1.924845519366741e-05, + "loss": 1.2771, + "step": 5045 + }, + { + "epoch": 1.502931923527988, + "grad_norm": 0.24388039112091064, + "learning_rate": 1.9248088284620428e-05, + "loss": 1.2765, + "step": 5046 + }, + { + "epoch": 1.5032297697276569, + "grad_norm": 0.2095615118741989, + "learning_rate": 1.9247721289530318e-05, + "loss": 1.2818, + "step": 5047 + }, + { + "epoch": 1.5035276159273256, + "grad_norm": 0.23394814133644104, + "learning_rate": 1.9247354208400492e-05, + "loss": 1.2735, + "step": 5048 + }, + { + "epoch": 1.5038254621269942, + "grad_norm": 0.21744844317436218, + "learning_rate": 1.9246987041234372e-05, + "loss": 1.2812, + "step": 5049 + }, + { + "epoch": 1.5041233083266627, + "grad_norm": 0.23882654309272766, + "learning_rate": 1.9246619788035363e-05, + "loss": 1.2803, + "step": 5050 + }, + { + "epoch": 1.5044211545263315, + "grad_norm": 0.21487607061862946, + "learning_rate": 1.924625244880689e-05, + "loss": 1.2784, + "step": 5051 + }, + { + "epoch": 1.504719000726, + "grad_norm": 0.24575792253017426, + "learning_rate": 1.924588502355237e-05, + "loss": 1.3009, + "step": 5052 + }, + { + "epoch": 1.5050168469256686, + "grad_norm": 0.2168632447719574, + "learning_rate": 1.9245517512275217e-05, + "loss": 1.2894, + "step": 5053 + }, + { + "epoch": 1.5053146931253374, + "grad_norm": 0.2268664389848709, + "learning_rate": 1.9245149914978854e-05, + "loss": 1.2835, + "step": 5054 + }, + { + "epoch": 1.5056125393250062, + "grad_norm": 0.2149057686328888, + "learning_rate": 1.9244782231666703e-05, + "loss": 1.2715, + "step": 5055 + }, + { + "epoch": 1.5059103855246747, + "grad_norm": 0.22716905176639557, + "learning_rate": 1.9244414462342184e-05, + "loss": 1.2766, + "step": 5056 + }, + { + "epoch": 1.5062082317243433, + "grad_norm": 0.22673822939395905, + "learning_rate": 1.924404660700871e-05, + "loss": 1.2717, + "step": 5057 + }, + { + "epoch": 1.506506077924012, + "grad_norm": 0.21987012028694153, + "learning_rate": 1.9243678665669715e-05, + "loss": 1.2825, + "step": 5058 + }, + { + "epoch": 1.5068039241236806, + "grad_norm": 0.22434185445308685, + "learning_rate": 1.924331063832862e-05, + "loss": 1.2698, + "step": 5059 + }, + { + "epoch": 1.5071017703233491, + "grad_norm": 0.23307372629642487, + "learning_rate": 1.9242942524988842e-05, + "loss": 1.2887, + "step": 5060 + }, + { + "epoch": 1.507399616523018, + "grad_norm": 0.2240176945924759, + "learning_rate": 1.924257432565381e-05, + "loss": 1.2878, + "step": 5061 + }, + { + "epoch": 1.5076974627226867, + "grad_norm": 0.23216986656188965, + "learning_rate": 1.924220604032695e-05, + "loss": 1.29, + "step": 5062 + }, + { + "epoch": 1.5079953089223552, + "grad_norm": 0.22601667046546936, + "learning_rate": 1.9241837669011694e-05, + "loss": 1.2878, + "step": 5063 + }, + { + "epoch": 1.5082931551220238, + "grad_norm": 0.21720650792121887, + "learning_rate": 1.924146921171146e-05, + "loss": 1.2929, + "step": 5064 + }, + { + "epoch": 1.5085910013216925, + "grad_norm": 0.22596383094787598, + "learning_rate": 1.9241100668429685e-05, + "loss": 1.2751, + "step": 5065 + }, + { + "epoch": 1.5088888475213613, + "grad_norm": 0.2102445662021637, + "learning_rate": 1.9240732039169786e-05, + "loss": 1.2879, + "step": 5066 + }, + { + "epoch": 1.5091866937210296, + "grad_norm": 0.24772226810455322, + "learning_rate": 1.9240363323935206e-05, + "loss": 1.2576, + "step": 5067 + }, + { + "epoch": 1.5094845399206984, + "grad_norm": 0.217487633228302, + "learning_rate": 1.9239994522729364e-05, + "loss": 1.2735, + "step": 5068 + }, + { + "epoch": 1.5097823861203672, + "grad_norm": 0.2197350114583969, + "learning_rate": 1.9239625635555698e-05, + "loss": 1.2637, + "step": 5069 + }, + { + "epoch": 1.5100802323200357, + "grad_norm": 0.22306449711322784, + "learning_rate": 1.923925666241764e-05, + "loss": 1.2759, + "step": 5070 + }, + { + "epoch": 1.5103780785197043, + "grad_norm": 0.2273397296667099, + "learning_rate": 1.9238887603318625e-05, + "loss": 1.2785, + "step": 5071 + }, + { + "epoch": 1.510675924719373, + "grad_norm": 0.21052822470664978, + "learning_rate": 1.9238518458262075e-05, + "loss": 1.2786, + "step": 5072 + }, + { + "epoch": 1.5109737709190418, + "grad_norm": 0.21630816161632538, + "learning_rate": 1.9238149227251437e-05, + "loss": 1.2927, + "step": 5073 + }, + { + "epoch": 1.5112716171187102, + "grad_norm": 0.20805077254772186, + "learning_rate": 1.9237779910290144e-05, + "loss": 1.285, + "step": 5074 + }, + { + "epoch": 1.511569463318379, + "grad_norm": 0.22114484012126923, + "learning_rate": 1.9237410507381623e-05, + "loss": 1.2876, + "step": 5075 + }, + { + "epoch": 1.5118673095180477, + "grad_norm": 0.22389502823352814, + "learning_rate": 1.9237041018529325e-05, + "loss": 1.2795, + "step": 5076 + }, + { + "epoch": 1.5121651557177163, + "grad_norm": 0.21567100286483765, + "learning_rate": 1.9236671443736677e-05, + "loss": 1.2854, + "step": 5077 + }, + { + "epoch": 1.5124630019173848, + "grad_norm": 0.22371292114257812, + "learning_rate": 1.9236301783007123e-05, + "loss": 1.2769, + "step": 5078 + }, + { + "epoch": 1.5127608481170536, + "grad_norm": 0.22854968905448914, + "learning_rate": 1.9235932036344097e-05, + "loss": 1.2843, + "step": 5079 + }, + { + "epoch": 1.5130586943167224, + "grad_norm": 0.21412812173366547, + "learning_rate": 1.9235562203751047e-05, + "loss": 1.2641, + "step": 5080 + }, + { + "epoch": 1.513356540516391, + "grad_norm": 0.22908322513103485, + "learning_rate": 1.923519228523141e-05, + "loss": 1.2798, + "step": 5081 + }, + { + "epoch": 1.5136543867160595, + "grad_norm": 0.21367821097373962, + "learning_rate": 1.923482228078862e-05, + "loss": 1.2798, + "step": 5082 + }, + { + "epoch": 1.5139522329157282, + "grad_norm": 0.33910495042800903, + "learning_rate": 1.923445219042613e-05, + "loss": 1.2733, + "step": 5083 + }, + { + "epoch": 1.5142500791153968, + "grad_norm": 0.23164407908916473, + "learning_rate": 1.923408201414738e-05, + "loss": 1.2749, + "step": 5084 + }, + { + "epoch": 1.5145479253150653, + "grad_norm": 0.23809514939785004, + "learning_rate": 1.9233711751955815e-05, + "loss": 1.2724, + "step": 5085 + }, + { + "epoch": 1.514845771514734, + "grad_norm": 0.20901332795619965, + "learning_rate": 1.9233341403854877e-05, + "loss": 1.2691, + "step": 5086 + }, + { + "epoch": 1.5151436177144029, + "grad_norm": 0.22775524854660034, + "learning_rate": 1.9232970969848013e-05, + "loss": 1.2832, + "step": 5087 + }, + { + "epoch": 1.5154414639140714, + "grad_norm": 0.21273073554039001, + "learning_rate": 1.9232600449938673e-05, + "loss": 1.2783, + "step": 5088 + }, + { + "epoch": 1.51573931011374, + "grad_norm": 0.22500090301036835, + "learning_rate": 1.92322298441303e-05, + "loss": 1.3053, + "step": 5089 + }, + { + "epoch": 1.5160371563134087, + "grad_norm": 0.20830382406711578, + "learning_rate": 1.923185915242634e-05, + "loss": 1.2751, + "step": 5090 + }, + { + "epoch": 1.5163350025130773, + "grad_norm": 0.21803244948387146, + "learning_rate": 1.9231488374830247e-05, + "loss": 1.2693, + "step": 5091 + }, + { + "epoch": 1.5166328487127458, + "grad_norm": 0.21517494320869446, + "learning_rate": 1.923111751134547e-05, + "loss": 1.2797, + "step": 5092 + }, + { + "epoch": 1.5169306949124146, + "grad_norm": 0.2202211320400238, + "learning_rate": 1.9230746561975455e-05, + "loss": 1.2711, + "step": 5093 + }, + { + "epoch": 1.5172285411120834, + "grad_norm": 0.22223502397537231, + "learning_rate": 1.9230375526723657e-05, + "loss": 1.2846, + "step": 5094 + }, + { + "epoch": 1.517526387311752, + "grad_norm": 0.21127375960350037, + "learning_rate": 1.923000440559353e-05, + "loss": 1.2882, + "step": 5095 + }, + { + "epoch": 1.5178242335114205, + "grad_norm": 0.21672438085079193, + "learning_rate": 1.922963319858852e-05, + "loss": 1.281, + "step": 5096 + }, + { + "epoch": 1.5181220797110893, + "grad_norm": 0.2225533276796341, + "learning_rate": 1.9229261905712092e-05, + "loss": 1.2891, + "step": 5097 + }, + { + "epoch": 1.5184199259107578, + "grad_norm": 0.22136686742305756, + "learning_rate": 1.9228890526967688e-05, + "loss": 1.299, + "step": 5098 + }, + { + "epoch": 1.5187177721104264, + "grad_norm": 0.22122547030448914, + "learning_rate": 1.922851906235877e-05, + "loss": 1.2862, + "step": 5099 + }, + { + "epoch": 1.5190156183100951, + "grad_norm": 0.2353043109178543, + "learning_rate": 1.9228147511888795e-05, + "loss": 1.2834, + "step": 5100 + }, + { + "epoch": 1.519313464509764, + "grad_norm": 0.21503135561943054, + "learning_rate": 1.9227775875561218e-05, + "loss": 1.2634, + "step": 5101 + }, + { + "epoch": 1.5196113107094325, + "grad_norm": 0.21588893234729767, + "learning_rate": 1.9227404153379492e-05, + "loss": 1.2919, + "step": 5102 + }, + { + "epoch": 1.519909156909101, + "grad_norm": 0.2221689373254776, + "learning_rate": 1.922703234534708e-05, + "loss": 1.2896, + "step": 5103 + }, + { + "epoch": 1.5202070031087698, + "grad_norm": 0.2100355178117752, + "learning_rate": 1.9226660451467443e-05, + "loss": 1.2818, + "step": 5104 + }, + { + "epoch": 1.5205048493084383, + "grad_norm": 0.2249361276626587, + "learning_rate": 1.9226288471744042e-05, + "loss": 1.2886, + "step": 5105 + }, + { + "epoch": 1.5208026955081069, + "grad_norm": 0.22217456996440887, + "learning_rate": 1.922591640618033e-05, + "loss": 1.2585, + "step": 5106 + }, + { + "epoch": 1.5211005417077756, + "grad_norm": 0.2258172482252121, + "learning_rate": 1.9225544254779777e-05, + "loss": 1.2908, + "step": 5107 + }, + { + "epoch": 1.5213983879074444, + "grad_norm": 0.21065488457679749, + "learning_rate": 1.922517201754584e-05, + "loss": 1.289, + "step": 5108 + }, + { + "epoch": 1.521696234107113, + "grad_norm": 0.22901864349842072, + "learning_rate": 1.9224799694481988e-05, + "loss": 1.2786, + "step": 5109 + }, + { + "epoch": 1.5219940803067815, + "grad_norm": 0.2129736691713333, + "learning_rate": 1.922442728559168e-05, + "loss": 1.2937, + "step": 5110 + }, + { + "epoch": 1.5222919265064503, + "grad_norm": 0.21151354908943176, + "learning_rate": 1.9224054790878378e-05, + "loss": 1.2686, + "step": 5111 + }, + { + "epoch": 1.5225897727061188, + "grad_norm": 0.21267589926719666, + "learning_rate": 1.9223682210345556e-05, + "loss": 1.2684, + "step": 5112 + }, + { + "epoch": 1.5228876189057874, + "grad_norm": 0.22577396035194397, + "learning_rate": 1.9223309543996676e-05, + "loss": 1.278, + "step": 5113 + }, + { + "epoch": 1.5231854651054562, + "grad_norm": 0.21641625463962555, + "learning_rate": 1.9222936791835205e-05, + "loss": 1.2671, + "step": 5114 + }, + { + "epoch": 1.523483311305125, + "grad_norm": 0.23217715322971344, + "learning_rate": 1.9222563953864612e-05, + "loss": 1.2881, + "step": 5115 + }, + { + "epoch": 1.5237811575047935, + "grad_norm": 0.22642222046852112, + "learning_rate": 1.9222191030088364e-05, + "loss": 1.2736, + "step": 5116 + }, + { + "epoch": 1.524079003704462, + "grad_norm": 0.2142854481935501, + "learning_rate": 1.9221818020509933e-05, + "loss": 1.2636, + "step": 5117 + }, + { + "epoch": 1.5243768499041308, + "grad_norm": 0.22878050804138184, + "learning_rate": 1.922144492513279e-05, + "loss": 1.2855, + "step": 5118 + }, + { + "epoch": 1.5246746961037994, + "grad_norm": 0.21787415444850922, + "learning_rate": 1.92210717439604e-05, + "loss": 1.2707, + "step": 5119 + }, + { + "epoch": 1.524972542303468, + "grad_norm": 0.23277103900909424, + "learning_rate": 1.9220698476996245e-05, + "loss": 1.2876, + "step": 5120 + }, + { + "epoch": 1.5252703885031367, + "grad_norm": 0.21832115948200226, + "learning_rate": 1.922032512424379e-05, + "loss": 1.2736, + "step": 5121 + }, + { + "epoch": 1.5255682347028054, + "grad_norm": 0.2242104411125183, + "learning_rate": 1.921995168570651e-05, + "loss": 1.2613, + "step": 5122 + }, + { + "epoch": 1.525866080902474, + "grad_norm": 0.21479088068008423, + "learning_rate": 1.9219578161387886e-05, + "loss": 1.2975, + "step": 5123 + }, + { + "epoch": 1.5261639271021425, + "grad_norm": 0.21542489528656006, + "learning_rate": 1.9219204551291385e-05, + "loss": 1.2766, + "step": 5124 + }, + { + "epoch": 1.5264617733018113, + "grad_norm": 0.21532559394836426, + "learning_rate": 1.9218830855420486e-05, + "loss": 1.2779, + "step": 5125 + }, + { + "epoch": 1.5267596195014799, + "grad_norm": 0.23046466708183289, + "learning_rate": 1.9218457073778665e-05, + "loss": 1.2746, + "step": 5126 + }, + { + "epoch": 1.5270574657011484, + "grad_norm": 0.21354719996452332, + "learning_rate": 1.92180832063694e-05, + "loss": 1.2745, + "step": 5127 + }, + { + "epoch": 1.5273553119008172, + "grad_norm": 0.22858357429504395, + "learning_rate": 1.921770925319617e-05, + "loss": 1.2891, + "step": 5128 + }, + { + "epoch": 1.527653158100486, + "grad_norm": 0.21283267438411713, + "learning_rate": 1.9217335214262455e-05, + "loss": 1.2865, + "step": 5129 + }, + { + "epoch": 1.5279510043001545, + "grad_norm": 0.2285892367362976, + "learning_rate": 1.9216961089571734e-05, + "loss": 1.2906, + "step": 5130 + }, + { + "epoch": 1.528248850499823, + "grad_norm": 0.22981026768684387, + "learning_rate": 1.9216586879127486e-05, + "loss": 1.2773, + "step": 5131 + }, + { + "epoch": 1.5285466966994918, + "grad_norm": 0.22597664594650269, + "learning_rate": 1.9216212582933197e-05, + "loss": 1.2813, + "step": 5132 + }, + { + "epoch": 1.5288445428991606, + "grad_norm": 0.23075662553310394, + "learning_rate": 1.9215838200992344e-05, + "loss": 1.2803, + "step": 5133 + }, + { + "epoch": 1.529142389098829, + "grad_norm": 0.21592989563941956, + "learning_rate": 1.9215463733308418e-05, + "loss": 1.2742, + "step": 5134 + }, + { + "epoch": 1.5294402352984977, + "grad_norm": 0.2478303462266922, + "learning_rate": 1.9215089179884897e-05, + "loss": 1.2767, + "step": 5135 + }, + { + "epoch": 1.5297380814981665, + "grad_norm": 0.2218201756477356, + "learning_rate": 1.9214714540725263e-05, + "loss": 1.2676, + "step": 5136 + }, + { + "epoch": 1.530035927697835, + "grad_norm": 0.22012294828891754, + "learning_rate": 1.9214339815833004e-05, + "loss": 1.2742, + "step": 5137 + }, + { + "epoch": 1.5303337738975036, + "grad_norm": 0.21948669850826263, + "learning_rate": 1.9213965005211614e-05, + "loss": 1.2904, + "step": 5138 + }, + { + "epoch": 1.5306316200971724, + "grad_norm": 0.20946471393108368, + "learning_rate": 1.9213590108864572e-05, + "loss": 1.2808, + "step": 5139 + }, + { + "epoch": 1.5309294662968411, + "grad_norm": 0.2280375212430954, + "learning_rate": 1.9213215126795366e-05, + "loss": 1.2849, + "step": 5140 + }, + { + "epoch": 1.5312273124965095, + "grad_norm": 0.2271459996700287, + "learning_rate": 1.921284005900749e-05, + "loss": 1.2733, + "step": 5141 + }, + { + "epoch": 1.5315251586961782, + "grad_norm": 0.22264908254146576, + "learning_rate": 1.921246490550443e-05, + "loss": 1.2628, + "step": 5142 + }, + { + "epoch": 1.531823004895847, + "grad_norm": 0.20613276958465576, + "learning_rate": 1.9212089666289674e-05, + "loss": 1.2704, + "step": 5143 + }, + { + "epoch": 1.5321208510955155, + "grad_norm": 0.28468140959739685, + "learning_rate": 1.9211714341366718e-05, + "loss": 1.2673, + "step": 5144 + }, + { + "epoch": 1.532418697295184, + "grad_norm": 0.24806201457977295, + "learning_rate": 1.921133893073905e-05, + "loss": 1.2897, + "step": 5145 + }, + { + "epoch": 1.5327165434948529, + "grad_norm": 0.24836388230323792, + "learning_rate": 1.9210963434410166e-05, + "loss": 1.2792, + "step": 5146 + }, + { + "epoch": 1.5330143896945216, + "grad_norm": 0.21674907207489014, + "learning_rate": 1.921058785238356e-05, + "loss": 1.2847, + "step": 5147 + }, + { + "epoch": 1.5333122358941902, + "grad_norm": 0.30787208676338196, + "learning_rate": 1.9210212184662724e-05, + "loss": 1.2909, + "step": 5148 + }, + { + "epoch": 1.5336100820938587, + "grad_norm": 0.24113978445529938, + "learning_rate": 1.9209836431251154e-05, + "loss": 1.2796, + "step": 5149 + }, + { + "epoch": 1.5339079282935275, + "grad_norm": 0.22804510593414307, + "learning_rate": 1.9209460592152345e-05, + "loss": 1.2737, + "step": 5150 + }, + { + "epoch": 1.534205774493196, + "grad_norm": 0.2252933233976364, + "learning_rate": 1.9209084667369793e-05, + "loss": 1.2851, + "step": 5151 + }, + { + "epoch": 1.5345036206928646, + "grad_norm": 0.21543268859386444, + "learning_rate": 1.9208708656907e-05, + "loss": 1.2846, + "step": 5152 + }, + { + "epoch": 1.5348014668925334, + "grad_norm": 0.21873699128627777, + "learning_rate": 1.920833256076746e-05, + "loss": 1.2744, + "step": 5153 + }, + { + "epoch": 1.5350993130922022, + "grad_norm": 0.23945359885692596, + "learning_rate": 1.9207956378954673e-05, + "loss": 1.2795, + "step": 5154 + }, + { + "epoch": 1.5353971592918707, + "grad_norm": 0.2227172702550888, + "learning_rate": 1.9207580111472142e-05, + "loss": 1.2978, + "step": 5155 + }, + { + "epoch": 1.5356950054915393, + "grad_norm": 0.212611123919487, + "learning_rate": 1.9207203758323362e-05, + "loss": 1.2945, + "step": 5156 + }, + { + "epoch": 1.535992851691208, + "grad_norm": 0.21947996318340302, + "learning_rate": 1.920682731951184e-05, + "loss": 1.2602, + "step": 5157 + }, + { + "epoch": 1.5362906978908766, + "grad_norm": 0.23065443336963654, + "learning_rate": 1.920645079504108e-05, + "loss": 1.2713, + "step": 5158 + }, + { + "epoch": 1.5365885440905451, + "grad_norm": 0.2252213954925537, + "learning_rate": 1.9206074184914575e-05, + "loss": 1.2844, + "step": 5159 + }, + { + "epoch": 1.536886390290214, + "grad_norm": 0.21542248129844666, + "learning_rate": 1.9205697489135838e-05, + "loss": 1.2723, + "step": 5160 + }, + { + "epoch": 1.5371842364898827, + "grad_norm": 0.20689259469509125, + "learning_rate": 1.9205320707708372e-05, + "loss": 1.2522, + "step": 5161 + }, + { + "epoch": 1.5374820826895512, + "grad_norm": 0.22795073688030243, + "learning_rate": 1.920494384063568e-05, + "loss": 1.2672, + "step": 5162 + }, + { + "epoch": 1.5377799288892198, + "grad_norm": 0.23105382919311523, + "learning_rate": 1.9204566887921273e-05, + "loss": 1.2841, + "step": 5163 + }, + { + "epoch": 1.5380777750888885, + "grad_norm": 0.2340937703847885, + "learning_rate": 1.9204189849568654e-05, + "loss": 1.2813, + "step": 5164 + }, + { + "epoch": 1.538375621288557, + "grad_norm": 0.21824911236763, + "learning_rate": 1.9203812725581328e-05, + "loss": 1.2751, + "step": 5165 + }, + { + "epoch": 1.5386734674882256, + "grad_norm": 0.21339337527751923, + "learning_rate": 1.920343551596281e-05, + "loss": 1.2677, + "step": 5166 + }, + { + "epoch": 1.5389713136878944, + "grad_norm": 0.22229644656181335, + "learning_rate": 1.9203058220716607e-05, + "loss": 1.2571, + "step": 5167 + }, + { + "epoch": 1.5392691598875632, + "grad_norm": 0.22805114090442657, + "learning_rate": 1.9202680839846232e-05, + "loss": 1.2818, + "step": 5168 + }, + { + "epoch": 1.5395670060872317, + "grad_norm": 0.2152119129896164, + "learning_rate": 1.920230337335519e-05, + "loss": 1.2931, + "step": 5169 + }, + { + "epoch": 1.5398648522869003, + "grad_norm": 0.22499510645866394, + "learning_rate": 1.9201925821247e-05, + "loss": 1.2619, + "step": 5170 + }, + { + "epoch": 1.540162698486569, + "grad_norm": 0.21436291933059692, + "learning_rate": 1.920154818352517e-05, + "loss": 1.2774, + "step": 5171 + }, + { + "epoch": 1.5404605446862376, + "grad_norm": 0.2081919014453888, + "learning_rate": 1.9201170460193213e-05, + "loss": 1.295, + "step": 5172 + }, + { + "epoch": 1.5407583908859062, + "grad_norm": 0.22008489072322845, + "learning_rate": 1.9200792651254647e-05, + "loss": 1.278, + "step": 5173 + }, + { + "epoch": 1.541056237085575, + "grad_norm": 0.21873022615909576, + "learning_rate": 1.9200414756712985e-05, + "loss": 1.2692, + "step": 5174 + }, + { + "epoch": 1.5413540832852437, + "grad_norm": 0.22118264436721802, + "learning_rate": 1.920003677657174e-05, + "loss": 1.2963, + "step": 5175 + }, + { + "epoch": 1.5416519294849123, + "grad_norm": 0.21683140099048615, + "learning_rate": 1.9199658710834434e-05, + "loss": 1.2995, + "step": 5176 + }, + { + "epoch": 1.5419497756845808, + "grad_norm": 0.21878816187381744, + "learning_rate": 1.9199280559504584e-05, + "loss": 1.2843, + "step": 5177 + }, + { + "epoch": 1.5422476218842496, + "grad_norm": 0.22942353785037994, + "learning_rate": 1.9198902322585704e-05, + "loss": 1.2767, + "step": 5178 + }, + { + "epoch": 1.5425454680839181, + "grad_norm": 0.23138710856437683, + "learning_rate": 1.9198524000081317e-05, + "loss": 1.301, + "step": 5179 + }, + { + "epoch": 1.5428433142835867, + "grad_norm": 0.21942220628261566, + "learning_rate": 1.919814559199494e-05, + "loss": 1.2846, + "step": 5180 + }, + { + "epoch": 1.5431411604832554, + "grad_norm": 0.21210066974163055, + "learning_rate": 1.9197767098330097e-05, + "loss": 1.2972, + "step": 5181 + }, + { + "epoch": 1.5434390066829242, + "grad_norm": 0.219122976064682, + "learning_rate": 1.9197388519090302e-05, + "loss": 1.2776, + "step": 5182 + }, + { + "epoch": 1.5437368528825928, + "grad_norm": 0.2294389009475708, + "learning_rate": 1.919700985427909e-05, + "loss": 1.2833, + "step": 5183 + }, + { + "epoch": 1.5440346990822613, + "grad_norm": 0.2336971014738083, + "learning_rate": 1.919663110389997e-05, + "loss": 1.2677, + "step": 5184 + }, + { + "epoch": 1.54433254528193, + "grad_norm": 0.2087344527244568, + "learning_rate": 1.9196252267956477e-05, + "loss": 1.2745, + "step": 5185 + }, + { + "epoch": 1.5446303914815986, + "grad_norm": 0.22128574550151825, + "learning_rate": 1.9195873346452132e-05, + "loss": 1.2865, + "step": 5186 + }, + { + "epoch": 1.5449282376812672, + "grad_norm": 0.2278864085674286, + "learning_rate": 1.9195494339390455e-05, + "loss": 1.2697, + "step": 5187 + }, + { + "epoch": 1.545226083880936, + "grad_norm": 0.2292591631412506, + "learning_rate": 1.9195115246774985e-05, + "loss": 1.2819, + "step": 5188 + }, + { + "epoch": 1.5455239300806047, + "grad_norm": 0.21845091879367828, + "learning_rate": 1.9194736068609235e-05, + "loss": 1.2734, + "step": 5189 + }, + { + "epoch": 1.5458217762802733, + "grad_norm": 0.2252739667892456, + "learning_rate": 1.919435680489674e-05, + "loss": 1.2897, + "step": 5190 + }, + { + "epoch": 1.5461196224799418, + "grad_norm": 0.21534298360347748, + "learning_rate": 1.9193977455641025e-05, + "loss": 1.2684, + "step": 5191 + }, + { + "epoch": 1.5464174686796106, + "grad_norm": 0.22747106850147247, + "learning_rate": 1.9193598020845626e-05, + "loss": 1.2735, + "step": 5192 + }, + { + "epoch": 1.5467153148792792, + "grad_norm": 0.2073824554681778, + "learning_rate": 1.919321850051407e-05, + "loss": 1.2672, + "step": 5193 + }, + { + "epoch": 1.5470131610789477, + "grad_norm": 0.2117050439119339, + "learning_rate": 1.9192838894649884e-05, + "loss": 1.2871, + "step": 5194 + }, + { + "epoch": 1.5473110072786165, + "grad_norm": 0.2082703709602356, + "learning_rate": 1.9192459203256605e-05, + "loss": 1.281, + "step": 5195 + }, + { + "epoch": 1.5476088534782853, + "grad_norm": 0.21297723054885864, + "learning_rate": 1.9192079426337762e-05, + "loss": 1.2987, + "step": 5196 + }, + { + "epoch": 1.5479066996779538, + "grad_norm": 0.21057502925395966, + "learning_rate": 1.919169956389689e-05, + "loss": 1.2651, + "step": 5197 + }, + { + "epoch": 1.5482045458776224, + "grad_norm": 0.21725547313690186, + "learning_rate": 1.9191319615937523e-05, + "loss": 1.2815, + "step": 5198 + }, + { + "epoch": 1.5485023920772911, + "grad_norm": 0.23164552450180054, + "learning_rate": 1.9190939582463195e-05, + "loss": 1.2888, + "step": 5199 + }, + { + "epoch": 1.54880023827696, + "grad_norm": 0.21210889518260956, + "learning_rate": 1.9190559463477445e-05, + "loss": 1.277, + "step": 5200 + }, + { + "epoch": 1.5490980844766282, + "grad_norm": 0.22068975865840912, + "learning_rate": 1.9190179258983804e-05, + "loss": 1.2701, + "step": 5201 + }, + { + "epoch": 1.549395930676297, + "grad_norm": 0.22226393222808838, + "learning_rate": 1.918979896898582e-05, + "loss": 1.2522, + "step": 5202 + }, + { + "epoch": 1.5496937768759658, + "grad_norm": 0.2246939092874527, + "learning_rate": 1.918941859348702e-05, + "loss": 1.2786, + "step": 5203 + }, + { + "epoch": 1.5499916230756343, + "grad_norm": 0.2530314028263092, + "learning_rate": 1.9189038132490945e-05, + "loss": 1.2862, + "step": 5204 + }, + { + "epoch": 1.5502894692753029, + "grad_norm": 0.20884862542152405, + "learning_rate": 1.9188657586001137e-05, + "loss": 1.2658, + "step": 5205 + }, + { + "epoch": 1.5505873154749716, + "grad_norm": 0.22472144663333893, + "learning_rate": 1.918827695402114e-05, + "loss": 1.2804, + "step": 5206 + }, + { + "epoch": 1.5508851616746404, + "grad_norm": 0.2202090620994568, + "learning_rate": 1.9187896236554488e-05, + "loss": 1.2542, + "step": 5207 + }, + { + "epoch": 1.5511830078743087, + "grad_norm": 0.2156151682138443, + "learning_rate": 1.918751543360473e-05, + "loss": 1.2698, + "step": 5208 + }, + { + "epoch": 1.5514808540739775, + "grad_norm": 0.22296875715255737, + "learning_rate": 1.9187134545175403e-05, + "loss": 1.2626, + "step": 5209 + }, + { + "epoch": 1.5517787002736463, + "grad_norm": 0.22329306602478027, + "learning_rate": 1.9186753571270054e-05, + "loss": 1.2853, + "step": 5210 + }, + { + "epoch": 1.5520765464733148, + "grad_norm": 0.22098888456821442, + "learning_rate": 1.918637251189223e-05, + "loss": 1.2864, + "step": 5211 + }, + { + "epoch": 1.5523743926729834, + "grad_norm": 0.23140911757946014, + "learning_rate": 1.918599136704547e-05, + "loss": 1.2957, + "step": 5212 + }, + { + "epoch": 1.5526722388726522, + "grad_norm": 0.21242539584636688, + "learning_rate": 1.9185610136733322e-05, + "loss": 1.2855, + "step": 5213 + }, + { + "epoch": 1.552970085072321, + "grad_norm": 0.21810154616832733, + "learning_rate": 1.918522882095934e-05, + "loss": 1.2815, + "step": 5214 + }, + { + "epoch": 1.5532679312719895, + "grad_norm": 0.2197161167860031, + "learning_rate": 1.9184847419727063e-05, + "loss": 1.2769, + "step": 5215 + }, + { + "epoch": 1.553565777471658, + "grad_norm": 0.21979984641075134, + "learning_rate": 1.9184465933040042e-05, + "loss": 1.2682, + "step": 5216 + }, + { + "epoch": 1.5538636236713268, + "grad_norm": 0.224289208650589, + "learning_rate": 1.9184084360901827e-05, + "loss": 1.2852, + "step": 5217 + }, + { + "epoch": 1.5541614698709953, + "grad_norm": 0.22888165712356567, + "learning_rate": 1.9183702703315972e-05, + "loss": 1.2904, + "step": 5218 + }, + { + "epoch": 1.554459316070664, + "grad_norm": 0.21977858245372772, + "learning_rate": 1.918332096028602e-05, + "loss": 1.2669, + "step": 5219 + }, + { + "epoch": 1.5547571622703327, + "grad_norm": 0.22267065942287445, + "learning_rate": 1.9182939131815527e-05, + "loss": 1.2853, + "step": 5220 + }, + { + "epoch": 1.5550550084700014, + "grad_norm": 0.21301211416721344, + "learning_rate": 1.9182557217908046e-05, + "loss": 1.2736, + "step": 5221 + }, + { + "epoch": 1.55535285466967, + "grad_norm": 0.21961882710456848, + "learning_rate": 1.918217521856713e-05, + "loss": 1.284, + "step": 5222 + }, + { + "epoch": 1.5556507008693385, + "grad_norm": 0.21925969421863556, + "learning_rate": 1.9181793133796332e-05, + "loss": 1.2796, + "step": 5223 + }, + { + "epoch": 1.5559485470690073, + "grad_norm": 0.2270103543996811, + "learning_rate": 1.918141096359921e-05, + "loss": 1.2936, + "step": 5224 + }, + { + "epoch": 1.5562463932686759, + "grad_norm": 0.24820318818092346, + "learning_rate": 1.918102870797931e-05, + "loss": 1.2593, + "step": 5225 + }, + { + "epoch": 1.5565442394683444, + "grad_norm": 0.21502672135829926, + "learning_rate": 1.9180646366940202e-05, + "loss": 1.2757, + "step": 5226 + }, + { + "epoch": 1.5568420856680132, + "grad_norm": 0.216362863779068, + "learning_rate": 1.9180263940485434e-05, + "loss": 1.2978, + "step": 5227 + }, + { + "epoch": 1.557139931867682, + "grad_norm": 0.21232004463672638, + "learning_rate": 1.9179881428618567e-05, + "loss": 1.266, + "step": 5228 + }, + { + "epoch": 1.5574377780673505, + "grad_norm": 0.22477225959300995, + "learning_rate": 1.9179498831343162e-05, + "loss": 1.2728, + "step": 5229 + }, + { + "epoch": 1.557735624267019, + "grad_norm": 0.2526955306529999, + "learning_rate": 1.9179116148662774e-05, + "loss": 1.2824, + "step": 5230 + }, + { + "epoch": 1.5580334704666878, + "grad_norm": 0.22520771622657776, + "learning_rate": 1.9178733380580967e-05, + "loss": 1.2768, + "step": 5231 + }, + { + "epoch": 1.5583313166663564, + "grad_norm": 0.22579000890254974, + "learning_rate": 1.91783505271013e-05, + "loss": 1.2743, + "step": 5232 + }, + { + "epoch": 1.558629162866025, + "grad_norm": 0.2351585179567337, + "learning_rate": 1.9177967588227334e-05, + "loss": 1.2989, + "step": 5233 + }, + { + "epoch": 1.5589270090656937, + "grad_norm": 0.23377393186092377, + "learning_rate": 1.9177584563962638e-05, + "loss": 1.2784, + "step": 5234 + }, + { + "epoch": 1.5592248552653625, + "grad_norm": 0.23362720012664795, + "learning_rate": 1.917720145431077e-05, + "loss": 1.2926, + "step": 5235 + }, + { + "epoch": 1.559522701465031, + "grad_norm": 0.21404823660850525, + "learning_rate": 1.9176818259275293e-05, + "loss": 1.2722, + "step": 5236 + }, + { + "epoch": 1.5598205476646996, + "grad_norm": 0.217382550239563, + "learning_rate": 1.9176434978859776e-05, + "loss": 1.2655, + "step": 5237 + }, + { + "epoch": 1.5601183938643683, + "grad_norm": 0.2208022028207779, + "learning_rate": 1.9176051613067787e-05, + "loss": 1.2751, + "step": 5238 + }, + { + "epoch": 1.560416240064037, + "grad_norm": 0.21439604461193085, + "learning_rate": 1.9175668161902886e-05, + "loss": 1.2818, + "step": 5239 + }, + { + "epoch": 1.5607140862637054, + "grad_norm": 0.24205389618873596, + "learning_rate": 1.917528462536865e-05, + "loss": 1.2943, + "step": 5240 + }, + { + "epoch": 1.5610119324633742, + "grad_norm": 0.21855804324150085, + "learning_rate": 1.9174901003468638e-05, + "loss": 1.2813, + "step": 5241 + }, + { + "epoch": 1.561309778663043, + "grad_norm": 0.22007018327713013, + "learning_rate": 1.917451729620642e-05, + "loss": 1.2668, + "step": 5242 + }, + { + "epoch": 1.5616076248627115, + "grad_norm": 0.2210848033428192, + "learning_rate": 1.9174133503585573e-05, + "loss": 1.2733, + "step": 5243 + }, + { + "epoch": 1.56190547106238, + "grad_norm": 0.2746601402759552, + "learning_rate": 1.9173749625609664e-05, + "loss": 1.2664, + "step": 5244 + }, + { + "epoch": 1.5622033172620489, + "grad_norm": 0.23702725768089294, + "learning_rate": 1.9173365662282264e-05, + "loss": 1.2771, + "step": 5245 + }, + { + "epoch": 1.5625011634617174, + "grad_norm": 0.2350674420595169, + "learning_rate": 1.9172981613606946e-05, + "loss": 1.275, + "step": 5246 + }, + { + "epoch": 1.562799009661386, + "grad_norm": 0.22329159080982208, + "learning_rate": 1.9172597479587282e-05, + "loss": 1.2868, + "step": 5247 + }, + { + "epoch": 1.5630968558610547, + "grad_norm": 0.22355787456035614, + "learning_rate": 1.9172213260226842e-05, + "loss": 1.2586, + "step": 5248 + }, + { + "epoch": 1.5633947020607235, + "grad_norm": 0.2204594910144806, + "learning_rate": 1.9171828955529213e-05, + "loss": 1.2765, + "step": 5249 + }, + { + "epoch": 1.563692548260392, + "grad_norm": 0.2157185971736908, + "learning_rate": 1.917144456549796e-05, + "loss": 1.2695, + "step": 5250 + }, + { + "epoch": 1.5639903944600606, + "grad_norm": 0.22540999948978424, + "learning_rate": 1.917106009013666e-05, + "loss": 1.2733, + "step": 5251 + }, + { + "epoch": 1.5642882406597294, + "grad_norm": 0.22991201281547546, + "learning_rate": 1.9170675529448895e-05, + "loss": 1.278, + "step": 5252 + }, + { + "epoch": 1.564586086859398, + "grad_norm": 0.2228827178478241, + "learning_rate": 1.9170290883438238e-05, + "loss": 1.2795, + "step": 5253 + }, + { + "epoch": 1.5648839330590665, + "grad_norm": 0.21096153557300568, + "learning_rate": 1.916990615210827e-05, + "loss": 1.2773, + "step": 5254 + }, + { + "epoch": 1.5651817792587353, + "grad_norm": 0.21494264900684357, + "learning_rate": 1.916952133546257e-05, + "loss": 1.2758, + "step": 5255 + }, + { + "epoch": 1.565479625458404, + "grad_norm": 0.22339381277561188, + "learning_rate": 1.9169136433504724e-05, + "loss": 1.2691, + "step": 5256 + }, + { + "epoch": 1.5657774716580726, + "grad_norm": 0.21925175189971924, + "learning_rate": 1.9168751446238306e-05, + "loss": 1.2597, + "step": 5257 + }, + { + "epoch": 1.5660753178577411, + "grad_norm": 0.22400447726249695, + "learning_rate": 1.91683663736669e-05, + "loss": 1.2653, + "step": 5258 + }, + { + "epoch": 1.56637316405741, + "grad_norm": 0.25193363428115845, + "learning_rate": 1.9167981215794086e-05, + "loss": 1.26, + "step": 5259 + }, + { + "epoch": 1.5666710102570787, + "grad_norm": 0.2602287232875824, + "learning_rate": 1.916759597262345e-05, + "loss": 1.2733, + "step": 5260 + }, + { + "epoch": 1.566968856456747, + "grad_norm": 0.2307026982307434, + "learning_rate": 1.9167210644158577e-05, + "loss": 1.2741, + "step": 5261 + }, + { + "epoch": 1.5672667026564158, + "grad_norm": 0.22563683986663818, + "learning_rate": 1.9166825230403047e-05, + "loss": 1.2717, + "step": 5262 + }, + { + "epoch": 1.5675645488560845, + "grad_norm": 0.3646620512008667, + "learning_rate": 1.9166439731360454e-05, + "loss": 1.2654, + "step": 5263 + }, + { + "epoch": 1.567862395055753, + "grad_norm": 0.25848421454429626, + "learning_rate": 1.916605414703438e-05, + "loss": 1.2853, + "step": 5264 + }, + { + "epoch": 1.5681602412554216, + "grad_norm": 0.24908843636512756, + "learning_rate": 1.9165668477428414e-05, + "loss": 1.2734, + "step": 5265 + }, + { + "epoch": 1.5684580874550904, + "grad_norm": 0.2249835878610611, + "learning_rate": 1.9165282722546146e-05, + "loss": 1.2919, + "step": 5266 + }, + { + "epoch": 1.5687559336547592, + "grad_norm": 0.254253089427948, + "learning_rate": 1.9164896882391158e-05, + "loss": 1.2778, + "step": 5267 + }, + { + "epoch": 1.5690537798544275, + "grad_norm": 0.24973627924919128, + "learning_rate": 1.9164510956967043e-05, + "loss": 1.2732, + "step": 5268 + }, + { + "epoch": 1.5693516260540963, + "grad_norm": 0.22042717039585114, + "learning_rate": 1.9164124946277396e-05, + "loss": 1.2998, + "step": 5269 + }, + { + "epoch": 1.569649472253765, + "grad_norm": 0.22058488428592682, + "learning_rate": 1.9163738850325806e-05, + "loss": 1.2594, + "step": 5270 + }, + { + "epoch": 1.5699473184534336, + "grad_norm": 0.2343423217535019, + "learning_rate": 1.916335266911586e-05, + "loss": 1.2808, + "step": 5271 + }, + { + "epoch": 1.5702451646531022, + "grad_norm": 0.2160743623971939, + "learning_rate": 1.916296640265116e-05, + "loss": 1.2746, + "step": 5272 + }, + { + "epoch": 1.570543010852771, + "grad_norm": 0.21926793456077576, + "learning_rate": 1.9162580050935293e-05, + "loss": 1.2807, + "step": 5273 + }, + { + "epoch": 1.5708408570524397, + "grad_norm": 0.21966791152954102, + "learning_rate": 1.9162193613971854e-05, + "loss": 1.2708, + "step": 5274 + }, + { + "epoch": 1.571138703252108, + "grad_norm": 0.21941600739955902, + "learning_rate": 1.9161807091764442e-05, + "loss": 1.2507, + "step": 5275 + }, + { + "epoch": 1.5714365494517768, + "grad_norm": 0.23557275533676147, + "learning_rate": 1.916142048431665e-05, + "loss": 1.2921, + "step": 5276 + }, + { + "epoch": 1.5717343956514456, + "grad_norm": 0.2285400778055191, + "learning_rate": 1.916103379163208e-05, + "loss": 1.2745, + "step": 5277 + }, + { + "epoch": 1.5720322418511141, + "grad_norm": 0.2255667895078659, + "learning_rate": 1.9160647013714323e-05, + "loss": 1.2766, + "step": 5278 + }, + { + "epoch": 1.5723300880507827, + "grad_norm": 0.21725843846797943, + "learning_rate": 1.9160260150566978e-05, + "loss": 1.2735, + "step": 5279 + }, + { + "epoch": 1.5726279342504514, + "grad_norm": 0.2289770096540451, + "learning_rate": 1.9159873202193648e-05, + "loss": 1.2771, + "step": 5280 + }, + { + "epoch": 1.5729257804501202, + "grad_norm": 0.21951895952224731, + "learning_rate": 1.9159486168597934e-05, + "loss": 1.2729, + "step": 5281 + }, + { + "epoch": 1.5732236266497888, + "grad_norm": 0.21915505826473236, + "learning_rate": 1.9159099049783435e-05, + "loss": 1.2707, + "step": 5282 + }, + { + "epoch": 1.5735214728494573, + "grad_norm": 0.22136370837688446, + "learning_rate": 1.915871184575375e-05, + "loss": 1.2576, + "step": 5283 + }, + { + "epoch": 1.573819319049126, + "grad_norm": 0.2282637357711792, + "learning_rate": 1.9158324556512483e-05, + "loss": 1.2801, + "step": 5284 + }, + { + "epoch": 1.5741171652487946, + "grad_norm": 0.22993646562099457, + "learning_rate": 1.9157937182063243e-05, + "loss": 1.2758, + "step": 5285 + }, + { + "epoch": 1.5744150114484632, + "grad_norm": 0.21509593725204468, + "learning_rate": 1.9157549722409628e-05, + "loss": 1.2826, + "step": 5286 + }, + { + "epoch": 1.574712857648132, + "grad_norm": 0.22301043570041656, + "learning_rate": 1.9157162177555242e-05, + "loss": 1.2861, + "step": 5287 + }, + { + "epoch": 1.5750107038478007, + "grad_norm": 0.22497759759426117, + "learning_rate": 1.9156774547503694e-05, + "loss": 1.264, + "step": 5288 + }, + { + "epoch": 1.5753085500474693, + "grad_norm": 0.22139903903007507, + "learning_rate": 1.915638683225859e-05, + "loss": 1.2825, + "step": 5289 + }, + { + "epoch": 1.5756063962471378, + "grad_norm": 0.22024013102054596, + "learning_rate": 1.9155999031823534e-05, + "loss": 1.2887, + "step": 5290 + }, + { + "epoch": 1.5759042424468066, + "grad_norm": 0.2222883701324463, + "learning_rate": 1.915561114620214e-05, + "loss": 1.2856, + "step": 5291 + }, + { + "epoch": 1.5762020886464752, + "grad_norm": 0.22331392765045166, + "learning_rate": 1.915522317539801e-05, + "loss": 1.2867, + "step": 5292 + }, + { + "epoch": 1.5764999348461437, + "grad_norm": 0.22336934506893158, + "learning_rate": 1.915483511941476e-05, + "loss": 1.2722, + "step": 5293 + }, + { + "epoch": 1.5767977810458125, + "grad_norm": 0.20129472017288208, + "learning_rate": 1.9154446978256e-05, + "loss": 1.257, + "step": 5294 + }, + { + "epoch": 1.5770956272454812, + "grad_norm": 0.2179460972547531, + "learning_rate": 1.9154058751925335e-05, + "loss": 1.2743, + "step": 5295 + }, + { + "epoch": 1.5773934734451498, + "grad_norm": 0.21097329258918762, + "learning_rate": 1.915367044042638e-05, + "loss": 1.2743, + "step": 5296 + }, + { + "epoch": 1.5776913196448183, + "grad_norm": 0.21877439320087433, + "learning_rate": 1.9153282043762753e-05, + "loss": 1.2652, + "step": 5297 + }, + { + "epoch": 1.5779891658444871, + "grad_norm": 0.21587024629116058, + "learning_rate": 1.9152893561938058e-05, + "loss": 1.2801, + "step": 5298 + }, + { + "epoch": 1.5782870120441557, + "grad_norm": 0.2207348793745041, + "learning_rate": 1.915250499495592e-05, + "loss": 1.269, + "step": 5299 + }, + { + "epoch": 1.5785848582438242, + "grad_norm": 0.21685463190078735, + "learning_rate": 1.9152116342819942e-05, + "loss": 1.2649, + "step": 5300 + }, + { + "epoch": 1.578882704443493, + "grad_norm": 0.2112230360507965, + "learning_rate": 1.9151727605533753e-05, + "loss": 1.256, + "step": 5301 + }, + { + "epoch": 1.5791805506431618, + "grad_norm": 0.20439797639846802, + "learning_rate": 1.9151338783100962e-05, + "loss": 1.2688, + "step": 5302 + }, + { + "epoch": 1.5794783968428303, + "grad_norm": 0.21740297973155975, + "learning_rate": 1.9150949875525185e-05, + "loss": 1.2851, + "step": 5303 + }, + { + "epoch": 1.5797762430424989, + "grad_norm": 0.2155197411775589, + "learning_rate": 1.9150560882810047e-05, + "loss": 1.2613, + "step": 5304 + }, + { + "epoch": 1.5800740892421676, + "grad_norm": 0.2102223038673401, + "learning_rate": 1.9150171804959163e-05, + "loss": 1.2895, + "step": 5305 + }, + { + "epoch": 1.5803719354418362, + "grad_norm": 0.22810573875904083, + "learning_rate": 1.9149782641976152e-05, + "loss": 1.2826, + "step": 5306 + }, + { + "epoch": 1.5806697816415047, + "grad_norm": 0.2198859602212906, + "learning_rate": 1.9149393393864636e-05, + "loss": 1.2809, + "step": 5307 + }, + { + "epoch": 1.5809676278411735, + "grad_norm": 0.22619320452213287, + "learning_rate": 1.9149004060628237e-05, + "loss": 1.2689, + "step": 5308 + }, + { + "epoch": 1.5812654740408423, + "grad_norm": 0.2240327000617981, + "learning_rate": 1.914861464227058e-05, + "loss": 1.2979, + "step": 5309 + }, + { + "epoch": 1.5815633202405108, + "grad_norm": 0.217616006731987, + "learning_rate": 1.9148225138795285e-05, + "loss": 1.2831, + "step": 5310 + }, + { + "epoch": 1.5818611664401794, + "grad_norm": 0.23388800024986267, + "learning_rate": 1.9147835550205974e-05, + "loss": 1.2849, + "step": 5311 + }, + { + "epoch": 1.5821590126398482, + "grad_norm": 0.22546786069869995, + "learning_rate": 1.914744587650627e-05, + "loss": 1.2875, + "step": 5312 + }, + { + "epoch": 1.5824568588395167, + "grad_norm": 0.22454583644866943, + "learning_rate": 1.914705611769981e-05, + "loss": 1.2806, + "step": 5313 + }, + { + "epoch": 1.5827547050391853, + "grad_norm": 0.21439944207668304, + "learning_rate": 1.9146666273790208e-05, + "loss": 1.261, + "step": 5314 + }, + { + "epoch": 1.583052551238854, + "grad_norm": 0.2200402319431305, + "learning_rate": 1.9146276344781096e-05, + "loss": 1.2879, + "step": 5315 + }, + { + "epoch": 1.5833503974385228, + "grad_norm": 0.22738364338874817, + "learning_rate": 1.9145886330676104e-05, + "loss": 1.2963, + "step": 5316 + }, + { + "epoch": 1.5836482436381913, + "grad_norm": 0.2065533846616745, + "learning_rate": 1.9145496231478855e-05, + "loss": 1.2747, + "step": 5317 + }, + { + "epoch": 1.58394608983786, + "grad_norm": 0.21450136601924896, + "learning_rate": 1.9145106047192983e-05, + "loss": 1.2746, + "step": 5318 + }, + { + "epoch": 1.5842439360375287, + "grad_norm": 0.2229100465774536, + "learning_rate": 1.9144715777822113e-05, + "loss": 1.2716, + "step": 5319 + }, + { + "epoch": 1.5845417822371972, + "grad_norm": 0.21140901744365692, + "learning_rate": 1.9144325423369883e-05, + "loss": 1.2734, + "step": 5320 + }, + { + "epoch": 1.5848396284368658, + "grad_norm": 0.21822868287563324, + "learning_rate": 1.9143934983839923e-05, + "loss": 1.2735, + "step": 5321 + }, + { + "epoch": 1.5851374746365345, + "grad_norm": 0.23037533462047577, + "learning_rate": 1.9143544459235864e-05, + "loss": 1.2803, + "step": 5322 + }, + { + "epoch": 1.5854353208362033, + "grad_norm": 0.2217172235250473, + "learning_rate": 1.914315384956134e-05, + "loss": 1.2738, + "step": 5323 + }, + { + "epoch": 1.5857331670358719, + "grad_norm": 0.2185138761997223, + "learning_rate": 1.9142763154819982e-05, + "loss": 1.2855, + "step": 5324 + }, + { + "epoch": 1.5860310132355404, + "grad_norm": 0.2042170614004135, + "learning_rate": 1.9142372375015426e-05, + "loss": 1.271, + "step": 5325 + }, + { + "epoch": 1.5863288594352092, + "grad_norm": 0.23105546832084656, + "learning_rate": 1.9141981510151314e-05, + "loss": 1.2881, + "step": 5326 + }, + { + "epoch": 1.586626705634878, + "grad_norm": 0.2183239758014679, + "learning_rate": 1.9141590560231277e-05, + "loss": 1.2661, + "step": 5327 + }, + { + "epoch": 1.5869245518345463, + "grad_norm": 0.2241998314857483, + "learning_rate": 1.914119952525895e-05, + "loss": 1.2898, + "step": 5328 + }, + { + "epoch": 1.587222398034215, + "grad_norm": 0.22711949050426483, + "learning_rate": 1.914080840523798e-05, + "loss": 1.2874, + "step": 5329 + }, + { + "epoch": 1.5875202442338838, + "grad_norm": 0.21991921961307526, + "learning_rate": 1.9140417200171995e-05, + "loss": 1.2931, + "step": 5330 + }, + { + "epoch": 1.5878180904335524, + "grad_norm": 0.23235772550106049, + "learning_rate": 1.9140025910064645e-05, + "loss": 1.2815, + "step": 5331 + }, + { + "epoch": 1.588115936633221, + "grad_norm": 0.22265605628490448, + "learning_rate": 1.9139634534919563e-05, + "loss": 1.2811, + "step": 5332 + }, + { + "epoch": 1.5884137828328897, + "grad_norm": 0.23455749452114105, + "learning_rate": 1.9139243074740394e-05, + "loss": 1.2648, + "step": 5333 + }, + { + "epoch": 1.5887116290325585, + "grad_norm": 0.21858735382556915, + "learning_rate": 1.913885152953078e-05, + "loss": 1.2931, + "step": 5334 + }, + { + "epoch": 1.5890094752322268, + "grad_norm": 0.2214006781578064, + "learning_rate": 1.9138459899294363e-05, + "loss": 1.2714, + "step": 5335 + }, + { + "epoch": 1.5893073214318956, + "grad_norm": 0.22406496107578278, + "learning_rate": 1.9138068184034786e-05, + "loss": 1.2749, + "step": 5336 + }, + { + "epoch": 1.5896051676315643, + "grad_norm": 0.22526974976062775, + "learning_rate": 1.9137676383755696e-05, + "loss": 1.2689, + "step": 5337 + }, + { + "epoch": 1.589903013831233, + "grad_norm": 0.2210036814212799, + "learning_rate": 1.913728449846073e-05, + "loss": 1.2855, + "step": 5338 + }, + { + "epoch": 1.5902008600309014, + "grad_norm": 0.21717329323291779, + "learning_rate": 1.9136892528153548e-05, + "loss": 1.2754, + "step": 5339 + }, + { + "epoch": 1.5904987062305702, + "grad_norm": 0.22751939296722412, + "learning_rate": 1.9136500472837785e-05, + "loss": 1.294, + "step": 5340 + }, + { + "epoch": 1.590796552430239, + "grad_norm": 0.21087098121643066, + "learning_rate": 1.9136108332517095e-05, + "loss": 1.2682, + "step": 5341 + }, + { + "epoch": 1.5910943986299075, + "grad_norm": 0.21704800426959991, + "learning_rate": 1.913571610719512e-05, + "loss": 1.2899, + "step": 5342 + }, + { + "epoch": 1.591392244829576, + "grad_norm": 0.22615809738636017, + "learning_rate": 1.913532379687552e-05, + "loss": 1.2861, + "step": 5343 + }, + { + "epoch": 1.5916900910292449, + "grad_norm": 0.2318710833787918, + "learning_rate": 1.9134931401561935e-05, + "loss": 1.2758, + "step": 5344 + }, + { + "epoch": 1.5919879372289134, + "grad_norm": 0.22961898148059845, + "learning_rate": 1.9134538921258023e-05, + "loss": 1.2824, + "step": 5345 + }, + { + "epoch": 1.592285783428582, + "grad_norm": 0.22651560604572296, + "learning_rate": 1.913414635596743e-05, + "loss": 1.2778, + "step": 5346 + }, + { + "epoch": 1.5925836296282507, + "grad_norm": 0.2375839352607727, + "learning_rate": 1.913375370569381e-05, + "loss": 1.2988, + "step": 5347 + }, + { + "epoch": 1.5928814758279195, + "grad_norm": 0.24498562514781952, + "learning_rate": 1.9133360970440816e-05, + "loss": 1.2686, + "step": 5348 + }, + { + "epoch": 1.593179322027588, + "grad_norm": 0.21912312507629395, + "learning_rate": 1.91329681502121e-05, + "loss": 1.269, + "step": 5349 + }, + { + "epoch": 1.5934771682272566, + "grad_norm": 0.22842130064964294, + "learning_rate": 1.9132575245011326e-05, + "loss": 1.2766, + "step": 5350 + }, + { + "epoch": 1.5937750144269254, + "grad_norm": 0.25212031602859497, + "learning_rate": 1.9132182254842138e-05, + "loss": 1.2784, + "step": 5351 + }, + { + "epoch": 1.594072860626594, + "grad_norm": 0.2245430201292038, + "learning_rate": 1.91317891797082e-05, + "loss": 1.2805, + "step": 5352 + }, + { + "epoch": 1.5943707068262625, + "grad_norm": 0.2176782637834549, + "learning_rate": 1.9131396019613163e-05, + "loss": 1.286, + "step": 5353 + }, + { + "epoch": 1.5946685530259312, + "grad_norm": 0.22105389833450317, + "learning_rate": 1.913100277456069e-05, + "loss": 1.2804, + "step": 5354 + }, + { + "epoch": 1.5949663992256, + "grad_norm": 0.20954161882400513, + "learning_rate": 1.913060944455444e-05, + "loss": 1.2603, + "step": 5355 + }, + { + "epoch": 1.5952642454252686, + "grad_norm": 0.2434096336364746, + "learning_rate": 1.9130216029598068e-05, + "loss": 1.283, + "step": 5356 + }, + { + "epoch": 1.5955620916249371, + "grad_norm": 0.25870367884635925, + "learning_rate": 1.9129822529695236e-05, + "loss": 1.2779, + "step": 5357 + }, + { + "epoch": 1.595859937824606, + "grad_norm": 0.24894842505455017, + "learning_rate": 1.912942894484961e-05, + "loss": 1.2618, + "step": 5358 + }, + { + "epoch": 1.5961577840242744, + "grad_norm": 0.2341824173927307, + "learning_rate": 1.912903527506484e-05, + "loss": 1.2819, + "step": 5359 + }, + { + "epoch": 1.596455630223943, + "grad_norm": 0.3381546437740326, + "learning_rate": 1.9128641520344602e-05, + "loss": 1.2792, + "step": 5360 + }, + { + "epoch": 1.5967534764236118, + "grad_norm": 0.25865137577056885, + "learning_rate": 1.9128247680692556e-05, + "loss": 1.2574, + "step": 5361 + }, + { + "epoch": 1.5970513226232805, + "grad_norm": 0.23169195652008057, + "learning_rate": 1.912785375611236e-05, + "loss": 1.2807, + "step": 5362 + }, + { + "epoch": 1.597349168822949, + "grad_norm": 0.21453149616718292, + "learning_rate": 1.9127459746607685e-05, + "loss": 1.2875, + "step": 5363 + }, + { + "epoch": 1.5976470150226176, + "grad_norm": 0.210612952709198, + "learning_rate": 1.9127065652182192e-05, + "loss": 1.2623, + "step": 5364 + }, + { + "epoch": 1.5979448612222864, + "grad_norm": 0.23503714799880981, + "learning_rate": 1.9126671472839553e-05, + "loss": 1.2784, + "step": 5365 + }, + { + "epoch": 1.598242707421955, + "grad_norm": 0.2105993777513504, + "learning_rate": 1.912627720858343e-05, + "loss": 1.2633, + "step": 5366 + }, + { + "epoch": 1.5985405536216235, + "grad_norm": 0.21981845796108246, + "learning_rate": 1.9125882859417497e-05, + "loss": 1.2679, + "step": 5367 + }, + { + "epoch": 1.5988383998212923, + "grad_norm": 0.22042503952980042, + "learning_rate": 1.912548842534542e-05, + "loss": 1.2765, + "step": 5368 + }, + { + "epoch": 1.599136246020961, + "grad_norm": 0.22172664105892181, + "learning_rate": 1.9125093906370866e-05, + "loss": 1.2834, + "step": 5369 + }, + { + "epoch": 1.5994340922206296, + "grad_norm": 0.21650701761245728, + "learning_rate": 1.9124699302497513e-05, + "loss": 1.2773, + "step": 5370 + }, + { + "epoch": 1.5997319384202981, + "grad_norm": 0.2130105346441269, + "learning_rate": 1.9124304613729025e-05, + "loss": 1.2701, + "step": 5371 + }, + { + "epoch": 1.600029784619967, + "grad_norm": 0.22273163497447968, + "learning_rate": 1.9123909840069076e-05, + "loss": 1.2689, + "step": 5372 + }, + { + "epoch": 1.6003276308196355, + "grad_norm": 0.2219657152891159, + "learning_rate": 1.912351498152134e-05, + "loss": 1.2946, + "step": 5373 + }, + { + "epoch": 1.600625477019304, + "grad_norm": 0.22636285424232483, + "learning_rate": 1.9123120038089487e-05, + "loss": 1.2741, + "step": 5374 + }, + { + "epoch": 1.6009233232189728, + "grad_norm": 0.2128158062696457, + "learning_rate": 1.91227250097772e-05, + "loss": 1.2898, + "step": 5375 + }, + { + "epoch": 1.6012211694186416, + "grad_norm": 0.2251681089401245, + "learning_rate": 1.912232989658815e-05, + "loss": 1.2741, + "step": 5376 + }, + { + "epoch": 1.6015190156183101, + "grad_norm": 0.2215256243944168, + "learning_rate": 1.9121934698526012e-05, + "loss": 1.2867, + "step": 5377 + }, + { + "epoch": 1.6018168618179787, + "grad_norm": 0.22041627764701843, + "learning_rate": 1.9121539415594462e-05, + "loss": 1.2929, + "step": 5378 + }, + { + "epoch": 1.6021147080176474, + "grad_norm": 0.2318020761013031, + "learning_rate": 1.9121144047797177e-05, + "loss": 1.2863, + "step": 5379 + }, + { + "epoch": 1.602412554217316, + "grad_norm": 0.23751366138458252, + "learning_rate": 1.912074859513784e-05, + "loss": 1.2709, + "step": 5380 + }, + { + "epoch": 1.6027104004169845, + "grad_norm": 0.22492028772830963, + "learning_rate": 1.9120353057620127e-05, + "loss": 1.2606, + "step": 5381 + }, + { + "epoch": 1.6030082466166533, + "grad_norm": 0.22688385844230652, + "learning_rate": 1.911995743524772e-05, + "loss": 1.2795, + "step": 5382 + }, + { + "epoch": 1.603306092816322, + "grad_norm": 0.2196384072303772, + "learning_rate": 1.9119561728024295e-05, + "loss": 1.2696, + "step": 5383 + }, + { + "epoch": 1.6036039390159906, + "grad_norm": 0.21395279467105865, + "learning_rate": 1.911916593595354e-05, + "loss": 1.2871, + "step": 5384 + }, + { + "epoch": 1.6039017852156592, + "grad_norm": 0.21836192905902863, + "learning_rate": 1.9118770059039133e-05, + "loss": 1.2748, + "step": 5385 + }, + { + "epoch": 1.604199631415328, + "grad_norm": 0.21494531631469727, + "learning_rate": 1.9118374097284758e-05, + "loss": 1.2866, + "step": 5386 + }, + { + "epoch": 1.6044974776149965, + "grad_norm": 0.23001225292682648, + "learning_rate": 1.9117978050694104e-05, + "loss": 1.2732, + "step": 5387 + }, + { + "epoch": 1.604795323814665, + "grad_norm": 0.2237677127122879, + "learning_rate": 1.911758191927085e-05, + "loss": 1.2887, + "step": 5388 + }, + { + "epoch": 1.6050931700143338, + "grad_norm": 0.21367740631103516, + "learning_rate": 1.911718570301868e-05, + "loss": 1.2662, + "step": 5389 + }, + { + "epoch": 1.6053910162140026, + "grad_norm": 0.2201278805732727, + "learning_rate": 1.9116789401941284e-05, + "loss": 1.2671, + "step": 5390 + }, + { + "epoch": 1.6056888624136711, + "grad_norm": 0.21882885694503784, + "learning_rate": 1.911639301604235e-05, + "loss": 1.2796, + "step": 5391 + }, + { + "epoch": 1.6059867086133397, + "grad_norm": 0.25696447491645813, + "learning_rate": 1.911599654532556e-05, + "loss": 1.2805, + "step": 5392 + }, + { + "epoch": 1.6062845548130085, + "grad_norm": 0.2266143560409546, + "learning_rate": 1.9115599989794615e-05, + "loss": 1.2896, + "step": 5393 + }, + { + "epoch": 1.6065824010126772, + "grad_norm": 0.24198952317237854, + "learning_rate": 1.911520334945319e-05, + "loss": 1.2588, + "step": 5394 + }, + { + "epoch": 1.6068802472123456, + "grad_norm": 0.23393751680850983, + "learning_rate": 1.911480662430499e-05, + "loss": 1.2726, + "step": 5395 + }, + { + "epoch": 1.6071780934120143, + "grad_norm": 0.2551330626010895, + "learning_rate": 1.911440981435369e-05, + "loss": 1.2901, + "step": 5396 + }, + { + "epoch": 1.6074759396116831, + "grad_norm": 0.21567265689373016, + "learning_rate": 1.911401291960299e-05, + "loss": 1.2778, + "step": 5397 + }, + { + "epoch": 1.6077737858113517, + "grad_norm": 0.24062630534172058, + "learning_rate": 1.9113615940056586e-05, + "loss": 1.2726, + "step": 5398 + }, + { + "epoch": 1.6080716320110202, + "grad_norm": 0.2298053652048111, + "learning_rate": 1.9113218875718166e-05, + "loss": 1.27, + "step": 5399 + }, + { + "epoch": 1.608369478210689, + "grad_norm": 0.22689305245876312, + "learning_rate": 1.9112821726591427e-05, + "loss": 1.2807, + "step": 5400 + }, + { + "epoch": 1.6086673244103578, + "grad_norm": 0.2515493333339691, + "learning_rate": 1.9112424492680064e-05, + "loss": 1.2797, + "step": 5401 + }, + { + "epoch": 1.608965170610026, + "grad_norm": 0.22785691916942596, + "learning_rate": 1.9112027173987774e-05, + "loss": 1.2781, + "step": 5402 + }, + { + "epoch": 1.6092630168096949, + "grad_norm": 0.21993227303028107, + "learning_rate": 1.9111629770518246e-05, + "loss": 1.2627, + "step": 5403 + }, + { + "epoch": 1.6095608630093636, + "grad_norm": 0.23092100024223328, + "learning_rate": 1.911123228227519e-05, + "loss": 1.2953, + "step": 5404 + }, + { + "epoch": 1.6098587092090322, + "grad_norm": 0.22650252282619476, + "learning_rate": 1.911083470926229e-05, + "loss": 1.2655, + "step": 5405 + }, + { + "epoch": 1.6101565554087007, + "grad_norm": 0.2112005203962326, + "learning_rate": 1.9110437051483256e-05, + "loss": 1.2698, + "step": 5406 + }, + { + "epoch": 1.6104544016083695, + "grad_norm": 0.2540183961391449, + "learning_rate": 1.9110039308941784e-05, + "loss": 1.2653, + "step": 5407 + }, + { + "epoch": 1.6107522478080383, + "grad_norm": 0.21758641302585602, + "learning_rate": 1.9109641481641575e-05, + "loss": 1.2744, + "step": 5408 + }, + { + "epoch": 1.6110500940077068, + "grad_norm": 0.21442918479442596, + "learning_rate": 1.9109243569586327e-05, + "loss": 1.2684, + "step": 5409 + }, + { + "epoch": 1.6113479402073754, + "grad_norm": 0.22990816831588745, + "learning_rate": 1.9108845572779748e-05, + "loss": 1.2761, + "step": 5410 + }, + { + "epoch": 1.6116457864070441, + "grad_norm": 0.22343116998672485, + "learning_rate": 1.910844749122554e-05, + "loss": 1.2808, + "step": 5411 + }, + { + "epoch": 1.6119436326067127, + "grad_norm": 0.22033905982971191, + "learning_rate": 1.9108049324927403e-05, + "loss": 1.2882, + "step": 5412 + }, + { + "epoch": 1.6122414788063812, + "grad_norm": 0.22083336114883423, + "learning_rate": 1.9107651073889038e-05, + "loss": 1.2621, + "step": 5413 + }, + { + "epoch": 1.61253932500605, + "grad_norm": 0.20866520702838898, + "learning_rate": 1.910725273811416e-05, + "loss": 1.2732, + "step": 5414 + }, + { + "epoch": 1.6128371712057188, + "grad_norm": 0.22707264125347137, + "learning_rate": 1.910685431760647e-05, + "loss": 1.2922, + "step": 5415 + }, + { + "epoch": 1.6131350174053873, + "grad_norm": 0.2184765338897705, + "learning_rate": 1.9106455812369677e-05, + "loss": 1.263, + "step": 5416 + }, + { + "epoch": 1.613432863605056, + "grad_norm": 0.23042844235897064, + "learning_rate": 1.9106057222407486e-05, + "loss": 1.2661, + "step": 5417 + }, + { + "epoch": 1.6137307098047247, + "grad_norm": 0.22253674268722534, + "learning_rate": 1.9105658547723607e-05, + "loss": 1.2839, + "step": 5418 + }, + { + "epoch": 1.6140285560043932, + "grad_norm": 0.22392839193344116, + "learning_rate": 1.9105259788321746e-05, + "loss": 1.2831, + "step": 5419 + }, + { + "epoch": 1.6143264022040618, + "grad_norm": 0.21724386513233185, + "learning_rate": 1.9104860944205615e-05, + "loss": 1.2766, + "step": 5420 + }, + { + "epoch": 1.6146242484037305, + "grad_norm": 0.2196635752916336, + "learning_rate": 1.910446201537893e-05, + "loss": 1.2854, + "step": 5421 + }, + { + "epoch": 1.6149220946033993, + "grad_norm": 0.2713010609149933, + "learning_rate": 1.9104063001845398e-05, + "loss": 1.2843, + "step": 5422 + }, + { + "epoch": 1.6152199408030679, + "grad_norm": 0.22543871402740479, + "learning_rate": 1.9103663903608728e-05, + "loss": 1.2726, + "step": 5423 + }, + { + "epoch": 1.6155177870027364, + "grad_norm": 0.2172880917787552, + "learning_rate": 1.9103264720672635e-05, + "loss": 1.2741, + "step": 5424 + }, + { + "epoch": 1.6158156332024052, + "grad_norm": 0.23078133165836334, + "learning_rate": 1.910286545304084e-05, + "loss": 1.2789, + "step": 5425 + }, + { + "epoch": 1.6161134794020737, + "grad_norm": 0.22541062533855438, + "learning_rate": 1.9102466100717048e-05, + "loss": 1.2723, + "step": 5426 + }, + { + "epoch": 1.6164113256017423, + "grad_norm": 0.23872913420200348, + "learning_rate": 1.910206666370498e-05, + "loss": 1.2655, + "step": 5427 + }, + { + "epoch": 1.616709171801411, + "grad_norm": 0.24908356368541718, + "learning_rate": 1.9101667142008353e-05, + "loss": 1.2914, + "step": 5428 + }, + { + "epoch": 1.6170070180010798, + "grad_norm": 0.2230306714773178, + "learning_rate": 1.9101267535630882e-05, + "loss": 1.2852, + "step": 5429 + }, + { + "epoch": 1.6173048642007484, + "grad_norm": 0.21355973184108734, + "learning_rate": 1.910086784457628e-05, + "loss": 1.2782, + "step": 5430 + }, + { + "epoch": 1.617602710400417, + "grad_norm": 0.2299388200044632, + "learning_rate": 1.9100468068848275e-05, + "loss": 1.2765, + "step": 5431 + }, + { + "epoch": 1.6179005566000857, + "grad_norm": 0.23457807302474976, + "learning_rate": 1.9100068208450583e-05, + "loss": 1.2703, + "step": 5432 + }, + { + "epoch": 1.6181984027997542, + "grad_norm": 0.2189137488603592, + "learning_rate": 1.909966826338692e-05, + "loss": 1.2696, + "step": 5433 + }, + { + "epoch": 1.6184962489994228, + "grad_norm": 0.24128244817256927, + "learning_rate": 1.9099268233661016e-05, + "loss": 1.2749, + "step": 5434 + }, + { + "epoch": 1.6187940951990916, + "grad_norm": 0.2196899801492691, + "learning_rate": 1.9098868119276585e-05, + "loss": 1.2658, + "step": 5435 + }, + { + "epoch": 1.6190919413987603, + "grad_norm": 0.2194526493549347, + "learning_rate": 1.909846792023735e-05, + "loss": 1.2688, + "step": 5436 + }, + { + "epoch": 1.6193897875984289, + "grad_norm": 0.22628456354141235, + "learning_rate": 1.9098067636547038e-05, + "loss": 1.2692, + "step": 5437 + }, + { + "epoch": 1.6196876337980974, + "grad_norm": 0.2313448041677475, + "learning_rate": 1.909766726820937e-05, + "loss": 1.2814, + "step": 5438 + }, + { + "epoch": 1.6199854799977662, + "grad_norm": 0.24621717631816864, + "learning_rate": 1.9097266815228074e-05, + "loss": 1.2846, + "step": 5439 + }, + { + "epoch": 1.6202833261974348, + "grad_norm": 0.22433489561080933, + "learning_rate": 1.9096866277606873e-05, + "loss": 1.2617, + "step": 5440 + }, + { + "epoch": 1.6205811723971033, + "grad_norm": 0.2487044781446457, + "learning_rate": 1.9096465655349496e-05, + "loss": 1.2588, + "step": 5441 + }, + { + "epoch": 1.620879018596772, + "grad_norm": 0.21417568624019623, + "learning_rate": 1.909606494845967e-05, + "loss": 1.2712, + "step": 5442 + }, + { + "epoch": 1.6211768647964409, + "grad_norm": 0.23787125945091248, + "learning_rate": 1.9095664156941123e-05, + "loss": 1.2558, + "step": 5443 + }, + { + "epoch": 1.6214747109961094, + "grad_norm": 0.243920236825943, + "learning_rate": 1.9095263280797584e-05, + "loss": 1.2645, + "step": 5444 + }, + { + "epoch": 1.621772557195778, + "grad_norm": 0.24374152719974518, + "learning_rate": 1.909486232003278e-05, + "loss": 1.2905, + "step": 5445 + }, + { + "epoch": 1.6220704033954467, + "grad_norm": 0.23641370236873627, + "learning_rate": 1.909446127465044e-05, + "loss": 1.265, + "step": 5446 + }, + { + "epoch": 1.6223682495951153, + "grad_norm": 0.21501170098781586, + "learning_rate": 1.9094060144654306e-05, + "loss": 1.2737, + "step": 5447 + }, + { + "epoch": 1.6226660957947838, + "grad_norm": 0.2302922159433365, + "learning_rate": 1.9093658930048097e-05, + "loss": 1.2672, + "step": 5448 + }, + { + "epoch": 1.6229639419944526, + "grad_norm": 0.24240373075008392, + "learning_rate": 1.9093257630835554e-05, + "loss": 1.2801, + "step": 5449 + }, + { + "epoch": 1.6232617881941214, + "grad_norm": 0.22276267409324646, + "learning_rate": 1.909285624702041e-05, + "loss": 1.2984, + "step": 5450 + }, + { + "epoch": 1.62355963439379, + "grad_norm": 0.2276402711868286, + "learning_rate": 1.9092454778606395e-05, + "loss": 1.2742, + "step": 5451 + }, + { + "epoch": 1.6238574805934585, + "grad_norm": 0.23029804229736328, + "learning_rate": 1.9092053225597245e-05, + "loss": 1.2524, + "step": 5452 + }, + { + "epoch": 1.6241553267931272, + "grad_norm": 0.22955401241779327, + "learning_rate": 1.9091651587996704e-05, + "loss": 1.261, + "step": 5453 + }, + { + "epoch": 1.6244531729927958, + "grad_norm": 0.22547177970409393, + "learning_rate": 1.9091249865808498e-05, + "loss": 1.2697, + "step": 5454 + }, + { + "epoch": 1.6247510191924643, + "grad_norm": 0.23433789610862732, + "learning_rate": 1.9090848059036372e-05, + "loss": 1.276, + "step": 5455 + }, + { + "epoch": 1.6250488653921331, + "grad_norm": 0.22249680757522583, + "learning_rate": 1.9090446167684062e-05, + "loss": 1.2781, + "step": 5456 + }, + { + "epoch": 1.6253467115918019, + "grad_norm": 0.23831702768802643, + "learning_rate": 1.9090044191755305e-05, + "loss": 1.2812, + "step": 5457 + }, + { + "epoch": 1.6256445577914704, + "grad_norm": 0.21816882491111755, + "learning_rate": 1.9089642131253843e-05, + "loss": 1.2686, + "step": 5458 + }, + { + "epoch": 1.625942403991139, + "grad_norm": 0.2549525201320648, + "learning_rate": 1.9089239986183415e-05, + "loss": 1.2765, + "step": 5459 + }, + { + "epoch": 1.6262402501908078, + "grad_norm": 0.2201966941356659, + "learning_rate": 1.9088837756547765e-05, + "loss": 1.2544, + "step": 5460 + }, + { + "epoch": 1.6265380963904765, + "grad_norm": 0.22870147228240967, + "learning_rate": 1.9088435442350638e-05, + "loss": 1.2857, + "step": 5461 + }, + { + "epoch": 1.6268359425901449, + "grad_norm": 0.2498757541179657, + "learning_rate": 1.908803304359577e-05, + "loss": 1.2667, + "step": 5462 + }, + { + "epoch": 1.6271337887898136, + "grad_norm": 0.21831880509853363, + "learning_rate": 1.9087630560286906e-05, + "loss": 1.2645, + "step": 5463 + }, + { + "epoch": 1.6274316349894824, + "grad_norm": 0.229718878865242, + "learning_rate": 1.9087227992427796e-05, + "loss": 1.2711, + "step": 5464 + }, + { + "epoch": 1.627729481189151, + "grad_norm": 0.22852644324302673, + "learning_rate": 1.9086825340022183e-05, + "loss": 1.2653, + "step": 5465 + }, + { + "epoch": 1.6280273273888195, + "grad_norm": 0.21467527747154236, + "learning_rate": 1.9086422603073812e-05, + "loss": 1.286, + "step": 5466 + }, + { + "epoch": 1.6283251735884883, + "grad_norm": 0.2270808070898056, + "learning_rate": 1.908601978158643e-05, + "loss": 1.2717, + "step": 5467 + }, + { + "epoch": 1.628623019788157, + "grad_norm": 0.23573139309883118, + "learning_rate": 1.9085616875563788e-05, + "loss": 1.2708, + "step": 5468 + }, + { + "epoch": 1.6289208659878254, + "grad_norm": 0.22102656960487366, + "learning_rate": 1.908521388500963e-05, + "loss": 1.2806, + "step": 5469 + }, + { + "epoch": 1.6292187121874941, + "grad_norm": 0.223962664604187, + "learning_rate": 1.9084810809927708e-05, + "loss": 1.2603, + "step": 5470 + }, + { + "epoch": 1.629516558387163, + "grad_norm": 0.24056734144687653, + "learning_rate": 1.908440765032177e-05, + "loss": 1.2711, + "step": 5471 + }, + { + "epoch": 1.6298144045868315, + "grad_norm": 0.23388129472732544, + "learning_rate": 1.9084004406195568e-05, + "loss": 1.2893, + "step": 5472 + }, + { + "epoch": 1.6301122507865, + "grad_norm": 0.21446532011032104, + "learning_rate": 1.9083601077552857e-05, + "loss": 1.2731, + "step": 5473 + }, + { + "epoch": 1.6304100969861688, + "grad_norm": 0.23439285159111023, + "learning_rate": 1.9083197664397386e-05, + "loss": 1.2745, + "step": 5474 + }, + { + "epoch": 1.6307079431858376, + "grad_norm": 0.24088014662265778, + "learning_rate": 1.9082794166732906e-05, + "loss": 1.2797, + "step": 5475 + }, + { + "epoch": 1.631005789385506, + "grad_norm": 0.21608074009418488, + "learning_rate": 1.9082390584563178e-05, + "loss": 1.2807, + "step": 5476 + }, + { + "epoch": 1.6313036355851747, + "grad_norm": 0.228294238448143, + "learning_rate": 1.9081986917891952e-05, + "loss": 1.2894, + "step": 5477 + }, + { + "epoch": 1.6316014817848434, + "grad_norm": 0.2253601998090744, + "learning_rate": 1.9081583166722986e-05, + "loss": 1.2725, + "step": 5478 + }, + { + "epoch": 1.631899327984512, + "grad_norm": 0.2199680656194687, + "learning_rate": 1.908117933106003e-05, + "loss": 1.272, + "step": 5479 + }, + { + "epoch": 1.6321971741841805, + "grad_norm": 0.22012652456760406, + "learning_rate": 1.9080775410906854e-05, + "loss": 1.2612, + "step": 5480 + }, + { + "epoch": 1.6324950203838493, + "grad_norm": 0.2143363207578659, + "learning_rate": 1.9080371406267205e-05, + "loss": 1.2676, + "step": 5481 + }, + { + "epoch": 1.632792866583518, + "grad_norm": 0.23318170011043549, + "learning_rate": 1.907996731714484e-05, + "loss": 1.2822, + "step": 5482 + }, + { + "epoch": 1.6330907127831866, + "grad_norm": 0.2162562608718872, + "learning_rate": 1.907956314354353e-05, + "loss": 1.2734, + "step": 5483 + }, + { + "epoch": 1.6333885589828552, + "grad_norm": 0.22819747030735016, + "learning_rate": 1.9079158885467027e-05, + "loss": 1.2855, + "step": 5484 + }, + { + "epoch": 1.633686405182524, + "grad_norm": 0.25651898980140686, + "learning_rate": 1.90787545429191e-05, + "loss": 1.2655, + "step": 5485 + }, + { + "epoch": 1.6339842513821925, + "grad_norm": 0.22409702837467194, + "learning_rate": 1.9078350115903496e-05, + "loss": 1.2695, + "step": 5486 + }, + { + "epoch": 1.634282097581861, + "grad_norm": 0.23563243448734283, + "learning_rate": 1.9077945604423994e-05, + "loss": 1.2765, + "step": 5487 + }, + { + "epoch": 1.6345799437815298, + "grad_norm": 0.2208978831768036, + "learning_rate": 1.9077541008484347e-05, + "loss": 1.2881, + "step": 5488 + }, + { + "epoch": 1.6348777899811986, + "grad_norm": 0.23069295287132263, + "learning_rate": 1.9077136328088325e-05, + "loss": 1.2965, + "step": 5489 + }, + { + "epoch": 1.6351756361808671, + "grad_norm": 0.2202451229095459, + "learning_rate": 1.907673156323969e-05, + "loss": 1.27, + "step": 5490 + }, + { + "epoch": 1.6354734823805357, + "grad_norm": 0.21095435321331024, + "learning_rate": 1.9076326713942208e-05, + "loss": 1.2546, + "step": 5491 + }, + { + "epoch": 1.6357713285802045, + "grad_norm": 0.2126925140619278, + "learning_rate": 1.9075921780199648e-05, + "loss": 1.2693, + "step": 5492 + }, + { + "epoch": 1.636069174779873, + "grad_norm": 0.21833592653274536, + "learning_rate": 1.9075516762015777e-05, + "loss": 1.2755, + "step": 5493 + }, + { + "epoch": 1.6363670209795416, + "grad_norm": 0.22189980745315552, + "learning_rate": 1.907511165939436e-05, + "loss": 1.2861, + "step": 5494 + }, + { + "epoch": 1.6366648671792103, + "grad_norm": 0.21767203509807587, + "learning_rate": 1.907470647233917e-05, + "loss": 1.2742, + "step": 5495 + }, + { + "epoch": 1.636962713378879, + "grad_norm": 0.21431377530097961, + "learning_rate": 1.9074301200853976e-05, + "loss": 1.2577, + "step": 5496 + }, + { + "epoch": 1.6372605595785477, + "grad_norm": 0.2187560349702835, + "learning_rate": 1.9073895844942548e-05, + "loss": 1.2672, + "step": 5497 + }, + { + "epoch": 1.6375584057782162, + "grad_norm": 0.22370322048664093, + "learning_rate": 1.9073490404608654e-05, + "loss": 1.2764, + "step": 5498 + }, + { + "epoch": 1.637856251977885, + "grad_norm": 0.2242758572101593, + "learning_rate": 1.907308487985607e-05, + "loss": 1.2857, + "step": 5499 + }, + { + "epoch": 1.6381540981775535, + "grad_norm": 0.20985734462738037, + "learning_rate": 1.907267927068857e-05, + "loss": 1.2697, + "step": 5500 + }, + { + "epoch": 1.6381540981775535, + "eval_loss": 1.3477442264556885, + "eval_runtime": 21.5816, + "eval_samples_per_second": 80.346, + "eval_steps_per_second": 5.051, + "step": 5500 + }, + { + "epoch": 1.638451944377222, + "grad_norm": 0.22465425729751587, + "learning_rate": 1.9072273577109923e-05, + "loss": 1.2718, + "step": 5501 + }, + { + "epoch": 1.6387497905768909, + "grad_norm": 0.23549233376979828, + "learning_rate": 1.9071867799123913e-05, + "loss": 1.2783, + "step": 5502 + }, + { + "epoch": 1.6390476367765596, + "grad_norm": 0.22173607349395752, + "learning_rate": 1.90714619367343e-05, + "loss": 1.2814, + "step": 5503 + }, + { + "epoch": 1.6393454829762282, + "grad_norm": 0.24669671058654785, + "learning_rate": 1.9071055989944873e-05, + "loss": 1.2796, + "step": 5504 + }, + { + "epoch": 1.6396433291758967, + "grad_norm": 0.2293134182691574, + "learning_rate": 1.9070649958759406e-05, + "loss": 1.2793, + "step": 5505 + }, + { + "epoch": 1.6399411753755655, + "grad_norm": 0.23103909194469452, + "learning_rate": 1.9070243843181675e-05, + "loss": 1.2993, + "step": 5506 + }, + { + "epoch": 1.640239021575234, + "grad_norm": 0.21258032321929932, + "learning_rate": 1.9069837643215457e-05, + "loss": 1.2903, + "step": 5507 + }, + { + "epoch": 1.6405368677749026, + "grad_norm": 0.2531338334083557, + "learning_rate": 1.9069431358864535e-05, + "loss": 1.2822, + "step": 5508 + }, + { + "epoch": 1.6408347139745714, + "grad_norm": 0.21448421478271484, + "learning_rate": 1.9069024990132688e-05, + "loss": 1.269, + "step": 5509 + }, + { + "epoch": 1.6411325601742401, + "grad_norm": 0.21856717765331268, + "learning_rate": 1.9068618537023695e-05, + "loss": 1.2749, + "step": 5510 + }, + { + "epoch": 1.6414304063739087, + "grad_norm": 0.23338472843170166, + "learning_rate": 1.9068211999541336e-05, + "loss": 1.2791, + "step": 5511 + }, + { + "epoch": 1.6417282525735772, + "grad_norm": 0.22949013113975525, + "learning_rate": 1.90678053776894e-05, + "loss": 1.2751, + "step": 5512 + }, + { + "epoch": 1.642026098773246, + "grad_norm": 0.25006377696990967, + "learning_rate": 1.906739867147166e-05, + "loss": 1.2781, + "step": 5513 + }, + { + "epoch": 1.6423239449729146, + "grad_norm": 0.23023587465286255, + "learning_rate": 1.906699188089191e-05, + "loss": 1.2832, + "step": 5514 + }, + { + "epoch": 1.6426217911725831, + "grad_norm": 0.2393902689218521, + "learning_rate": 1.9066585005953934e-05, + "loss": 1.2796, + "step": 5515 + }, + { + "epoch": 1.6429196373722519, + "grad_norm": 0.22710615396499634, + "learning_rate": 1.906617804666151e-05, + "loss": 1.2771, + "step": 5516 + }, + { + "epoch": 1.6432174835719207, + "grad_norm": 0.3001965582370758, + "learning_rate": 1.9065771003018433e-05, + "loss": 1.2713, + "step": 5517 + }, + { + "epoch": 1.6435153297715892, + "grad_norm": 0.2223108857870102, + "learning_rate": 1.906536387502848e-05, + "loss": 1.2755, + "step": 5518 + }, + { + "epoch": 1.6438131759712578, + "grad_norm": 0.2387503832578659, + "learning_rate": 1.906495666269545e-05, + "loss": 1.2922, + "step": 5519 + }, + { + "epoch": 1.6441110221709265, + "grad_norm": 0.23094727098941803, + "learning_rate": 1.9064549366023124e-05, + "loss": 1.2711, + "step": 5520 + }, + { + "epoch": 1.6444088683705953, + "grad_norm": 0.22692003846168518, + "learning_rate": 1.9064141985015293e-05, + "loss": 1.2765, + "step": 5521 + }, + { + "epoch": 1.6447067145702636, + "grad_norm": 0.2327008694410324, + "learning_rate": 1.9063734519675748e-05, + "loss": 1.2811, + "step": 5522 + }, + { + "epoch": 1.6450045607699324, + "grad_norm": 0.23037296533584595, + "learning_rate": 1.906332697000828e-05, + "loss": 1.286, + "step": 5523 + }, + { + "epoch": 1.6453024069696012, + "grad_norm": 0.22375428676605225, + "learning_rate": 1.906291933601668e-05, + "loss": 1.2741, + "step": 5524 + }, + { + "epoch": 1.6456002531692697, + "grad_norm": 0.22224177420139313, + "learning_rate": 1.9062511617704743e-05, + "loss": 1.2722, + "step": 5525 + }, + { + "epoch": 1.6458980993689383, + "grad_norm": 0.22655761241912842, + "learning_rate": 1.9062103815076257e-05, + "loss": 1.2644, + "step": 5526 + }, + { + "epoch": 1.646195945568607, + "grad_norm": 0.22255229949951172, + "learning_rate": 1.906169592813502e-05, + "loss": 1.271, + "step": 5527 + }, + { + "epoch": 1.6464937917682758, + "grad_norm": 0.2182021141052246, + "learning_rate": 1.9061287956884834e-05, + "loss": 1.2652, + "step": 5528 + }, + { + "epoch": 1.6467916379679441, + "grad_norm": 0.23365797102451324, + "learning_rate": 1.9060879901329482e-05, + "loss": 1.2886, + "step": 5529 + }, + { + "epoch": 1.647089484167613, + "grad_norm": 0.2474723905324936, + "learning_rate": 1.9060471761472766e-05, + "loss": 1.2832, + "step": 5530 + }, + { + "epoch": 1.6473873303672817, + "grad_norm": 0.20822176337242126, + "learning_rate": 1.9060063537318484e-05, + "loss": 1.2621, + "step": 5531 + }, + { + "epoch": 1.6476851765669502, + "grad_norm": 0.22447814047336578, + "learning_rate": 1.9059655228870434e-05, + "loss": 1.276, + "step": 5532 + }, + { + "epoch": 1.6479830227666188, + "grad_norm": 0.22813937067985535, + "learning_rate": 1.905924683613241e-05, + "loss": 1.2783, + "step": 5533 + }, + { + "epoch": 1.6482808689662876, + "grad_norm": 0.22755442559719086, + "learning_rate": 1.905883835910822e-05, + "loss": 1.2882, + "step": 5534 + }, + { + "epoch": 1.6485787151659563, + "grad_norm": 0.22414061427116394, + "learning_rate": 1.9058429797801657e-05, + "loss": 1.2656, + "step": 5535 + }, + { + "epoch": 1.6488765613656247, + "grad_norm": 0.22432328760623932, + "learning_rate": 1.9058021152216527e-05, + "loss": 1.2664, + "step": 5536 + }, + { + "epoch": 1.6491744075652934, + "grad_norm": 0.2137758731842041, + "learning_rate": 1.9057612422356633e-05, + "loss": 1.266, + "step": 5537 + }, + { + "epoch": 1.6494722537649622, + "grad_norm": 0.22579067945480347, + "learning_rate": 1.9057203608225773e-05, + "loss": 1.2476, + "step": 5538 + }, + { + "epoch": 1.6497700999646308, + "grad_norm": 0.2272815704345703, + "learning_rate": 1.9056794709827752e-05, + "loss": 1.2715, + "step": 5539 + }, + { + "epoch": 1.6500679461642993, + "grad_norm": 0.22341889142990112, + "learning_rate": 1.9056385727166376e-05, + "loss": 1.2826, + "step": 5540 + }, + { + "epoch": 1.650365792363968, + "grad_norm": 0.24031281471252441, + "learning_rate": 1.905597666024545e-05, + "loss": 1.268, + "step": 5541 + }, + { + "epoch": 1.6506636385636368, + "grad_norm": 0.24679391086101532, + "learning_rate": 1.9055567509068777e-05, + "loss": 1.3016, + "step": 5542 + }, + { + "epoch": 1.6509614847633054, + "grad_norm": 0.23233523964881897, + "learning_rate": 1.905515827364017e-05, + "loss": 1.2812, + "step": 5543 + }, + { + "epoch": 1.651259330962974, + "grad_norm": 0.21395809948444366, + "learning_rate": 1.9054748953963427e-05, + "loss": 1.2742, + "step": 5544 + }, + { + "epoch": 1.6515571771626427, + "grad_norm": 0.2266373187303543, + "learning_rate": 1.9054339550042364e-05, + "loss": 1.2536, + "step": 5545 + }, + { + "epoch": 1.6518550233623113, + "grad_norm": 0.2295445203781128, + "learning_rate": 1.9053930061880788e-05, + "loss": 1.2784, + "step": 5546 + }, + { + "epoch": 1.6521528695619798, + "grad_norm": 0.22353899478912354, + "learning_rate": 1.9053520489482506e-05, + "loss": 1.2576, + "step": 5547 + }, + { + "epoch": 1.6524507157616486, + "grad_norm": 0.22203253209590912, + "learning_rate": 1.9053110832851335e-05, + "loss": 1.2885, + "step": 5548 + }, + { + "epoch": 1.6527485619613174, + "grad_norm": 0.2155544012784958, + "learning_rate": 1.905270109199108e-05, + "loss": 1.2789, + "step": 5549 + }, + { + "epoch": 1.653046408160986, + "grad_norm": 0.22099432349205017, + "learning_rate": 1.9052291266905553e-05, + "loss": 1.2626, + "step": 5550 + }, + { + "epoch": 1.6533442543606545, + "grad_norm": 0.22569872438907623, + "learning_rate": 1.9051881357598575e-05, + "loss": 1.2599, + "step": 5551 + }, + { + "epoch": 1.6536421005603232, + "grad_norm": 0.23275689780712128, + "learning_rate": 1.9051471364073954e-05, + "loss": 1.2762, + "step": 5552 + }, + { + "epoch": 1.6539399467599918, + "grad_norm": 0.22771941125392914, + "learning_rate": 1.9051061286335498e-05, + "loss": 1.2906, + "step": 5553 + }, + { + "epoch": 1.6542377929596603, + "grad_norm": 0.21359166502952576, + "learning_rate": 1.9050651124387035e-05, + "loss": 1.261, + "step": 5554 + }, + { + "epoch": 1.654535639159329, + "grad_norm": 0.2313777059316635, + "learning_rate": 1.9050240878232375e-05, + "loss": 1.2778, + "step": 5555 + }, + { + "epoch": 1.6548334853589979, + "grad_norm": 0.2221272587776184, + "learning_rate": 1.9049830547875334e-05, + "loss": 1.2712, + "step": 5556 + }, + { + "epoch": 1.6551313315586664, + "grad_norm": 0.20936013758182526, + "learning_rate": 1.9049420133319732e-05, + "loss": 1.2664, + "step": 5557 + }, + { + "epoch": 1.655429177758335, + "grad_norm": 0.2277703881263733, + "learning_rate": 1.9049009634569384e-05, + "loss": 1.2766, + "step": 5558 + }, + { + "epoch": 1.6557270239580038, + "grad_norm": 0.2331382781267166, + "learning_rate": 1.904859905162811e-05, + "loss": 1.2557, + "step": 5559 + }, + { + "epoch": 1.6560248701576723, + "grad_norm": 0.2420787364244461, + "learning_rate": 1.9048188384499736e-05, + "loss": 1.2863, + "step": 5560 + }, + { + "epoch": 1.6563227163573409, + "grad_norm": 0.21740663051605225, + "learning_rate": 1.9047777633188077e-05, + "loss": 1.2483, + "step": 5561 + }, + { + "epoch": 1.6566205625570096, + "grad_norm": 0.23249472677707672, + "learning_rate": 1.9047366797696954e-05, + "loss": 1.2588, + "step": 5562 + }, + { + "epoch": 1.6569184087566784, + "grad_norm": 0.21488188207149506, + "learning_rate": 1.9046955878030195e-05, + "loss": 1.2798, + "step": 5563 + }, + { + "epoch": 1.657216254956347, + "grad_norm": 0.24667470157146454, + "learning_rate": 1.9046544874191617e-05, + "loss": 1.2621, + "step": 5564 + }, + { + "epoch": 1.6575141011560155, + "grad_norm": 0.23690661787986755, + "learning_rate": 1.9046133786185045e-05, + "loss": 1.2844, + "step": 5565 + }, + { + "epoch": 1.6578119473556843, + "grad_norm": 0.23039014637470245, + "learning_rate": 1.904572261401431e-05, + "loss": 1.2688, + "step": 5566 + }, + { + "epoch": 1.6581097935553528, + "grad_norm": 0.24273911118507385, + "learning_rate": 1.904531135768323e-05, + "loss": 1.2863, + "step": 5567 + }, + { + "epoch": 1.6584076397550214, + "grad_norm": 0.21636506915092468, + "learning_rate": 1.9044900017195627e-05, + "loss": 1.2591, + "step": 5568 + }, + { + "epoch": 1.6587054859546901, + "grad_norm": 0.22625398635864258, + "learning_rate": 1.9044488592555343e-05, + "loss": 1.2691, + "step": 5569 + }, + { + "epoch": 1.659003332154359, + "grad_norm": 0.23376969993114471, + "learning_rate": 1.90440770837662e-05, + "loss": 1.2652, + "step": 5570 + }, + { + "epoch": 1.6593011783540275, + "grad_norm": 0.21269488334655762, + "learning_rate": 1.9043665490832018e-05, + "loss": 1.2799, + "step": 5571 + }, + { + "epoch": 1.659599024553696, + "grad_norm": 0.22330698370933533, + "learning_rate": 1.9043253813756634e-05, + "loss": 1.2787, + "step": 5572 + }, + { + "epoch": 1.6598968707533648, + "grad_norm": 0.21178990602493286, + "learning_rate": 1.9042842052543878e-05, + "loss": 1.2599, + "step": 5573 + }, + { + "epoch": 1.6601947169530333, + "grad_norm": 0.21995270252227783, + "learning_rate": 1.904243020719758e-05, + "loss": 1.2697, + "step": 5574 + }, + { + "epoch": 1.6604925631527019, + "grad_norm": 0.25758859515190125, + "learning_rate": 1.904201827772157e-05, + "loss": 1.2784, + "step": 5575 + }, + { + "epoch": 1.6607904093523707, + "grad_norm": 0.3004457652568817, + "learning_rate": 1.9041606264119683e-05, + "loss": 1.2754, + "step": 5576 + }, + { + "epoch": 1.6610882555520394, + "grad_norm": 0.27928078174591064, + "learning_rate": 1.9041194166395753e-05, + "loss": 1.2803, + "step": 5577 + }, + { + "epoch": 1.661386101751708, + "grad_norm": 0.22976548969745636, + "learning_rate": 1.9040781984553612e-05, + "loss": 1.2912, + "step": 5578 + }, + { + "epoch": 1.6616839479513765, + "grad_norm": 0.3062029480934143, + "learning_rate": 1.9040369718597098e-05, + "loss": 1.2855, + "step": 5579 + }, + { + "epoch": 1.6619817941510453, + "grad_norm": 0.2543890178203583, + "learning_rate": 1.903995736853004e-05, + "loss": 1.2666, + "step": 5580 + }, + { + "epoch": 1.6622796403507138, + "grad_norm": 0.22848451137542725, + "learning_rate": 1.903954493435628e-05, + "loss": 1.2911, + "step": 5581 + }, + { + "epoch": 1.6625774865503824, + "grad_norm": 0.22023822367191315, + "learning_rate": 1.9039132416079656e-05, + "loss": 1.2682, + "step": 5582 + }, + { + "epoch": 1.6628753327500512, + "grad_norm": 0.24857406318187714, + "learning_rate": 1.9038719813704e-05, + "loss": 1.264, + "step": 5583 + }, + { + "epoch": 1.66317317894972, + "grad_norm": 0.24410122632980347, + "learning_rate": 1.9038307127233157e-05, + "loss": 1.2985, + "step": 5584 + }, + { + "epoch": 1.6634710251493885, + "grad_norm": 0.2163051813840866, + "learning_rate": 1.9037894356670964e-05, + "loss": 1.2765, + "step": 5585 + }, + { + "epoch": 1.663768871349057, + "grad_norm": 0.22405213117599487, + "learning_rate": 1.9037481502021266e-05, + "loss": 1.2696, + "step": 5586 + }, + { + "epoch": 1.6640667175487258, + "grad_norm": 0.2406502664089203, + "learning_rate": 1.9037068563287894e-05, + "loss": 1.2746, + "step": 5587 + }, + { + "epoch": 1.6643645637483946, + "grad_norm": 0.23189301788806915, + "learning_rate": 1.90366555404747e-05, + "loss": 1.2684, + "step": 5588 + }, + { + "epoch": 1.664662409948063, + "grad_norm": 0.21800807118415833, + "learning_rate": 1.9036242433585517e-05, + "loss": 1.2672, + "step": 5589 + }, + { + "epoch": 1.6649602561477317, + "grad_norm": 0.2630351781845093, + "learning_rate": 1.90358292426242e-05, + "loss": 1.2678, + "step": 5590 + }, + { + "epoch": 1.6652581023474005, + "grad_norm": 0.22028449177742004, + "learning_rate": 1.9035415967594587e-05, + "loss": 1.2878, + "step": 5591 + }, + { + "epoch": 1.665555948547069, + "grad_norm": 0.22171668708324432, + "learning_rate": 1.903500260850052e-05, + "loss": 1.2765, + "step": 5592 + }, + { + "epoch": 1.6658537947467376, + "grad_norm": 0.22152946889400482, + "learning_rate": 1.9034589165345848e-05, + "loss": 1.2744, + "step": 5593 + }, + { + "epoch": 1.6661516409464063, + "grad_norm": 0.2194407433271408, + "learning_rate": 1.903417563813442e-05, + "loss": 1.2786, + "step": 5594 + }, + { + "epoch": 1.666449487146075, + "grad_norm": 0.22076888382434845, + "learning_rate": 1.903376202687008e-05, + "loss": 1.2613, + "step": 5595 + }, + { + "epoch": 1.6667473333457434, + "grad_norm": 0.22642038762569427, + "learning_rate": 1.903334833155668e-05, + "loss": 1.2887, + "step": 5596 + }, + { + "epoch": 1.6670451795454122, + "grad_norm": 0.23118306696414948, + "learning_rate": 1.9032934552198066e-05, + "loss": 1.2798, + "step": 5597 + }, + { + "epoch": 1.667343025745081, + "grad_norm": 0.22751539945602417, + "learning_rate": 1.9032520688798085e-05, + "loss": 1.2701, + "step": 5598 + }, + { + "epoch": 1.6676408719447495, + "grad_norm": 0.21959461271762848, + "learning_rate": 1.9032106741360593e-05, + "loss": 1.2881, + "step": 5599 + }, + { + "epoch": 1.667938718144418, + "grad_norm": 0.23361878097057343, + "learning_rate": 1.9031692709889437e-05, + "loss": 1.2481, + "step": 5600 + }, + { + "epoch": 1.6682365643440868, + "grad_norm": 0.21968644857406616, + "learning_rate": 1.9031278594388472e-05, + "loss": 1.289, + "step": 5601 + }, + { + "epoch": 1.6685344105437556, + "grad_norm": 0.21639177203178406, + "learning_rate": 1.903086439486155e-05, + "loss": 1.2789, + "step": 5602 + }, + { + "epoch": 1.6688322567434242, + "grad_norm": 0.22558465600013733, + "learning_rate": 1.9030450111312526e-05, + "loss": 1.2613, + "step": 5603 + }, + { + "epoch": 1.6691301029430927, + "grad_norm": 0.2187129706144333, + "learning_rate": 1.9030035743745253e-05, + "loss": 1.284, + "step": 5604 + }, + { + "epoch": 1.6694279491427615, + "grad_norm": 0.23081013560295105, + "learning_rate": 1.9029621292163587e-05, + "loss": 1.2707, + "step": 5605 + }, + { + "epoch": 1.66972579534243, + "grad_norm": 0.22469882667064667, + "learning_rate": 1.9029206756571384e-05, + "loss": 1.2798, + "step": 5606 + }, + { + "epoch": 1.6700236415420986, + "grad_norm": 0.21861881017684937, + "learning_rate": 1.90287921369725e-05, + "loss": 1.2668, + "step": 5607 + }, + { + "epoch": 1.6703214877417674, + "grad_norm": 0.22377395629882812, + "learning_rate": 1.9028377433370787e-05, + "loss": 1.2768, + "step": 5608 + }, + { + "epoch": 1.6706193339414361, + "grad_norm": 0.22810766100883484, + "learning_rate": 1.9027962645770114e-05, + "loss": 1.2574, + "step": 5609 + }, + { + "epoch": 1.6709171801411047, + "grad_norm": 0.2153186798095703, + "learning_rate": 1.9027547774174337e-05, + "loss": 1.2804, + "step": 5610 + }, + { + "epoch": 1.6712150263407732, + "grad_norm": 0.21898439526557922, + "learning_rate": 1.9027132818587308e-05, + "loss": 1.2558, + "step": 5611 + }, + { + "epoch": 1.671512872540442, + "grad_norm": 0.21713757514953613, + "learning_rate": 1.9026717779012897e-05, + "loss": 1.2703, + "step": 5612 + }, + { + "epoch": 1.6718107187401106, + "grad_norm": 0.22190316021442413, + "learning_rate": 1.9026302655454962e-05, + "loss": 1.2768, + "step": 5613 + }, + { + "epoch": 1.672108564939779, + "grad_norm": 0.21604092419147491, + "learning_rate": 1.902588744791737e-05, + "loss": 1.2789, + "step": 5614 + }, + { + "epoch": 1.6724064111394479, + "grad_norm": 0.21985818445682526, + "learning_rate": 1.9025472156403976e-05, + "loss": 1.2885, + "step": 5615 + }, + { + "epoch": 1.6727042573391167, + "grad_norm": 0.22372137010097504, + "learning_rate": 1.9025056780918646e-05, + "loss": 1.2875, + "step": 5616 + }, + { + "epoch": 1.6730021035387852, + "grad_norm": 0.22135815024375916, + "learning_rate": 1.9024641321465246e-05, + "loss": 1.2765, + "step": 5617 + }, + { + "epoch": 1.6732999497384538, + "grad_norm": 0.22023016214370728, + "learning_rate": 1.902422577804764e-05, + "loss": 1.2767, + "step": 5618 + }, + { + "epoch": 1.6735977959381225, + "grad_norm": 0.24457132816314697, + "learning_rate": 1.9023810150669702e-05, + "loss": 1.2516, + "step": 5619 + }, + { + "epoch": 1.673895642137791, + "grad_norm": 0.22207579016685486, + "learning_rate": 1.9023394439335287e-05, + "loss": 1.2702, + "step": 5620 + }, + { + "epoch": 1.6741934883374596, + "grad_norm": 0.22879260778427124, + "learning_rate": 1.9022978644048275e-05, + "loss": 1.2526, + "step": 5621 + }, + { + "epoch": 1.6744913345371284, + "grad_norm": 0.22566340863704681, + "learning_rate": 1.902256276481252e-05, + "loss": 1.2749, + "step": 5622 + }, + { + "epoch": 1.6747891807367972, + "grad_norm": 0.2589960992336273, + "learning_rate": 1.90221468016319e-05, + "loss": 1.2624, + "step": 5623 + }, + { + "epoch": 1.6750870269364657, + "grad_norm": 0.24110759794712067, + "learning_rate": 1.9021730754510288e-05, + "loss": 1.2804, + "step": 5624 + }, + { + "epoch": 1.6753848731361343, + "grad_norm": 0.24634456634521484, + "learning_rate": 1.902131462345155e-05, + "loss": 1.2848, + "step": 5625 + }, + { + "epoch": 1.675682719335803, + "grad_norm": 0.23470130562782288, + "learning_rate": 1.902089840845956e-05, + "loss": 1.2949, + "step": 5626 + }, + { + "epoch": 1.6759805655354716, + "grad_norm": 0.32685351371765137, + "learning_rate": 1.902048210953818e-05, + "loss": 1.2794, + "step": 5627 + }, + { + "epoch": 1.6762784117351401, + "grad_norm": 0.23882338404655457, + "learning_rate": 1.9020065726691302e-05, + "loss": 1.275, + "step": 5628 + }, + { + "epoch": 1.676576257934809, + "grad_norm": 0.24273385107517242, + "learning_rate": 1.9019649259922787e-05, + "loss": 1.2843, + "step": 5629 + }, + { + "epoch": 1.6768741041344777, + "grad_norm": 0.23705027997493744, + "learning_rate": 1.9019232709236516e-05, + "loss": 1.2632, + "step": 5630 + }, + { + "epoch": 1.6771719503341462, + "grad_norm": 0.2242920696735382, + "learning_rate": 1.9018816074636358e-05, + "loss": 1.2862, + "step": 5631 + }, + { + "epoch": 1.6774697965338148, + "grad_norm": 0.23454338312149048, + "learning_rate": 1.9018399356126195e-05, + "loss": 1.2653, + "step": 5632 + }, + { + "epoch": 1.6777676427334836, + "grad_norm": 0.25819316506385803, + "learning_rate": 1.90179825537099e-05, + "loss": 1.2643, + "step": 5633 + }, + { + "epoch": 1.678065488933152, + "grad_norm": 0.2194097489118576, + "learning_rate": 1.901756566739135e-05, + "loss": 1.2764, + "step": 5634 + }, + { + "epoch": 1.6783633351328207, + "grad_norm": 0.23922835290431976, + "learning_rate": 1.901714869717443e-05, + "loss": 1.2685, + "step": 5635 + }, + { + "epoch": 1.6786611813324894, + "grad_norm": 0.24081382155418396, + "learning_rate": 1.9016731643063018e-05, + "loss": 1.2866, + "step": 5636 + }, + { + "epoch": 1.6789590275321582, + "grad_norm": 0.23041003942489624, + "learning_rate": 1.9016314505060987e-05, + "loss": 1.2781, + "step": 5637 + }, + { + "epoch": 1.6792568737318267, + "grad_norm": 0.21383997797966003, + "learning_rate": 1.9015897283172232e-05, + "loss": 1.262, + "step": 5638 + }, + { + "epoch": 1.6795547199314953, + "grad_norm": 0.2681867182254791, + "learning_rate": 1.901547997740062e-05, + "loss": 1.2789, + "step": 5639 + }, + { + "epoch": 1.679852566131164, + "grad_norm": 0.23258072137832642, + "learning_rate": 1.9015062587750036e-05, + "loss": 1.2826, + "step": 5640 + }, + { + "epoch": 1.6801504123308326, + "grad_norm": 0.23819170892238617, + "learning_rate": 1.9014645114224374e-05, + "loss": 1.2643, + "step": 5641 + }, + { + "epoch": 1.6804482585305012, + "grad_norm": 0.2216070592403412, + "learning_rate": 1.9014227556827505e-05, + "loss": 1.2629, + "step": 5642 + }, + { + "epoch": 1.68074610473017, + "grad_norm": 0.23336848616600037, + "learning_rate": 1.9013809915563322e-05, + "loss": 1.2682, + "step": 5643 + }, + { + "epoch": 1.6810439509298387, + "grad_norm": 0.24089714884757996, + "learning_rate": 1.901339219043571e-05, + "loss": 1.2665, + "step": 5644 + }, + { + "epoch": 1.6813417971295073, + "grad_norm": 0.2426704615354538, + "learning_rate": 1.9012974381448553e-05, + "loss": 1.2665, + "step": 5645 + }, + { + "epoch": 1.6816396433291758, + "grad_norm": 0.22386778891086578, + "learning_rate": 1.901255648860574e-05, + "loss": 1.2746, + "step": 5646 + }, + { + "epoch": 1.6819374895288446, + "grad_norm": 0.23168200254440308, + "learning_rate": 1.9012138511911156e-05, + "loss": 1.2739, + "step": 5647 + }, + { + "epoch": 1.6822353357285131, + "grad_norm": 0.2353355884552002, + "learning_rate": 1.9011720451368693e-05, + "loss": 1.2616, + "step": 5648 + }, + { + "epoch": 1.6825331819281817, + "grad_norm": 0.22925683856010437, + "learning_rate": 1.901130230698224e-05, + "loss": 1.2615, + "step": 5649 + }, + { + "epoch": 1.6828310281278505, + "grad_norm": 0.23452536761760712, + "learning_rate": 1.9010884078755688e-05, + "loss": 1.2678, + "step": 5650 + }, + { + "epoch": 1.6831288743275192, + "grad_norm": 0.22609202563762665, + "learning_rate": 1.9010465766692924e-05, + "loss": 1.281, + "step": 5651 + }, + { + "epoch": 1.6834267205271878, + "grad_norm": 0.22634583711624146, + "learning_rate": 1.901004737079785e-05, + "loss": 1.2736, + "step": 5652 + }, + { + "epoch": 1.6837245667268563, + "grad_norm": 0.22738151252269745, + "learning_rate": 1.9009628891074346e-05, + "loss": 1.286, + "step": 5653 + }, + { + "epoch": 1.684022412926525, + "grad_norm": 0.22332048416137695, + "learning_rate": 1.9009210327526314e-05, + "loss": 1.2824, + "step": 5654 + }, + { + "epoch": 1.6843202591261939, + "grad_norm": 0.2146749645471573, + "learning_rate": 1.9008791680157643e-05, + "loss": 1.2672, + "step": 5655 + }, + { + "epoch": 1.6846181053258622, + "grad_norm": 0.28092333674430847, + "learning_rate": 1.900837294897223e-05, + "loss": 1.2681, + "step": 5656 + }, + { + "epoch": 1.684915951525531, + "grad_norm": 0.21779432892799377, + "learning_rate": 1.9007954133973974e-05, + "loss": 1.2946, + "step": 5657 + }, + { + "epoch": 1.6852137977251997, + "grad_norm": 0.23540149629116058, + "learning_rate": 1.9007535235166768e-05, + "loss": 1.2891, + "step": 5658 + }, + { + "epoch": 1.6855116439248683, + "grad_norm": 0.23689579963684082, + "learning_rate": 1.900711625255451e-05, + "loss": 1.2635, + "step": 5659 + }, + { + "epoch": 1.6858094901245368, + "grad_norm": 0.2266598492860794, + "learning_rate": 1.90066971861411e-05, + "loss": 1.2671, + "step": 5660 + }, + { + "epoch": 1.6861073363242056, + "grad_norm": 0.21305103600025177, + "learning_rate": 1.9006278035930434e-05, + "loss": 1.2679, + "step": 5661 + }, + { + "epoch": 1.6864051825238744, + "grad_norm": 0.2813588082790375, + "learning_rate": 1.9005858801926416e-05, + "loss": 1.2576, + "step": 5662 + }, + { + "epoch": 1.6867030287235427, + "grad_norm": 0.21802647411823273, + "learning_rate": 1.9005439484132943e-05, + "loss": 1.2721, + "step": 5663 + }, + { + "epoch": 1.6870008749232115, + "grad_norm": 0.22629772126674652, + "learning_rate": 1.9005020082553915e-05, + "loss": 1.2528, + "step": 5664 + }, + { + "epoch": 1.6872987211228803, + "grad_norm": 0.2283496856689453, + "learning_rate": 1.900460059719324e-05, + "loss": 1.2587, + "step": 5665 + }, + { + "epoch": 1.6875965673225488, + "grad_norm": 0.22529184818267822, + "learning_rate": 1.9004181028054813e-05, + "loss": 1.2721, + "step": 5666 + }, + { + "epoch": 1.6878944135222174, + "grad_norm": 0.2219720184803009, + "learning_rate": 1.9003761375142543e-05, + "loss": 1.259, + "step": 5667 + }, + { + "epoch": 1.6881922597218861, + "grad_norm": 0.22435903549194336, + "learning_rate": 1.9003341638460334e-05, + "loss": 1.266, + "step": 5668 + }, + { + "epoch": 1.688490105921555, + "grad_norm": 0.23676352202892303, + "learning_rate": 1.900292181801209e-05, + "loss": 1.2651, + "step": 5669 + }, + { + "epoch": 1.6887879521212235, + "grad_norm": 0.21435624361038208, + "learning_rate": 1.900250191380171e-05, + "loss": 1.2782, + "step": 5670 + }, + { + "epoch": 1.689085798320892, + "grad_norm": 0.22388912737369537, + "learning_rate": 1.9002081925833117e-05, + "loss": 1.2797, + "step": 5671 + }, + { + "epoch": 1.6893836445205608, + "grad_norm": 0.23809774219989777, + "learning_rate": 1.9001661854110207e-05, + "loss": 1.279, + "step": 5672 + }, + { + "epoch": 1.6896814907202293, + "grad_norm": 0.22046956419944763, + "learning_rate": 1.900124169863689e-05, + "loss": 1.2641, + "step": 5673 + }, + { + "epoch": 1.6899793369198979, + "grad_norm": 0.22175170481204987, + "learning_rate": 1.9000821459417074e-05, + "loss": 1.2809, + "step": 5674 + }, + { + "epoch": 1.6902771831195667, + "grad_norm": 0.25351229310035706, + "learning_rate": 1.9000401136454675e-05, + "loss": 1.2721, + "step": 5675 + }, + { + "epoch": 1.6905750293192354, + "grad_norm": 0.21347540616989136, + "learning_rate": 1.89999807297536e-05, + "loss": 1.2853, + "step": 5676 + }, + { + "epoch": 1.690872875518904, + "grad_norm": 0.22976148128509521, + "learning_rate": 1.8999560239317752e-05, + "loss": 1.2745, + "step": 5677 + }, + { + "epoch": 1.6911707217185725, + "grad_norm": 0.22690346837043762, + "learning_rate": 1.8999139665151056e-05, + "loss": 1.2798, + "step": 5678 + }, + { + "epoch": 1.6914685679182413, + "grad_norm": 0.2327873408794403, + "learning_rate": 1.899871900725742e-05, + "loss": 1.2544, + "step": 5679 + }, + { + "epoch": 1.6917664141179098, + "grad_norm": 0.22844178974628448, + "learning_rate": 1.899829826564076e-05, + "loss": 1.2614, + "step": 5680 + }, + { + "epoch": 1.6920642603175784, + "grad_norm": 0.21966998279094696, + "learning_rate": 1.8997877440304982e-05, + "loss": 1.2887, + "step": 5681 + }, + { + "epoch": 1.6923621065172472, + "grad_norm": 0.2034863829612732, + "learning_rate": 1.899745653125401e-05, + "loss": 1.2698, + "step": 5682 + }, + { + "epoch": 1.692659952716916, + "grad_norm": 0.21531030535697937, + "learning_rate": 1.899703553849176e-05, + "loss": 1.2692, + "step": 5683 + }, + { + "epoch": 1.6929577989165845, + "grad_norm": 0.22363252937793732, + "learning_rate": 1.8996614462022146e-05, + "loss": 1.2721, + "step": 5684 + }, + { + "epoch": 1.693255645116253, + "grad_norm": 0.22647421061992645, + "learning_rate": 1.899619330184908e-05, + "loss": 1.2728, + "step": 5685 + }, + { + "epoch": 1.6935534913159218, + "grad_norm": 0.22268177568912506, + "learning_rate": 1.899577205797649e-05, + "loss": 1.2978, + "step": 5686 + }, + { + "epoch": 1.6938513375155904, + "grad_norm": 0.22444170713424683, + "learning_rate": 1.8995350730408295e-05, + "loss": 1.2535, + "step": 5687 + }, + { + "epoch": 1.694149183715259, + "grad_norm": 0.2262217104434967, + "learning_rate": 1.899492931914841e-05, + "loss": 1.2644, + "step": 5688 + }, + { + "epoch": 1.6944470299149277, + "grad_norm": 0.22429290413856506, + "learning_rate": 1.8994507824200756e-05, + "loss": 1.2883, + "step": 5689 + }, + { + "epoch": 1.6947448761145965, + "grad_norm": 0.22514869272708893, + "learning_rate": 1.8994086245569258e-05, + "loss": 1.2642, + "step": 5690 + }, + { + "epoch": 1.695042722314265, + "grad_norm": 0.24482673406600952, + "learning_rate": 1.8993664583257833e-05, + "loss": 1.2783, + "step": 5691 + }, + { + "epoch": 1.6953405685139336, + "grad_norm": 0.24208861589431763, + "learning_rate": 1.899324283727041e-05, + "loss": 1.2807, + "step": 5692 + }, + { + "epoch": 1.6956384147136023, + "grad_norm": 0.23001578450202942, + "learning_rate": 1.899282100761091e-05, + "loss": 1.2644, + "step": 5693 + }, + { + "epoch": 1.6959362609132709, + "grad_norm": 0.21940132975578308, + "learning_rate": 1.899239909428326e-05, + "loss": 1.2892, + "step": 5694 + }, + { + "epoch": 1.6962341071129394, + "grad_norm": 0.22105544805526733, + "learning_rate": 1.8991977097291377e-05, + "loss": 1.2838, + "step": 5695 + }, + { + "epoch": 1.6965319533126082, + "grad_norm": 0.21267855167388916, + "learning_rate": 1.89915550166392e-05, + "loss": 1.2605, + "step": 5696 + }, + { + "epoch": 1.696829799512277, + "grad_norm": 0.23278282582759857, + "learning_rate": 1.8991132852330643e-05, + "loss": 1.2889, + "step": 5697 + }, + { + "epoch": 1.6971276457119455, + "grad_norm": 0.22636377811431885, + "learning_rate": 1.8990710604369648e-05, + "loss": 1.2899, + "step": 5698 + }, + { + "epoch": 1.697425491911614, + "grad_norm": 0.22682049870491028, + "learning_rate": 1.899028827276013e-05, + "loss": 1.2736, + "step": 5699 + }, + { + "epoch": 1.6977233381112828, + "grad_norm": 0.22536416351795197, + "learning_rate": 1.8989865857506025e-05, + "loss": 1.2765, + "step": 5700 + }, + { + "epoch": 1.6980211843109514, + "grad_norm": 0.23606960475444794, + "learning_rate": 1.8989443358611265e-05, + "loss": 1.2651, + "step": 5701 + }, + { + "epoch": 1.69831903051062, + "grad_norm": 0.25915801525115967, + "learning_rate": 1.8989020776079778e-05, + "loss": 1.2807, + "step": 5702 + }, + { + "epoch": 1.6986168767102887, + "grad_norm": 0.24296355247497559, + "learning_rate": 1.898859810991549e-05, + "loss": 1.2767, + "step": 5703 + }, + { + "epoch": 1.6989147229099575, + "grad_norm": 0.21679812669754028, + "learning_rate": 1.8988175360122346e-05, + "loss": 1.2886, + "step": 5704 + }, + { + "epoch": 1.699212569109626, + "grad_norm": 0.22106091678142548, + "learning_rate": 1.8987752526704267e-05, + "loss": 1.2668, + "step": 5705 + }, + { + "epoch": 1.6995104153092946, + "grad_norm": 0.21959993243217468, + "learning_rate": 1.8987329609665197e-05, + "loss": 1.2683, + "step": 5706 + }, + { + "epoch": 1.6998082615089634, + "grad_norm": 0.23039567470550537, + "learning_rate": 1.8986906609009065e-05, + "loss": 1.2553, + "step": 5707 + }, + { + "epoch": 1.700106107708632, + "grad_norm": 0.2438964694738388, + "learning_rate": 1.8986483524739806e-05, + "loss": 1.2739, + "step": 5708 + }, + { + "epoch": 1.7004039539083005, + "grad_norm": 0.21674993634223938, + "learning_rate": 1.898606035686136e-05, + "loss": 1.2762, + "step": 5709 + }, + { + "epoch": 1.7007018001079692, + "grad_norm": 0.31936731934547424, + "learning_rate": 1.8985637105377663e-05, + "loss": 1.2676, + "step": 5710 + }, + { + "epoch": 1.700999646307638, + "grad_norm": 0.2805282771587372, + "learning_rate": 1.898521377029265e-05, + "loss": 1.2837, + "step": 5711 + }, + { + "epoch": 1.7012974925073066, + "grad_norm": 0.25507208704948425, + "learning_rate": 1.8984790351610262e-05, + "loss": 1.2639, + "step": 5712 + }, + { + "epoch": 1.701595338706975, + "grad_norm": 0.22662752866744995, + "learning_rate": 1.898436684933444e-05, + "loss": 1.2698, + "step": 5713 + }, + { + "epoch": 1.7018931849066439, + "grad_norm": 0.24285344779491425, + "learning_rate": 1.898394326346912e-05, + "loss": 1.2742, + "step": 5714 + }, + { + "epoch": 1.7021910311063124, + "grad_norm": 0.23638954758644104, + "learning_rate": 1.8983519594018247e-05, + "loss": 1.2848, + "step": 5715 + }, + { + "epoch": 1.702488877305981, + "grad_norm": 0.22729748487472534, + "learning_rate": 1.898309584098576e-05, + "loss": 1.2802, + "step": 5716 + }, + { + "epoch": 1.7027867235056497, + "grad_norm": 0.21747556328773499, + "learning_rate": 1.8982672004375603e-05, + "loss": 1.2615, + "step": 5717 + }, + { + "epoch": 1.7030845697053185, + "grad_norm": 0.22167259454727173, + "learning_rate": 1.898224808419172e-05, + "loss": 1.2657, + "step": 5718 + }, + { + "epoch": 1.703382415904987, + "grad_norm": 0.22318674623966217, + "learning_rate": 1.8981824080438054e-05, + "loss": 1.272, + "step": 5719 + }, + { + "epoch": 1.7036802621046556, + "grad_norm": 0.21981275081634521, + "learning_rate": 1.898139999311855e-05, + "loss": 1.2678, + "step": 5720 + }, + { + "epoch": 1.7039781083043244, + "grad_norm": 0.2351217269897461, + "learning_rate": 1.898097582223715e-05, + "loss": 1.2922, + "step": 5721 + }, + { + "epoch": 1.7042759545039932, + "grad_norm": 0.22308693826198578, + "learning_rate": 1.8980551567797808e-05, + "loss": 1.2752, + "step": 5722 + }, + { + "epoch": 1.7045738007036615, + "grad_norm": 0.22976164519786835, + "learning_rate": 1.898012722980447e-05, + "loss": 1.2582, + "step": 5723 + }, + { + "epoch": 1.7048716469033303, + "grad_norm": 0.2237618863582611, + "learning_rate": 1.8979702808261077e-05, + "loss": 1.2789, + "step": 5724 + }, + { + "epoch": 1.705169493102999, + "grad_norm": 0.21517914533615112, + "learning_rate": 1.8979278303171584e-05, + "loss": 1.2639, + "step": 5725 + }, + { + "epoch": 1.7054673393026676, + "grad_norm": 0.21843688189983368, + "learning_rate": 1.8978853714539935e-05, + "loss": 1.2656, + "step": 5726 + }, + { + "epoch": 1.7057651855023361, + "grad_norm": 0.2246447652578354, + "learning_rate": 1.897842904237009e-05, + "loss": 1.2754, + "step": 5727 + }, + { + "epoch": 1.706063031702005, + "grad_norm": 0.22413119673728943, + "learning_rate": 1.8978004286665986e-05, + "loss": 1.2816, + "step": 5728 + }, + { + "epoch": 1.7063608779016737, + "grad_norm": 0.22636112570762634, + "learning_rate": 1.897757944743159e-05, + "loss": 1.273, + "step": 5729 + }, + { + "epoch": 1.706658724101342, + "grad_norm": 0.2248656004667282, + "learning_rate": 1.8977154524670845e-05, + "loss": 1.281, + "step": 5730 + }, + { + "epoch": 1.7069565703010108, + "grad_norm": 0.2276724874973297, + "learning_rate": 1.8976729518387708e-05, + "loss": 1.2885, + "step": 5731 + }, + { + "epoch": 1.7072544165006796, + "grad_norm": 0.24606406688690186, + "learning_rate": 1.897630442858613e-05, + "loss": 1.286, + "step": 5732 + }, + { + "epoch": 1.707552262700348, + "grad_norm": 0.2130737155675888, + "learning_rate": 1.897587925527007e-05, + "loss": 1.2562, + "step": 5733 + }, + { + "epoch": 1.7078501089000167, + "grad_norm": 0.21556521952152252, + "learning_rate": 1.897545399844348e-05, + "loss": 1.2737, + "step": 5734 + }, + { + "epoch": 1.7081479550996854, + "grad_norm": 0.2331210970878601, + "learning_rate": 1.8975028658110323e-05, + "loss": 1.2682, + "step": 5735 + }, + { + "epoch": 1.7084458012993542, + "grad_norm": 0.2184014916419983, + "learning_rate": 1.897460323427455e-05, + "loss": 1.2624, + "step": 5736 + }, + { + "epoch": 1.7087436474990227, + "grad_norm": 0.22465340793132782, + "learning_rate": 1.8974177726940117e-05, + "loss": 1.2729, + "step": 5737 + }, + { + "epoch": 1.7090414936986913, + "grad_norm": 0.22078350186347961, + "learning_rate": 1.897375213611099e-05, + "loss": 1.2585, + "step": 5738 + }, + { + "epoch": 1.70933933989836, + "grad_norm": 0.2483685463666916, + "learning_rate": 1.8973326461791126e-05, + "loss": 1.2659, + "step": 5739 + }, + { + "epoch": 1.7096371860980286, + "grad_norm": 0.21920223534107208, + "learning_rate": 1.8972900703984484e-05, + "loss": 1.2585, + "step": 5740 + }, + { + "epoch": 1.7099350322976972, + "grad_norm": 0.22167551517486572, + "learning_rate": 1.8972474862695024e-05, + "loss": 1.2615, + "step": 5741 + }, + { + "epoch": 1.710232878497366, + "grad_norm": 0.22482536733150482, + "learning_rate": 1.8972048937926713e-05, + "loss": 1.265, + "step": 5742 + }, + { + "epoch": 1.7105307246970347, + "grad_norm": 0.21366822719573975, + "learning_rate": 1.897162292968351e-05, + "loss": 1.2745, + "step": 5743 + }, + { + "epoch": 1.7108285708967033, + "grad_norm": 0.24560382962226868, + "learning_rate": 1.8971196837969378e-05, + "loss": 1.2714, + "step": 5744 + }, + { + "epoch": 1.7111264170963718, + "grad_norm": 0.22219218313694, + "learning_rate": 1.8970770662788285e-05, + "loss": 1.2911, + "step": 5745 + }, + { + "epoch": 1.7114242632960406, + "grad_norm": 0.22853435575962067, + "learning_rate": 1.897034440414419e-05, + "loss": 1.2725, + "step": 5746 + }, + { + "epoch": 1.7117221094957091, + "grad_norm": 0.21046991646289825, + "learning_rate": 1.896991806204107e-05, + "loss": 1.2789, + "step": 5747 + }, + { + "epoch": 1.7120199556953777, + "grad_norm": 0.2427307814359665, + "learning_rate": 1.8969491636482878e-05, + "loss": 1.2907, + "step": 5748 + }, + { + "epoch": 1.7123178018950465, + "grad_norm": 0.2283223271369934, + "learning_rate": 1.896906512747359e-05, + "loss": 1.2799, + "step": 5749 + }, + { + "epoch": 1.7126156480947152, + "grad_norm": 0.2287558764219284, + "learning_rate": 1.8968638535017167e-05, + "loss": 1.2705, + "step": 5750 + }, + { + "epoch": 1.7129134942943838, + "grad_norm": 0.22238564491271973, + "learning_rate": 1.8968211859117586e-05, + "loss": 1.2601, + "step": 5751 + }, + { + "epoch": 1.7132113404940523, + "grad_norm": 0.23806633055210114, + "learning_rate": 1.8967785099778818e-05, + "loss": 1.2689, + "step": 5752 + }, + { + "epoch": 1.713509186693721, + "grad_norm": 0.2507690489292145, + "learning_rate": 1.8967358257004825e-05, + "loss": 1.2793, + "step": 5753 + }, + { + "epoch": 1.7138070328933896, + "grad_norm": 0.23303022980690002, + "learning_rate": 1.896693133079958e-05, + "loss": 1.2722, + "step": 5754 + }, + { + "epoch": 1.7141048790930582, + "grad_norm": 0.2564085125923157, + "learning_rate": 1.896650432116706e-05, + "loss": 1.2659, + "step": 5755 + }, + { + "epoch": 1.714402725292727, + "grad_norm": 0.23091602325439453, + "learning_rate": 1.8966077228111236e-05, + "loss": 1.2664, + "step": 5756 + }, + { + "epoch": 1.7147005714923957, + "grad_norm": 0.23390372097492218, + "learning_rate": 1.896565005163608e-05, + "loss": 1.2728, + "step": 5757 + }, + { + "epoch": 1.7149984176920643, + "grad_norm": 0.21289603412151337, + "learning_rate": 1.8965222791745568e-05, + "loss": 1.2745, + "step": 5758 + }, + { + "epoch": 1.7152962638917328, + "grad_norm": 0.22639428079128265, + "learning_rate": 1.8964795448443672e-05, + "loss": 1.2932, + "step": 5759 + }, + { + "epoch": 1.7155941100914016, + "grad_norm": 0.2241027057170868, + "learning_rate": 1.896436802173437e-05, + "loss": 1.2693, + "step": 5760 + }, + { + "epoch": 1.7158919562910702, + "grad_norm": 0.2254457324743271, + "learning_rate": 1.8963940511621645e-05, + "loss": 1.2649, + "step": 5761 + }, + { + "epoch": 1.7161898024907387, + "grad_norm": 0.2425682544708252, + "learning_rate": 1.8963512918109462e-05, + "loss": 1.2729, + "step": 5762 + }, + { + "epoch": 1.7164876486904075, + "grad_norm": 0.25813499093055725, + "learning_rate": 1.896308524120181e-05, + "loss": 1.2625, + "step": 5763 + }, + { + "epoch": 1.7167854948900763, + "grad_norm": 0.222854882478714, + "learning_rate": 1.8962657480902664e-05, + "loss": 1.277, + "step": 5764 + }, + { + "epoch": 1.7170833410897448, + "grad_norm": 0.25514060258865356, + "learning_rate": 1.8962229637216004e-05, + "loss": 1.2709, + "step": 5765 + }, + { + "epoch": 1.7173811872894134, + "grad_norm": 0.2230352759361267, + "learning_rate": 1.8961801710145807e-05, + "loss": 1.2725, + "step": 5766 + }, + { + "epoch": 1.7176790334890821, + "grad_norm": 0.24023233354091644, + "learning_rate": 1.8961373699696063e-05, + "loss": 1.2764, + "step": 5767 + }, + { + "epoch": 1.7179768796887507, + "grad_norm": 0.2421426922082901, + "learning_rate": 1.8960945605870744e-05, + "loss": 1.2625, + "step": 5768 + }, + { + "epoch": 1.7182747258884192, + "grad_norm": 0.22517035901546478, + "learning_rate": 1.8960517428673843e-05, + "loss": 1.2519, + "step": 5769 + }, + { + "epoch": 1.718572572088088, + "grad_norm": 0.22730892896652222, + "learning_rate": 1.8960089168109333e-05, + "loss": 1.2722, + "step": 5770 + }, + { + "epoch": 1.7188704182877568, + "grad_norm": 0.21366451680660248, + "learning_rate": 1.8959660824181206e-05, + "loss": 1.2722, + "step": 5771 + }, + { + "epoch": 1.7191682644874253, + "grad_norm": 0.22521375119686127, + "learning_rate": 1.8959232396893446e-05, + "loss": 1.267, + "step": 5772 + }, + { + "epoch": 1.7194661106870939, + "grad_norm": 0.21800497174263, + "learning_rate": 1.8958803886250038e-05, + "loss": 1.268, + "step": 5773 + }, + { + "epoch": 1.7197639568867626, + "grad_norm": 0.23374035954475403, + "learning_rate": 1.895837529225497e-05, + "loss": 1.2784, + "step": 5774 + }, + { + "epoch": 1.7200618030864312, + "grad_norm": 0.22994539141654968, + "learning_rate": 1.8957946614912227e-05, + "loss": 1.2686, + "step": 5775 + }, + { + "epoch": 1.7203596492860997, + "grad_norm": 0.22978772222995758, + "learning_rate": 1.89575178542258e-05, + "loss": 1.2733, + "step": 5776 + }, + { + "epoch": 1.7206574954857685, + "grad_norm": 0.23379528522491455, + "learning_rate": 1.8957089010199678e-05, + "loss": 1.2754, + "step": 5777 + }, + { + "epoch": 1.7209553416854373, + "grad_norm": 0.24608808755874634, + "learning_rate": 1.895666008283785e-05, + "loss": 1.2859, + "step": 5778 + }, + { + "epoch": 1.7212531878851058, + "grad_norm": 0.22173528373241425, + "learning_rate": 1.8956231072144308e-05, + "loss": 1.2824, + "step": 5779 + }, + { + "epoch": 1.7215510340847744, + "grad_norm": 0.26645976305007935, + "learning_rate": 1.8955801978123037e-05, + "loss": 1.278, + "step": 5780 + }, + { + "epoch": 1.7218488802844432, + "grad_norm": 0.26630792021751404, + "learning_rate": 1.895537280077804e-05, + "loss": 1.2726, + "step": 5781 + }, + { + "epoch": 1.7221467264841117, + "grad_norm": 0.2354707568883896, + "learning_rate": 1.8954943540113305e-05, + "loss": 1.266, + "step": 5782 + }, + { + "epoch": 1.7224445726837803, + "grad_norm": 0.2792515754699707, + "learning_rate": 1.8954514196132825e-05, + "loss": 1.2721, + "step": 5783 + }, + { + "epoch": 1.722742418883449, + "grad_norm": 0.2280869483947754, + "learning_rate": 1.8954084768840593e-05, + "loss": 1.2803, + "step": 5784 + }, + { + "epoch": 1.7230402650831178, + "grad_norm": 0.2183830738067627, + "learning_rate": 1.8953655258240608e-05, + "loss": 1.281, + "step": 5785 + }, + { + "epoch": 1.7233381112827864, + "grad_norm": 0.21691355109214783, + "learning_rate": 1.895322566433686e-05, + "loss": 1.2719, + "step": 5786 + }, + { + "epoch": 1.723635957482455, + "grad_norm": 0.22357288002967834, + "learning_rate": 1.8952795987133354e-05, + "loss": 1.267, + "step": 5787 + }, + { + "epoch": 1.7239338036821237, + "grad_norm": 0.23809656500816345, + "learning_rate": 1.8952366226634086e-05, + "loss": 1.2514, + "step": 5788 + }, + { + "epoch": 1.7242316498817924, + "grad_norm": 0.22516104578971863, + "learning_rate": 1.8951936382843052e-05, + "loss": 1.291, + "step": 5789 + }, + { + "epoch": 1.7245294960814608, + "grad_norm": 0.21597300469875336, + "learning_rate": 1.8951506455764247e-05, + "loss": 1.2602, + "step": 5790 + }, + { + "epoch": 1.7248273422811295, + "grad_norm": 0.2319001704454422, + "learning_rate": 1.895107644540168e-05, + "loss": 1.2872, + "step": 5791 + }, + { + "epoch": 1.7251251884807983, + "grad_norm": 0.23265139758586884, + "learning_rate": 1.8950646351759346e-05, + "loss": 1.2777, + "step": 5792 + }, + { + "epoch": 1.7254230346804669, + "grad_norm": 0.22303816676139832, + "learning_rate": 1.895021617484125e-05, + "loss": 1.2735, + "step": 5793 + }, + { + "epoch": 1.7257208808801354, + "grad_norm": 0.22523610293865204, + "learning_rate": 1.894978591465139e-05, + "loss": 1.2815, + "step": 5794 + }, + { + "epoch": 1.7260187270798042, + "grad_norm": 0.3229084014892578, + "learning_rate": 1.894935557119377e-05, + "loss": 1.258, + "step": 5795 + }, + { + "epoch": 1.726316573279473, + "grad_norm": 0.28496530652046204, + "learning_rate": 1.8948925144472395e-05, + "loss": 1.28, + "step": 5796 + }, + { + "epoch": 1.7266144194791413, + "grad_norm": 0.2660283148288727, + "learning_rate": 1.8948494634491273e-05, + "loss": 1.282, + "step": 5797 + }, + { + "epoch": 1.72691226567881, + "grad_norm": 0.22563010454177856, + "learning_rate": 1.8948064041254405e-05, + "loss": 1.2706, + "step": 5798 + }, + { + "epoch": 1.7272101118784788, + "grad_norm": 0.2682509422302246, + "learning_rate": 1.89476333647658e-05, + "loss": 1.263, + "step": 5799 + }, + { + "epoch": 1.7275079580781474, + "grad_norm": 0.27108532190322876, + "learning_rate": 1.894720260502946e-05, + "loss": 1.2838, + "step": 5800 + }, + { + "epoch": 1.727805804277816, + "grad_norm": 0.24455232918262482, + "learning_rate": 1.89467717620494e-05, + "loss": 1.2903, + "step": 5801 + }, + { + "epoch": 1.7281036504774847, + "grad_norm": 0.22039395570755005, + "learning_rate": 1.894634083582962e-05, + "loss": 1.2701, + "step": 5802 + }, + { + "epoch": 1.7284014966771535, + "grad_norm": 0.22624199092388153, + "learning_rate": 1.8945909826374134e-05, + "loss": 1.2706, + "step": 5803 + }, + { + "epoch": 1.728699342876822, + "grad_norm": 0.23424167931079865, + "learning_rate": 1.8945478733686956e-05, + "loss": 1.276, + "step": 5804 + }, + { + "epoch": 1.7289971890764906, + "grad_norm": 0.22864419221878052, + "learning_rate": 1.8945047557772095e-05, + "loss": 1.2696, + "step": 5805 + }, + { + "epoch": 1.7292950352761594, + "grad_norm": 0.22675685584545135, + "learning_rate": 1.8944616298633556e-05, + "loss": 1.2791, + "step": 5806 + }, + { + "epoch": 1.729592881475828, + "grad_norm": 0.22480760514736176, + "learning_rate": 1.8944184956275356e-05, + "loss": 1.2534, + "step": 5807 + }, + { + "epoch": 1.7298907276754965, + "grad_norm": 0.30962008237838745, + "learning_rate": 1.8943753530701508e-05, + "loss": 1.2785, + "step": 5808 + }, + { + "epoch": 1.7301885738751652, + "grad_norm": 0.24147537350654602, + "learning_rate": 1.8943322021916028e-05, + "loss": 1.2602, + "step": 5809 + }, + { + "epoch": 1.730486420074834, + "grad_norm": 0.22445976734161377, + "learning_rate": 1.8942890429922927e-05, + "loss": 1.2734, + "step": 5810 + }, + { + "epoch": 1.7307842662745025, + "grad_norm": 0.21948538720607758, + "learning_rate": 1.8942458754726223e-05, + "loss": 1.2628, + "step": 5811 + }, + { + "epoch": 1.731082112474171, + "grad_norm": 0.2405489981174469, + "learning_rate": 1.894202699632993e-05, + "loss": 1.2878, + "step": 5812 + }, + { + "epoch": 1.7313799586738399, + "grad_norm": 0.22903725504875183, + "learning_rate": 1.8941595154738067e-05, + "loss": 1.2749, + "step": 5813 + }, + { + "epoch": 1.7316778048735084, + "grad_norm": 0.22374002635478973, + "learning_rate": 1.8941163229954652e-05, + "loss": 1.2691, + "step": 5814 + }, + { + "epoch": 1.731975651073177, + "grad_norm": 0.2340724915266037, + "learning_rate": 1.89407312219837e-05, + "loss": 1.2703, + "step": 5815 + }, + { + "epoch": 1.7322734972728457, + "grad_norm": 0.21973590552806854, + "learning_rate": 1.894029913082924e-05, + "loss": 1.2837, + "step": 5816 + }, + { + "epoch": 1.7325713434725145, + "grad_norm": 0.23547907173633575, + "learning_rate": 1.8939866956495278e-05, + "loss": 1.2656, + "step": 5817 + }, + { + "epoch": 1.732869189672183, + "grad_norm": 0.22913868725299835, + "learning_rate": 1.8939434698985842e-05, + "loss": 1.268, + "step": 5818 + }, + { + "epoch": 1.7331670358718516, + "grad_norm": 0.23191285133361816, + "learning_rate": 1.893900235830496e-05, + "loss": 1.2481, + "step": 5819 + }, + { + "epoch": 1.7334648820715204, + "grad_norm": 0.24176964163780212, + "learning_rate": 1.8938569934456645e-05, + "loss": 1.2914, + "step": 5820 + }, + { + "epoch": 1.733762728271189, + "grad_norm": 0.22274360060691833, + "learning_rate": 1.8938137427444926e-05, + "loss": 1.2792, + "step": 5821 + }, + { + "epoch": 1.7340605744708575, + "grad_norm": 0.24947451055049896, + "learning_rate": 1.8937704837273817e-05, + "loss": 1.2547, + "step": 5822 + }, + { + "epoch": 1.7343584206705263, + "grad_norm": 0.22832182049751282, + "learning_rate": 1.8937272163947356e-05, + "loss": 1.2718, + "step": 5823 + }, + { + "epoch": 1.734656266870195, + "grad_norm": 0.22086654603481293, + "learning_rate": 1.893683940746956e-05, + "loss": 1.2781, + "step": 5824 + }, + { + "epoch": 1.7349541130698636, + "grad_norm": 0.23569034039974213, + "learning_rate": 1.8936406567844464e-05, + "loss": 1.2669, + "step": 5825 + }, + { + "epoch": 1.7352519592695321, + "grad_norm": 0.2154817283153534, + "learning_rate": 1.8935973645076087e-05, + "loss": 1.2679, + "step": 5826 + }, + { + "epoch": 1.735549805469201, + "grad_norm": 0.24194398522377014, + "learning_rate": 1.8935540639168458e-05, + "loss": 1.2664, + "step": 5827 + }, + { + "epoch": 1.7358476516688695, + "grad_norm": 0.23178546130657196, + "learning_rate": 1.8935107550125606e-05, + "loss": 1.2703, + "step": 5828 + }, + { + "epoch": 1.736145497868538, + "grad_norm": 0.22296249866485596, + "learning_rate": 1.8934674377951562e-05, + "loss": 1.2711, + "step": 5829 + }, + { + "epoch": 1.7364433440682068, + "grad_norm": 0.23277190327644348, + "learning_rate": 1.8934241122650355e-05, + "loss": 1.2765, + "step": 5830 + }, + { + "epoch": 1.7367411902678755, + "grad_norm": 0.22290737926959991, + "learning_rate": 1.8933807784226014e-05, + "loss": 1.2785, + "step": 5831 + }, + { + "epoch": 1.737039036467544, + "grad_norm": 0.21856017410755157, + "learning_rate": 1.8933374362682574e-05, + "loss": 1.2878, + "step": 5832 + }, + { + "epoch": 1.7373368826672126, + "grad_norm": 0.23294885456562042, + "learning_rate": 1.8932940858024073e-05, + "loss": 1.2905, + "step": 5833 + }, + { + "epoch": 1.7376347288668814, + "grad_norm": 0.22651009261608124, + "learning_rate": 1.893250727025453e-05, + "loss": 1.2739, + "step": 5834 + }, + { + "epoch": 1.73793257506655, + "grad_norm": 0.23007501661777496, + "learning_rate": 1.893207359937799e-05, + "loss": 1.2755, + "step": 5835 + }, + { + "epoch": 1.7382304212662185, + "grad_norm": 0.22142191231250763, + "learning_rate": 1.8931639845398486e-05, + "loss": 1.2853, + "step": 5836 + }, + { + "epoch": 1.7385282674658873, + "grad_norm": 0.21657319366931915, + "learning_rate": 1.8931206008320048e-05, + "loss": 1.2577, + "step": 5837 + }, + { + "epoch": 1.738826113665556, + "grad_norm": 0.2249903827905655, + "learning_rate": 1.893077208814672e-05, + "loss": 1.2808, + "step": 5838 + }, + { + "epoch": 1.7391239598652246, + "grad_norm": 0.22896753251552582, + "learning_rate": 1.8930338084882535e-05, + "loss": 1.2823, + "step": 5839 + }, + { + "epoch": 1.7394218060648932, + "grad_norm": 0.23422539234161377, + "learning_rate": 1.8929903998531527e-05, + "loss": 1.2683, + "step": 5840 + }, + { + "epoch": 1.739719652264562, + "grad_norm": 0.24125337600708008, + "learning_rate": 1.8929469829097744e-05, + "loss": 1.2699, + "step": 5841 + }, + { + "epoch": 1.7400174984642305, + "grad_norm": 0.2250746190547943, + "learning_rate": 1.892903557658522e-05, + "loss": 1.2704, + "step": 5842 + }, + { + "epoch": 1.740315344663899, + "grad_norm": 0.23315951228141785, + "learning_rate": 1.8928601240997997e-05, + "loss": 1.2914, + "step": 5843 + }, + { + "epoch": 1.7406131908635678, + "grad_norm": 0.22729183733463287, + "learning_rate": 1.8928166822340114e-05, + "loss": 1.2746, + "step": 5844 + }, + { + "epoch": 1.7409110370632366, + "grad_norm": 0.21855691075325012, + "learning_rate": 1.8927732320615617e-05, + "loss": 1.2745, + "step": 5845 + }, + { + "epoch": 1.7412088832629051, + "grad_norm": 0.21247674524784088, + "learning_rate": 1.8927297735828542e-05, + "loss": 1.2718, + "step": 5846 + }, + { + "epoch": 1.7415067294625737, + "grad_norm": 0.2240409553050995, + "learning_rate": 1.8926863067982938e-05, + "loss": 1.2597, + "step": 5847 + }, + { + "epoch": 1.7418045756622424, + "grad_norm": 0.21879000961780548, + "learning_rate": 1.8926428317082846e-05, + "loss": 1.2859, + "step": 5848 + }, + { + "epoch": 1.7421024218619112, + "grad_norm": 0.22439351677894592, + "learning_rate": 1.892599348313231e-05, + "loss": 1.2642, + "step": 5849 + }, + { + "epoch": 1.7424002680615795, + "grad_norm": 0.21980057656764984, + "learning_rate": 1.8925558566135377e-05, + "loss": 1.256, + "step": 5850 + }, + { + "epoch": 1.7426981142612483, + "grad_norm": 0.21809187531471252, + "learning_rate": 1.8925123566096095e-05, + "loss": 1.2719, + "step": 5851 + }, + { + "epoch": 1.742995960460917, + "grad_norm": 0.21508194506168365, + "learning_rate": 1.892468848301851e-05, + "loss": 1.2876, + "step": 5852 + }, + { + "epoch": 1.7432938066605856, + "grad_norm": 0.2727757692337036, + "learning_rate": 1.892425331690667e-05, + "loss": 1.2659, + "step": 5853 + }, + { + "epoch": 1.7435916528602542, + "grad_norm": 0.25025418400764465, + "learning_rate": 1.8923818067764624e-05, + "loss": 1.2769, + "step": 5854 + }, + { + "epoch": 1.743889499059923, + "grad_norm": 0.25716692209243774, + "learning_rate": 1.8923382735596422e-05, + "loss": 1.266, + "step": 5855 + }, + { + "epoch": 1.7441873452595917, + "grad_norm": 0.22011108696460724, + "learning_rate": 1.8922947320406113e-05, + "loss": 1.2651, + "step": 5856 + }, + { + "epoch": 1.74448519145926, + "grad_norm": 0.45085522532463074, + "learning_rate": 1.892251182219775e-05, + "loss": 1.275, + "step": 5857 + }, + { + "epoch": 1.7447830376589288, + "grad_norm": 0.21316717565059662, + "learning_rate": 1.892207624097538e-05, + "loss": 1.2488, + "step": 5858 + }, + { + "epoch": 1.7450808838585976, + "grad_norm": 0.22340159118175507, + "learning_rate": 1.8921640576743058e-05, + "loss": 1.2693, + "step": 5859 + }, + { + "epoch": 1.7453787300582662, + "grad_norm": 0.24735619127750397, + "learning_rate": 1.892120482950484e-05, + "loss": 1.2788, + "step": 5860 + }, + { + "epoch": 1.7456765762579347, + "grad_norm": 0.2385658472776413, + "learning_rate": 1.892076899926478e-05, + "loss": 1.2886, + "step": 5861 + }, + { + "epoch": 1.7459744224576035, + "grad_norm": 0.21086426079273224, + "learning_rate": 1.892033308602693e-05, + "loss": 1.2661, + "step": 5862 + }, + { + "epoch": 1.7462722686572723, + "grad_norm": 0.22353291511535645, + "learning_rate": 1.891989708979535e-05, + "loss": 1.2966, + "step": 5863 + }, + { + "epoch": 1.7465701148569406, + "grad_norm": 0.2436157613992691, + "learning_rate": 1.8919461010574086e-05, + "loss": 1.273, + "step": 5864 + }, + { + "epoch": 1.7468679610566094, + "grad_norm": 0.22707544267177582, + "learning_rate": 1.891902484836721e-05, + "loss": 1.266, + "step": 5865 + }, + { + "epoch": 1.7471658072562781, + "grad_norm": 0.21741141378879547, + "learning_rate": 1.8918588603178768e-05, + "loss": 1.277, + "step": 5866 + }, + { + "epoch": 1.7474636534559467, + "grad_norm": 0.22682414948940277, + "learning_rate": 1.8918152275012828e-05, + "loss": 1.2706, + "step": 5867 + }, + { + "epoch": 1.7477614996556152, + "grad_norm": 0.2459396868944168, + "learning_rate": 1.891771586387344e-05, + "loss": 1.2613, + "step": 5868 + }, + { + "epoch": 1.748059345855284, + "grad_norm": 0.23079337179660797, + "learning_rate": 1.8917279369764673e-05, + "loss": 1.2719, + "step": 5869 + }, + { + "epoch": 1.7483571920549528, + "grad_norm": 0.21382039785385132, + "learning_rate": 1.8916842792690583e-05, + "loss": 1.2711, + "step": 5870 + }, + { + "epoch": 1.7486550382546213, + "grad_norm": 0.21237541735172272, + "learning_rate": 1.8916406132655233e-05, + "loss": 1.2748, + "step": 5871 + }, + { + "epoch": 1.7489528844542899, + "grad_norm": 0.234297513961792, + "learning_rate": 1.8915969389662686e-05, + "loss": 1.2761, + "step": 5872 + }, + { + "epoch": 1.7492507306539586, + "grad_norm": 0.22806483507156372, + "learning_rate": 1.8915532563717008e-05, + "loss": 1.2742, + "step": 5873 + }, + { + "epoch": 1.7495485768536272, + "grad_norm": 0.22773486375808716, + "learning_rate": 1.8915095654822256e-05, + "loss": 1.2631, + "step": 5874 + }, + { + "epoch": 1.7498464230532957, + "grad_norm": 0.21805758774280548, + "learning_rate": 1.8914658662982502e-05, + "loss": 1.2566, + "step": 5875 + }, + { + "epoch": 1.7501442692529645, + "grad_norm": 0.22487327456474304, + "learning_rate": 1.891422158820181e-05, + "loss": 1.2862, + "step": 5876 + }, + { + "epoch": 1.7504421154526333, + "grad_norm": 0.24331773817539215, + "learning_rate": 1.8913784430484246e-05, + "loss": 1.296, + "step": 5877 + }, + { + "epoch": 1.7507399616523018, + "grad_norm": 0.243910551071167, + "learning_rate": 1.8913347189833876e-05, + "loss": 1.2776, + "step": 5878 + }, + { + "epoch": 1.7510378078519704, + "grad_norm": 0.21218135952949524, + "learning_rate": 1.891290986625477e-05, + "loss": 1.2821, + "step": 5879 + }, + { + "epoch": 1.7513356540516392, + "grad_norm": 0.2238103151321411, + "learning_rate": 1.8912472459750994e-05, + "loss": 1.2799, + "step": 5880 + }, + { + "epoch": 1.7516335002513077, + "grad_norm": 0.21185216307640076, + "learning_rate": 1.8912034970326617e-05, + "loss": 1.2613, + "step": 5881 + }, + { + "epoch": 1.7519313464509763, + "grad_norm": 0.22607696056365967, + "learning_rate": 1.8911597397985714e-05, + "loss": 1.2659, + "step": 5882 + }, + { + "epoch": 1.752229192650645, + "grad_norm": 0.22042007744312286, + "learning_rate": 1.8911159742732357e-05, + "loss": 1.2667, + "step": 5883 + }, + { + "epoch": 1.7525270388503138, + "grad_norm": 0.2211650311946869, + "learning_rate": 1.8910722004570612e-05, + "loss": 1.2725, + "step": 5884 + }, + { + "epoch": 1.7528248850499824, + "grad_norm": 0.21940304338932037, + "learning_rate": 1.8910284183504552e-05, + "loss": 1.2713, + "step": 5885 + }, + { + "epoch": 1.753122731249651, + "grad_norm": 0.23272915184497833, + "learning_rate": 1.8909846279538256e-05, + "loss": 1.274, + "step": 5886 + }, + { + "epoch": 1.7534205774493197, + "grad_norm": 0.216901957988739, + "learning_rate": 1.8909408292675797e-05, + "loss": 1.2692, + "step": 5887 + }, + { + "epoch": 1.7537184236489882, + "grad_norm": 0.23030175268650055, + "learning_rate": 1.8908970222921243e-05, + "loss": 1.2913, + "step": 5888 + }, + { + "epoch": 1.7540162698486568, + "grad_norm": 0.23435144126415253, + "learning_rate": 1.8908532070278677e-05, + "loss": 1.2575, + "step": 5889 + }, + { + "epoch": 1.7543141160483255, + "grad_norm": 0.22257715463638306, + "learning_rate": 1.8908093834752176e-05, + "loss": 1.2525, + "step": 5890 + }, + { + "epoch": 1.7546119622479943, + "grad_norm": 0.22075094282627106, + "learning_rate": 1.8907655516345808e-05, + "loss": 1.2593, + "step": 5891 + }, + { + "epoch": 1.7549098084476629, + "grad_norm": 0.22186720371246338, + "learning_rate": 1.8907217115063663e-05, + "loss": 1.2783, + "step": 5892 + }, + { + "epoch": 1.7552076546473314, + "grad_norm": 0.22689685225486755, + "learning_rate": 1.8906778630909814e-05, + "loss": 1.2973, + "step": 5893 + }, + { + "epoch": 1.7555055008470002, + "grad_norm": 0.21796102821826935, + "learning_rate": 1.890634006388834e-05, + "loss": 1.2591, + "step": 5894 + }, + { + "epoch": 1.7558033470466687, + "grad_norm": 0.2169775366783142, + "learning_rate": 1.8905901414003322e-05, + "loss": 1.2691, + "step": 5895 + }, + { + "epoch": 1.7561011932463373, + "grad_norm": 0.2183646261692047, + "learning_rate": 1.8905462681258845e-05, + "loss": 1.2689, + "step": 5896 + }, + { + "epoch": 1.756399039446006, + "grad_norm": 0.22654369473457336, + "learning_rate": 1.8905023865658984e-05, + "loss": 1.2911, + "step": 5897 + }, + { + "epoch": 1.7566968856456748, + "grad_norm": 0.22667363286018372, + "learning_rate": 1.8904584967207825e-05, + "loss": 1.2637, + "step": 5898 + }, + { + "epoch": 1.7569947318453434, + "grad_norm": 0.21776628494262695, + "learning_rate": 1.890414598590945e-05, + "loss": 1.2585, + "step": 5899 + }, + { + "epoch": 1.757292578045012, + "grad_norm": 0.2251240313053131, + "learning_rate": 1.890370692176795e-05, + "loss": 1.2642, + "step": 5900 + }, + { + "epoch": 1.7575904242446807, + "grad_norm": 0.2164892703294754, + "learning_rate": 1.8903267774787402e-05, + "loss": 1.2602, + "step": 5901 + }, + { + "epoch": 1.7578882704443493, + "grad_norm": 0.22920046746730804, + "learning_rate": 1.8902828544971896e-05, + "loss": 1.2565, + "step": 5902 + }, + { + "epoch": 1.7581861166440178, + "grad_norm": 0.20191632211208344, + "learning_rate": 1.8902389232325515e-05, + "loss": 1.2731, + "step": 5903 + }, + { + "epoch": 1.7584839628436866, + "grad_norm": 0.2176978886127472, + "learning_rate": 1.890194983685235e-05, + "loss": 1.2786, + "step": 5904 + }, + { + "epoch": 1.7587818090433553, + "grad_norm": 0.21307040750980377, + "learning_rate": 1.8901510358556486e-05, + "loss": 1.2699, + "step": 5905 + }, + { + "epoch": 1.759079655243024, + "grad_norm": 0.21998471021652222, + "learning_rate": 1.8901070797442017e-05, + "loss": 1.2714, + "step": 5906 + }, + { + "epoch": 1.7593775014426924, + "grad_norm": 0.2163301706314087, + "learning_rate": 1.8900631153513024e-05, + "loss": 1.27, + "step": 5907 + }, + { + "epoch": 1.7596753476423612, + "grad_norm": 0.2146802842617035, + "learning_rate": 1.8900191426773608e-05, + "loss": 1.2677, + "step": 5908 + }, + { + "epoch": 1.7599731938420298, + "grad_norm": 0.21443694829940796, + "learning_rate": 1.8899751617227854e-05, + "loss": 1.2646, + "step": 5909 + }, + { + "epoch": 1.7602710400416983, + "grad_norm": 0.231244295835495, + "learning_rate": 1.8899311724879853e-05, + "loss": 1.2732, + "step": 5910 + }, + { + "epoch": 1.760568886241367, + "grad_norm": 0.2077794224023819, + "learning_rate": 1.8898871749733697e-05, + "loss": 1.2414, + "step": 5911 + }, + { + "epoch": 1.7608667324410359, + "grad_norm": 0.2148183137178421, + "learning_rate": 1.8898431691793485e-05, + "loss": 1.2753, + "step": 5912 + }, + { + "epoch": 1.7611645786407044, + "grad_norm": 0.21195048093795776, + "learning_rate": 1.8897991551063304e-05, + "loss": 1.2664, + "step": 5913 + }, + { + "epoch": 1.761462424840373, + "grad_norm": 0.2143191397190094, + "learning_rate": 1.8897551327547257e-05, + "loss": 1.2615, + "step": 5914 + }, + { + "epoch": 1.7617602710400417, + "grad_norm": 0.220404714345932, + "learning_rate": 1.8897111021249433e-05, + "loss": 1.2815, + "step": 5915 + }, + { + "epoch": 1.7620581172397105, + "grad_norm": 0.22406984865665436, + "learning_rate": 1.8896670632173936e-05, + "loss": 1.2687, + "step": 5916 + }, + { + "epoch": 1.7623559634393788, + "grad_norm": 0.2176969051361084, + "learning_rate": 1.8896230160324854e-05, + "loss": 1.2726, + "step": 5917 + }, + { + "epoch": 1.7626538096390476, + "grad_norm": 0.21982218325138092, + "learning_rate": 1.889578960570629e-05, + "loss": 1.2676, + "step": 5918 + }, + { + "epoch": 1.7629516558387164, + "grad_norm": 0.2140255570411682, + "learning_rate": 1.8895348968322346e-05, + "loss": 1.2717, + "step": 5919 + }, + { + "epoch": 1.763249502038385, + "grad_norm": 0.21119576692581177, + "learning_rate": 1.8894908248177116e-05, + "loss": 1.2695, + "step": 5920 + }, + { + "epoch": 1.7635473482380535, + "grad_norm": 0.21961624920368195, + "learning_rate": 1.8894467445274703e-05, + "loss": 1.2634, + "step": 5921 + }, + { + "epoch": 1.7638451944377223, + "grad_norm": 0.22621069848537445, + "learning_rate": 1.889402655961921e-05, + "loss": 1.2667, + "step": 5922 + }, + { + "epoch": 1.764143040637391, + "grad_norm": 0.21127013862133026, + "learning_rate": 1.8893585591214734e-05, + "loss": 1.2783, + "step": 5923 + }, + { + "epoch": 1.7644408868370594, + "grad_norm": 0.21664802730083466, + "learning_rate": 1.889314454006538e-05, + "loss": 1.2698, + "step": 5924 + }, + { + "epoch": 1.7647387330367281, + "grad_norm": 0.22002741694450378, + "learning_rate": 1.8892703406175257e-05, + "loss": 1.2673, + "step": 5925 + }, + { + "epoch": 1.765036579236397, + "grad_norm": 0.2275383025407791, + "learning_rate": 1.8892262189548462e-05, + "loss": 1.2581, + "step": 5926 + }, + { + "epoch": 1.7653344254360654, + "grad_norm": 0.21121346950531006, + "learning_rate": 1.88918208901891e-05, + "loss": 1.275, + "step": 5927 + }, + { + "epoch": 1.765632271635734, + "grad_norm": 0.21943354606628418, + "learning_rate": 1.889137950810128e-05, + "loss": 1.2757, + "step": 5928 + }, + { + "epoch": 1.7659301178354028, + "grad_norm": 0.2101525366306305, + "learning_rate": 1.8890938043289106e-05, + "loss": 1.2684, + "step": 5929 + }, + { + "epoch": 1.7662279640350715, + "grad_norm": 0.22099937498569489, + "learning_rate": 1.8890496495756692e-05, + "loss": 1.2712, + "step": 5930 + }, + { + "epoch": 1.76652581023474, + "grad_norm": 0.23087163269519806, + "learning_rate": 1.8890054865508137e-05, + "loss": 1.273, + "step": 5931 + }, + { + "epoch": 1.7668236564344086, + "grad_norm": 0.21452589333057404, + "learning_rate": 1.8889613152547556e-05, + "loss": 1.2845, + "step": 5932 + }, + { + "epoch": 1.7671215026340774, + "grad_norm": 0.21434038877487183, + "learning_rate": 1.8889171356879056e-05, + "loss": 1.2606, + "step": 5933 + }, + { + "epoch": 1.767419348833746, + "grad_norm": 0.22064903378486633, + "learning_rate": 1.8888729478506747e-05, + "loss": 1.2735, + "step": 5934 + }, + { + "epoch": 1.7677171950334145, + "grad_norm": 0.2192641794681549, + "learning_rate": 1.8888287517434746e-05, + "loss": 1.2742, + "step": 5935 + }, + { + "epoch": 1.7680150412330833, + "grad_norm": 0.21743933856487274, + "learning_rate": 1.8887845473667155e-05, + "loss": 1.2558, + "step": 5936 + }, + { + "epoch": 1.768312887432752, + "grad_norm": 0.22595295310020447, + "learning_rate": 1.888740334720809e-05, + "loss": 1.2664, + "step": 5937 + }, + { + "epoch": 1.7686107336324206, + "grad_norm": 0.21765536069869995, + "learning_rate": 1.888696113806167e-05, + "loss": 1.2871, + "step": 5938 + }, + { + "epoch": 1.7689085798320892, + "grad_norm": 0.2340780645608902, + "learning_rate": 1.888651884623201e-05, + "loss": 1.2686, + "step": 5939 + }, + { + "epoch": 1.769206426031758, + "grad_norm": 0.2243206799030304, + "learning_rate": 1.8886076471723216e-05, + "loss": 1.2568, + "step": 5940 + }, + { + "epoch": 1.7695042722314265, + "grad_norm": 0.2287902683019638, + "learning_rate": 1.888563401453941e-05, + "loss": 1.2711, + "step": 5941 + }, + { + "epoch": 1.769802118431095, + "grad_norm": 0.23100508749485016, + "learning_rate": 1.8885191474684706e-05, + "loss": 1.2832, + "step": 5942 + }, + { + "epoch": 1.7700999646307638, + "grad_norm": 0.21319617331027985, + "learning_rate": 1.8884748852163223e-05, + "loss": 1.2907, + "step": 5943 + }, + { + "epoch": 1.7703978108304326, + "grad_norm": 0.2116445004940033, + "learning_rate": 1.888430614697908e-05, + "loss": 1.2801, + "step": 5944 + }, + { + "epoch": 1.7706956570301011, + "grad_norm": 0.22597062587738037, + "learning_rate": 1.888386335913639e-05, + "loss": 1.2662, + "step": 5945 + }, + { + "epoch": 1.7709935032297697, + "grad_norm": 0.23162300884723663, + "learning_rate": 1.8883420488639278e-05, + "loss": 1.284, + "step": 5946 + }, + { + "epoch": 1.7712913494294384, + "grad_norm": 0.21227452158927917, + "learning_rate": 1.8882977535491867e-05, + "loss": 1.2554, + "step": 5947 + }, + { + "epoch": 1.771589195629107, + "grad_norm": 0.2118864804506302, + "learning_rate": 1.8882534499698272e-05, + "loss": 1.2644, + "step": 5948 + }, + { + "epoch": 1.7718870418287755, + "grad_norm": 0.22540844976902008, + "learning_rate": 1.888209138126262e-05, + "loss": 1.2756, + "step": 5949 + }, + { + "epoch": 1.7721848880284443, + "grad_norm": 0.2292068600654602, + "learning_rate": 1.888164818018903e-05, + "loss": 1.282, + "step": 5950 + }, + { + "epoch": 1.772482734228113, + "grad_norm": 0.2195514440536499, + "learning_rate": 1.8881204896481625e-05, + "loss": 1.2861, + "step": 5951 + }, + { + "epoch": 1.7727805804277816, + "grad_norm": 0.22650204598903656, + "learning_rate": 1.8880761530144536e-05, + "loss": 1.2663, + "step": 5952 + }, + { + "epoch": 1.7730784266274502, + "grad_norm": 0.21833503246307373, + "learning_rate": 1.888031808118188e-05, + "loss": 1.2659, + "step": 5953 + }, + { + "epoch": 1.773376272827119, + "grad_norm": 0.21655771136283875, + "learning_rate": 1.8879874549597783e-05, + "loss": 1.2811, + "step": 5954 + }, + { + "epoch": 1.7736741190267875, + "grad_norm": 0.2311340719461441, + "learning_rate": 1.887943093539638e-05, + "loss": 1.2689, + "step": 5955 + }, + { + "epoch": 1.773971965226456, + "grad_norm": 0.21346105635166168, + "learning_rate": 1.8878987238581786e-05, + "loss": 1.2483, + "step": 5956 + }, + { + "epoch": 1.7742698114261248, + "grad_norm": 0.220946803689003, + "learning_rate": 1.8878543459158143e-05, + "loss": 1.2655, + "step": 5957 + }, + { + "epoch": 1.7745676576257936, + "grad_norm": 0.2175082564353943, + "learning_rate": 1.887809959712957e-05, + "loss": 1.269, + "step": 5958 + }, + { + "epoch": 1.7748655038254622, + "grad_norm": 0.21422596275806427, + "learning_rate": 1.88776556525002e-05, + "loss": 1.2663, + "step": 5959 + }, + { + "epoch": 1.7751633500251307, + "grad_norm": 0.2292601764202118, + "learning_rate": 1.887721162527416e-05, + "loss": 1.2774, + "step": 5960 + }, + { + "epoch": 1.7754611962247995, + "grad_norm": 0.23035894334316254, + "learning_rate": 1.8876767515455586e-05, + "loss": 1.2631, + "step": 5961 + }, + { + "epoch": 1.775759042424468, + "grad_norm": 0.22301332652568817, + "learning_rate": 1.8876323323048612e-05, + "loss": 1.2722, + "step": 5962 + }, + { + "epoch": 1.7760568886241366, + "grad_norm": 0.22273315489292145, + "learning_rate": 1.8875879048057362e-05, + "loss": 1.2859, + "step": 5963 + }, + { + "epoch": 1.7763547348238053, + "grad_norm": 0.2204713076353073, + "learning_rate": 1.8875434690485973e-05, + "loss": 1.2511, + "step": 5964 + }, + { + "epoch": 1.7766525810234741, + "grad_norm": 0.21425484120845795, + "learning_rate": 1.8874990250338582e-05, + "loss": 1.2644, + "step": 5965 + }, + { + "epoch": 1.7769504272231427, + "grad_norm": 0.22761522233486176, + "learning_rate": 1.8874545727619327e-05, + "loss": 1.2703, + "step": 5966 + }, + { + "epoch": 1.7772482734228112, + "grad_norm": 0.23269104957580566, + "learning_rate": 1.8874101122332332e-05, + "loss": 1.2499, + "step": 5967 + }, + { + "epoch": 1.77754611962248, + "grad_norm": 0.2156340777873993, + "learning_rate": 1.8873656434481744e-05, + "loss": 1.288, + "step": 5968 + }, + { + "epoch": 1.7778439658221485, + "grad_norm": 0.21951022744178772, + "learning_rate": 1.88732116640717e-05, + "loss": 1.2656, + "step": 5969 + }, + { + "epoch": 1.778141812021817, + "grad_norm": 0.23585747182369232, + "learning_rate": 1.887276681110633e-05, + "loss": 1.2686, + "step": 5970 + }, + { + "epoch": 1.7784396582214859, + "grad_norm": 0.2331666797399521, + "learning_rate": 1.8872321875589782e-05, + "loss": 1.2705, + "step": 5971 + }, + { + "epoch": 1.7787375044211546, + "grad_norm": 0.2160840481519699, + "learning_rate": 1.887187685752619e-05, + "loss": 1.2581, + "step": 5972 + }, + { + "epoch": 1.7790353506208232, + "grad_norm": 0.21729782223701477, + "learning_rate": 1.8871431756919696e-05, + "loss": 1.2701, + "step": 5973 + }, + { + "epoch": 1.7793331968204917, + "grad_norm": 0.22713887691497803, + "learning_rate": 1.887098657377444e-05, + "loss": 1.2599, + "step": 5974 + }, + { + "epoch": 1.7796310430201605, + "grad_norm": 0.22662410140037537, + "learning_rate": 1.8870541308094567e-05, + "loss": 1.2516, + "step": 5975 + }, + { + "epoch": 1.779928889219829, + "grad_norm": 0.21475499868392944, + "learning_rate": 1.8870095959884218e-05, + "loss": 1.2561, + "step": 5976 + }, + { + "epoch": 1.7802267354194976, + "grad_norm": 0.2308451533317566, + "learning_rate": 1.8869650529147537e-05, + "loss": 1.2583, + "step": 5977 + }, + { + "epoch": 1.7805245816191664, + "grad_norm": 0.22059254348278046, + "learning_rate": 1.8869205015888667e-05, + "loss": 1.2695, + "step": 5978 + }, + { + "epoch": 1.7808224278188352, + "grad_norm": 0.22483398020267487, + "learning_rate": 1.8868759420111753e-05, + "loss": 1.2714, + "step": 5979 + }, + { + "epoch": 1.7811202740185037, + "grad_norm": 0.22063012421131134, + "learning_rate": 1.886831374182094e-05, + "loss": 1.266, + "step": 5980 + }, + { + "epoch": 1.7814181202181723, + "grad_norm": 0.2195800244808197, + "learning_rate": 1.886786798102038e-05, + "loss": 1.2838, + "step": 5981 + }, + { + "epoch": 1.781715966417841, + "grad_norm": 0.22838428616523743, + "learning_rate": 1.886742213771421e-05, + "loss": 1.2724, + "step": 5982 + }, + { + "epoch": 1.7820138126175098, + "grad_norm": 0.2218025028705597, + "learning_rate": 1.8866976211906588e-05, + "loss": 1.2689, + "step": 5983 + }, + { + "epoch": 1.7823116588171781, + "grad_norm": 0.21701163053512573, + "learning_rate": 1.886653020360166e-05, + "loss": 1.2733, + "step": 5984 + }, + { + "epoch": 1.782609505016847, + "grad_norm": 0.24475444853305817, + "learning_rate": 1.8866084112803574e-05, + "loss": 1.2692, + "step": 5985 + }, + { + "epoch": 1.7829073512165157, + "grad_norm": 0.2269645631313324, + "learning_rate": 1.886563793951648e-05, + "loss": 1.2877, + "step": 5986 + }, + { + "epoch": 1.7832051974161842, + "grad_norm": 0.2226429432630539, + "learning_rate": 1.886519168374453e-05, + "loss": 1.2505, + "step": 5987 + }, + { + "epoch": 1.7835030436158528, + "grad_norm": 0.21845246851444244, + "learning_rate": 1.886474534549188e-05, + "loss": 1.2501, + "step": 5988 + }, + { + "epoch": 1.7838008898155215, + "grad_norm": 0.21259817481040955, + "learning_rate": 1.8864298924762673e-05, + "loss": 1.2859, + "step": 5989 + }, + { + "epoch": 1.7840987360151903, + "grad_norm": 0.21492299437522888, + "learning_rate": 1.886385242156107e-05, + "loss": 1.2627, + "step": 5990 + }, + { + "epoch": 1.7843965822148586, + "grad_norm": 0.22381509840488434, + "learning_rate": 1.8863405835891225e-05, + "loss": 1.278, + "step": 5991 + }, + { + "epoch": 1.7846944284145274, + "grad_norm": 0.22142374515533447, + "learning_rate": 1.8862959167757294e-05, + "loss": 1.2726, + "step": 5992 + }, + { + "epoch": 1.7849922746141962, + "grad_norm": 0.22224482893943787, + "learning_rate": 1.886251241716343e-05, + "loss": 1.2676, + "step": 5993 + }, + { + "epoch": 1.7852901208138647, + "grad_norm": 0.21727298200130463, + "learning_rate": 1.8862065584113783e-05, + "loss": 1.2637, + "step": 5994 + }, + { + "epoch": 1.7855879670135333, + "grad_norm": 0.21374741196632385, + "learning_rate": 1.8861618668612523e-05, + "loss": 1.2681, + "step": 5995 + }, + { + "epoch": 1.785885813213202, + "grad_norm": 0.22146710753440857, + "learning_rate": 1.88611716706638e-05, + "loss": 1.2724, + "step": 5996 + }, + { + "epoch": 1.7861836594128708, + "grad_norm": 0.22528040409088135, + "learning_rate": 1.8860724590271775e-05, + "loss": 1.2778, + "step": 5997 + }, + { + "epoch": 1.7864815056125394, + "grad_norm": 0.22059322893619537, + "learning_rate": 1.8860277427440608e-05, + "loss": 1.2506, + "step": 5998 + }, + { + "epoch": 1.786779351812208, + "grad_norm": 0.22748437523841858, + "learning_rate": 1.885983018217446e-05, + "loss": 1.2794, + "step": 5999 + }, + { + "epoch": 1.7870771980118767, + "grad_norm": 0.2144092172384262, + "learning_rate": 1.8859382854477484e-05, + "loss": 1.2594, + "step": 6000 + }, + { + "epoch": 1.7870771980118767, + "eval_loss": 1.3452653884887695, + "eval_runtime": 23.4453, + "eval_samples_per_second": 73.959, + "eval_steps_per_second": 4.649, + "step": 6000 + }, + { + "epoch": 1.7873750442115452, + "grad_norm": 0.228704035282135, + "learning_rate": 1.8858935444353856e-05, + "loss": 1.2717, + "step": 6001 + }, + { + "epoch": 1.7876728904112138, + "grad_norm": 0.21824924647808075, + "learning_rate": 1.885848795180773e-05, + "loss": 1.2632, + "step": 6002 + }, + { + "epoch": 1.7879707366108826, + "grad_norm": 0.23707593977451324, + "learning_rate": 1.8858040376843272e-05, + "loss": 1.2665, + "step": 6003 + }, + { + "epoch": 1.7882685828105513, + "grad_norm": 0.23337838053703308, + "learning_rate": 1.8857592719464644e-05, + "loss": 1.2768, + "step": 6004 + }, + { + "epoch": 1.78856642901022, + "grad_norm": 0.23033075034618378, + "learning_rate": 1.8857144979676013e-05, + "loss": 1.2716, + "step": 6005 + }, + { + "epoch": 1.7888642752098884, + "grad_norm": 0.2230740189552307, + "learning_rate": 1.885669715748154e-05, + "loss": 1.2663, + "step": 6006 + }, + { + "epoch": 1.7891621214095572, + "grad_norm": 0.22400899231433868, + "learning_rate": 1.8856249252885398e-05, + "loss": 1.2693, + "step": 6007 + }, + { + "epoch": 1.7894599676092258, + "grad_norm": 0.2205016165971756, + "learning_rate": 1.8855801265891753e-05, + "loss": 1.2592, + "step": 6008 + }, + { + "epoch": 1.7897578138088943, + "grad_norm": 0.220457524061203, + "learning_rate": 1.885535319650477e-05, + "loss": 1.2593, + "step": 6009 + }, + { + "epoch": 1.790055660008563, + "grad_norm": 0.22970102727413177, + "learning_rate": 1.8854905044728617e-05, + "loss": 1.2751, + "step": 6010 + }, + { + "epoch": 1.7903535062082319, + "grad_norm": 0.22625747323036194, + "learning_rate": 1.8854456810567467e-05, + "loss": 1.286, + "step": 6011 + }, + { + "epoch": 1.7906513524079004, + "grad_norm": 0.2235589474439621, + "learning_rate": 1.885400849402549e-05, + "loss": 1.2779, + "step": 6012 + }, + { + "epoch": 1.790949198607569, + "grad_norm": 0.22546285390853882, + "learning_rate": 1.8853560095106857e-05, + "loss": 1.2683, + "step": 6013 + }, + { + "epoch": 1.7912470448072377, + "grad_norm": 0.22701458632946014, + "learning_rate": 1.8853111613815738e-05, + "loss": 1.2497, + "step": 6014 + }, + { + "epoch": 1.7915448910069063, + "grad_norm": 0.22597983479499817, + "learning_rate": 1.885266305015631e-05, + "loss": 1.2692, + "step": 6015 + }, + { + "epoch": 1.7918427372065748, + "grad_norm": 0.22108972072601318, + "learning_rate": 1.885221440413274e-05, + "loss": 1.2684, + "step": 6016 + }, + { + "epoch": 1.7921405834062436, + "grad_norm": 0.21375659108161926, + "learning_rate": 1.8851765675749203e-05, + "loss": 1.2581, + "step": 6017 + }, + { + "epoch": 1.7924384296059124, + "grad_norm": 0.2176457941532135, + "learning_rate": 1.8851316865009877e-05, + "loss": 1.274, + "step": 6018 + }, + { + "epoch": 1.792736275805581, + "grad_norm": 0.23012582957744598, + "learning_rate": 1.885086797191894e-05, + "loss": 1.2551, + "step": 6019 + }, + { + "epoch": 1.7930341220052495, + "grad_norm": 0.22473666071891785, + "learning_rate": 1.8850418996480565e-05, + "loss": 1.2645, + "step": 6020 + }, + { + "epoch": 1.7933319682049182, + "grad_norm": 0.22484521567821503, + "learning_rate": 1.8849969938698927e-05, + "loss": 1.2701, + "step": 6021 + }, + { + "epoch": 1.7936298144045868, + "grad_norm": 0.211605042219162, + "learning_rate": 1.884952079857821e-05, + "loss": 1.2729, + "step": 6022 + }, + { + "epoch": 1.7939276606042553, + "grad_norm": 0.2143273800611496, + "learning_rate": 1.8849071576122584e-05, + "loss": 1.2842, + "step": 6023 + }, + { + "epoch": 1.7942255068039241, + "grad_norm": 0.21585173904895782, + "learning_rate": 1.8848622271336235e-05, + "loss": 1.2512, + "step": 6024 + }, + { + "epoch": 1.794523353003593, + "grad_norm": 0.2228710949420929, + "learning_rate": 1.8848172884223345e-05, + "loss": 1.2592, + "step": 6025 + }, + { + "epoch": 1.7948211992032614, + "grad_norm": 0.23387007415294647, + "learning_rate": 1.8847723414788093e-05, + "loss": 1.2594, + "step": 6026 + }, + { + "epoch": 1.79511904540293, + "grad_norm": 0.22081926465034485, + "learning_rate": 1.8847273863034658e-05, + "loss": 1.2696, + "step": 6027 + }, + { + "epoch": 1.7954168916025988, + "grad_norm": 0.21065343916416168, + "learning_rate": 1.8846824228967222e-05, + "loss": 1.2764, + "step": 6028 + }, + { + "epoch": 1.7957147378022673, + "grad_norm": 0.2157675176858902, + "learning_rate": 1.8846374512589972e-05, + "loss": 1.2786, + "step": 6029 + }, + { + "epoch": 1.7960125840019359, + "grad_norm": 0.2320483773946762, + "learning_rate": 1.8845924713907094e-05, + "loss": 1.2599, + "step": 6030 + }, + { + "epoch": 1.7963104302016046, + "grad_norm": 0.2298630177974701, + "learning_rate": 1.8845474832922764e-05, + "loss": 1.2671, + "step": 6031 + }, + { + "epoch": 1.7966082764012734, + "grad_norm": 0.25772473216056824, + "learning_rate": 1.884502486964118e-05, + "loss": 1.2763, + "step": 6032 + }, + { + "epoch": 1.796906122600942, + "grad_norm": 0.2270805686712265, + "learning_rate": 1.884457482406652e-05, + "loss": 1.2822, + "step": 6033 + }, + { + "epoch": 1.7972039688006105, + "grad_norm": 0.24416397511959076, + "learning_rate": 1.884412469620297e-05, + "loss": 1.2835, + "step": 6034 + }, + { + "epoch": 1.7975018150002793, + "grad_norm": 0.2223682850599289, + "learning_rate": 1.884367448605472e-05, + "loss": 1.2612, + "step": 6035 + }, + { + "epoch": 1.7977996611999478, + "grad_norm": 0.26283374428749084, + "learning_rate": 1.8843224193625967e-05, + "loss": 1.2863, + "step": 6036 + }, + { + "epoch": 1.7980975073996164, + "grad_norm": 0.23235592246055603, + "learning_rate": 1.8842773818920887e-05, + "loss": 1.2712, + "step": 6037 + }, + { + "epoch": 1.7983953535992852, + "grad_norm": 0.2454880326986313, + "learning_rate": 1.884232336194368e-05, + "loss": 1.2395, + "step": 6038 + }, + { + "epoch": 1.798693199798954, + "grad_norm": 0.23668311536312103, + "learning_rate": 1.8841872822698528e-05, + "loss": 1.2666, + "step": 6039 + }, + { + "epoch": 1.7989910459986225, + "grad_norm": 0.2175203114748001, + "learning_rate": 1.8841422201189633e-05, + "loss": 1.2679, + "step": 6040 + }, + { + "epoch": 1.799288892198291, + "grad_norm": 0.22800126671791077, + "learning_rate": 1.884097149742118e-05, + "loss": 1.264, + "step": 6041 + }, + { + "epoch": 1.7995867383979598, + "grad_norm": 0.2290138304233551, + "learning_rate": 1.8840520711397367e-05, + "loss": 1.2478, + "step": 6042 + }, + { + "epoch": 1.7998845845976283, + "grad_norm": 0.24279721081256866, + "learning_rate": 1.8840069843122384e-05, + "loss": 1.2856, + "step": 6043 + }, + { + "epoch": 1.800182430797297, + "grad_norm": 0.23835624754428864, + "learning_rate": 1.8839618892600427e-05, + "loss": 1.2635, + "step": 6044 + }, + { + "epoch": 1.8004802769969657, + "grad_norm": 0.21752871572971344, + "learning_rate": 1.8839167859835695e-05, + "loss": 1.2784, + "step": 6045 + }, + { + "epoch": 1.8007781231966344, + "grad_norm": 0.23671525716781616, + "learning_rate": 1.8838716744832385e-05, + "loss": 1.2689, + "step": 6046 + }, + { + "epoch": 1.801075969396303, + "grad_norm": 0.22538290917873383, + "learning_rate": 1.8838265547594684e-05, + "loss": 1.2965, + "step": 6047 + }, + { + "epoch": 1.8013738155959715, + "grad_norm": 0.2299932986497879, + "learning_rate": 1.8837814268126798e-05, + "loss": 1.267, + "step": 6048 + }, + { + "epoch": 1.8016716617956403, + "grad_norm": 0.2351229190826416, + "learning_rate": 1.8837362906432928e-05, + "loss": 1.2631, + "step": 6049 + }, + { + "epoch": 1.801969507995309, + "grad_norm": 0.22970789670944214, + "learning_rate": 1.8836911462517268e-05, + "loss": 1.2736, + "step": 6050 + }, + { + "epoch": 1.8022673541949774, + "grad_norm": 0.22385859489440918, + "learning_rate": 1.883645993638402e-05, + "loss": 1.2694, + "step": 6051 + }, + { + "epoch": 1.8025652003946462, + "grad_norm": 0.2234676629304886, + "learning_rate": 1.8836008328037384e-05, + "loss": 1.2633, + "step": 6052 + }, + { + "epoch": 1.802863046594315, + "grad_norm": 0.2438935488462448, + "learning_rate": 1.8835556637481566e-05, + "loss": 1.276, + "step": 6053 + }, + { + "epoch": 1.8031608927939835, + "grad_norm": 0.2130393236875534, + "learning_rate": 1.883510486472076e-05, + "loss": 1.2765, + "step": 6054 + }, + { + "epoch": 1.803458738993652, + "grad_norm": 0.2313595861196518, + "learning_rate": 1.8834653009759177e-05, + "loss": 1.292, + "step": 6055 + }, + { + "epoch": 1.8037565851933208, + "grad_norm": 0.2259809374809265, + "learning_rate": 1.883420107260102e-05, + "loss": 1.2533, + "step": 6056 + }, + { + "epoch": 1.8040544313929896, + "grad_norm": 0.22493532299995422, + "learning_rate": 1.883374905325049e-05, + "loss": 1.281, + "step": 6057 + }, + { + "epoch": 1.804352277592658, + "grad_norm": 0.2182721346616745, + "learning_rate": 1.8833296951711793e-05, + "loss": 1.2652, + "step": 6058 + }, + { + "epoch": 1.8046501237923267, + "grad_norm": 0.22102218866348267, + "learning_rate": 1.883284476798914e-05, + "loss": 1.2588, + "step": 6059 + }, + { + "epoch": 1.8049479699919955, + "grad_norm": 0.2173367142677307, + "learning_rate": 1.8832392502086736e-05, + "loss": 1.2644, + "step": 6060 + }, + { + "epoch": 1.805245816191664, + "grad_norm": 0.22203823924064636, + "learning_rate": 1.8831940154008785e-05, + "loss": 1.2599, + "step": 6061 + }, + { + "epoch": 1.8055436623913326, + "grad_norm": 0.21372835338115692, + "learning_rate": 1.88314877237595e-05, + "loss": 1.2772, + "step": 6062 + }, + { + "epoch": 1.8058415085910013, + "grad_norm": 0.22985833883285522, + "learning_rate": 1.8831035211343088e-05, + "loss": 1.2799, + "step": 6063 + }, + { + "epoch": 1.8061393547906701, + "grad_norm": 0.21987077593803406, + "learning_rate": 1.883058261676376e-05, + "loss": 1.2606, + "step": 6064 + }, + { + "epoch": 1.8064372009903387, + "grad_norm": 0.22975841164588928, + "learning_rate": 1.883012994002573e-05, + "loss": 1.2736, + "step": 6065 + }, + { + "epoch": 1.8067350471900072, + "grad_norm": 0.22226479649543762, + "learning_rate": 1.8829677181133202e-05, + "loss": 1.2683, + "step": 6066 + }, + { + "epoch": 1.807032893389676, + "grad_norm": 0.22626134753227234, + "learning_rate": 1.8829224340090398e-05, + "loss": 1.2783, + "step": 6067 + }, + { + "epoch": 1.8073307395893445, + "grad_norm": 0.21422359347343445, + "learning_rate": 1.882877141690152e-05, + "loss": 1.2742, + "step": 6068 + }, + { + "epoch": 1.807628585789013, + "grad_norm": 0.23135624825954437, + "learning_rate": 1.882831841157079e-05, + "loss": 1.2614, + "step": 6069 + }, + { + "epoch": 1.8079264319886819, + "grad_norm": 0.24068643152713776, + "learning_rate": 1.8827865324102426e-05, + "loss": 1.2823, + "step": 6070 + }, + { + "epoch": 1.8082242781883506, + "grad_norm": 0.22447143495082855, + "learning_rate": 1.8827412154500634e-05, + "loss": 1.2554, + "step": 6071 + }, + { + "epoch": 1.8085221243880192, + "grad_norm": 0.22670595347881317, + "learning_rate": 1.8826958902769636e-05, + "loss": 1.2683, + "step": 6072 + }, + { + "epoch": 1.8088199705876877, + "grad_norm": 0.21980814635753632, + "learning_rate": 1.8826505568913644e-05, + "loss": 1.2773, + "step": 6073 + }, + { + "epoch": 1.8091178167873565, + "grad_norm": 0.23693813383579254, + "learning_rate": 1.882605215293688e-05, + "loss": 1.2541, + "step": 6074 + }, + { + "epoch": 1.809415662987025, + "grad_norm": 0.2204945832490921, + "learning_rate": 1.8825598654843563e-05, + "loss": 1.2771, + "step": 6075 + }, + { + "epoch": 1.8097135091866936, + "grad_norm": 0.22734081745147705, + "learning_rate": 1.882514507463791e-05, + "loss": 1.2766, + "step": 6076 + }, + { + "epoch": 1.8100113553863624, + "grad_norm": 0.2248615026473999, + "learning_rate": 1.8824691412324144e-05, + "loss": 1.2861, + "step": 6077 + }, + { + "epoch": 1.8103092015860311, + "grad_norm": 0.25749897956848145, + "learning_rate": 1.882423766790648e-05, + "loss": 1.2697, + "step": 6078 + }, + { + "epoch": 1.8106070477856997, + "grad_norm": 0.21990980207920074, + "learning_rate": 1.882378384138915e-05, + "loss": 1.2709, + "step": 6079 + }, + { + "epoch": 1.8109048939853682, + "grad_norm": 0.22228093445301056, + "learning_rate": 1.8823329932776367e-05, + "loss": 1.2844, + "step": 6080 + }, + { + "epoch": 1.811202740185037, + "grad_norm": 0.22391878068447113, + "learning_rate": 1.8822875942072355e-05, + "loss": 1.2648, + "step": 6081 + }, + { + "epoch": 1.8115005863847056, + "grad_norm": 0.22353194653987885, + "learning_rate": 1.8822421869281344e-05, + "loss": 1.2751, + "step": 6082 + }, + { + "epoch": 1.8117984325843741, + "grad_norm": 0.21903499960899353, + "learning_rate": 1.882196771440755e-05, + "loss": 1.2792, + "step": 6083 + }, + { + "epoch": 1.812096278784043, + "grad_norm": 0.22099357843399048, + "learning_rate": 1.8821513477455205e-05, + "loss": 1.2714, + "step": 6084 + }, + { + "epoch": 1.8123941249837117, + "grad_norm": 0.21618127822875977, + "learning_rate": 1.8821059158428534e-05, + "loss": 1.2577, + "step": 6085 + }, + { + "epoch": 1.8126919711833802, + "grad_norm": 0.21984921395778656, + "learning_rate": 1.8820604757331763e-05, + "loss": 1.2557, + "step": 6086 + }, + { + "epoch": 1.8129898173830488, + "grad_norm": 0.22133424878120422, + "learning_rate": 1.8820150274169115e-05, + "loss": 1.2793, + "step": 6087 + }, + { + "epoch": 1.8132876635827175, + "grad_norm": 0.22182853519916534, + "learning_rate": 1.8819695708944827e-05, + "loss": 1.279, + "step": 6088 + }, + { + "epoch": 1.813585509782386, + "grad_norm": 0.22566376626491547, + "learning_rate": 1.8819241061663124e-05, + "loss": 1.2718, + "step": 6089 + }, + { + "epoch": 1.8138833559820546, + "grad_norm": 0.2206290364265442, + "learning_rate": 1.8818786332328236e-05, + "loss": 1.2608, + "step": 6090 + }, + { + "epoch": 1.8141812021817234, + "grad_norm": 0.21377462148666382, + "learning_rate": 1.8818331520944396e-05, + "loss": 1.2419, + "step": 6091 + }, + { + "epoch": 1.8144790483813922, + "grad_norm": 0.2487218827009201, + "learning_rate": 1.8817876627515834e-05, + "loss": 1.2748, + "step": 6092 + }, + { + "epoch": 1.8147768945810607, + "grad_norm": 0.22330373525619507, + "learning_rate": 1.8817421652046777e-05, + "loss": 1.2595, + "step": 6093 + }, + { + "epoch": 1.8150747407807293, + "grad_norm": 0.23433241248130798, + "learning_rate": 1.8816966594541465e-05, + "loss": 1.2688, + "step": 6094 + }, + { + "epoch": 1.815372586980398, + "grad_norm": 0.2255052775144577, + "learning_rate": 1.8816511455004133e-05, + "loss": 1.2642, + "step": 6095 + }, + { + "epoch": 1.8156704331800666, + "grad_norm": 0.22542732954025269, + "learning_rate": 1.8816056233439008e-05, + "loss": 1.2643, + "step": 6096 + }, + { + "epoch": 1.8159682793797352, + "grad_norm": 0.23201413452625275, + "learning_rate": 1.8815600929850333e-05, + "loss": 1.2647, + "step": 6097 + }, + { + "epoch": 1.816266125579404, + "grad_norm": 0.23117925226688385, + "learning_rate": 1.881514554424234e-05, + "loss": 1.2517, + "step": 6098 + }, + { + "epoch": 1.8165639717790727, + "grad_norm": 0.22815728187561035, + "learning_rate": 1.8814690076619264e-05, + "loss": 1.2622, + "step": 6099 + }, + { + "epoch": 1.8168618179787412, + "grad_norm": 0.22319307923316956, + "learning_rate": 1.8814234526985346e-05, + "loss": 1.2719, + "step": 6100 + }, + { + "epoch": 1.8171596641784098, + "grad_norm": 0.2264396995306015, + "learning_rate": 1.8813778895344826e-05, + "loss": 1.2772, + "step": 6101 + }, + { + "epoch": 1.8174575103780786, + "grad_norm": 0.23161791265010834, + "learning_rate": 1.8813323181701938e-05, + "loss": 1.2572, + "step": 6102 + }, + { + "epoch": 1.8177553565777471, + "grad_norm": 0.228708416223526, + "learning_rate": 1.8812867386060928e-05, + "loss": 1.2874, + "step": 6103 + }, + { + "epoch": 1.8180532027774157, + "grad_norm": 0.21395297348499298, + "learning_rate": 1.881241150842603e-05, + "loss": 1.2519, + "step": 6104 + }, + { + "epoch": 1.8183510489770844, + "grad_norm": 0.22581659257411957, + "learning_rate": 1.8811955548801492e-05, + "loss": 1.2628, + "step": 6105 + }, + { + "epoch": 1.8186488951767532, + "grad_norm": 0.23664312064647675, + "learning_rate": 1.881149950719155e-05, + "loss": 1.2683, + "step": 6106 + }, + { + "epoch": 1.8189467413764218, + "grad_norm": 0.23028619587421417, + "learning_rate": 1.881104338360045e-05, + "loss": 1.2732, + "step": 6107 + }, + { + "epoch": 1.8192445875760903, + "grad_norm": 0.21617329120635986, + "learning_rate": 1.8810587178032434e-05, + "loss": 1.2808, + "step": 6108 + }, + { + "epoch": 1.819542433775759, + "grad_norm": 0.21901915967464447, + "learning_rate": 1.8810130890491755e-05, + "loss": 1.2636, + "step": 6109 + }, + { + "epoch": 1.8198402799754279, + "grad_norm": 0.23325960338115692, + "learning_rate": 1.8809674520982643e-05, + "loss": 1.281, + "step": 6110 + }, + { + "epoch": 1.8201381261750962, + "grad_norm": 0.2988772392272949, + "learning_rate": 1.880921806950936e-05, + "loss": 1.2565, + "step": 6111 + }, + { + "epoch": 1.820435972374765, + "grad_norm": 0.26141929626464844, + "learning_rate": 1.880876153607614e-05, + "loss": 1.2591, + "step": 6112 + }, + { + "epoch": 1.8207338185744337, + "grad_norm": 0.27643170952796936, + "learning_rate": 1.8808304920687238e-05, + "loss": 1.2612, + "step": 6113 + }, + { + "epoch": 1.8210316647741023, + "grad_norm": 0.3000950217247009, + "learning_rate": 1.8807848223346895e-05, + "loss": 1.2705, + "step": 6114 + }, + { + "epoch": 1.8213295109737708, + "grad_norm": 0.21227096021175385, + "learning_rate": 1.8807391444059368e-05, + "loss": 1.2747, + "step": 6115 + }, + { + "epoch": 1.8216273571734396, + "grad_norm": 0.22193902730941772, + "learning_rate": 1.8806934582828908e-05, + "loss": 1.2743, + "step": 6116 + }, + { + "epoch": 1.8219252033731084, + "grad_norm": 0.22305090725421906, + "learning_rate": 1.8806477639659757e-05, + "loss": 1.2796, + "step": 6117 + }, + { + "epoch": 1.8222230495727767, + "grad_norm": 0.22063802182674408, + "learning_rate": 1.880602061455617e-05, + "loss": 1.2766, + "step": 6118 + }, + { + "epoch": 1.8225208957724455, + "grad_norm": 0.24384748935699463, + "learning_rate": 1.8805563507522403e-05, + "loss": 1.2711, + "step": 6119 + }, + { + "epoch": 1.8228187419721142, + "grad_norm": 0.23025380074977875, + "learning_rate": 1.8805106318562702e-05, + "loss": 1.2682, + "step": 6120 + }, + { + "epoch": 1.8231165881717828, + "grad_norm": 0.233215793967247, + "learning_rate": 1.8804649047681328e-05, + "loss": 1.2661, + "step": 6121 + }, + { + "epoch": 1.8234144343714513, + "grad_norm": 0.21864233911037445, + "learning_rate": 1.880419169488253e-05, + "loss": 1.2664, + "step": 6122 + }, + { + "epoch": 1.8237122805711201, + "grad_norm": 0.23096491396427155, + "learning_rate": 1.8803734260170565e-05, + "loss": 1.2575, + "step": 6123 + }, + { + "epoch": 1.8240101267707889, + "grad_norm": 0.2311975210905075, + "learning_rate": 1.880327674354969e-05, + "loss": 1.2569, + "step": 6124 + }, + { + "epoch": 1.8243079729704572, + "grad_norm": 0.24429549276828766, + "learning_rate": 1.8802819145024157e-05, + "loss": 1.2552, + "step": 6125 + }, + { + "epoch": 1.824605819170126, + "grad_norm": 0.23141895234584808, + "learning_rate": 1.880236146459823e-05, + "loss": 1.2498, + "step": 6126 + }, + { + "epoch": 1.8249036653697948, + "grad_norm": 0.21364538371562958, + "learning_rate": 1.8801903702276164e-05, + "loss": 1.2581, + "step": 6127 + }, + { + "epoch": 1.8252015115694633, + "grad_norm": 0.21442024409770966, + "learning_rate": 1.880144585806222e-05, + "loss": 1.2776, + "step": 6128 + }, + { + "epoch": 1.8254993577691319, + "grad_norm": 0.2209845632314682, + "learning_rate": 1.880098793196065e-05, + "loss": 1.2797, + "step": 6129 + }, + { + "epoch": 1.8257972039688006, + "grad_norm": 0.20947112143039703, + "learning_rate": 1.8800529923975726e-05, + "loss": 1.2527, + "step": 6130 + }, + { + "epoch": 1.8260950501684694, + "grad_norm": 0.21361005306243896, + "learning_rate": 1.8800071834111704e-05, + "loss": 1.2467, + "step": 6131 + }, + { + "epoch": 1.826392896368138, + "grad_norm": 0.2133372575044632, + "learning_rate": 1.879961366237284e-05, + "loss": 1.2806, + "step": 6132 + }, + { + "epoch": 1.8266907425678065, + "grad_norm": 0.22230187058448792, + "learning_rate": 1.8799155408763407e-05, + "loss": 1.2621, + "step": 6133 + }, + { + "epoch": 1.8269885887674753, + "grad_norm": 0.22517280280590057, + "learning_rate": 1.879869707328766e-05, + "loss": 1.2427, + "step": 6134 + }, + { + "epoch": 1.8272864349671438, + "grad_norm": 0.2517028748989105, + "learning_rate": 1.8798238655949873e-05, + "loss": 1.2958, + "step": 6135 + }, + { + "epoch": 1.8275842811668124, + "grad_norm": 0.25225141644477844, + "learning_rate": 1.8797780156754303e-05, + "loss": 1.2589, + "step": 6136 + }, + { + "epoch": 1.8278821273664811, + "grad_norm": 0.22715818881988525, + "learning_rate": 1.8797321575705216e-05, + "loss": 1.2772, + "step": 6137 + }, + { + "epoch": 1.82817997356615, + "grad_norm": 0.23603639006614685, + "learning_rate": 1.879686291280688e-05, + "loss": 1.258, + "step": 6138 + }, + { + "epoch": 1.8284778197658185, + "grad_norm": 0.2428574413061142, + "learning_rate": 1.8796404168063564e-05, + "loss": 1.2743, + "step": 6139 + }, + { + "epoch": 1.828775665965487, + "grad_norm": 0.21798360347747803, + "learning_rate": 1.8795945341479537e-05, + "loss": 1.2665, + "step": 6140 + }, + { + "epoch": 1.8290735121651558, + "grad_norm": 0.25193336606025696, + "learning_rate": 1.8795486433059067e-05, + "loss": 1.2804, + "step": 6141 + }, + { + "epoch": 1.8293713583648243, + "grad_norm": 0.24949511885643005, + "learning_rate": 1.879502744280642e-05, + "loss": 1.2491, + "step": 6142 + }, + { + "epoch": 1.829669204564493, + "grad_norm": 0.22949185967445374, + "learning_rate": 1.8794568370725867e-05, + "loss": 1.2611, + "step": 6143 + }, + { + "epoch": 1.8299670507641617, + "grad_norm": 0.3252366781234741, + "learning_rate": 1.8794109216821686e-05, + "loss": 1.2688, + "step": 6144 + }, + { + "epoch": 1.8302648969638304, + "grad_norm": 0.2781335711479187, + "learning_rate": 1.879364998109814e-05, + "loss": 1.2864, + "step": 6145 + }, + { + "epoch": 1.830562743163499, + "grad_norm": 0.248800590634346, + "learning_rate": 1.879319066355951e-05, + "loss": 1.2746, + "step": 6146 + }, + { + "epoch": 1.8308605893631675, + "grad_norm": 0.2256658971309662, + "learning_rate": 1.8792731264210063e-05, + "loss": 1.2705, + "step": 6147 + }, + { + "epoch": 1.8311584355628363, + "grad_norm": 0.32897138595581055, + "learning_rate": 1.8792271783054072e-05, + "loss": 1.2777, + "step": 6148 + }, + { + "epoch": 1.8314562817625049, + "grad_norm": 0.22394399344921112, + "learning_rate": 1.879181222009582e-05, + "loss": 1.2753, + "step": 6149 + }, + { + "epoch": 1.8317541279621734, + "grad_norm": 0.22119250893592834, + "learning_rate": 1.8791352575339577e-05, + "loss": 1.2642, + "step": 6150 + }, + { + "epoch": 1.8320519741618422, + "grad_norm": 0.2309536635875702, + "learning_rate": 1.879089284878962e-05, + "loss": 1.2729, + "step": 6151 + }, + { + "epoch": 1.832349820361511, + "grad_norm": 0.22146302461624146, + "learning_rate": 1.8790433040450227e-05, + "loss": 1.2569, + "step": 6152 + }, + { + "epoch": 1.8326476665611795, + "grad_norm": 0.22456271946430206, + "learning_rate": 1.878997315032568e-05, + "loss": 1.2745, + "step": 6153 + }, + { + "epoch": 1.832945512760848, + "grad_norm": 0.2310352325439453, + "learning_rate": 1.8789513178420246e-05, + "loss": 1.2688, + "step": 6154 + }, + { + "epoch": 1.8332433589605168, + "grad_norm": 0.24492207169532776, + "learning_rate": 1.8789053124738213e-05, + "loss": 1.2598, + "step": 6155 + }, + { + "epoch": 1.8335412051601854, + "grad_norm": 0.2163112908601761, + "learning_rate": 1.8788592989283863e-05, + "loss": 1.2659, + "step": 6156 + }, + { + "epoch": 1.833839051359854, + "grad_norm": 0.22027339041233063, + "learning_rate": 1.8788132772061476e-05, + "loss": 1.2685, + "step": 6157 + }, + { + "epoch": 1.8341368975595227, + "grad_norm": 0.22436580061912537, + "learning_rate": 1.878767247307533e-05, + "loss": 1.2487, + "step": 6158 + }, + { + "epoch": 1.8344347437591915, + "grad_norm": 0.2317281812429428, + "learning_rate": 1.8787212092329713e-05, + "loss": 1.265, + "step": 6159 + }, + { + "epoch": 1.83473258995886, + "grad_norm": 0.2352665215730667, + "learning_rate": 1.87867516298289e-05, + "loss": 1.2836, + "step": 6160 + }, + { + "epoch": 1.8350304361585286, + "grad_norm": 0.2499164193868637, + "learning_rate": 1.878629108557718e-05, + "loss": 1.2638, + "step": 6161 + }, + { + "epoch": 1.8353282823581973, + "grad_norm": 0.2163504809141159, + "learning_rate": 1.8785830459578845e-05, + "loss": 1.2662, + "step": 6162 + }, + { + "epoch": 1.8356261285578659, + "grad_norm": 0.2350332885980606, + "learning_rate": 1.878536975183817e-05, + "loss": 1.2497, + "step": 6163 + }, + { + "epoch": 1.8359239747575344, + "grad_norm": 0.23730915784835815, + "learning_rate": 1.8784908962359443e-05, + "loss": 1.2606, + "step": 6164 + }, + { + "epoch": 1.8362218209572032, + "grad_norm": 0.23150528967380524, + "learning_rate": 1.8784448091146953e-05, + "loss": 1.2739, + "step": 6165 + }, + { + "epoch": 1.836519667156872, + "grad_norm": 0.29068508744239807, + "learning_rate": 1.8783987138204992e-05, + "loss": 1.2667, + "step": 6166 + }, + { + "epoch": 1.8368175133565405, + "grad_norm": 0.21419180929660797, + "learning_rate": 1.878352610353784e-05, + "loss": 1.2655, + "step": 6167 + }, + { + "epoch": 1.837115359556209, + "grad_norm": 0.2460484802722931, + "learning_rate": 1.8783064987149796e-05, + "loss": 1.2737, + "step": 6168 + }, + { + "epoch": 1.8374132057558779, + "grad_norm": 0.2468438595533371, + "learning_rate": 1.8782603789045143e-05, + "loss": 1.283, + "step": 6169 + }, + { + "epoch": 1.8377110519555464, + "grad_norm": 0.22701148688793182, + "learning_rate": 1.8782142509228174e-05, + "loss": 1.2797, + "step": 6170 + }, + { + "epoch": 1.838008898155215, + "grad_norm": 0.2519301772117615, + "learning_rate": 1.8781681147703178e-05, + "loss": 1.2766, + "step": 6171 + }, + { + "epoch": 1.8383067443548837, + "grad_norm": 0.2273082137107849, + "learning_rate": 1.8781219704474457e-05, + "loss": 1.2673, + "step": 6172 + }, + { + "epoch": 1.8386045905545525, + "grad_norm": 0.22925065457820892, + "learning_rate": 1.8780758179546295e-05, + "loss": 1.2771, + "step": 6173 + }, + { + "epoch": 1.838902436754221, + "grad_norm": 0.22185562551021576, + "learning_rate": 1.878029657292299e-05, + "loss": 1.2721, + "step": 6174 + }, + { + "epoch": 1.8392002829538896, + "grad_norm": 0.22061800956726074, + "learning_rate": 1.877983488460883e-05, + "loss": 1.2774, + "step": 6175 + }, + { + "epoch": 1.8394981291535584, + "grad_norm": 0.22943070530891418, + "learning_rate": 1.8779373114608116e-05, + "loss": 1.26, + "step": 6176 + }, + { + "epoch": 1.8397959753532271, + "grad_norm": 0.2462579756975174, + "learning_rate": 1.8778911262925147e-05, + "loss": 1.2783, + "step": 6177 + }, + { + "epoch": 1.8400938215528955, + "grad_norm": 0.24037913978099823, + "learning_rate": 1.877844932956422e-05, + "loss": 1.2715, + "step": 6178 + }, + { + "epoch": 1.8403916677525642, + "grad_norm": 0.2180246114730835, + "learning_rate": 1.8777987314529625e-05, + "loss": 1.2808, + "step": 6179 + }, + { + "epoch": 1.840689513952233, + "grad_norm": 0.23191234469413757, + "learning_rate": 1.877752521782567e-05, + "loss": 1.2812, + "step": 6180 + }, + { + "epoch": 1.8409873601519016, + "grad_norm": 0.22117017209529877, + "learning_rate": 1.8777063039456644e-05, + "loss": 1.2523, + "step": 6181 + }, + { + "epoch": 1.8412852063515701, + "grad_norm": 0.22852277755737305, + "learning_rate": 1.8776600779426856e-05, + "loss": 1.2662, + "step": 6182 + }, + { + "epoch": 1.8415830525512389, + "grad_norm": 0.22987455129623413, + "learning_rate": 1.87761384377406e-05, + "loss": 1.2726, + "step": 6183 + }, + { + "epoch": 1.8418808987509077, + "grad_norm": 0.21900035440921783, + "learning_rate": 1.8775676014402187e-05, + "loss": 1.2724, + "step": 6184 + }, + { + "epoch": 1.842178744950576, + "grad_norm": 0.23168663680553436, + "learning_rate": 1.8775213509415913e-05, + "loss": 1.2658, + "step": 6185 + }, + { + "epoch": 1.8424765911502448, + "grad_norm": 0.23066608607769012, + "learning_rate": 1.8774750922786077e-05, + "loss": 1.2753, + "step": 6186 + }, + { + "epoch": 1.8427744373499135, + "grad_norm": 0.23946154117584229, + "learning_rate": 1.877428825451699e-05, + "loss": 1.2471, + "step": 6187 + }, + { + "epoch": 1.843072283549582, + "grad_norm": 0.2212175577878952, + "learning_rate": 1.877382550461295e-05, + "loss": 1.2615, + "step": 6188 + }, + { + "epoch": 1.8433701297492506, + "grad_norm": 0.2367374747991562, + "learning_rate": 1.8773362673078274e-05, + "loss": 1.2637, + "step": 6189 + }, + { + "epoch": 1.8436679759489194, + "grad_norm": 0.23277902603149414, + "learning_rate": 1.8772899759917257e-05, + "loss": 1.2641, + "step": 6190 + }, + { + "epoch": 1.8439658221485882, + "grad_norm": 0.22886283695697784, + "learning_rate": 1.877243676513421e-05, + "loss": 1.273, + "step": 6191 + }, + { + "epoch": 1.8442636683482567, + "grad_norm": 0.2127494364976883, + "learning_rate": 1.877197368873344e-05, + "loss": 1.2625, + "step": 6192 + }, + { + "epoch": 1.8445615145479253, + "grad_norm": 0.21837003529071808, + "learning_rate": 1.8771510530719253e-05, + "loss": 1.2756, + "step": 6193 + }, + { + "epoch": 1.844859360747594, + "grad_norm": 0.22271862626075745, + "learning_rate": 1.877104729109596e-05, + "loss": 1.2807, + "step": 6194 + }, + { + "epoch": 1.8451572069472626, + "grad_norm": 0.2360060065984726, + "learning_rate": 1.8770583969867876e-05, + "loss": 1.2673, + "step": 6195 + }, + { + "epoch": 1.8454550531469311, + "grad_norm": 0.22648479044437408, + "learning_rate": 1.87701205670393e-05, + "loss": 1.2547, + "step": 6196 + }, + { + "epoch": 1.8457528993466, + "grad_norm": 0.2194075882434845, + "learning_rate": 1.876965708261456e-05, + "loss": 1.2599, + "step": 6197 + }, + { + "epoch": 1.8460507455462687, + "grad_norm": 0.2420044094324112, + "learning_rate": 1.876919351659795e-05, + "loss": 1.2629, + "step": 6198 + }, + { + "epoch": 1.8463485917459372, + "grad_norm": 0.22396166622638702, + "learning_rate": 1.87687298689938e-05, + "loss": 1.2693, + "step": 6199 + }, + { + "epoch": 1.8466464379456058, + "grad_norm": 0.25090858340263367, + "learning_rate": 1.876826613980641e-05, + "loss": 1.2653, + "step": 6200 + }, + { + "epoch": 1.8469442841452746, + "grad_norm": 0.22225917875766754, + "learning_rate": 1.8767802329040105e-05, + "loss": 1.2446, + "step": 6201 + }, + { + "epoch": 1.8472421303449431, + "grad_norm": 0.2735970616340637, + "learning_rate": 1.8767338436699193e-05, + "loss": 1.2748, + "step": 6202 + }, + { + "epoch": 1.8475399765446117, + "grad_norm": 0.23733359575271606, + "learning_rate": 1.876687446278799e-05, + "loss": 1.294, + "step": 6203 + }, + { + "epoch": 1.8478378227442804, + "grad_norm": 0.24605488777160645, + "learning_rate": 1.8766410407310817e-05, + "loss": 1.2947, + "step": 6204 + }, + { + "epoch": 1.8481356689439492, + "grad_norm": 0.23347657918930054, + "learning_rate": 1.8765946270271986e-05, + "loss": 1.2678, + "step": 6205 + }, + { + "epoch": 1.8484335151436178, + "grad_norm": 0.241032212972641, + "learning_rate": 1.8765482051675823e-05, + "loss": 1.2678, + "step": 6206 + }, + { + "epoch": 1.8487313613432863, + "grad_norm": 0.2312811017036438, + "learning_rate": 1.8765017751526642e-05, + "loss": 1.2826, + "step": 6207 + }, + { + "epoch": 1.849029207542955, + "grad_norm": 0.2209426313638687, + "learning_rate": 1.8764553369828765e-05, + "loss": 1.2546, + "step": 6208 + }, + { + "epoch": 1.8493270537426236, + "grad_norm": 0.22778834402561188, + "learning_rate": 1.876408890658651e-05, + "loss": 1.2604, + "step": 6209 + }, + { + "epoch": 1.8496248999422922, + "grad_norm": 0.2315870225429535, + "learning_rate": 1.8763624361804198e-05, + "loss": 1.2737, + "step": 6210 + }, + { + "epoch": 1.849922746141961, + "grad_norm": 0.2249654233455658, + "learning_rate": 1.8763159735486153e-05, + "loss": 1.292, + "step": 6211 + }, + { + "epoch": 1.8502205923416297, + "grad_norm": 0.2213476002216339, + "learning_rate": 1.87626950276367e-05, + "loss": 1.2748, + "step": 6212 + }, + { + "epoch": 1.8505184385412983, + "grad_norm": 0.22676432132720947, + "learning_rate": 1.876223023826016e-05, + "loss": 1.2649, + "step": 6213 + }, + { + "epoch": 1.8508162847409668, + "grad_norm": 0.24635079503059387, + "learning_rate": 1.876176536736085e-05, + "loss": 1.2729, + "step": 6214 + }, + { + "epoch": 1.8511141309406356, + "grad_norm": 0.2461482584476471, + "learning_rate": 1.876130041494311e-05, + "loss": 1.2471, + "step": 6215 + }, + { + "epoch": 1.8514119771403041, + "grad_norm": 0.22454310953617096, + "learning_rate": 1.876083538101126e-05, + "loss": 1.2645, + "step": 6216 + }, + { + "epoch": 1.8517098233399727, + "grad_norm": 0.26606985926628113, + "learning_rate": 1.876037026556962e-05, + "loss": 1.2822, + "step": 6217 + }, + { + "epoch": 1.8520076695396415, + "grad_norm": 0.2288210541009903, + "learning_rate": 1.8759905068622523e-05, + "loss": 1.2579, + "step": 6218 + }, + { + "epoch": 1.8523055157393102, + "grad_norm": 0.21645639836788177, + "learning_rate": 1.8759439790174297e-05, + "loss": 1.2595, + "step": 6219 + }, + { + "epoch": 1.8526033619389788, + "grad_norm": 0.22343234717845917, + "learning_rate": 1.875897443022927e-05, + "loss": 1.2691, + "step": 6220 + }, + { + "epoch": 1.8529012081386473, + "grad_norm": 0.24432212114334106, + "learning_rate": 1.8758508988791773e-05, + "loss": 1.2626, + "step": 6221 + }, + { + "epoch": 1.853199054338316, + "grad_norm": 0.22526809573173523, + "learning_rate": 1.875804346586614e-05, + "loss": 1.2552, + "step": 6222 + }, + { + "epoch": 1.8534969005379847, + "grad_norm": 0.2272035777568817, + "learning_rate": 1.8757577861456687e-05, + "loss": 1.2497, + "step": 6223 + }, + { + "epoch": 1.8537947467376532, + "grad_norm": 0.24414905905723572, + "learning_rate": 1.8757112175567765e-05, + "loss": 1.2659, + "step": 6224 + }, + { + "epoch": 1.854092592937322, + "grad_norm": 0.21001029014587402, + "learning_rate": 1.8756646408203695e-05, + "loss": 1.2712, + "step": 6225 + }, + { + "epoch": 1.8543904391369908, + "grad_norm": 0.26856371760368347, + "learning_rate": 1.8756180559368812e-05, + "loss": 1.2719, + "step": 6226 + }, + { + "epoch": 1.8546882853366593, + "grad_norm": 0.2772649824619293, + "learning_rate": 1.8755714629067455e-05, + "loss": 1.2714, + "step": 6227 + }, + { + "epoch": 1.8549861315363279, + "grad_norm": 0.24141082167625427, + "learning_rate": 1.8755248617303955e-05, + "loss": 1.2489, + "step": 6228 + }, + { + "epoch": 1.8552839777359966, + "grad_norm": 0.3939058184623718, + "learning_rate": 1.875478252408265e-05, + "loss": 1.2715, + "step": 6229 + }, + { + "epoch": 1.8555818239356652, + "grad_norm": 0.247290700674057, + "learning_rate": 1.8754316349407875e-05, + "loss": 1.2684, + "step": 6230 + }, + { + "epoch": 1.8558796701353337, + "grad_norm": 0.2563334107398987, + "learning_rate": 1.8753850093283966e-05, + "loss": 1.2531, + "step": 6231 + }, + { + "epoch": 1.8561775163350025, + "grad_norm": 0.23085132241249084, + "learning_rate": 1.875338375571526e-05, + "loss": 1.2906, + "step": 6232 + }, + { + "epoch": 1.8564753625346713, + "grad_norm": 0.21739709377288818, + "learning_rate": 1.87529173367061e-05, + "loss": 1.2734, + "step": 6233 + }, + { + "epoch": 1.8567732087343398, + "grad_norm": 0.2239406406879425, + "learning_rate": 1.8752450836260823e-05, + "loss": 1.2716, + "step": 6234 + }, + { + "epoch": 1.8570710549340084, + "grad_norm": 0.25117847323417664, + "learning_rate": 1.8751984254383773e-05, + "loss": 1.2792, + "step": 6235 + }, + { + "epoch": 1.8573689011336771, + "grad_norm": 0.2235771268606186, + "learning_rate": 1.8751517591079284e-05, + "loss": 1.268, + "step": 6236 + }, + { + "epoch": 1.8576667473333457, + "grad_norm": 0.21495643258094788, + "learning_rate": 1.8751050846351703e-05, + "loss": 1.2744, + "step": 6237 + }, + { + "epoch": 1.8579645935330142, + "grad_norm": 0.2274235486984253, + "learning_rate": 1.875058402020537e-05, + "loss": 1.2881, + "step": 6238 + }, + { + "epoch": 1.858262439732683, + "grad_norm": 0.2241227924823761, + "learning_rate": 1.875011711264463e-05, + "loss": 1.2549, + "step": 6239 + }, + { + "epoch": 1.8585602859323518, + "grad_norm": 0.22675253450870514, + "learning_rate": 1.8749650123673828e-05, + "loss": 1.2695, + "step": 6240 + }, + { + "epoch": 1.8588581321320203, + "grad_norm": 0.21627160906791687, + "learning_rate": 1.8749183053297305e-05, + "loss": 1.2546, + "step": 6241 + }, + { + "epoch": 1.8591559783316889, + "grad_norm": 0.23366791009902954, + "learning_rate": 1.874871590151941e-05, + "loss": 1.2689, + "step": 6242 + }, + { + "epoch": 1.8594538245313577, + "grad_norm": 0.23596471548080444, + "learning_rate": 1.874824866834449e-05, + "loss": 1.2607, + "step": 6243 + }, + { + "epoch": 1.8597516707310264, + "grad_norm": 0.22348742187023163, + "learning_rate": 1.8747781353776885e-05, + "loss": 1.2695, + "step": 6244 + }, + { + "epoch": 1.8600495169306948, + "grad_norm": 0.2204684019088745, + "learning_rate": 1.8747313957820955e-05, + "loss": 1.2751, + "step": 6245 + }, + { + "epoch": 1.8603473631303635, + "grad_norm": 0.22575503587722778, + "learning_rate": 1.8746846480481036e-05, + "loss": 1.264, + "step": 6246 + }, + { + "epoch": 1.8606452093300323, + "grad_norm": 0.22473333775997162, + "learning_rate": 1.8746378921761484e-05, + "loss": 1.2665, + "step": 6247 + }, + { + "epoch": 1.8609430555297009, + "grad_norm": 0.23724524676799774, + "learning_rate": 1.874591128166665e-05, + "loss": 1.265, + "step": 6248 + }, + { + "epoch": 1.8612409017293694, + "grad_norm": 0.2172134816646576, + "learning_rate": 1.8745443560200885e-05, + "loss": 1.2743, + "step": 6249 + }, + { + "epoch": 1.8615387479290382, + "grad_norm": 0.21474622189998627, + "learning_rate": 1.8744975757368532e-05, + "loss": 1.2775, + "step": 6250 + }, + { + "epoch": 1.861836594128707, + "grad_norm": 0.21865083277225494, + "learning_rate": 1.8744507873173955e-05, + "loss": 1.2752, + "step": 6251 + }, + { + "epoch": 1.8621344403283753, + "grad_norm": 0.2198323756456375, + "learning_rate": 1.8744039907621504e-05, + "loss": 1.2606, + "step": 6252 + }, + { + "epoch": 1.862432286528044, + "grad_norm": 0.22255510091781616, + "learning_rate": 1.874357186071553e-05, + "loss": 1.27, + "step": 6253 + }, + { + "epoch": 1.8627301327277128, + "grad_norm": 0.23042772710323334, + "learning_rate": 1.8743103732460382e-05, + "loss": 1.2692, + "step": 6254 + }, + { + "epoch": 1.8630279789273814, + "grad_norm": 0.21695443987846375, + "learning_rate": 1.874263552286043e-05, + "loss": 1.2658, + "step": 6255 + }, + { + "epoch": 1.86332582512705, + "grad_norm": 0.21861781179904938, + "learning_rate": 1.8742167231920018e-05, + "loss": 1.2713, + "step": 6256 + }, + { + "epoch": 1.8636236713267187, + "grad_norm": 0.22876958549022675, + "learning_rate": 1.874169885964351e-05, + "loss": 1.2901, + "step": 6257 + }, + { + "epoch": 1.8639215175263875, + "grad_norm": 0.21717725694179535, + "learning_rate": 1.874123040603526e-05, + "loss": 1.2666, + "step": 6258 + }, + { + "epoch": 1.864219363726056, + "grad_norm": 0.23413631319999695, + "learning_rate": 1.8740761871099625e-05, + "loss": 1.2719, + "step": 6259 + }, + { + "epoch": 1.8645172099257246, + "grad_norm": 0.2251828908920288, + "learning_rate": 1.874029325484097e-05, + "loss": 1.2702, + "step": 6260 + }, + { + "epoch": 1.8648150561253933, + "grad_norm": 0.22034551203250885, + "learning_rate": 1.873982455726365e-05, + "loss": 1.2629, + "step": 6261 + }, + { + "epoch": 1.8651129023250619, + "grad_norm": 0.2504265308380127, + "learning_rate": 1.8739355778372025e-05, + "loss": 1.2801, + "step": 6262 + }, + { + "epoch": 1.8654107485247304, + "grad_norm": 0.22186152637004852, + "learning_rate": 1.8738886918170463e-05, + "loss": 1.2736, + "step": 6263 + }, + { + "epoch": 1.8657085947243992, + "grad_norm": 0.22834238409996033, + "learning_rate": 1.873841797666332e-05, + "loss": 1.2809, + "step": 6264 + }, + { + "epoch": 1.866006440924068, + "grad_norm": 0.22512505948543549, + "learning_rate": 1.873794895385496e-05, + "loss": 1.2801, + "step": 6265 + }, + { + "epoch": 1.8663042871237365, + "grad_norm": 0.22817987203598022, + "learning_rate": 1.873747984974975e-05, + "loss": 1.2527, + "step": 6266 + }, + { + "epoch": 1.866602133323405, + "grad_norm": 0.22437958419322968, + "learning_rate": 1.8737010664352048e-05, + "loss": 1.2597, + "step": 6267 + }, + { + "epoch": 1.8668999795230738, + "grad_norm": 0.21601814031600952, + "learning_rate": 1.8736541397666226e-05, + "loss": 1.2644, + "step": 6268 + }, + { + "epoch": 1.8671978257227424, + "grad_norm": 0.23356673121452332, + "learning_rate": 1.8736072049696648e-05, + "loss": 1.265, + "step": 6269 + }, + { + "epoch": 1.867495671922411, + "grad_norm": 0.2248130738735199, + "learning_rate": 1.8735602620447676e-05, + "loss": 1.2676, + "step": 6270 + }, + { + "epoch": 1.8677935181220797, + "grad_norm": 0.2332877218723297, + "learning_rate": 1.873513310992369e-05, + "loss": 1.2774, + "step": 6271 + }, + { + "epoch": 1.8680913643217485, + "grad_norm": 0.23454421758651733, + "learning_rate": 1.8734663518129045e-05, + "loss": 1.2721, + "step": 6272 + }, + { + "epoch": 1.868389210521417, + "grad_norm": 0.2268686145544052, + "learning_rate": 1.873419384506811e-05, + "loss": 1.2748, + "step": 6273 + }, + { + "epoch": 1.8686870567210856, + "grad_norm": 0.23339509963989258, + "learning_rate": 1.8733724090745268e-05, + "loss": 1.2777, + "step": 6274 + }, + { + "epoch": 1.8689849029207544, + "grad_norm": 0.24625654518604279, + "learning_rate": 1.8733254255164874e-05, + "loss": 1.2638, + "step": 6275 + }, + { + "epoch": 1.869282749120423, + "grad_norm": 0.22551506757736206, + "learning_rate": 1.8732784338331313e-05, + "loss": 1.2716, + "step": 6276 + }, + { + "epoch": 1.8695805953200915, + "grad_norm": 0.25346723198890686, + "learning_rate": 1.8732314340248946e-05, + "loss": 1.2674, + "step": 6277 + }, + { + "epoch": 1.8698784415197602, + "grad_norm": 0.23279167711734772, + "learning_rate": 1.8731844260922153e-05, + "loss": 1.2584, + "step": 6278 + }, + { + "epoch": 1.870176287719429, + "grad_norm": 0.25668370723724365, + "learning_rate": 1.8731374100355302e-05, + "loss": 1.2447, + "step": 6279 + }, + { + "epoch": 1.8704741339190976, + "grad_norm": 0.25280889868736267, + "learning_rate": 1.873090385855277e-05, + "loss": 1.2716, + "step": 6280 + }, + { + "epoch": 1.870771980118766, + "grad_norm": 0.24993520975112915, + "learning_rate": 1.873043353551894e-05, + "loss": 1.2531, + "step": 6281 + }, + { + "epoch": 1.8710698263184349, + "grad_norm": 0.22116729617118835, + "learning_rate": 1.872996313125817e-05, + "loss": 1.2654, + "step": 6282 + }, + { + "epoch": 1.8713676725181034, + "grad_norm": 0.23692280054092407, + "learning_rate": 1.872949264577485e-05, + "loss": 1.2655, + "step": 6283 + }, + { + "epoch": 1.871665518717772, + "grad_norm": 0.2352658212184906, + "learning_rate": 1.8729022079073358e-05, + "loss": 1.2655, + "step": 6284 + }, + { + "epoch": 1.8719633649174408, + "grad_norm": 0.21809260547161102, + "learning_rate": 1.8728551431158068e-05, + "loss": 1.2623, + "step": 6285 + }, + { + "epoch": 1.8722612111171095, + "grad_norm": 0.23344051837921143, + "learning_rate": 1.8728080702033354e-05, + "loss": 1.2654, + "step": 6286 + }, + { + "epoch": 1.872559057316778, + "grad_norm": 0.22276152670383453, + "learning_rate": 1.8727609891703603e-05, + "loss": 1.2576, + "step": 6287 + }, + { + "epoch": 1.8728569035164466, + "grad_norm": 0.22614987194538116, + "learning_rate": 1.8727139000173197e-05, + "loss": 1.2522, + "step": 6288 + }, + { + "epoch": 1.8731547497161154, + "grad_norm": 0.21375888586044312, + "learning_rate": 1.8726668027446507e-05, + "loss": 1.2701, + "step": 6289 + }, + { + "epoch": 1.873452595915784, + "grad_norm": 0.2416151911020279, + "learning_rate": 1.8726196973527922e-05, + "loss": 1.281, + "step": 6290 + }, + { + "epoch": 1.8737504421154525, + "grad_norm": 0.24732106924057007, + "learning_rate": 1.8725725838421825e-05, + "loss": 1.269, + "step": 6291 + }, + { + "epoch": 1.8740482883151213, + "grad_norm": 0.23814208805561066, + "learning_rate": 1.8725254622132597e-05, + "loss": 1.2635, + "step": 6292 + }, + { + "epoch": 1.87434613451479, + "grad_norm": 0.2380426526069641, + "learning_rate": 1.8724783324664626e-05, + "loss": 1.2612, + "step": 6293 + }, + { + "epoch": 1.8746439807144586, + "grad_norm": 0.23904134333133698, + "learning_rate": 1.8724311946022293e-05, + "loss": 1.2597, + "step": 6294 + }, + { + "epoch": 1.8749418269141271, + "grad_norm": 0.21786151826381683, + "learning_rate": 1.8723840486209984e-05, + "loss": 1.2771, + "step": 6295 + }, + { + "epoch": 1.875239673113796, + "grad_norm": 0.2555582821369171, + "learning_rate": 1.872336894523209e-05, + "loss": 1.2563, + "step": 6296 + }, + { + "epoch": 1.8755375193134645, + "grad_norm": 0.2183697521686554, + "learning_rate": 1.8722897323092988e-05, + "loss": 1.2642, + "step": 6297 + }, + { + "epoch": 1.875835365513133, + "grad_norm": 0.22678542137145996, + "learning_rate": 1.8722425619797074e-05, + "loss": 1.2594, + "step": 6298 + }, + { + "epoch": 1.8761332117128018, + "grad_norm": 0.2133307158946991, + "learning_rate": 1.8721953835348734e-05, + "loss": 1.2621, + "step": 6299 + }, + { + "epoch": 1.8764310579124706, + "grad_norm": 0.23420582711696625, + "learning_rate": 1.8721481969752363e-05, + "loss": 1.2645, + "step": 6300 + }, + { + "epoch": 1.876728904112139, + "grad_norm": 0.2238420844078064, + "learning_rate": 1.8721010023012343e-05, + "loss": 1.2645, + "step": 6301 + }, + { + "epoch": 1.8770267503118077, + "grad_norm": 0.22619526088237762, + "learning_rate": 1.872053799513307e-05, + "loss": 1.2644, + "step": 6302 + }, + { + "epoch": 1.8773245965114764, + "grad_norm": 0.2313038855791092, + "learning_rate": 1.872006588611893e-05, + "loss": 1.2555, + "step": 6303 + }, + { + "epoch": 1.877622442711145, + "grad_norm": 0.22440804541110992, + "learning_rate": 1.871959369597432e-05, + "loss": 1.2806, + "step": 6304 + }, + { + "epoch": 1.8779202889108135, + "grad_norm": 0.2729954421520233, + "learning_rate": 1.871912142470364e-05, + "loss": 1.269, + "step": 6305 + }, + { + "epoch": 1.8782181351104823, + "grad_norm": 0.21770896017551422, + "learning_rate": 1.8718649072311272e-05, + "loss": 1.2796, + "step": 6306 + }, + { + "epoch": 1.878515981310151, + "grad_norm": 0.22921176254749298, + "learning_rate": 1.8718176638801614e-05, + "loss": 1.2685, + "step": 6307 + }, + { + "epoch": 1.8788138275098196, + "grad_norm": 0.2256852388381958, + "learning_rate": 1.8717704124179065e-05, + "loss": 1.2633, + "step": 6308 + }, + { + "epoch": 1.8791116737094882, + "grad_norm": 0.2169460952281952, + "learning_rate": 1.8717231528448015e-05, + "loss": 1.2879, + "step": 6309 + }, + { + "epoch": 1.879409519909157, + "grad_norm": 0.2836572825908661, + "learning_rate": 1.8716758851612873e-05, + "loss": 1.2723, + "step": 6310 + }, + { + "epoch": 1.8797073661088257, + "grad_norm": 0.22437360882759094, + "learning_rate": 1.8716286093678023e-05, + "loss": 1.2715, + "step": 6311 + }, + { + "epoch": 1.880005212308494, + "grad_norm": 0.22923672199249268, + "learning_rate": 1.8715813254647872e-05, + "loss": 1.2609, + "step": 6312 + }, + { + "epoch": 1.8803030585081628, + "grad_norm": 0.22592346370220184, + "learning_rate": 1.8715340334526812e-05, + "loss": 1.2837, + "step": 6313 + }, + { + "epoch": 1.8806009047078316, + "grad_norm": 0.21839545667171478, + "learning_rate": 1.871486733331925e-05, + "loss": 1.2558, + "step": 6314 + }, + { + "epoch": 1.8808987509075001, + "grad_norm": 0.22045713663101196, + "learning_rate": 1.871439425102959e-05, + "loss": 1.2804, + "step": 6315 + }, + { + "epoch": 1.8811965971071687, + "grad_norm": 0.23273544013500214, + "learning_rate": 1.8713921087662223e-05, + "loss": 1.2772, + "step": 6316 + }, + { + "epoch": 1.8814944433068375, + "grad_norm": 0.22192887961864471, + "learning_rate": 1.871344784322155e-05, + "loss": 1.2484, + "step": 6317 + }, + { + "epoch": 1.8817922895065062, + "grad_norm": 0.23370422422885895, + "learning_rate": 1.871297451771199e-05, + "loss": 1.2679, + "step": 6318 + }, + { + "epoch": 1.8820901357061746, + "grad_norm": 0.22910785675048828, + "learning_rate": 1.8712501111137933e-05, + "loss": 1.2686, + "step": 6319 + }, + { + "epoch": 1.8823879819058433, + "grad_norm": 0.22994323074817657, + "learning_rate": 1.8712027623503785e-05, + "loss": 1.2549, + "step": 6320 + }, + { + "epoch": 1.882685828105512, + "grad_norm": 0.2222004234790802, + "learning_rate": 1.8711554054813955e-05, + "loss": 1.2761, + "step": 6321 + }, + { + "epoch": 1.8829836743051807, + "grad_norm": 0.22731554508209229, + "learning_rate": 1.871108040507285e-05, + "loss": 1.2623, + "step": 6322 + }, + { + "epoch": 1.8832815205048492, + "grad_norm": 0.22629621624946594, + "learning_rate": 1.8710606674284874e-05, + "loss": 1.2814, + "step": 6323 + }, + { + "epoch": 1.883579366704518, + "grad_norm": 0.22215241193771362, + "learning_rate": 1.8710132862454432e-05, + "loss": 1.2745, + "step": 6324 + }, + { + "epoch": 1.8838772129041867, + "grad_norm": 0.2270025908946991, + "learning_rate": 1.8709658969585936e-05, + "loss": 1.2704, + "step": 6325 + }, + { + "epoch": 1.8841750591038553, + "grad_norm": 0.24180828034877777, + "learning_rate": 1.8709184995683797e-05, + "loss": 1.2672, + "step": 6326 + }, + { + "epoch": 1.8844729053035238, + "grad_norm": 0.23187457025051117, + "learning_rate": 1.870871094075242e-05, + "loss": 1.2845, + "step": 6327 + }, + { + "epoch": 1.8847707515031926, + "grad_norm": 0.2269025444984436, + "learning_rate": 1.870823680479622e-05, + "loss": 1.263, + "step": 6328 + }, + { + "epoch": 1.8850685977028612, + "grad_norm": 0.21496471762657166, + "learning_rate": 1.87077625878196e-05, + "loss": 1.2658, + "step": 6329 + }, + { + "epoch": 1.8853664439025297, + "grad_norm": 0.26027464866638184, + "learning_rate": 1.8707288289826983e-05, + "loss": 1.262, + "step": 6330 + }, + { + "epoch": 1.8856642901021985, + "grad_norm": 0.23373070359230042, + "learning_rate": 1.8706813910822776e-05, + "loss": 1.2617, + "step": 6331 + }, + { + "epoch": 1.8859621363018673, + "grad_norm": 0.23181870579719543, + "learning_rate": 1.8706339450811396e-05, + "loss": 1.2688, + "step": 6332 + }, + { + "epoch": 1.8862599825015358, + "grad_norm": 0.22112888097763062, + "learning_rate": 1.870586490979725e-05, + "loss": 1.2649, + "step": 6333 + }, + { + "epoch": 1.8865578287012044, + "grad_norm": 0.2553446590900421, + "learning_rate": 1.8705390287784758e-05, + "loss": 1.2698, + "step": 6334 + }, + { + "epoch": 1.8868556749008731, + "grad_norm": 0.22858518362045288, + "learning_rate": 1.870491558477834e-05, + "loss": 1.2796, + "step": 6335 + }, + { + "epoch": 1.8871535211005417, + "grad_norm": 0.2378801852464676, + "learning_rate": 1.8704440800782403e-05, + "loss": 1.2609, + "step": 6336 + }, + { + "epoch": 1.8874513673002102, + "grad_norm": 0.22767691314220428, + "learning_rate": 1.8703965935801373e-05, + "loss": 1.2714, + "step": 6337 + }, + { + "epoch": 1.887749213499879, + "grad_norm": 0.22173963487148285, + "learning_rate": 1.8703490989839664e-05, + "loss": 1.2564, + "step": 6338 + }, + { + "epoch": 1.8880470596995478, + "grad_norm": 0.26524287462234497, + "learning_rate": 1.8703015962901692e-05, + "loss": 1.2921, + "step": 6339 + }, + { + "epoch": 1.8883449058992163, + "grad_norm": 0.23315204679965973, + "learning_rate": 1.8702540854991884e-05, + "loss": 1.2585, + "step": 6340 + }, + { + "epoch": 1.8886427520988849, + "grad_norm": 0.22819599509239197, + "learning_rate": 1.8702065666114658e-05, + "loss": 1.2513, + "step": 6341 + }, + { + "epoch": 1.8889405982985537, + "grad_norm": 0.21942868828773499, + "learning_rate": 1.870159039627443e-05, + "loss": 1.2547, + "step": 6342 + }, + { + "epoch": 1.8892384444982222, + "grad_norm": 0.2668503522872925, + "learning_rate": 1.8701115045475627e-05, + "loss": 1.2617, + "step": 6343 + }, + { + "epoch": 1.8895362906978908, + "grad_norm": 0.2508845329284668, + "learning_rate": 1.8700639613722667e-05, + "loss": 1.2818, + "step": 6344 + }, + { + "epoch": 1.8898341368975595, + "grad_norm": 0.23692795634269714, + "learning_rate": 1.870016410101998e-05, + "loss": 1.261, + "step": 6345 + }, + { + "epoch": 1.8901319830972283, + "grad_norm": 0.30562636256217957, + "learning_rate": 1.8699688507371987e-05, + "loss": 1.2886, + "step": 6346 + }, + { + "epoch": 1.8904298292968968, + "grad_norm": 0.23118507862091064, + "learning_rate": 1.869921283278311e-05, + "loss": 1.2845, + "step": 6347 + }, + { + "epoch": 1.8907276754965654, + "grad_norm": 0.2323632538318634, + "learning_rate": 1.8698737077257776e-05, + "loss": 1.259, + "step": 6348 + }, + { + "epoch": 1.8910255216962342, + "grad_norm": 0.22015786170959473, + "learning_rate": 1.869826124080041e-05, + "loss": 1.2548, + "step": 6349 + }, + { + "epoch": 1.8913233678959027, + "grad_norm": 0.2369198501110077, + "learning_rate": 1.869778532341545e-05, + "loss": 1.2668, + "step": 6350 + }, + { + "epoch": 1.8916212140955713, + "grad_norm": 0.22735393047332764, + "learning_rate": 1.8697309325107308e-05, + "loss": 1.2682, + "step": 6351 + }, + { + "epoch": 1.89191906029524, + "grad_norm": 0.2261963188648224, + "learning_rate": 1.869683324588042e-05, + "loss": 1.2662, + "step": 6352 + }, + { + "epoch": 1.8922169064949088, + "grad_norm": 0.23152479529380798, + "learning_rate": 1.869635708573922e-05, + "loss": 1.2597, + "step": 6353 + }, + { + "epoch": 1.8925147526945774, + "grad_norm": 0.2275117188692093, + "learning_rate": 1.8695880844688133e-05, + "loss": 1.2793, + "step": 6354 + }, + { + "epoch": 1.892812598894246, + "grad_norm": 0.2778516411781311, + "learning_rate": 1.869540452273159e-05, + "loss": 1.2615, + "step": 6355 + }, + { + "epoch": 1.8931104450939147, + "grad_norm": 0.22918972373008728, + "learning_rate": 1.869492811987402e-05, + "loss": 1.2645, + "step": 6356 + }, + { + "epoch": 1.8934082912935832, + "grad_norm": 0.241749107837677, + "learning_rate": 1.8694451636119858e-05, + "loss": 1.2554, + "step": 6357 + }, + { + "epoch": 1.8937061374932518, + "grad_norm": 0.2322414517402649, + "learning_rate": 1.8693975071473537e-05, + "loss": 1.2894, + "step": 6358 + }, + { + "epoch": 1.8940039836929206, + "grad_norm": 0.297078937292099, + "learning_rate": 1.8693498425939497e-05, + "loss": 1.2644, + "step": 6359 + }, + { + "epoch": 1.8943018298925893, + "grad_norm": 0.24310941994190216, + "learning_rate": 1.8693021699522162e-05, + "loss": 1.2822, + "step": 6360 + }, + { + "epoch": 1.8945996760922579, + "grad_norm": 0.24547787010669708, + "learning_rate": 1.8692544892225975e-05, + "loss": 1.2514, + "step": 6361 + }, + { + "epoch": 1.8948975222919264, + "grad_norm": 0.23121975362300873, + "learning_rate": 1.869206800405537e-05, + "loss": 1.2825, + "step": 6362 + }, + { + "epoch": 1.8951953684915952, + "grad_norm": 0.2518481910228729, + "learning_rate": 1.869159103501478e-05, + "loss": 1.2399, + "step": 6363 + }, + { + "epoch": 1.8954932146912638, + "grad_norm": 0.2387373447418213, + "learning_rate": 1.8691113985108652e-05, + "loss": 1.2723, + "step": 6364 + }, + { + "epoch": 1.8957910608909323, + "grad_norm": 0.23863528668880463, + "learning_rate": 1.8690636854341414e-05, + "loss": 1.2809, + "step": 6365 + }, + { + "epoch": 1.896088907090601, + "grad_norm": 0.23983988165855408, + "learning_rate": 1.869015964271751e-05, + "loss": 1.2794, + "step": 6366 + }, + { + "epoch": 1.8963867532902698, + "grad_norm": 0.22375057637691498, + "learning_rate": 1.8689682350241384e-05, + "loss": 1.2825, + "step": 6367 + }, + { + "epoch": 1.8966845994899384, + "grad_norm": 0.2556236982345581, + "learning_rate": 1.868920497691747e-05, + "loss": 1.2732, + "step": 6368 + }, + { + "epoch": 1.896982445689607, + "grad_norm": 0.2415783554315567, + "learning_rate": 1.868872752275021e-05, + "loss": 1.2763, + "step": 6369 + }, + { + "epoch": 1.8972802918892757, + "grad_norm": 0.23676668107509613, + "learning_rate": 1.868824998774405e-05, + "loss": 1.2794, + "step": 6370 + }, + { + "epoch": 1.8975781380889443, + "grad_norm": 0.22184059023857117, + "learning_rate": 1.8687772371903427e-05, + "loss": 1.2596, + "step": 6371 + }, + { + "epoch": 1.8978759842886128, + "grad_norm": 0.23676683008670807, + "learning_rate": 1.8687294675232795e-05, + "loss": 1.2698, + "step": 6372 + }, + { + "epoch": 1.8981738304882816, + "grad_norm": 0.23678049445152283, + "learning_rate": 1.868681689773659e-05, + "loss": 1.2804, + "step": 6373 + }, + { + "epoch": 1.8984716766879504, + "grad_norm": 0.22727486491203308, + "learning_rate": 1.8686339039419254e-05, + "loss": 1.2666, + "step": 6374 + }, + { + "epoch": 1.898769522887619, + "grad_norm": 0.2718203663825989, + "learning_rate": 1.8685861100285242e-05, + "loss": 1.2587, + "step": 6375 + }, + { + "epoch": 1.8990673690872875, + "grad_norm": 0.2525746822357178, + "learning_rate": 1.8685383080339e-05, + "loss": 1.2758, + "step": 6376 + }, + { + "epoch": 1.8993652152869562, + "grad_norm": 0.24332743883132935, + "learning_rate": 1.868490497958497e-05, + "loss": 1.2773, + "step": 6377 + }, + { + "epoch": 1.899663061486625, + "grad_norm": 0.3178490698337555, + "learning_rate": 1.86844267980276e-05, + "loss": 1.2591, + "step": 6378 + }, + { + "epoch": 1.8999609076862933, + "grad_norm": 0.23794011771678925, + "learning_rate": 1.8683948535671344e-05, + "loss": 1.2536, + "step": 6379 + }, + { + "epoch": 1.900258753885962, + "grad_norm": 0.2222364842891693, + "learning_rate": 1.868347019252065e-05, + "loss": 1.2539, + "step": 6380 + }, + { + "epoch": 1.9005566000856309, + "grad_norm": 0.22782839834690094, + "learning_rate": 1.8682991768579965e-05, + "loss": 1.2592, + "step": 6381 + }, + { + "epoch": 1.9008544462852994, + "grad_norm": 0.2816076874732971, + "learning_rate": 1.8682513263853743e-05, + "loss": 1.2584, + "step": 6382 + }, + { + "epoch": 1.901152292484968, + "grad_norm": 0.22128011286258698, + "learning_rate": 1.868203467834644e-05, + "loss": 1.27, + "step": 6383 + }, + { + "epoch": 1.9014501386846367, + "grad_norm": 0.26034650206565857, + "learning_rate": 1.8681556012062502e-05, + "loss": 1.2779, + "step": 6384 + }, + { + "epoch": 1.9017479848843055, + "grad_norm": 0.22287800908088684, + "learning_rate": 1.868107726500638e-05, + "loss": 1.2539, + "step": 6385 + }, + { + "epoch": 1.9020458310839738, + "grad_norm": 0.2690623998641968, + "learning_rate": 1.8680598437182537e-05, + "loss": 1.2748, + "step": 6386 + }, + { + "epoch": 1.9023436772836426, + "grad_norm": 0.22985732555389404, + "learning_rate": 1.8680119528595427e-05, + "loss": 1.2684, + "step": 6387 + }, + { + "epoch": 1.9026415234833114, + "grad_norm": 0.26795485615730286, + "learning_rate": 1.8679640539249498e-05, + "loss": 1.2669, + "step": 6388 + }, + { + "epoch": 1.90293936968298, + "grad_norm": 0.23416420817375183, + "learning_rate": 1.8679161469149214e-05, + "loss": 1.2389, + "step": 6389 + }, + { + "epoch": 1.9032372158826485, + "grad_norm": 0.3261023759841919, + "learning_rate": 1.8678682318299032e-05, + "loss": 1.2779, + "step": 6390 + }, + { + "epoch": 1.9035350620823173, + "grad_norm": 0.2772911787033081, + "learning_rate": 1.8678203086703402e-05, + "loss": 1.2659, + "step": 6391 + }, + { + "epoch": 1.903832908281986, + "grad_norm": 0.27153968811035156, + "learning_rate": 1.8677723774366792e-05, + "loss": 1.2812, + "step": 6392 + }, + { + "epoch": 1.9041307544816546, + "grad_norm": 0.22545918822288513, + "learning_rate": 1.8677244381293655e-05, + "loss": 1.2555, + "step": 6393 + }, + { + "epoch": 1.9044286006813231, + "grad_norm": 0.3606754541397095, + "learning_rate": 1.8676764907488455e-05, + "loss": 1.271, + "step": 6394 + }, + { + "epoch": 1.904726446880992, + "grad_norm": 0.23116520047187805, + "learning_rate": 1.8676285352955653e-05, + "loss": 1.265, + "step": 6395 + }, + { + "epoch": 1.9050242930806605, + "grad_norm": 0.21856485307216644, + "learning_rate": 1.8675805717699705e-05, + "loss": 1.2694, + "step": 6396 + }, + { + "epoch": 1.905322139280329, + "grad_norm": 0.22345422208309174, + "learning_rate": 1.8675326001725086e-05, + "loss": 1.2642, + "step": 6397 + }, + { + "epoch": 1.9056199854799978, + "grad_norm": 0.22112628817558289, + "learning_rate": 1.8674846205036243e-05, + "loss": 1.2568, + "step": 6398 + }, + { + "epoch": 1.9059178316796666, + "grad_norm": 0.21793977916240692, + "learning_rate": 1.8674366327637652e-05, + "loss": 1.2516, + "step": 6399 + }, + { + "epoch": 1.906215677879335, + "grad_norm": 0.22539812326431274, + "learning_rate": 1.8673886369533772e-05, + "loss": 1.2735, + "step": 6400 + }, + { + "epoch": 1.9065135240790037, + "grad_norm": 0.23870933055877686, + "learning_rate": 1.8673406330729072e-05, + "loss": 1.2719, + "step": 6401 + }, + { + "epoch": 1.9068113702786724, + "grad_norm": 0.22308455407619476, + "learning_rate": 1.8672926211228018e-05, + "loss": 1.2672, + "step": 6402 + }, + { + "epoch": 1.907109216478341, + "grad_norm": 0.2290334850549698, + "learning_rate": 1.8672446011035074e-05, + "loss": 1.2778, + "step": 6403 + }, + { + "epoch": 1.9074070626780095, + "grad_norm": 0.22054670751094818, + "learning_rate": 1.867196573015471e-05, + "loss": 1.2746, + "step": 6404 + }, + { + "epoch": 1.9077049088776783, + "grad_norm": 0.22691991925239563, + "learning_rate": 1.8671485368591393e-05, + "loss": 1.2681, + "step": 6405 + }, + { + "epoch": 1.908002755077347, + "grad_norm": 0.2267318069934845, + "learning_rate": 1.8671004926349592e-05, + "loss": 1.2529, + "step": 6406 + }, + { + "epoch": 1.9083006012770156, + "grad_norm": 0.22116418182849884, + "learning_rate": 1.867052440343378e-05, + "loss": 1.2737, + "step": 6407 + }, + { + "epoch": 1.9085984474766842, + "grad_norm": 0.2230682671070099, + "learning_rate": 1.867004379984842e-05, + "loss": 1.2628, + "step": 6408 + }, + { + "epoch": 1.908896293676353, + "grad_norm": 0.226077601313591, + "learning_rate": 1.8669563115598e-05, + "loss": 1.284, + "step": 6409 + }, + { + "epoch": 1.9091941398760215, + "grad_norm": 0.21638865768909454, + "learning_rate": 1.8669082350686973e-05, + "loss": 1.2772, + "step": 6410 + }, + { + "epoch": 1.90949198607569, + "grad_norm": 0.23876522481441498, + "learning_rate": 1.866860150511982e-05, + "loss": 1.2668, + "step": 6411 + }, + { + "epoch": 1.9097898322753588, + "grad_norm": 0.22125433385372162, + "learning_rate": 1.8668120578901022e-05, + "loss": 1.2522, + "step": 6412 + }, + { + "epoch": 1.9100876784750276, + "grad_norm": 0.22033749520778656, + "learning_rate": 1.866763957203504e-05, + "loss": 1.2667, + "step": 6413 + }, + { + "epoch": 1.9103855246746961, + "grad_norm": 0.22628949582576752, + "learning_rate": 1.866715848452636e-05, + "loss": 1.2686, + "step": 6414 + }, + { + "epoch": 1.9106833708743647, + "grad_norm": 0.25021255016326904, + "learning_rate": 1.8666677316379453e-05, + "loss": 1.2807, + "step": 6415 + }, + { + "epoch": 1.9109812170740335, + "grad_norm": 0.2332649976015091, + "learning_rate": 1.8666196067598793e-05, + "loss": 1.285, + "step": 6416 + }, + { + "epoch": 1.911279063273702, + "grad_norm": 0.2220938801765442, + "learning_rate": 1.8665714738188866e-05, + "loss": 1.257, + "step": 6417 + }, + { + "epoch": 1.9115769094733706, + "grad_norm": 0.2171488106250763, + "learning_rate": 1.8665233328154142e-05, + "loss": 1.2579, + "step": 6418 + }, + { + "epoch": 1.9118747556730393, + "grad_norm": 0.2283460795879364, + "learning_rate": 1.8664751837499104e-05, + "loss": 1.2701, + "step": 6419 + }, + { + "epoch": 1.912172601872708, + "grad_norm": 0.2154398411512375, + "learning_rate": 1.866427026622823e-05, + "loss": 1.2668, + "step": 6420 + }, + { + "epoch": 1.9124704480723766, + "grad_norm": 0.21827664971351624, + "learning_rate": 1.8663788614346003e-05, + "loss": 1.2886, + "step": 6421 + }, + { + "epoch": 1.9127682942720452, + "grad_norm": 0.22104908525943756, + "learning_rate": 1.8663306881856906e-05, + "loss": 1.2813, + "step": 6422 + }, + { + "epoch": 1.913066140471714, + "grad_norm": 0.2262624204158783, + "learning_rate": 1.866282506876541e-05, + "loss": 1.2605, + "step": 6423 + }, + { + "epoch": 1.9133639866713825, + "grad_norm": 0.23348413407802582, + "learning_rate": 1.866234317507601e-05, + "loss": 1.2662, + "step": 6424 + }, + { + "epoch": 1.913661832871051, + "grad_norm": 0.22634485363960266, + "learning_rate": 1.8661861200793187e-05, + "loss": 1.2569, + "step": 6425 + }, + { + "epoch": 1.9139596790707198, + "grad_norm": 0.23128573596477509, + "learning_rate": 1.866137914592142e-05, + "loss": 1.2505, + "step": 6426 + }, + { + "epoch": 1.9142575252703886, + "grad_norm": 0.2224595546722412, + "learning_rate": 1.86608970104652e-05, + "loss": 1.2555, + "step": 6427 + }, + { + "epoch": 1.9145553714700572, + "grad_norm": 0.21892890334129333, + "learning_rate": 1.8660414794429003e-05, + "loss": 1.2743, + "step": 6428 + }, + { + "epoch": 1.9148532176697257, + "grad_norm": 0.24438150227069855, + "learning_rate": 1.8659932497817328e-05, + "loss": 1.299, + "step": 6429 + }, + { + "epoch": 1.9151510638693945, + "grad_norm": 0.21374495327472687, + "learning_rate": 1.8659450120634656e-05, + "loss": 1.261, + "step": 6430 + }, + { + "epoch": 1.915448910069063, + "grad_norm": 0.24549736082553864, + "learning_rate": 1.8658967662885472e-05, + "loss": 1.2712, + "step": 6431 + }, + { + "epoch": 1.9157467562687316, + "grad_norm": 0.2284165322780609, + "learning_rate": 1.8658485124574274e-05, + "loss": 1.2549, + "step": 6432 + }, + { + "epoch": 1.9160446024684004, + "grad_norm": 0.2533683478832245, + "learning_rate": 1.8658002505705543e-05, + "loss": 1.2804, + "step": 6433 + }, + { + "epoch": 1.9163424486680691, + "grad_norm": 0.21719850599765778, + "learning_rate": 1.8657519806283772e-05, + "loss": 1.2756, + "step": 6434 + }, + { + "epoch": 1.9166402948677377, + "grad_norm": 0.23185434937477112, + "learning_rate": 1.865703702631345e-05, + "loss": 1.2539, + "step": 6435 + }, + { + "epoch": 1.9169381410674062, + "grad_norm": 0.2468571960926056, + "learning_rate": 1.8656554165799074e-05, + "loss": 1.2479, + "step": 6436 + }, + { + "epoch": 1.917235987267075, + "grad_norm": 0.24522361159324646, + "learning_rate": 1.8656071224745132e-05, + "loss": 1.2761, + "step": 6437 + }, + { + "epoch": 1.9175338334667438, + "grad_norm": 0.22193396091461182, + "learning_rate": 1.8655588203156118e-05, + "loss": 1.2677, + "step": 6438 + }, + { + "epoch": 1.917831679666412, + "grad_norm": 0.2236398607492447, + "learning_rate": 1.8655105101036523e-05, + "loss": 1.2674, + "step": 6439 + }, + { + "epoch": 1.9181295258660809, + "grad_norm": 0.24252815544605255, + "learning_rate": 1.865462191839085e-05, + "loss": 1.2843, + "step": 6440 + }, + { + "epoch": 1.9184273720657496, + "grad_norm": 0.24570755660533905, + "learning_rate": 1.865413865522359e-05, + "loss": 1.2619, + "step": 6441 + }, + { + "epoch": 1.9187252182654182, + "grad_norm": 0.22457174956798553, + "learning_rate": 1.8653655311539234e-05, + "loss": 1.2696, + "step": 6442 + }, + { + "epoch": 1.9190230644650867, + "grad_norm": 0.2523292303085327, + "learning_rate": 1.8653171887342287e-05, + "loss": 1.2577, + "step": 6443 + }, + { + "epoch": 1.9193209106647555, + "grad_norm": 0.2227764129638672, + "learning_rate": 1.8652688382637243e-05, + "loss": 1.2581, + "step": 6444 + }, + { + "epoch": 1.9196187568644243, + "grad_norm": 0.25271645188331604, + "learning_rate": 1.8652204797428602e-05, + "loss": 1.2701, + "step": 6445 + }, + { + "epoch": 1.9199166030640926, + "grad_norm": 0.23058974742889404, + "learning_rate": 1.8651721131720857e-05, + "loss": 1.2532, + "step": 6446 + }, + { + "epoch": 1.9202144492637614, + "grad_norm": 0.2599303126335144, + "learning_rate": 1.865123738551852e-05, + "loss": 1.264, + "step": 6447 + }, + { + "epoch": 1.9205122954634302, + "grad_norm": 0.23858413100242615, + "learning_rate": 1.8650753558826083e-05, + "loss": 1.2429, + "step": 6448 + }, + { + "epoch": 1.9208101416630987, + "grad_norm": 0.2499847710132599, + "learning_rate": 1.865026965164805e-05, + "loss": 1.2853, + "step": 6449 + }, + { + "epoch": 1.9211079878627673, + "grad_norm": 0.2274659425020218, + "learning_rate": 1.864978566398892e-05, + "loss": 1.2637, + "step": 6450 + }, + { + "epoch": 1.921405834062436, + "grad_norm": 0.24274186789989471, + "learning_rate": 1.86493015958532e-05, + "loss": 1.2734, + "step": 6451 + }, + { + "epoch": 1.9217036802621048, + "grad_norm": 0.2235558032989502, + "learning_rate": 1.864881744724539e-05, + "loss": 1.2783, + "step": 6452 + }, + { + "epoch": 1.9220015264617731, + "grad_norm": 0.2452191561460495, + "learning_rate": 1.8648333218169998e-05, + "loss": 1.2716, + "step": 6453 + }, + { + "epoch": 1.922299372661442, + "grad_norm": 0.23667138814926147, + "learning_rate": 1.864784890863153e-05, + "loss": 1.2676, + "step": 6454 + }, + { + "epoch": 1.9225972188611107, + "grad_norm": 0.23631134629249573, + "learning_rate": 1.8647364518634488e-05, + "loss": 1.2631, + "step": 6455 + }, + { + "epoch": 1.9228950650607792, + "grad_norm": 0.2375621795654297, + "learning_rate": 1.864688004818338e-05, + "loss": 1.2724, + "step": 6456 + }, + { + "epoch": 1.9231929112604478, + "grad_norm": 0.2185078263282776, + "learning_rate": 1.8646395497282718e-05, + "loss": 1.263, + "step": 6457 + }, + { + "epoch": 1.9234907574601166, + "grad_norm": 0.22520285844802856, + "learning_rate": 1.8645910865937e-05, + "loss": 1.269, + "step": 6458 + }, + { + "epoch": 1.9237886036597853, + "grad_norm": 0.24822598695755005, + "learning_rate": 1.8645426154150744e-05, + "loss": 1.2605, + "step": 6459 + }, + { + "epoch": 1.9240864498594539, + "grad_norm": 0.26536160707473755, + "learning_rate": 1.8644941361928458e-05, + "loss": 1.2835, + "step": 6460 + }, + { + "epoch": 1.9243842960591224, + "grad_norm": 0.2194330245256424, + "learning_rate": 1.864445648927465e-05, + "loss": 1.2877, + "step": 6461 + }, + { + "epoch": 1.9246821422587912, + "grad_norm": 0.3066021502017975, + "learning_rate": 1.8643971536193835e-05, + "loss": 1.24, + "step": 6462 + }, + { + "epoch": 1.9249799884584597, + "grad_norm": 0.2796516418457031, + "learning_rate": 1.8643486502690517e-05, + "loss": 1.2638, + "step": 6463 + }, + { + "epoch": 1.9252778346581283, + "grad_norm": 0.27320075035095215, + "learning_rate": 1.864300138876922e-05, + "loss": 1.2736, + "step": 6464 + }, + { + "epoch": 1.925575680857797, + "grad_norm": 0.24786368012428284, + "learning_rate": 1.8642516194434448e-05, + "loss": 1.2495, + "step": 6465 + }, + { + "epoch": 1.9258735270574658, + "grad_norm": 0.23576407134532928, + "learning_rate": 1.864203091969072e-05, + "loss": 1.284, + "step": 6466 + }, + { + "epoch": 1.9261713732571344, + "grad_norm": 0.22346080839633942, + "learning_rate": 1.864154556454255e-05, + "loss": 1.2677, + "step": 6467 + }, + { + "epoch": 1.926469219456803, + "grad_norm": 0.23468804359436035, + "learning_rate": 1.8641060128994452e-05, + "loss": 1.256, + "step": 6468 + }, + { + "epoch": 1.9267670656564717, + "grad_norm": 0.22079093754291534, + "learning_rate": 1.8640574613050946e-05, + "loss": 1.2681, + "step": 6469 + }, + { + "epoch": 1.9270649118561403, + "grad_norm": 0.23220165073871613, + "learning_rate": 1.8640089016716545e-05, + "loss": 1.2501, + "step": 6470 + }, + { + "epoch": 1.9273627580558088, + "grad_norm": 0.251128613948822, + "learning_rate": 1.863960333999577e-05, + "loss": 1.2586, + "step": 6471 + }, + { + "epoch": 1.9276606042554776, + "grad_norm": 0.2199133038520813, + "learning_rate": 1.863911758289314e-05, + "loss": 1.2721, + "step": 6472 + }, + { + "epoch": 1.9279584504551464, + "grad_norm": 0.22844639420509338, + "learning_rate": 1.863863174541317e-05, + "loss": 1.2687, + "step": 6473 + }, + { + "epoch": 1.928256296654815, + "grad_norm": 0.22571970522403717, + "learning_rate": 1.863814582756039e-05, + "loss": 1.268, + "step": 6474 + }, + { + "epoch": 1.9285541428544835, + "grad_norm": 0.21965770423412323, + "learning_rate": 1.8637659829339307e-05, + "loss": 1.2592, + "step": 6475 + }, + { + "epoch": 1.9288519890541522, + "grad_norm": 0.21255746483802795, + "learning_rate": 1.863717375075445e-05, + "loss": 1.238, + "step": 6476 + }, + { + "epoch": 1.9291498352538208, + "grad_norm": 0.22225402295589447, + "learning_rate": 1.863668759181034e-05, + "loss": 1.2421, + "step": 6477 + }, + { + "epoch": 1.9294476814534893, + "grad_norm": 0.2166040986776352, + "learning_rate": 1.863620135251151e-05, + "loss": 1.2648, + "step": 6478 + }, + { + "epoch": 1.929745527653158, + "grad_norm": 0.2302701473236084, + "learning_rate": 1.863571503286247e-05, + "loss": 1.2662, + "step": 6479 + }, + { + "epoch": 1.9300433738528269, + "grad_norm": 0.26717764139175415, + "learning_rate": 1.863522863286775e-05, + "loss": 1.2623, + "step": 6480 + }, + { + "epoch": 1.9303412200524954, + "grad_norm": 0.39063355326652527, + "learning_rate": 1.8634742152531875e-05, + "loss": 1.2732, + "step": 6481 + }, + { + "epoch": 1.930639066252164, + "grad_norm": 0.27181732654571533, + "learning_rate": 1.863425559185937e-05, + "loss": 1.2676, + "step": 6482 + }, + { + "epoch": 1.9309369124518327, + "grad_norm": 0.24872452020645142, + "learning_rate": 1.863376895085477e-05, + "loss": 1.2705, + "step": 6483 + }, + { + "epoch": 1.9312347586515013, + "grad_norm": 0.2171843945980072, + "learning_rate": 1.863328222952259e-05, + "loss": 1.2668, + "step": 6484 + }, + { + "epoch": 1.9315326048511698, + "grad_norm": 0.21550366282463074, + "learning_rate": 1.8632795427867365e-05, + "loss": 1.262, + "step": 6485 + }, + { + "epoch": 1.9318304510508386, + "grad_norm": 0.2369561344385147, + "learning_rate": 1.8632308545893625e-05, + "loss": 1.2616, + "step": 6486 + }, + { + "epoch": 1.9321282972505074, + "grad_norm": 0.24757346510887146, + "learning_rate": 1.8631821583605898e-05, + "loss": 1.2698, + "step": 6487 + }, + { + "epoch": 1.932426143450176, + "grad_norm": 0.23112964630126953, + "learning_rate": 1.8631334541008717e-05, + "loss": 1.2723, + "step": 6488 + }, + { + "epoch": 1.9327239896498445, + "grad_norm": 0.2273424118757248, + "learning_rate": 1.863084741810661e-05, + "loss": 1.2709, + "step": 6489 + }, + { + "epoch": 1.9330218358495133, + "grad_norm": 0.2369772493839264, + "learning_rate": 1.863036021490411e-05, + "loss": 1.2515, + "step": 6490 + }, + { + "epoch": 1.9333196820491818, + "grad_norm": 0.23573176562786102, + "learning_rate": 1.862987293140575e-05, + "loss": 1.2516, + "step": 6491 + }, + { + "epoch": 1.9336175282488504, + "grad_norm": 0.23649613559246063, + "learning_rate": 1.8629385567616067e-05, + "loss": 1.2681, + "step": 6492 + }, + { + "epoch": 1.9339153744485191, + "grad_norm": 0.22525697946548462, + "learning_rate": 1.862889812353959e-05, + "loss": 1.2441, + "step": 6493 + }, + { + "epoch": 1.934213220648188, + "grad_norm": 0.22657667100429535, + "learning_rate": 1.8628410599180858e-05, + "loss": 1.2802, + "step": 6494 + }, + { + "epoch": 1.9345110668478565, + "grad_norm": 0.2088024765253067, + "learning_rate": 1.8627922994544408e-05, + "loss": 1.2486, + "step": 6495 + }, + { + "epoch": 1.934808913047525, + "grad_norm": 0.24303029477596283, + "learning_rate": 1.862743530963477e-05, + "loss": 1.2791, + "step": 6496 + }, + { + "epoch": 1.9351067592471938, + "grad_norm": 0.22134655714035034, + "learning_rate": 1.862694754445649e-05, + "loss": 1.2701, + "step": 6497 + }, + { + "epoch": 1.9354046054468623, + "grad_norm": 0.2253435254096985, + "learning_rate": 1.86264596990141e-05, + "loss": 1.2556, + "step": 6498 + }, + { + "epoch": 1.9357024516465309, + "grad_norm": 0.22450168430805206, + "learning_rate": 1.8625971773312138e-05, + "loss": 1.259, + "step": 6499 + }, + { + "epoch": 1.9360002978461996, + "grad_norm": 0.22210893034934998, + "learning_rate": 1.8625483767355146e-05, + "loss": 1.254, + "step": 6500 + }, + { + "epoch": 1.9360002978461996, + "eval_loss": 1.3413488864898682, + "eval_runtime": 22.2616, + "eval_samples_per_second": 77.892, + "eval_steps_per_second": 4.896, + "step": 6500 + }, + { + "epoch": 1.9362981440458684, + "grad_norm": 0.21273358166217804, + "learning_rate": 1.862499568114767e-05, + "loss": 1.268, + "step": 6501 + }, + { + "epoch": 1.936595990245537, + "grad_norm": 0.2237015664577484, + "learning_rate": 1.862450751469424e-05, + "loss": 1.2594, + "step": 6502 + }, + { + "epoch": 1.9368938364452055, + "grad_norm": 0.22538568079471588, + "learning_rate": 1.8624019267999407e-05, + "loss": 1.2558, + "step": 6503 + }, + { + "epoch": 1.9371916826448743, + "grad_norm": 0.22177664935588837, + "learning_rate": 1.8623530941067707e-05, + "loss": 1.2603, + "step": 6504 + }, + { + "epoch": 1.937489528844543, + "grad_norm": 0.22911646962165833, + "learning_rate": 1.8623042533903687e-05, + "loss": 1.2778, + "step": 6505 + }, + { + "epoch": 1.9377873750442114, + "grad_norm": 0.221419095993042, + "learning_rate": 1.8622554046511895e-05, + "loss": 1.2661, + "step": 6506 + }, + { + "epoch": 1.9380852212438802, + "grad_norm": 0.2237306833267212, + "learning_rate": 1.8622065478896863e-05, + "loss": 1.2711, + "step": 6507 + }, + { + "epoch": 1.938383067443549, + "grad_norm": 0.23403052985668182, + "learning_rate": 1.862157683106315e-05, + "loss": 1.2563, + "step": 6508 + }, + { + "epoch": 1.9386809136432175, + "grad_norm": 0.22545550763607025, + "learning_rate": 1.8621088103015297e-05, + "loss": 1.2692, + "step": 6509 + }, + { + "epoch": 1.938978759842886, + "grad_norm": 0.22285085916519165, + "learning_rate": 1.8620599294757853e-05, + "loss": 1.2472, + "step": 6510 + }, + { + "epoch": 1.9392766060425548, + "grad_norm": 0.2223236858844757, + "learning_rate": 1.862011040629536e-05, + "loss": 1.2892, + "step": 6511 + }, + { + "epoch": 1.9395744522422236, + "grad_norm": 0.2175566852092743, + "learning_rate": 1.8619621437632374e-05, + "loss": 1.2471, + "step": 6512 + }, + { + "epoch": 1.939872298441892, + "grad_norm": 0.2539050877094269, + "learning_rate": 1.861913238877344e-05, + "loss": 1.2717, + "step": 6513 + }, + { + "epoch": 1.9401701446415607, + "grad_norm": 0.24455590546131134, + "learning_rate": 1.861864325972311e-05, + "loss": 1.2586, + "step": 6514 + }, + { + "epoch": 1.9404679908412295, + "grad_norm": 0.22603529691696167, + "learning_rate": 1.861815405048593e-05, + "loss": 1.2603, + "step": 6515 + }, + { + "epoch": 1.940765837040898, + "grad_norm": 0.22951674461364746, + "learning_rate": 1.8617664761066457e-05, + "loss": 1.2715, + "step": 6516 + }, + { + "epoch": 1.9410636832405666, + "grad_norm": 0.23248666524887085, + "learning_rate": 1.8617175391469243e-05, + "loss": 1.2792, + "step": 6517 + }, + { + "epoch": 1.9413615294402353, + "grad_norm": 0.25327444076538086, + "learning_rate": 1.861668594169884e-05, + "loss": 1.2724, + "step": 6518 + }, + { + "epoch": 1.941659375639904, + "grad_norm": 0.22093702852725983, + "learning_rate": 1.8616196411759797e-05, + "loss": 1.2654, + "step": 6519 + }, + { + "epoch": 1.9419572218395726, + "grad_norm": 0.23170721530914307, + "learning_rate": 1.861570680165668e-05, + "loss": 1.2813, + "step": 6520 + }, + { + "epoch": 1.9422550680392412, + "grad_norm": 0.230038583278656, + "learning_rate": 1.8615217111394032e-05, + "loss": 1.2554, + "step": 6521 + }, + { + "epoch": 1.94255291423891, + "grad_norm": 0.24214862287044525, + "learning_rate": 1.8614727340976417e-05, + "loss": 1.2699, + "step": 6522 + }, + { + "epoch": 1.9428507604385785, + "grad_norm": 0.23127305507659912, + "learning_rate": 1.8614237490408387e-05, + "loss": 1.2652, + "step": 6523 + }, + { + "epoch": 1.943148606638247, + "grad_norm": 0.22470897436141968, + "learning_rate": 1.8613747559694502e-05, + "loss": 1.2639, + "step": 6524 + }, + { + "epoch": 1.9434464528379158, + "grad_norm": 0.23439840972423553, + "learning_rate": 1.8613257548839317e-05, + "loss": 1.2694, + "step": 6525 + }, + { + "epoch": 1.9437442990375846, + "grad_norm": 0.22015082836151123, + "learning_rate": 1.86127674578474e-05, + "loss": 1.2629, + "step": 6526 + }, + { + "epoch": 1.9440421452372532, + "grad_norm": 0.2245972901582718, + "learning_rate": 1.86122772867233e-05, + "loss": 1.2758, + "step": 6527 + }, + { + "epoch": 1.9443399914369217, + "grad_norm": 0.2232385128736496, + "learning_rate": 1.8611787035471583e-05, + "loss": 1.2594, + "step": 6528 + }, + { + "epoch": 1.9446378376365905, + "grad_norm": 0.22602230310440063, + "learning_rate": 1.8611296704096813e-05, + "loss": 1.2651, + "step": 6529 + }, + { + "epoch": 1.944935683836259, + "grad_norm": 0.2488381415605545, + "learning_rate": 1.8610806292603545e-05, + "loss": 1.2696, + "step": 6530 + }, + { + "epoch": 1.9452335300359276, + "grad_norm": 0.22943571209907532, + "learning_rate": 1.8610315800996342e-05, + "loss": 1.2712, + "step": 6531 + }, + { + "epoch": 1.9455313762355964, + "grad_norm": 0.23891986906528473, + "learning_rate": 1.8609825229279775e-05, + "loss": 1.2666, + "step": 6532 + }, + { + "epoch": 1.9458292224352651, + "grad_norm": 0.29686233401298523, + "learning_rate": 1.86093345774584e-05, + "loss": 1.2642, + "step": 6533 + }, + { + "epoch": 1.9461270686349337, + "grad_norm": 0.2327146977186203, + "learning_rate": 1.8608843845536794e-05, + "loss": 1.2575, + "step": 6534 + }, + { + "epoch": 1.9464249148346022, + "grad_norm": 0.22958245873451233, + "learning_rate": 1.8608353033519507e-05, + "loss": 1.258, + "step": 6535 + }, + { + "epoch": 1.946722761034271, + "grad_norm": 0.21778999269008636, + "learning_rate": 1.860786214141111e-05, + "loss": 1.2575, + "step": 6536 + }, + { + "epoch": 1.9470206072339395, + "grad_norm": 0.24746260046958923, + "learning_rate": 1.860737116921618e-05, + "loss": 1.2735, + "step": 6537 + }, + { + "epoch": 1.947318453433608, + "grad_norm": 0.2289373278617859, + "learning_rate": 1.8606880116939273e-05, + "loss": 1.2666, + "step": 6538 + }, + { + "epoch": 1.9476162996332769, + "grad_norm": 0.24286894500255585, + "learning_rate": 1.8606388984584968e-05, + "loss": 1.2663, + "step": 6539 + }, + { + "epoch": 1.9479141458329456, + "grad_norm": 0.22748291492462158, + "learning_rate": 1.8605897772157826e-05, + "loss": 1.2649, + "step": 6540 + }, + { + "epoch": 1.9482119920326142, + "grad_norm": 0.2637212574481964, + "learning_rate": 1.860540647966242e-05, + "loss": 1.2526, + "step": 6541 + }, + { + "epoch": 1.9485098382322827, + "grad_norm": 0.25491830706596375, + "learning_rate": 1.860491510710332e-05, + "loss": 1.2761, + "step": 6542 + }, + { + "epoch": 1.9488076844319515, + "grad_norm": 0.23540496826171875, + "learning_rate": 1.8604423654485103e-05, + "loss": 1.2788, + "step": 6543 + }, + { + "epoch": 1.94910553063162, + "grad_norm": 0.2479553520679474, + "learning_rate": 1.8603932121812334e-05, + "loss": 1.2588, + "step": 6544 + }, + { + "epoch": 1.9494033768312886, + "grad_norm": 0.21612919867038727, + "learning_rate": 1.860344050908959e-05, + "loss": 1.2431, + "step": 6545 + }, + { + "epoch": 1.9497012230309574, + "grad_norm": 0.23982878029346466, + "learning_rate": 1.8602948816321446e-05, + "loss": 1.2685, + "step": 6546 + }, + { + "epoch": 1.9499990692306262, + "grad_norm": 0.21687816083431244, + "learning_rate": 1.8602457043512475e-05, + "loss": 1.2629, + "step": 6547 + }, + { + "epoch": 1.9502969154302947, + "grad_norm": 0.2796175479888916, + "learning_rate": 1.8601965190667252e-05, + "loss": 1.2692, + "step": 6548 + }, + { + "epoch": 1.9505947616299633, + "grad_norm": 0.2920897603034973, + "learning_rate": 1.860147325779035e-05, + "loss": 1.2651, + "step": 6549 + }, + { + "epoch": 1.950892607829632, + "grad_norm": 0.22344376146793365, + "learning_rate": 1.8600981244886354e-05, + "loss": 1.2678, + "step": 6550 + }, + { + "epoch": 1.9511904540293006, + "grad_norm": 0.5246843099594116, + "learning_rate": 1.8600489151959834e-05, + "loss": 1.2464, + "step": 6551 + }, + { + "epoch": 1.9514883002289691, + "grad_norm": 0.2972206473350525, + "learning_rate": 1.8599996979015372e-05, + "loss": 1.2628, + "step": 6552 + }, + { + "epoch": 1.951786146428638, + "grad_norm": 0.27029719948768616, + "learning_rate": 1.8599504726057548e-05, + "loss": 1.2553, + "step": 6553 + }, + { + "epoch": 1.9520839926283067, + "grad_norm": 0.23671601712703705, + "learning_rate": 1.859901239309094e-05, + "loss": 1.2696, + "step": 6554 + }, + { + "epoch": 1.9523818388279752, + "grad_norm": 0.22710232436656952, + "learning_rate": 1.859851998012013e-05, + "loss": 1.2607, + "step": 6555 + }, + { + "epoch": 1.9526796850276438, + "grad_norm": 0.25671708583831787, + "learning_rate": 1.8598027487149696e-05, + "loss": 1.2546, + "step": 6556 + }, + { + "epoch": 1.9529775312273125, + "grad_norm": 0.250498503446579, + "learning_rate": 1.8597534914184224e-05, + "loss": 1.2697, + "step": 6557 + }, + { + "epoch": 1.953275377426981, + "grad_norm": 0.23408140242099762, + "learning_rate": 1.8597042261228298e-05, + "loss": 1.2725, + "step": 6558 + }, + { + "epoch": 1.9535732236266496, + "grad_norm": 0.21811716258525848, + "learning_rate": 1.8596549528286495e-05, + "loss": 1.2573, + "step": 6559 + }, + { + "epoch": 1.9538710698263184, + "grad_norm": 0.22456933557987213, + "learning_rate": 1.8596056715363403e-05, + "loss": 1.267, + "step": 6560 + }, + { + "epoch": 1.9541689160259872, + "grad_norm": 0.24056674540042877, + "learning_rate": 1.859556382246361e-05, + "loss": 1.2706, + "step": 6561 + }, + { + "epoch": 1.9544667622256557, + "grad_norm": 0.22696880996227264, + "learning_rate": 1.8595070849591697e-05, + "loss": 1.2634, + "step": 6562 + }, + { + "epoch": 1.9547646084253243, + "grad_norm": 0.22648760676383972, + "learning_rate": 1.8594577796752252e-05, + "loss": 1.2745, + "step": 6563 + }, + { + "epoch": 1.955062454624993, + "grad_norm": 0.22334381937980652, + "learning_rate": 1.8594084663949867e-05, + "loss": 1.2625, + "step": 6564 + }, + { + "epoch": 1.9553603008246616, + "grad_norm": 0.22345022857189178, + "learning_rate": 1.8593591451189124e-05, + "loss": 1.2702, + "step": 6565 + }, + { + "epoch": 1.9556581470243302, + "grad_norm": 0.22984817624092102, + "learning_rate": 1.8593098158474614e-05, + "loss": 1.2753, + "step": 6566 + }, + { + "epoch": 1.955955993223999, + "grad_norm": 0.2276160567998886, + "learning_rate": 1.8592604785810927e-05, + "loss": 1.2684, + "step": 6567 + }, + { + "epoch": 1.9562538394236677, + "grad_norm": 0.22295142710208893, + "learning_rate": 1.859211133320265e-05, + "loss": 1.2669, + "step": 6568 + }, + { + "epoch": 1.9565516856233363, + "grad_norm": 0.214729905128479, + "learning_rate": 1.859161780065438e-05, + "loss": 1.2693, + "step": 6569 + }, + { + "epoch": 1.9568495318230048, + "grad_norm": 0.21655964851379395, + "learning_rate": 1.85911241881707e-05, + "loss": 1.2537, + "step": 6570 + }, + { + "epoch": 1.9571473780226736, + "grad_norm": 0.23342803120613098, + "learning_rate": 1.8590630495756214e-05, + "loss": 1.2574, + "step": 6571 + }, + { + "epoch": 1.9574452242223424, + "grad_norm": 0.22633297741413116, + "learning_rate": 1.8590136723415507e-05, + "loss": 1.2549, + "step": 6572 + }, + { + "epoch": 1.9577430704220107, + "grad_norm": 0.21886546909809113, + "learning_rate": 1.8589642871153177e-05, + "loss": 1.2662, + "step": 6573 + }, + { + "epoch": 1.9580409166216795, + "grad_norm": 0.21713319420814514, + "learning_rate": 1.8589148938973816e-05, + "loss": 1.2763, + "step": 6574 + }, + { + "epoch": 1.9583387628213482, + "grad_norm": 0.22585688531398773, + "learning_rate": 1.858865492688202e-05, + "loss": 1.2773, + "step": 6575 + }, + { + "epoch": 1.9586366090210168, + "grad_norm": 0.22475779056549072, + "learning_rate": 1.8588160834882385e-05, + "loss": 1.2741, + "step": 6576 + }, + { + "epoch": 1.9589344552206853, + "grad_norm": 0.220277339220047, + "learning_rate": 1.858766666297951e-05, + "loss": 1.2547, + "step": 6577 + }, + { + "epoch": 1.959232301420354, + "grad_norm": 0.22024206817150116, + "learning_rate": 1.8587172411177993e-05, + "loss": 1.27, + "step": 6578 + }, + { + "epoch": 1.9595301476200229, + "grad_norm": 0.21963857114315033, + "learning_rate": 1.8586678079482427e-05, + "loss": 1.2442, + "step": 6579 + }, + { + "epoch": 1.9598279938196912, + "grad_norm": 0.23683059215545654, + "learning_rate": 1.8586183667897417e-05, + "loss": 1.2902, + "step": 6580 + }, + { + "epoch": 1.96012584001936, + "grad_norm": 0.23136749863624573, + "learning_rate": 1.8585689176427558e-05, + "loss": 1.2657, + "step": 6581 + }, + { + "epoch": 1.9604236862190287, + "grad_norm": 0.23223896324634552, + "learning_rate": 1.8585194605077457e-05, + "loss": 1.2644, + "step": 6582 + }, + { + "epoch": 1.9607215324186973, + "grad_norm": 0.2299426645040512, + "learning_rate": 1.8584699953851712e-05, + "loss": 1.2734, + "step": 6583 + }, + { + "epoch": 1.9610193786183658, + "grad_norm": 0.22070257365703583, + "learning_rate": 1.8584205222754924e-05, + "loss": 1.2609, + "step": 6584 + }, + { + "epoch": 1.9613172248180346, + "grad_norm": 0.2285425364971161, + "learning_rate": 1.85837104117917e-05, + "loss": 1.2711, + "step": 6585 + }, + { + "epoch": 1.9616150710177034, + "grad_norm": 0.2327187955379486, + "learning_rate": 1.8583215520966638e-05, + "loss": 1.2698, + "step": 6586 + }, + { + "epoch": 1.961912917217372, + "grad_norm": 0.2283291071653366, + "learning_rate": 1.858272055028435e-05, + "loss": 1.2511, + "step": 6587 + }, + { + "epoch": 1.9622107634170405, + "grad_norm": 0.2311006486415863, + "learning_rate": 1.858222549974943e-05, + "loss": 1.2515, + "step": 6588 + }, + { + "epoch": 1.9625086096167093, + "grad_norm": 0.23775631189346313, + "learning_rate": 1.858173036936649e-05, + "loss": 1.2527, + "step": 6589 + }, + { + "epoch": 1.9628064558163778, + "grad_norm": 0.2357291877269745, + "learning_rate": 1.8581235159140144e-05, + "loss": 1.2771, + "step": 6590 + }, + { + "epoch": 1.9631043020160464, + "grad_norm": 0.22054970264434814, + "learning_rate": 1.8580739869074987e-05, + "loss": 1.274, + "step": 6591 + }, + { + "epoch": 1.9634021482157151, + "grad_norm": 0.22896882891654968, + "learning_rate": 1.8580244499175634e-05, + "loss": 1.2659, + "step": 6592 + }, + { + "epoch": 1.963699994415384, + "grad_norm": 0.24621307849884033, + "learning_rate": 1.8579749049446695e-05, + "loss": 1.2692, + "step": 6593 + }, + { + "epoch": 1.9639978406150524, + "grad_norm": 0.2353813201189041, + "learning_rate": 1.857925351989277e-05, + "loss": 1.2431, + "step": 6594 + }, + { + "epoch": 1.964295686814721, + "grad_norm": 0.21590836346149445, + "learning_rate": 1.8578757910518485e-05, + "loss": 1.25, + "step": 6595 + }, + { + "epoch": 1.9645935330143898, + "grad_norm": 0.21876215934753418, + "learning_rate": 1.8578262221328436e-05, + "loss": 1.2567, + "step": 6596 + }, + { + "epoch": 1.9648913792140583, + "grad_norm": 0.22484147548675537, + "learning_rate": 1.8577766452327243e-05, + "loss": 1.2633, + "step": 6597 + }, + { + "epoch": 1.9651892254137269, + "grad_norm": 0.22392192482948303, + "learning_rate": 1.8577270603519517e-05, + "loss": 1.2672, + "step": 6598 + }, + { + "epoch": 1.9654870716133956, + "grad_norm": 0.24420735239982605, + "learning_rate": 1.8576774674909873e-05, + "loss": 1.2646, + "step": 6599 + }, + { + "epoch": 1.9657849178130644, + "grad_norm": 0.2220325618982315, + "learning_rate": 1.857627866650292e-05, + "loss": 1.2623, + "step": 6600 + }, + { + "epoch": 1.966082764012733, + "grad_norm": 0.22404667735099792, + "learning_rate": 1.8575782578303278e-05, + "loss": 1.2443, + "step": 6601 + }, + { + "epoch": 1.9663806102124015, + "grad_norm": 0.22577859461307526, + "learning_rate": 1.857528641031556e-05, + "loss": 1.2638, + "step": 6602 + }, + { + "epoch": 1.9666784564120703, + "grad_norm": 0.23402155935764313, + "learning_rate": 1.8574790162544382e-05, + "loss": 1.273, + "step": 6603 + }, + { + "epoch": 1.9669763026117388, + "grad_norm": 0.22014889121055603, + "learning_rate": 1.8574293834994363e-05, + "loss": 1.2592, + "step": 6604 + }, + { + "epoch": 1.9672741488114074, + "grad_norm": 0.22578471899032593, + "learning_rate": 1.857379742767012e-05, + "loss": 1.2805, + "step": 6605 + }, + { + "epoch": 1.9675719950110762, + "grad_norm": 0.21594074368476868, + "learning_rate": 1.8573300940576268e-05, + "loss": 1.2556, + "step": 6606 + }, + { + "epoch": 1.967869841210745, + "grad_norm": 0.23016615211963654, + "learning_rate": 1.8572804373717432e-05, + "loss": 1.2615, + "step": 6607 + }, + { + "epoch": 1.9681676874104135, + "grad_norm": 0.22241929173469543, + "learning_rate": 1.857230772709823e-05, + "loss": 1.258, + "step": 6608 + }, + { + "epoch": 1.968465533610082, + "grad_norm": 0.22699490189552307, + "learning_rate": 1.8571811000723282e-05, + "loss": 1.2653, + "step": 6609 + }, + { + "epoch": 1.9687633798097508, + "grad_norm": 0.2268591970205307, + "learning_rate": 1.8571314194597208e-05, + "loss": 1.248, + "step": 6610 + }, + { + "epoch": 1.9690612260094194, + "grad_norm": 0.21655899286270142, + "learning_rate": 1.8570817308724632e-05, + "loss": 1.2607, + "step": 6611 + }, + { + "epoch": 1.969359072209088, + "grad_norm": 0.23562391102313995, + "learning_rate": 1.857032034311018e-05, + "loss": 1.2725, + "step": 6612 + }, + { + "epoch": 1.9696569184087567, + "grad_norm": 0.22721697390079498, + "learning_rate": 1.856982329775847e-05, + "loss": 1.2697, + "step": 6613 + }, + { + "epoch": 1.9699547646084254, + "grad_norm": 0.22269241511821747, + "learning_rate": 1.856932617267413e-05, + "loss": 1.269, + "step": 6614 + }, + { + "epoch": 1.970252610808094, + "grad_norm": 0.2151211053133011, + "learning_rate": 1.856882896786178e-05, + "loss": 1.273, + "step": 6615 + }, + { + "epoch": 1.9705504570077625, + "grad_norm": 0.22445057332515717, + "learning_rate": 1.8568331683326054e-05, + "loss": 1.2638, + "step": 6616 + }, + { + "epoch": 1.9708483032074313, + "grad_norm": 0.23451349139213562, + "learning_rate": 1.8567834319071577e-05, + "loss": 1.2591, + "step": 6617 + }, + { + "epoch": 1.9711461494070999, + "grad_norm": 0.22861187160015106, + "learning_rate": 1.856733687510297e-05, + "loss": 1.268, + "step": 6618 + }, + { + "epoch": 1.9714439956067684, + "grad_norm": 0.22528326511383057, + "learning_rate": 1.8566839351424866e-05, + "loss": 1.2644, + "step": 6619 + }, + { + "epoch": 1.9717418418064372, + "grad_norm": 0.23457497358322144, + "learning_rate": 1.8566341748041895e-05, + "loss": 1.2612, + "step": 6620 + }, + { + "epoch": 1.972039688006106, + "grad_norm": 0.21923547983169556, + "learning_rate": 1.8565844064958684e-05, + "loss": 1.2498, + "step": 6621 + }, + { + "epoch": 1.9723375342057745, + "grad_norm": 0.2287691980600357, + "learning_rate": 1.8565346302179864e-05, + "loss": 1.2554, + "step": 6622 + }, + { + "epoch": 1.972635380405443, + "grad_norm": 0.2202005833387375, + "learning_rate": 1.8564848459710066e-05, + "loss": 1.2664, + "step": 6623 + }, + { + "epoch": 1.9729332266051118, + "grad_norm": 0.23299351334571838, + "learning_rate": 1.8564350537553927e-05, + "loss": 1.2563, + "step": 6624 + }, + { + "epoch": 1.9732310728047804, + "grad_norm": 0.22319965064525604, + "learning_rate": 1.856385253571607e-05, + "loss": 1.2551, + "step": 6625 + }, + { + "epoch": 1.973528919004449, + "grad_norm": 0.2123069018125534, + "learning_rate": 1.8563354454201133e-05, + "loss": 1.2541, + "step": 6626 + }, + { + "epoch": 1.9738267652041177, + "grad_norm": 0.21652795374393463, + "learning_rate": 1.8562856293013753e-05, + "loss": 1.2479, + "step": 6627 + }, + { + "epoch": 1.9741246114037865, + "grad_norm": 0.2229059785604477, + "learning_rate": 1.8562358052158556e-05, + "loss": 1.2518, + "step": 6628 + }, + { + "epoch": 1.974422457603455, + "grad_norm": 0.22529351711273193, + "learning_rate": 1.8561859731640192e-05, + "loss": 1.262, + "step": 6629 + }, + { + "epoch": 1.9747203038031236, + "grad_norm": 0.2385956346988678, + "learning_rate": 1.8561361331463286e-05, + "loss": 1.2775, + "step": 6630 + }, + { + "epoch": 1.9750181500027923, + "grad_norm": 0.23154640197753906, + "learning_rate": 1.8560862851632478e-05, + "loss": 1.2669, + "step": 6631 + }, + { + "epoch": 1.975315996202461, + "grad_norm": 0.2195601463317871, + "learning_rate": 1.8560364292152405e-05, + "loss": 1.2649, + "step": 6632 + }, + { + "epoch": 1.9756138424021294, + "grad_norm": 0.21839290857315063, + "learning_rate": 1.8559865653027707e-05, + "loss": 1.2638, + "step": 6633 + }, + { + "epoch": 1.9759116886017982, + "grad_norm": 0.2357621043920517, + "learning_rate": 1.855936693426302e-05, + "loss": 1.2662, + "step": 6634 + }, + { + "epoch": 1.976209534801467, + "grad_norm": 0.2239905297756195, + "learning_rate": 1.8558868135862992e-05, + "loss": 1.2662, + "step": 6635 + }, + { + "epoch": 1.9765073810011355, + "grad_norm": 0.22861617803573608, + "learning_rate": 1.8558369257832255e-05, + "loss": 1.2731, + "step": 6636 + }, + { + "epoch": 1.976805227200804, + "grad_norm": 0.22513408958911896, + "learning_rate": 1.8557870300175454e-05, + "loss": 1.2796, + "step": 6637 + }, + { + "epoch": 1.9771030734004729, + "grad_norm": 0.23186704516410828, + "learning_rate": 1.8557371262897235e-05, + "loss": 1.2547, + "step": 6638 + }, + { + "epoch": 1.9774009196001416, + "grad_norm": 0.21897944808006287, + "learning_rate": 1.8556872146002234e-05, + "loss": 1.2791, + "step": 6639 + }, + { + "epoch": 1.97769876579981, + "grad_norm": 0.22537602484226227, + "learning_rate": 1.85563729494951e-05, + "loss": 1.2689, + "step": 6640 + }, + { + "epoch": 1.9779966119994787, + "grad_norm": 0.21599406003952026, + "learning_rate": 1.8555873673380472e-05, + "loss": 1.2476, + "step": 6641 + }, + { + "epoch": 1.9782944581991475, + "grad_norm": 0.22699366509914398, + "learning_rate": 1.8555374317663e-05, + "loss": 1.2488, + "step": 6642 + }, + { + "epoch": 1.978592304398816, + "grad_norm": 0.23197655379772186, + "learning_rate": 1.8554874882347333e-05, + "loss": 1.2647, + "step": 6643 + }, + { + "epoch": 1.9788901505984846, + "grad_norm": 0.22245697677135468, + "learning_rate": 1.8554375367438106e-05, + "loss": 1.2524, + "step": 6644 + }, + { + "epoch": 1.9791879967981534, + "grad_norm": 0.22904571890830994, + "learning_rate": 1.855387577293998e-05, + "loss": 1.2691, + "step": 6645 + }, + { + "epoch": 1.9794858429978222, + "grad_norm": 0.23290230333805084, + "learning_rate": 1.8553376098857593e-05, + "loss": 1.2771, + "step": 6646 + }, + { + "epoch": 1.9797836891974905, + "grad_norm": 0.22315536439418793, + "learning_rate": 1.8552876345195597e-05, + "loss": 1.2729, + "step": 6647 + }, + { + "epoch": 1.9800815353971593, + "grad_norm": 0.21373401582241058, + "learning_rate": 1.8552376511958646e-05, + "loss": 1.2454, + "step": 6648 + }, + { + "epoch": 1.980379381596828, + "grad_norm": 0.2315627634525299, + "learning_rate": 1.8551876599151387e-05, + "loss": 1.2519, + "step": 6649 + }, + { + "epoch": 1.9806772277964966, + "grad_norm": 0.2316744327545166, + "learning_rate": 1.855137660677847e-05, + "loss": 1.2665, + "step": 6650 + }, + { + "epoch": 1.9809750739961651, + "grad_norm": 0.21895499527454376, + "learning_rate": 1.8550876534844545e-05, + "loss": 1.2584, + "step": 6651 + }, + { + "epoch": 1.981272920195834, + "grad_norm": 0.22238340973854065, + "learning_rate": 1.855037638335427e-05, + "loss": 1.2672, + "step": 6652 + }, + { + "epoch": 1.9815707663955027, + "grad_norm": 0.22337839007377625, + "learning_rate": 1.8549876152312297e-05, + "loss": 1.2557, + "step": 6653 + }, + { + "epoch": 1.9818686125951712, + "grad_norm": 0.22859609127044678, + "learning_rate": 1.8549375841723278e-05, + "loss": 1.2647, + "step": 6654 + }, + { + "epoch": 1.9821664587948398, + "grad_norm": 0.2241157442331314, + "learning_rate": 1.8548875451591865e-05, + "loss": 1.2656, + "step": 6655 + }, + { + "epoch": 1.9824643049945085, + "grad_norm": 0.22195187211036682, + "learning_rate": 1.854837498192272e-05, + "loss": 1.257, + "step": 6656 + }, + { + "epoch": 1.982762151194177, + "grad_norm": 0.23629960417747498, + "learning_rate": 1.8547874432720498e-05, + "loss": 1.2572, + "step": 6657 + }, + { + "epoch": 1.9830599973938456, + "grad_norm": 0.22006109356880188, + "learning_rate": 1.8547373803989854e-05, + "loss": 1.2616, + "step": 6658 + }, + { + "epoch": 1.9833578435935144, + "grad_norm": 0.2148144692182541, + "learning_rate": 1.8546873095735445e-05, + "loss": 1.2821, + "step": 6659 + }, + { + "epoch": 1.9836556897931832, + "grad_norm": 0.22610807418823242, + "learning_rate": 1.8546372307961934e-05, + "loss": 1.2518, + "step": 6660 + }, + { + "epoch": 1.9839535359928517, + "grad_norm": 0.23026686906814575, + "learning_rate": 1.8545871440673972e-05, + "loss": 1.2849, + "step": 6661 + }, + { + "epoch": 1.9842513821925203, + "grad_norm": 0.22461266815662384, + "learning_rate": 1.854537049387623e-05, + "loss": 1.238, + "step": 6662 + }, + { + "epoch": 1.984549228392189, + "grad_norm": 0.23078782856464386, + "learning_rate": 1.854486946757336e-05, + "loss": 1.2672, + "step": 6663 + }, + { + "epoch": 1.9848470745918576, + "grad_norm": 0.21966131031513214, + "learning_rate": 1.8544368361770026e-05, + "loss": 1.2612, + "step": 6664 + }, + { + "epoch": 1.9851449207915262, + "grad_norm": 0.22543630003929138, + "learning_rate": 1.8543867176470892e-05, + "loss": 1.2782, + "step": 6665 + }, + { + "epoch": 1.985442766991195, + "grad_norm": 0.24705076217651367, + "learning_rate": 1.8543365911680616e-05, + "loss": 1.27, + "step": 6666 + }, + { + "epoch": 1.9857406131908637, + "grad_norm": 0.21681620180606842, + "learning_rate": 1.854286456740387e-05, + "loss": 1.2607, + "step": 6667 + }, + { + "epoch": 1.9860384593905323, + "grad_norm": 0.22567439079284668, + "learning_rate": 1.8542363143645315e-05, + "loss": 1.2603, + "step": 6668 + }, + { + "epoch": 1.9863363055902008, + "grad_norm": 0.23189802467823029, + "learning_rate": 1.8541861640409613e-05, + "loss": 1.2477, + "step": 6669 + }, + { + "epoch": 1.9866341517898696, + "grad_norm": 0.23018525540828705, + "learning_rate": 1.854136005770143e-05, + "loss": 1.2612, + "step": 6670 + }, + { + "epoch": 1.9869319979895381, + "grad_norm": 0.22216768562793732, + "learning_rate": 1.8540858395525435e-05, + "loss": 1.2805, + "step": 6671 + }, + { + "epoch": 1.9872298441892067, + "grad_norm": 0.22187797725200653, + "learning_rate": 1.8540356653886297e-05, + "loss": 1.2497, + "step": 6672 + }, + { + "epoch": 1.9875276903888754, + "grad_norm": 0.21225842833518982, + "learning_rate": 1.853985483278868e-05, + "loss": 1.2488, + "step": 6673 + }, + { + "epoch": 1.9878255365885442, + "grad_norm": 0.2173687219619751, + "learning_rate": 1.8539352932237258e-05, + "loss": 1.2607, + "step": 6674 + }, + { + "epoch": 1.9881233827882128, + "grad_norm": 0.23274730145931244, + "learning_rate": 1.8538850952236695e-05, + "loss": 1.2768, + "step": 6675 + }, + { + "epoch": 1.9884212289878813, + "grad_norm": 0.22934773564338684, + "learning_rate": 1.853834889279167e-05, + "loss": 1.2659, + "step": 6676 + }, + { + "epoch": 1.98871907518755, + "grad_norm": 0.216777041554451, + "learning_rate": 1.853784675390684e-05, + "loss": 1.2558, + "step": 6677 + }, + { + "epoch": 1.9890169213872186, + "grad_norm": 0.22982190549373627, + "learning_rate": 1.8537344535586888e-05, + "loss": 1.2812, + "step": 6678 + }, + { + "epoch": 1.9893147675868872, + "grad_norm": 0.21795396506786346, + "learning_rate": 1.8536842237836484e-05, + "loss": 1.2637, + "step": 6679 + }, + { + "epoch": 1.989612613786556, + "grad_norm": 0.22477737069129944, + "learning_rate": 1.8536339860660302e-05, + "loss": 1.2525, + "step": 6680 + }, + { + "epoch": 1.9899104599862247, + "grad_norm": 0.22227859497070312, + "learning_rate": 1.8535837404063014e-05, + "loss": 1.2558, + "step": 6681 + }, + { + "epoch": 1.9902083061858933, + "grad_norm": 0.22554489970207214, + "learning_rate": 1.8535334868049297e-05, + "loss": 1.2551, + "step": 6682 + }, + { + "epoch": 1.9905061523855618, + "grad_norm": 0.22632233798503876, + "learning_rate": 1.853483225262382e-05, + "loss": 1.2681, + "step": 6683 + }, + { + "epoch": 1.9908039985852306, + "grad_norm": 0.23057745397090912, + "learning_rate": 1.853432955779127e-05, + "loss": 1.2752, + "step": 6684 + }, + { + "epoch": 1.9911018447848992, + "grad_norm": 0.2378273457288742, + "learning_rate": 1.8533826783556318e-05, + "loss": 1.2526, + "step": 6685 + }, + { + "epoch": 1.9913996909845677, + "grad_norm": 0.22854778170585632, + "learning_rate": 1.8533323929923643e-05, + "loss": 1.261, + "step": 6686 + }, + { + "epoch": 1.9916975371842365, + "grad_norm": 0.22855976223945618, + "learning_rate": 1.853282099689792e-05, + "loss": 1.2621, + "step": 6687 + }, + { + "epoch": 1.9919953833839052, + "grad_norm": 0.2171647548675537, + "learning_rate": 1.853231798448383e-05, + "loss": 1.2632, + "step": 6688 + }, + { + "epoch": 1.9922932295835738, + "grad_norm": 0.22770820558071136, + "learning_rate": 1.853181489268606e-05, + "loss": 1.2534, + "step": 6689 + }, + { + "epoch": 1.9925910757832423, + "grad_norm": 0.22236798703670502, + "learning_rate": 1.8531311721509278e-05, + "loss": 1.2623, + "step": 6690 + }, + { + "epoch": 1.9928889219829111, + "grad_norm": 0.24259105324745178, + "learning_rate": 1.8530808470958176e-05, + "loss": 1.256, + "step": 6691 + }, + { + "epoch": 1.9931867681825797, + "grad_norm": 0.2270156890153885, + "learning_rate": 1.853030514103743e-05, + "loss": 1.2743, + "step": 6692 + }, + { + "epoch": 1.9934846143822482, + "grad_norm": 0.21757015585899353, + "learning_rate": 1.852980173175173e-05, + "loss": 1.27, + "step": 6693 + }, + { + "epoch": 1.993782460581917, + "grad_norm": 0.22424596548080444, + "learning_rate": 1.852929824310575e-05, + "loss": 1.2655, + "step": 6694 + }, + { + "epoch": 1.9940803067815858, + "grad_norm": 0.23281924426555634, + "learning_rate": 1.8528794675104183e-05, + "loss": 1.267, + "step": 6695 + }, + { + "epoch": 1.9943781529812543, + "grad_norm": 0.22783441841602325, + "learning_rate": 1.8528291027751705e-05, + "loss": 1.2778, + "step": 6696 + }, + { + "epoch": 1.9946759991809229, + "grad_norm": 0.22311203181743622, + "learning_rate": 1.8527787301053013e-05, + "loss": 1.2722, + "step": 6697 + }, + { + "epoch": 1.9949738453805916, + "grad_norm": 0.22183965146541595, + "learning_rate": 1.8527283495012788e-05, + "loss": 1.2545, + "step": 6698 + }, + { + "epoch": 1.9952716915802604, + "grad_norm": 0.2308036834001541, + "learning_rate": 1.8526779609635714e-05, + "loss": 1.263, + "step": 6699 + }, + { + "epoch": 1.9955695377799287, + "grad_norm": 0.22947841882705688, + "learning_rate": 1.8526275644926482e-05, + "loss": 1.2741, + "step": 6700 + }, + { + "epoch": 1.9958673839795975, + "grad_norm": 0.22482331097126007, + "learning_rate": 1.8525771600889783e-05, + "loss": 1.2663, + "step": 6701 + }, + { + "epoch": 1.9961652301792663, + "grad_norm": 0.2146119326353073, + "learning_rate": 1.8525267477530304e-05, + "loss": 1.2715, + "step": 6702 + }, + { + "epoch": 1.9964630763789348, + "grad_norm": 0.21766456961631775, + "learning_rate": 1.852476327485274e-05, + "loss": 1.2624, + "step": 6703 + }, + { + "epoch": 1.9967609225786034, + "grad_norm": 0.21831873059272766, + "learning_rate": 1.8524258992861775e-05, + "loss": 1.2799, + "step": 6704 + }, + { + "epoch": 1.9970587687782722, + "grad_norm": 0.22991088032722473, + "learning_rate": 1.8523754631562102e-05, + "loss": 1.2683, + "step": 6705 + }, + { + "epoch": 1.997356614977941, + "grad_norm": 0.21384309232234955, + "learning_rate": 1.8523250190958417e-05, + "loss": 1.2463, + "step": 6706 + }, + { + "epoch": 1.9976544611776093, + "grad_norm": 0.2210661917924881, + "learning_rate": 1.8522745671055413e-05, + "loss": 1.2649, + "step": 6707 + }, + { + "epoch": 1.997952307377278, + "grad_norm": 0.22385594248771667, + "learning_rate": 1.852224107185778e-05, + "loss": 1.2675, + "step": 6708 + }, + { + "epoch": 1.9982501535769468, + "grad_norm": 0.21237967908382416, + "learning_rate": 1.8521736393370216e-05, + "loss": 1.2692, + "step": 6709 + }, + { + "epoch": 1.9985479997766153, + "grad_norm": 0.21873606741428375, + "learning_rate": 1.852123163559742e-05, + "loss": 1.2527, + "step": 6710 + }, + { + "epoch": 1.998845845976284, + "grad_norm": 0.22441710531711578, + "learning_rate": 1.8520726798544084e-05, + "loss": 1.2538, + "step": 6711 + }, + { + "epoch": 1.9991436921759527, + "grad_norm": 0.23059618473052979, + "learning_rate": 1.8520221882214898e-05, + "loss": 1.2514, + "step": 6712 + }, + { + "epoch": 1.9994415383756214, + "grad_norm": 0.22172501683235168, + "learning_rate": 1.8519716886614572e-05, + "loss": 1.2666, + "step": 6713 + }, + { + "epoch": 1.9997393845752898, + "grad_norm": 0.22111006081104279, + "learning_rate": 1.8519211811747798e-05, + "loss": 1.2677, + "step": 6714 + }, + { + "epoch": 2.0000372307749585, + "grad_norm": 0.21776264905929565, + "learning_rate": 1.8518706657619276e-05, + "loss": 1.2746, + "step": 6715 + }, + { + "epoch": 2.0003350769746273, + "grad_norm": 0.22761105000972748, + "learning_rate": 1.8518201424233705e-05, + "loss": 1.2487, + "step": 6716 + }, + { + "epoch": 2.000632923174296, + "grad_norm": 0.22182194888591766, + "learning_rate": 1.851769611159579e-05, + "loss": 1.2601, + "step": 6717 + }, + { + "epoch": 2.0009307693739644, + "grad_norm": 0.22404825687408447, + "learning_rate": 1.8517190719710226e-05, + "loss": 1.2548, + "step": 6718 + }, + { + "epoch": 2.001228615573633, + "grad_norm": 0.2240694761276245, + "learning_rate": 1.8516685248581724e-05, + "loss": 1.2632, + "step": 6719 + }, + { + "epoch": 2.001526461773302, + "grad_norm": 0.22302211821079254, + "learning_rate": 1.8516179698214974e-05, + "loss": 1.25, + "step": 6720 + }, + { + "epoch": 2.0018243079729703, + "grad_norm": 0.21828891336917877, + "learning_rate": 1.8515674068614692e-05, + "loss": 1.2722, + "step": 6721 + }, + { + "epoch": 2.002122154172639, + "grad_norm": 0.22998200356960297, + "learning_rate": 1.8515168359785573e-05, + "loss": 1.2611, + "step": 6722 + }, + { + "epoch": 2.002420000372308, + "grad_norm": 0.21669180691242218, + "learning_rate": 1.851466257173233e-05, + "loss": 1.2631, + "step": 6723 + }, + { + "epoch": 2.0027178465719766, + "grad_norm": 0.22056011855602264, + "learning_rate": 1.8514156704459663e-05, + "loss": 1.2469, + "step": 6724 + }, + { + "epoch": 2.003015692771645, + "grad_norm": 0.22987788915634155, + "learning_rate": 1.851365075797228e-05, + "loss": 1.2668, + "step": 6725 + }, + { + "epoch": 2.0033135389713137, + "grad_norm": 0.22454120218753815, + "learning_rate": 1.851314473227489e-05, + "loss": 1.2687, + "step": 6726 + }, + { + "epoch": 2.0036113851709825, + "grad_norm": 0.22449935972690582, + "learning_rate": 1.8512638627372198e-05, + "loss": 1.2532, + "step": 6727 + }, + { + "epoch": 2.003909231370651, + "grad_norm": 0.2300426959991455, + "learning_rate": 1.851213244326892e-05, + "loss": 1.2568, + "step": 6728 + }, + { + "epoch": 2.0042070775703196, + "grad_norm": 0.21341969072818756, + "learning_rate": 1.851162617996975e-05, + "loss": 1.2561, + "step": 6729 + }, + { + "epoch": 2.0045049237699883, + "grad_norm": 0.22083446383476257, + "learning_rate": 1.8511119837479413e-05, + "loss": 1.2485, + "step": 6730 + }, + { + "epoch": 2.004802769969657, + "grad_norm": 0.2172650694847107, + "learning_rate": 1.8510613415802617e-05, + "loss": 1.2642, + "step": 6731 + }, + { + "epoch": 2.0051006161693254, + "grad_norm": 0.21617619693279266, + "learning_rate": 1.8510106914944072e-05, + "loss": 1.2704, + "step": 6732 + }, + { + "epoch": 2.005398462368994, + "grad_norm": 0.21791894733905792, + "learning_rate": 1.8509600334908486e-05, + "loss": 1.2641, + "step": 6733 + }, + { + "epoch": 2.005696308568663, + "grad_norm": 0.22550319135189056, + "learning_rate": 1.850909367570058e-05, + "loss": 1.2607, + "step": 6734 + }, + { + "epoch": 2.0059941547683313, + "grad_norm": 0.22665993869304657, + "learning_rate": 1.8508586937325063e-05, + "loss": 1.2568, + "step": 6735 + }, + { + "epoch": 2.006292000968, + "grad_norm": 0.22282226383686066, + "learning_rate": 1.8508080119786652e-05, + "loss": 1.2451, + "step": 6736 + }, + { + "epoch": 2.006589847167669, + "grad_norm": 0.21354874968528748, + "learning_rate": 1.8507573223090062e-05, + "loss": 1.2531, + "step": 6737 + }, + { + "epoch": 2.0068876933673376, + "grad_norm": 0.219691663980484, + "learning_rate": 1.8507066247240005e-05, + "loss": 1.2587, + "step": 6738 + }, + { + "epoch": 2.007185539567006, + "grad_norm": 0.22585265338420868, + "learning_rate": 1.8506559192241203e-05, + "loss": 1.2709, + "step": 6739 + }, + { + "epoch": 2.0074833857666747, + "grad_norm": 0.2313259243965149, + "learning_rate": 1.850605205809837e-05, + "loss": 1.262, + "step": 6740 + }, + { + "epoch": 2.0077812319663435, + "grad_norm": 0.23173195123672485, + "learning_rate": 1.850554484481623e-05, + "loss": 1.2685, + "step": 6741 + }, + { + "epoch": 2.008079078166012, + "grad_norm": 0.24330610036849976, + "learning_rate": 1.8505037552399497e-05, + "loss": 1.2708, + "step": 6742 + }, + { + "epoch": 2.0083769243656806, + "grad_norm": 0.22211681306362152, + "learning_rate": 1.850453018085289e-05, + "loss": 1.2603, + "step": 6743 + }, + { + "epoch": 2.0086747705653494, + "grad_norm": 0.22535398602485657, + "learning_rate": 1.850402273018113e-05, + "loss": 1.2603, + "step": 6744 + }, + { + "epoch": 2.008972616765018, + "grad_norm": 0.21972212195396423, + "learning_rate": 1.8503515200388945e-05, + "loss": 1.263, + "step": 6745 + }, + { + "epoch": 2.0092704629646865, + "grad_norm": 0.22704623639583588, + "learning_rate": 1.850300759148105e-05, + "loss": 1.2467, + "step": 6746 + }, + { + "epoch": 2.0095683091643552, + "grad_norm": 0.23253682255744934, + "learning_rate": 1.8502499903462165e-05, + "loss": 1.2538, + "step": 6747 + }, + { + "epoch": 2.009866155364024, + "grad_norm": 0.22977900505065918, + "learning_rate": 1.8501992136337022e-05, + "loss": 1.282, + "step": 6748 + }, + { + "epoch": 2.0101640015636923, + "grad_norm": 0.23483891785144806, + "learning_rate": 1.850148429011034e-05, + "loss": 1.2598, + "step": 6749 + }, + { + "epoch": 2.010461847763361, + "grad_norm": 0.2185533195734024, + "learning_rate": 1.850097636478685e-05, + "loss": 1.244, + "step": 6750 + }, + { + "epoch": 2.01075969396303, + "grad_norm": 0.2195434421300888, + "learning_rate": 1.8500468360371268e-05, + "loss": 1.252, + "step": 6751 + }, + { + "epoch": 2.0110575401626987, + "grad_norm": 0.2338484823703766, + "learning_rate": 1.8499960276868324e-05, + "loss": 1.2842, + "step": 6752 + }, + { + "epoch": 2.011355386362367, + "grad_norm": 0.23106524348258972, + "learning_rate": 1.849945211428275e-05, + "loss": 1.267, + "step": 6753 + }, + { + "epoch": 2.0116532325620358, + "grad_norm": 0.229380264878273, + "learning_rate": 1.8498943872619266e-05, + "loss": 1.2556, + "step": 6754 + }, + { + "epoch": 2.0119510787617045, + "grad_norm": 0.2258760929107666, + "learning_rate": 1.8498435551882607e-05, + "loss": 1.2758, + "step": 6755 + }, + { + "epoch": 2.012248924961373, + "grad_norm": 0.2275063544511795, + "learning_rate": 1.84979271520775e-05, + "loss": 1.2667, + "step": 6756 + }, + { + "epoch": 2.0125467711610416, + "grad_norm": 0.22864452004432678, + "learning_rate": 1.8497418673208677e-05, + "loss": 1.2592, + "step": 6757 + }, + { + "epoch": 2.0128446173607104, + "grad_norm": 0.22659514844417572, + "learning_rate": 1.8496910115280865e-05, + "loss": 1.2486, + "step": 6758 + }, + { + "epoch": 2.013142463560379, + "grad_norm": 0.2362157255411148, + "learning_rate": 1.8496401478298798e-05, + "loss": 1.2685, + "step": 6759 + }, + { + "epoch": 2.0134403097600475, + "grad_norm": 0.2306687980890274, + "learning_rate": 1.8495892762267208e-05, + "loss": 1.2639, + "step": 6760 + }, + { + "epoch": 2.0137381559597163, + "grad_norm": 0.23007617890834808, + "learning_rate": 1.849538396719083e-05, + "loss": 1.2479, + "step": 6761 + }, + { + "epoch": 2.014036002159385, + "grad_norm": 0.24096724390983582, + "learning_rate": 1.8494875093074392e-05, + "loss": 1.257, + "step": 6762 + }, + { + "epoch": 2.0143338483590534, + "grad_norm": 0.22791236639022827, + "learning_rate": 1.8494366139922634e-05, + "loss": 1.2679, + "step": 6763 + }, + { + "epoch": 2.014631694558722, + "grad_norm": 0.2302304208278656, + "learning_rate": 1.849385710774029e-05, + "loss": 1.2734, + "step": 6764 + }, + { + "epoch": 2.014929540758391, + "grad_norm": 0.2323758900165558, + "learning_rate": 1.8493347996532097e-05, + "loss": 1.255, + "step": 6765 + }, + { + "epoch": 2.0152273869580597, + "grad_norm": 0.22466978430747986, + "learning_rate": 1.849283880630279e-05, + "loss": 1.2689, + "step": 6766 + }, + { + "epoch": 2.015525233157728, + "grad_norm": 0.23928982019424438, + "learning_rate": 1.8492329537057102e-05, + "loss": 1.2644, + "step": 6767 + }, + { + "epoch": 2.015823079357397, + "grad_norm": 0.2315400242805481, + "learning_rate": 1.849182018879978e-05, + "loss": 1.2554, + "step": 6768 + }, + { + "epoch": 2.0161209255570656, + "grad_norm": 0.23092469573020935, + "learning_rate": 1.849131076153556e-05, + "loss": 1.241, + "step": 6769 + }, + { + "epoch": 2.016418771756734, + "grad_norm": 0.21835115551948547, + "learning_rate": 1.8490801255269176e-05, + "loss": 1.2558, + "step": 6770 + }, + { + "epoch": 2.0167166179564027, + "grad_norm": 0.2587580978870392, + "learning_rate": 1.849029167000538e-05, + "loss": 1.249, + "step": 6771 + }, + { + "epoch": 2.0170144641560714, + "grad_norm": 0.2355932891368866, + "learning_rate": 1.84897820057489e-05, + "loss": 1.2595, + "step": 6772 + }, + { + "epoch": 2.01731231035574, + "grad_norm": 0.24647144973278046, + "learning_rate": 1.848927226250448e-05, + "loss": 1.2677, + "step": 6773 + }, + { + "epoch": 2.0176101565554085, + "grad_norm": 0.2150038331747055, + "learning_rate": 1.848876244027687e-05, + "loss": 1.2575, + "step": 6774 + }, + { + "epoch": 2.0179080027550773, + "grad_norm": 0.248734250664711, + "learning_rate": 1.8488252539070815e-05, + "loss": 1.2691, + "step": 6775 + }, + { + "epoch": 2.018205848954746, + "grad_norm": 0.22936883568763733, + "learning_rate": 1.848774255889105e-05, + "loss": 1.2564, + "step": 6776 + }, + { + "epoch": 2.018503695154415, + "grad_norm": 0.23732532560825348, + "learning_rate": 1.8487232499742323e-05, + "loss": 1.2576, + "step": 6777 + }, + { + "epoch": 2.018801541354083, + "grad_norm": 0.2249889373779297, + "learning_rate": 1.8486722361629377e-05, + "loss": 1.2661, + "step": 6778 + }, + { + "epoch": 2.019099387553752, + "grad_norm": 0.21732540428638458, + "learning_rate": 1.848621214455697e-05, + "loss": 1.2626, + "step": 6779 + }, + { + "epoch": 2.0193972337534207, + "grad_norm": 0.2325662225484848, + "learning_rate": 1.8485701848529835e-05, + "loss": 1.2554, + "step": 6780 + }, + { + "epoch": 2.019695079953089, + "grad_norm": 0.2356565296649933, + "learning_rate": 1.848519147355272e-05, + "loss": 1.2618, + "step": 6781 + }, + { + "epoch": 2.019992926152758, + "grad_norm": 0.23597142100334167, + "learning_rate": 1.8484681019630386e-05, + "loss": 1.2529, + "step": 6782 + }, + { + "epoch": 2.0202907723524266, + "grad_norm": 0.2436504364013672, + "learning_rate": 1.8484170486767574e-05, + "loss": 1.2584, + "step": 6783 + }, + { + "epoch": 2.0205886185520954, + "grad_norm": 0.22701208293437958, + "learning_rate": 1.8483659874969034e-05, + "loss": 1.2499, + "step": 6784 + }, + { + "epoch": 2.0208864647517637, + "grad_norm": 0.27323466539382935, + "learning_rate": 1.8483149184239515e-05, + "loss": 1.2675, + "step": 6785 + }, + { + "epoch": 2.0211843109514325, + "grad_norm": 0.22097352147102356, + "learning_rate": 1.848263841458377e-05, + "loss": 1.2548, + "step": 6786 + }, + { + "epoch": 2.0214821571511012, + "grad_norm": 0.2183672934770584, + "learning_rate": 1.8482127566006556e-05, + "loss": 1.266, + "step": 6787 + }, + { + "epoch": 2.0217800033507696, + "grad_norm": 0.2181762307882309, + "learning_rate": 1.848161663851262e-05, + "loss": 1.2596, + "step": 6788 + }, + { + "epoch": 2.0220778495504383, + "grad_norm": 0.2295231968164444, + "learning_rate": 1.8481105632106718e-05, + "loss": 1.2494, + "step": 6789 + }, + { + "epoch": 2.022375695750107, + "grad_norm": 0.22780396044254303, + "learning_rate": 1.8480594546793602e-05, + "loss": 1.2513, + "step": 6790 + }, + { + "epoch": 2.022673541949776, + "grad_norm": 0.23354724049568176, + "learning_rate": 1.8480083382578033e-05, + "loss": 1.2711, + "step": 6791 + }, + { + "epoch": 2.022971388149444, + "grad_norm": 0.2310808002948761, + "learning_rate": 1.847957213946476e-05, + "loss": 1.2674, + "step": 6792 + }, + { + "epoch": 2.023269234349113, + "grad_norm": 0.25081607699394226, + "learning_rate": 1.847906081745854e-05, + "loss": 1.2685, + "step": 6793 + }, + { + "epoch": 2.0235670805487818, + "grad_norm": 0.23171740770339966, + "learning_rate": 1.8478549416564132e-05, + "loss": 1.2666, + "step": 6794 + }, + { + "epoch": 2.02386492674845, + "grad_norm": 0.25872913002967834, + "learning_rate": 1.84780379367863e-05, + "loss": 1.2565, + "step": 6795 + }, + { + "epoch": 2.024162772948119, + "grad_norm": 0.2311062514781952, + "learning_rate": 1.8477526378129794e-05, + "loss": 1.2614, + "step": 6796 + }, + { + "epoch": 2.0244606191477876, + "grad_norm": 0.2204236090183258, + "learning_rate": 1.8477014740599376e-05, + "loss": 1.2553, + "step": 6797 + }, + { + "epoch": 2.0247584653474564, + "grad_norm": 0.23132838308811188, + "learning_rate": 1.8476503024199806e-05, + "loss": 1.2441, + "step": 6798 + }, + { + "epoch": 2.0250563115471247, + "grad_norm": 0.22377510368824005, + "learning_rate": 1.8475991228935847e-05, + "loss": 1.2536, + "step": 6799 + }, + { + "epoch": 2.0253541577467935, + "grad_norm": 0.22365237772464752, + "learning_rate": 1.847547935481226e-05, + "loss": 1.2769, + "step": 6800 + }, + { + "epoch": 2.0256520039464623, + "grad_norm": 0.23301854729652405, + "learning_rate": 1.8474967401833807e-05, + "loss": 1.2469, + "step": 6801 + }, + { + "epoch": 2.0259498501461306, + "grad_norm": 0.21874602138996124, + "learning_rate": 1.847445537000525e-05, + "loss": 1.2656, + "step": 6802 + }, + { + "epoch": 2.0262476963457994, + "grad_norm": 0.23669880628585815, + "learning_rate": 1.8473943259331358e-05, + "loss": 1.2685, + "step": 6803 + }, + { + "epoch": 2.026545542545468, + "grad_norm": 0.22757285833358765, + "learning_rate": 1.8473431069816887e-05, + "loss": 1.2484, + "step": 6804 + }, + { + "epoch": 2.026843388745137, + "grad_norm": 0.23288923501968384, + "learning_rate": 1.847291880146661e-05, + "loss": 1.2371, + "step": 6805 + }, + { + "epoch": 2.0271412349448052, + "grad_norm": 0.22801439464092255, + "learning_rate": 1.8472406454285287e-05, + "loss": 1.2581, + "step": 6806 + }, + { + "epoch": 2.027439081144474, + "grad_norm": 0.21672135591506958, + "learning_rate": 1.847189402827769e-05, + "loss": 1.2546, + "step": 6807 + }, + { + "epoch": 2.027736927344143, + "grad_norm": 0.22586698830127716, + "learning_rate": 1.8471381523448583e-05, + "loss": 1.258, + "step": 6808 + }, + { + "epoch": 2.028034773543811, + "grad_norm": 0.23314444720745087, + "learning_rate": 1.847086893980274e-05, + "loss": 1.2656, + "step": 6809 + }, + { + "epoch": 2.02833261974348, + "grad_norm": 0.24733838438987732, + "learning_rate": 1.847035627734492e-05, + "loss": 1.2514, + "step": 6810 + }, + { + "epoch": 2.0286304659431487, + "grad_norm": 0.23412738740444183, + "learning_rate": 1.84698435360799e-05, + "loss": 1.2699, + "step": 6811 + }, + { + "epoch": 2.0289283121428174, + "grad_norm": 0.2705174386501312, + "learning_rate": 1.846933071601245e-05, + "loss": 1.2667, + "step": 6812 + }, + { + "epoch": 2.0292261583424858, + "grad_norm": 0.22393466532230377, + "learning_rate": 1.8468817817147343e-05, + "loss": 1.2575, + "step": 6813 + }, + { + "epoch": 2.0295240045421545, + "grad_norm": 0.22294221818447113, + "learning_rate": 1.846830483948934e-05, + "loss": 1.261, + "step": 6814 + }, + { + "epoch": 2.0298218507418233, + "grad_norm": 0.2212774157524109, + "learning_rate": 1.846779178304323e-05, + "loss": 1.2584, + "step": 6815 + }, + { + "epoch": 2.0301196969414916, + "grad_norm": 0.29387468099594116, + "learning_rate": 1.8467278647813775e-05, + "loss": 1.2542, + "step": 6816 + }, + { + "epoch": 2.0304175431411604, + "grad_norm": 0.2356308400630951, + "learning_rate": 1.8466765433805754e-05, + "loss": 1.2584, + "step": 6817 + }, + { + "epoch": 2.030715389340829, + "grad_norm": 0.22963428497314453, + "learning_rate": 1.8466252141023937e-05, + "loss": 1.2508, + "step": 6818 + }, + { + "epoch": 2.031013235540498, + "grad_norm": 0.22229976952075958, + "learning_rate": 1.8465738769473107e-05, + "loss": 1.2522, + "step": 6819 + }, + { + "epoch": 2.0313110817401663, + "grad_norm": 0.22314177453517914, + "learning_rate": 1.8465225319158033e-05, + "loss": 1.2671, + "step": 6820 + }, + { + "epoch": 2.031608927939835, + "grad_norm": 0.2310333549976349, + "learning_rate": 1.8464711790083496e-05, + "loss": 1.2392, + "step": 6821 + }, + { + "epoch": 2.031906774139504, + "grad_norm": 0.22531390190124512, + "learning_rate": 1.8464198182254273e-05, + "loss": 1.2593, + "step": 6822 + }, + { + "epoch": 2.032204620339172, + "grad_norm": 0.23093685507774353, + "learning_rate": 1.8463684495675146e-05, + "loss": 1.2418, + "step": 6823 + }, + { + "epoch": 2.032502466538841, + "grad_norm": 0.22888115048408508, + "learning_rate": 1.8463170730350887e-05, + "loss": 1.2571, + "step": 6824 + }, + { + "epoch": 2.0328003127385097, + "grad_norm": 0.23483408987522125, + "learning_rate": 1.846265688628628e-05, + "loss": 1.2541, + "step": 6825 + }, + { + "epoch": 2.0330981589381785, + "grad_norm": 0.23314787447452545, + "learning_rate": 1.8462142963486105e-05, + "loss": 1.2576, + "step": 6826 + }, + { + "epoch": 2.033396005137847, + "grad_norm": 0.2304399311542511, + "learning_rate": 1.8461628961955148e-05, + "loss": 1.2925, + "step": 6827 + }, + { + "epoch": 2.0336938513375156, + "grad_norm": 0.22697293758392334, + "learning_rate": 1.8461114881698184e-05, + "loss": 1.2547, + "step": 6828 + }, + { + "epoch": 2.0339916975371843, + "grad_norm": 0.22943907976150513, + "learning_rate": 1.846060072272e-05, + "loss": 1.2579, + "step": 6829 + }, + { + "epoch": 2.0342895437368527, + "grad_norm": 0.2288867086172104, + "learning_rate": 1.8460086485025382e-05, + "loss": 1.2745, + "step": 6830 + }, + { + "epoch": 2.0345873899365214, + "grad_norm": 0.22665607929229736, + "learning_rate": 1.8459572168619105e-05, + "loss": 1.2611, + "step": 6831 + }, + { + "epoch": 2.03488523613619, + "grad_norm": 0.21407344937324524, + "learning_rate": 1.8459057773505968e-05, + "loss": 1.2605, + "step": 6832 + }, + { + "epoch": 2.035183082335859, + "grad_norm": 0.23433037102222443, + "learning_rate": 1.8458543299690747e-05, + "loss": 1.2699, + "step": 6833 + }, + { + "epoch": 2.0354809285355273, + "grad_norm": 0.2505665719509125, + "learning_rate": 1.845802874717823e-05, + "loss": 1.246, + "step": 6834 + }, + { + "epoch": 2.035778774735196, + "grad_norm": 0.25351783633232117, + "learning_rate": 1.8457514115973202e-05, + "loss": 1.2471, + "step": 6835 + }, + { + "epoch": 2.036076620934865, + "grad_norm": 0.22248394787311554, + "learning_rate": 1.845699940608046e-05, + "loss": 1.2662, + "step": 6836 + }, + { + "epoch": 2.036374467134533, + "grad_norm": 0.3558101952075958, + "learning_rate": 1.8456484617504787e-05, + "loss": 1.2368, + "step": 6837 + }, + { + "epoch": 2.036672313334202, + "grad_norm": 0.28423941135406494, + "learning_rate": 1.845596975025097e-05, + "loss": 1.2575, + "step": 6838 + }, + { + "epoch": 2.0369701595338707, + "grad_norm": 0.2607453763484955, + "learning_rate": 1.8455454804323802e-05, + "loss": 1.267, + "step": 6839 + }, + { + "epoch": 2.0372680057335395, + "grad_norm": 0.22378839552402496, + "learning_rate": 1.8454939779728077e-05, + "loss": 1.2569, + "step": 6840 + }, + { + "epoch": 2.037565851933208, + "grad_norm": 0.2533341348171234, + "learning_rate": 1.8454424676468582e-05, + "loss": 1.2545, + "step": 6841 + }, + { + "epoch": 2.0378636981328766, + "grad_norm": 0.261690616607666, + "learning_rate": 1.8453909494550108e-05, + "loss": 1.249, + "step": 6842 + }, + { + "epoch": 2.0381615443325454, + "grad_norm": 0.23489874601364136, + "learning_rate": 1.8453394233977455e-05, + "loss": 1.2653, + "step": 6843 + }, + { + "epoch": 2.038459390532214, + "grad_norm": 0.2246517837047577, + "learning_rate": 1.8452878894755414e-05, + "loss": 1.2666, + "step": 6844 + }, + { + "epoch": 2.0387572367318825, + "grad_norm": 0.2273196429014206, + "learning_rate": 1.8452363476888777e-05, + "loss": 1.269, + "step": 6845 + }, + { + "epoch": 2.0390550829315512, + "grad_norm": 0.23828180134296417, + "learning_rate": 1.8451847980382344e-05, + "loss": 1.2609, + "step": 6846 + }, + { + "epoch": 2.03935292913122, + "grad_norm": 0.2287394106388092, + "learning_rate": 1.8451332405240906e-05, + "loss": 1.2601, + "step": 6847 + }, + { + "epoch": 2.0396507753308883, + "grad_norm": 0.22577506303787231, + "learning_rate": 1.8450816751469264e-05, + "loss": 1.2557, + "step": 6848 + }, + { + "epoch": 2.039948621530557, + "grad_norm": 0.23476409912109375, + "learning_rate": 1.8450301019072212e-05, + "loss": 1.2583, + "step": 6849 + }, + { + "epoch": 2.040246467730226, + "grad_norm": 0.2375391721725464, + "learning_rate": 1.844978520805455e-05, + "loss": 1.2736, + "step": 6850 + }, + { + "epoch": 2.0405443139298947, + "grad_norm": 0.2250700443983078, + "learning_rate": 1.844926931842108e-05, + "loss": 1.2566, + "step": 6851 + }, + { + "epoch": 2.040842160129563, + "grad_norm": 0.21955886483192444, + "learning_rate": 1.8448753350176597e-05, + "loss": 1.2436, + "step": 6852 + }, + { + "epoch": 2.0411400063292318, + "grad_norm": 0.2313331663608551, + "learning_rate": 1.8448237303325905e-05, + "loss": 1.2628, + "step": 6853 + }, + { + "epoch": 2.0414378525289005, + "grad_norm": 0.2249409407377243, + "learning_rate": 1.8447721177873802e-05, + "loss": 1.2607, + "step": 6854 + }, + { + "epoch": 2.041735698728569, + "grad_norm": 0.23042838275432587, + "learning_rate": 1.8447204973825092e-05, + "loss": 1.2428, + "step": 6855 + }, + { + "epoch": 2.0420335449282376, + "grad_norm": 0.23948293924331665, + "learning_rate": 1.844668869118458e-05, + "loss": 1.2417, + "step": 6856 + }, + { + "epoch": 2.0423313911279064, + "grad_norm": 0.22150684893131256, + "learning_rate": 1.8446172329957063e-05, + "loss": 1.2508, + "step": 6857 + }, + { + "epoch": 2.042629237327575, + "grad_norm": 0.22669459879398346, + "learning_rate": 1.844565589014735e-05, + "loss": 1.2677, + "step": 6858 + }, + { + "epoch": 2.0429270835272435, + "grad_norm": 0.24645200371742249, + "learning_rate": 1.844513937176025e-05, + "loss": 1.2655, + "step": 6859 + }, + { + "epoch": 2.0432249297269123, + "grad_norm": 0.2355499267578125, + "learning_rate": 1.844462277480056e-05, + "loss": 1.24, + "step": 6860 + }, + { + "epoch": 2.043522775926581, + "grad_norm": 0.22788940370082855, + "learning_rate": 1.8444106099273086e-05, + "loss": 1.2664, + "step": 6861 + }, + { + "epoch": 2.0438206221262494, + "grad_norm": 0.22359760105609894, + "learning_rate": 1.8443589345182644e-05, + "loss": 1.2529, + "step": 6862 + }, + { + "epoch": 2.044118468325918, + "grad_norm": 0.2289741486310959, + "learning_rate": 1.8443072512534036e-05, + "loss": 1.2696, + "step": 6863 + }, + { + "epoch": 2.044416314525587, + "grad_norm": 0.2220015972852707, + "learning_rate": 1.844255560133207e-05, + "loss": 1.2524, + "step": 6864 + }, + { + "epoch": 2.0447141607252557, + "grad_norm": 0.23483262956142426, + "learning_rate": 1.844203861158156e-05, + "loss": 1.2668, + "step": 6865 + }, + { + "epoch": 2.045012006924924, + "grad_norm": 0.2522992491722107, + "learning_rate": 1.8441521543287312e-05, + "loss": 1.2647, + "step": 6866 + }, + { + "epoch": 2.045309853124593, + "grad_norm": 0.23144623637199402, + "learning_rate": 1.8441004396454136e-05, + "loss": 1.2507, + "step": 6867 + }, + { + "epoch": 2.0456076993242616, + "grad_norm": 0.22831657528877258, + "learning_rate": 1.8440487171086844e-05, + "loss": 1.2746, + "step": 6868 + }, + { + "epoch": 2.04590554552393, + "grad_norm": 0.22541166841983795, + "learning_rate": 1.843996986719025e-05, + "loss": 1.2666, + "step": 6869 + }, + { + "epoch": 2.0462033917235987, + "grad_norm": 0.2273152619600296, + "learning_rate": 1.8439452484769167e-05, + "loss": 1.2753, + "step": 6870 + }, + { + "epoch": 2.0465012379232674, + "grad_norm": 0.2365824282169342, + "learning_rate": 1.8438935023828405e-05, + "loss": 1.2541, + "step": 6871 + }, + { + "epoch": 2.046799084122936, + "grad_norm": 0.2436744123697281, + "learning_rate": 1.8438417484372785e-05, + "loss": 1.2623, + "step": 6872 + }, + { + "epoch": 2.0470969303226045, + "grad_norm": 0.22438614070415497, + "learning_rate": 1.8437899866407117e-05, + "loss": 1.2677, + "step": 6873 + }, + { + "epoch": 2.0473947765222733, + "grad_norm": 0.21692584455013275, + "learning_rate": 1.8437382169936217e-05, + "loss": 1.2601, + "step": 6874 + }, + { + "epoch": 2.047692622721942, + "grad_norm": 0.23106350004673004, + "learning_rate": 1.8436864394964905e-05, + "loss": 1.2381, + "step": 6875 + }, + { + "epoch": 2.0479904689216104, + "grad_norm": 0.23219044506549835, + "learning_rate": 1.8436346541497993e-05, + "loss": 1.2798, + "step": 6876 + }, + { + "epoch": 2.048288315121279, + "grad_norm": 0.2407381385564804, + "learning_rate": 1.84358286095403e-05, + "loss": 1.2648, + "step": 6877 + }, + { + "epoch": 2.048586161320948, + "grad_norm": 0.23298059403896332, + "learning_rate": 1.8435310599096653e-05, + "loss": 1.2559, + "step": 6878 + }, + { + "epoch": 2.0488840075206167, + "grad_norm": 0.22220873832702637, + "learning_rate": 1.843479251017186e-05, + "loss": 1.2796, + "step": 6879 + }, + { + "epoch": 2.049181853720285, + "grad_norm": 0.23294870555400848, + "learning_rate": 1.843427434277075e-05, + "loss": 1.2622, + "step": 6880 + }, + { + "epoch": 2.049479699919954, + "grad_norm": 0.21809184551239014, + "learning_rate": 1.843375609689814e-05, + "loss": 1.2516, + "step": 6881 + }, + { + "epoch": 2.0497775461196226, + "grad_norm": 0.2328861504793167, + "learning_rate": 1.8433237772558856e-05, + "loss": 1.2596, + "step": 6882 + }, + { + "epoch": 2.050075392319291, + "grad_norm": 0.2302510291337967, + "learning_rate": 1.843271936975771e-05, + "loss": 1.2471, + "step": 6883 + }, + { + "epoch": 2.0503732385189597, + "grad_norm": 0.24519892036914825, + "learning_rate": 1.8432200888499533e-05, + "loss": 1.2461, + "step": 6884 + }, + { + "epoch": 2.0506710847186285, + "grad_norm": 0.2264488786458969, + "learning_rate": 1.843168232878915e-05, + "loss": 1.242, + "step": 6885 + }, + { + "epoch": 2.0509689309182972, + "grad_norm": 0.2383405566215515, + "learning_rate": 1.8431163690631383e-05, + "loss": 1.2608, + "step": 6886 + }, + { + "epoch": 2.0512667771179656, + "grad_norm": 0.22933566570281982, + "learning_rate": 1.8430644974031057e-05, + "loss": 1.2852, + "step": 6887 + }, + { + "epoch": 2.0515646233176343, + "grad_norm": 0.2428789883852005, + "learning_rate": 1.8430126178992996e-05, + "loss": 1.2723, + "step": 6888 + }, + { + "epoch": 2.051862469517303, + "grad_norm": 0.2358553111553192, + "learning_rate": 1.8429607305522036e-05, + "loss": 1.239, + "step": 6889 + }, + { + "epoch": 2.0521603157169714, + "grad_norm": 0.22229686379432678, + "learning_rate": 1.8429088353622994e-05, + "loss": 1.2562, + "step": 6890 + }, + { + "epoch": 2.05245816191664, + "grad_norm": 0.2688424289226532, + "learning_rate": 1.8428569323300706e-05, + "loss": 1.2959, + "step": 6891 + }, + { + "epoch": 2.052756008116309, + "grad_norm": 0.22803576290607452, + "learning_rate": 1.8428050214559992e-05, + "loss": 1.2387, + "step": 6892 + }, + { + "epoch": 2.0530538543159778, + "grad_norm": 0.23481722176074982, + "learning_rate": 1.842753102740569e-05, + "loss": 1.2629, + "step": 6893 + }, + { + "epoch": 2.053351700515646, + "grad_norm": 0.23635835945606232, + "learning_rate": 1.8427011761842627e-05, + "loss": 1.2751, + "step": 6894 + }, + { + "epoch": 2.053649546715315, + "grad_norm": 0.21708984673023224, + "learning_rate": 1.8426492417875636e-05, + "loss": 1.2518, + "step": 6895 + }, + { + "epoch": 2.0539473929149836, + "grad_norm": 0.2377351075410843, + "learning_rate": 1.8425972995509545e-05, + "loss": 1.2634, + "step": 6896 + }, + { + "epoch": 2.054245239114652, + "grad_norm": 0.2265872359275818, + "learning_rate": 1.8425453494749193e-05, + "loss": 1.2558, + "step": 6897 + }, + { + "epoch": 2.0545430853143207, + "grad_norm": 0.2197393923997879, + "learning_rate": 1.8424933915599408e-05, + "loss": 1.2535, + "step": 6898 + }, + { + "epoch": 2.0548409315139895, + "grad_norm": 0.2214498519897461, + "learning_rate": 1.8424414258065028e-05, + "loss": 1.2559, + "step": 6899 + }, + { + "epoch": 2.0551387777136583, + "grad_norm": 0.23770642280578613, + "learning_rate": 1.842389452215088e-05, + "loss": 1.2604, + "step": 6900 + }, + { + "epoch": 2.0554366239133266, + "grad_norm": 0.2686885595321655, + "learning_rate": 1.8423374707861808e-05, + "loss": 1.2723, + "step": 6901 + }, + { + "epoch": 2.0557344701129954, + "grad_norm": 0.24521741271018982, + "learning_rate": 1.8422854815202645e-05, + "loss": 1.2574, + "step": 6902 + }, + { + "epoch": 2.056032316312664, + "grad_norm": 0.233657568693161, + "learning_rate": 1.842233484417823e-05, + "loss": 1.2642, + "step": 6903 + }, + { + "epoch": 2.0563301625123325, + "grad_norm": 0.2279568612575531, + "learning_rate": 1.84218147947934e-05, + "loss": 1.2602, + "step": 6904 + }, + { + "epoch": 2.0566280087120012, + "grad_norm": 0.22631187736988068, + "learning_rate": 1.842129466705299e-05, + "loss": 1.2694, + "step": 6905 + }, + { + "epoch": 2.05692585491167, + "grad_norm": 0.2367851287126541, + "learning_rate": 1.8420774460961846e-05, + "loss": 1.2638, + "step": 6906 + }, + { + "epoch": 2.057223701111339, + "grad_norm": 0.23497571051120758, + "learning_rate": 1.84202541765248e-05, + "loss": 1.2538, + "step": 6907 + }, + { + "epoch": 2.057521547311007, + "grad_norm": 0.22271881997585297, + "learning_rate": 1.84197338137467e-05, + "loss": 1.2397, + "step": 6908 + }, + { + "epoch": 2.057819393510676, + "grad_norm": 0.2256689965724945, + "learning_rate": 1.841921337263238e-05, + "loss": 1.274, + "step": 6909 + }, + { + "epoch": 2.0581172397103447, + "grad_norm": 0.2230585217475891, + "learning_rate": 1.8418692853186687e-05, + "loss": 1.2515, + "step": 6910 + }, + { + "epoch": 2.0584150859100134, + "grad_norm": 0.23039759695529938, + "learning_rate": 1.8418172255414463e-05, + "loss": 1.257, + "step": 6911 + }, + { + "epoch": 2.0587129321096818, + "grad_norm": 0.22781473398208618, + "learning_rate": 1.8417651579320555e-05, + "loss": 1.2669, + "step": 6912 + }, + { + "epoch": 2.0590107783093505, + "grad_norm": 0.23078389465808868, + "learning_rate": 1.8417130824909797e-05, + "loss": 1.2602, + "step": 6913 + }, + { + "epoch": 2.0593086245090193, + "grad_norm": 0.23632900416851044, + "learning_rate": 1.841660999218705e-05, + "loss": 1.2497, + "step": 6914 + }, + { + "epoch": 2.0596064707086876, + "grad_norm": 0.22319194674491882, + "learning_rate": 1.8416089081157142e-05, + "loss": 1.2702, + "step": 6915 + }, + { + "epoch": 2.0599043169083564, + "grad_norm": 0.23077838122844696, + "learning_rate": 1.8415568091824934e-05, + "loss": 1.2526, + "step": 6916 + }, + { + "epoch": 2.060202163108025, + "grad_norm": 0.24915198981761932, + "learning_rate": 1.8415047024195263e-05, + "loss": 1.2477, + "step": 6917 + }, + { + "epoch": 2.060500009307694, + "grad_norm": 0.22520673274993896, + "learning_rate": 1.8414525878272986e-05, + "loss": 1.2594, + "step": 6918 + }, + { + "epoch": 2.0607978555073623, + "grad_norm": 0.22590255737304688, + "learning_rate": 1.8414004654062943e-05, + "loss": 1.2708, + "step": 6919 + }, + { + "epoch": 2.061095701707031, + "grad_norm": 0.22848273813724518, + "learning_rate": 1.8413483351569986e-05, + "loss": 1.2622, + "step": 6920 + }, + { + "epoch": 2.0613935479067, + "grad_norm": 0.22753171622753143, + "learning_rate": 1.8412961970798974e-05, + "loss": 1.2611, + "step": 6921 + }, + { + "epoch": 2.061691394106368, + "grad_norm": 0.23591996729373932, + "learning_rate": 1.8412440511754745e-05, + "loss": 1.2688, + "step": 6922 + }, + { + "epoch": 2.061989240306037, + "grad_norm": 0.2515062987804413, + "learning_rate": 1.8411918974442154e-05, + "loss": 1.2748, + "step": 6923 + }, + { + "epoch": 2.0622870865057057, + "grad_norm": 0.41803601384162903, + "learning_rate": 1.841139735886606e-05, + "loss": 1.2672, + "step": 6924 + }, + { + "epoch": 2.0625849327053745, + "grad_norm": 0.2785837650299072, + "learning_rate": 1.841087566503131e-05, + "loss": 1.255, + "step": 6925 + }, + { + "epoch": 2.062882778905043, + "grad_norm": 0.28804776072502136, + "learning_rate": 1.8410353892942757e-05, + "loss": 1.2779, + "step": 6926 + }, + { + "epoch": 2.0631806251047116, + "grad_norm": 0.23044361174106598, + "learning_rate": 1.8409832042605258e-05, + "loss": 1.244, + "step": 6927 + }, + { + "epoch": 2.0634784713043803, + "grad_norm": 0.225747749209404, + "learning_rate": 1.840931011402367e-05, + "loss": 1.2545, + "step": 6928 + }, + { + "epoch": 2.0637763175040487, + "grad_norm": 0.26567065715789795, + "learning_rate": 1.8408788107202844e-05, + "loss": 1.2725, + "step": 6929 + }, + { + "epoch": 2.0640741637037174, + "grad_norm": 0.25946280360221863, + "learning_rate": 1.8408266022147643e-05, + "loss": 1.2642, + "step": 6930 + }, + { + "epoch": 2.064372009903386, + "grad_norm": 0.23841392993927002, + "learning_rate": 1.8407743858862915e-05, + "loss": 1.2548, + "step": 6931 + }, + { + "epoch": 2.064669856103055, + "grad_norm": 0.2300664484500885, + "learning_rate": 1.840722161735353e-05, + "loss": 1.2523, + "step": 6932 + }, + { + "epoch": 2.0649677023027233, + "grad_norm": 0.23629789054393768, + "learning_rate": 1.840669929762434e-05, + "loss": 1.2669, + "step": 6933 + }, + { + "epoch": 2.065265548502392, + "grad_norm": 0.23946943879127502, + "learning_rate": 1.8406176899680203e-05, + "loss": 1.2745, + "step": 6934 + }, + { + "epoch": 2.065563394702061, + "grad_norm": 0.23306971788406372, + "learning_rate": 1.8405654423525984e-05, + "loss": 1.2748, + "step": 6935 + }, + { + "epoch": 2.065861240901729, + "grad_norm": 0.21573369204998016, + "learning_rate": 1.840513186916654e-05, + "loss": 1.2631, + "step": 6936 + }, + { + "epoch": 2.066159087101398, + "grad_norm": 0.23127424716949463, + "learning_rate": 1.8404609236606736e-05, + "loss": 1.271, + "step": 6937 + }, + { + "epoch": 2.0664569333010667, + "grad_norm": 0.234597310423851, + "learning_rate": 1.8404086525851434e-05, + "loss": 1.2648, + "step": 6938 + }, + { + "epoch": 2.0667547795007355, + "grad_norm": 0.22970066964626312, + "learning_rate": 1.8403563736905498e-05, + "loss": 1.2408, + "step": 6939 + }, + { + "epoch": 2.067052625700404, + "grad_norm": 0.22580933570861816, + "learning_rate": 1.840304086977379e-05, + "loss": 1.2748, + "step": 6940 + }, + { + "epoch": 2.0673504719000726, + "grad_norm": 0.2299324870109558, + "learning_rate": 1.8402517924461173e-05, + "loss": 1.2589, + "step": 6941 + }, + { + "epoch": 2.0676483180997414, + "grad_norm": 0.2186276614665985, + "learning_rate": 1.840199490097251e-05, + "loss": 1.2525, + "step": 6942 + }, + { + "epoch": 2.0679461642994097, + "grad_norm": 0.2327873557806015, + "learning_rate": 1.840147179931268e-05, + "loss": 1.2632, + "step": 6943 + }, + { + "epoch": 2.0682440104990785, + "grad_norm": 0.23202918469905853, + "learning_rate": 1.8400948619486538e-05, + "loss": 1.2711, + "step": 6944 + }, + { + "epoch": 2.0685418566987472, + "grad_norm": 0.236239954829216, + "learning_rate": 1.8400425361498953e-05, + "loss": 1.2541, + "step": 6945 + }, + { + "epoch": 2.068839702898416, + "grad_norm": 0.21637538075447083, + "learning_rate": 1.8399902025354798e-05, + "loss": 1.2592, + "step": 6946 + }, + { + "epoch": 2.0691375490980843, + "grad_norm": 0.22838181257247925, + "learning_rate": 1.839937861105894e-05, + "loss": 1.2749, + "step": 6947 + }, + { + "epoch": 2.069435395297753, + "grad_norm": 0.2435142993927002, + "learning_rate": 1.839885511861625e-05, + "loss": 1.2732, + "step": 6948 + }, + { + "epoch": 2.069733241497422, + "grad_norm": 0.23763881623744965, + "learning_rate": 1.8398331548031595e-05, + "loss": 1.2604, + "step": 6949 + }, + { + "epoch": 2.07003108769709, + "grad_norm": 0.2310408502817154, + "learning_rate": 1.8397807899309847e-05, + "loss": 1.2513, + "step": 6950 + }, + { + "epoch": 2.070328933896759, + "grad_norm": 0.22499972581863403, + "learning_rate": 1.839728417245588e-05, + "loss": 1.2467, + "step": 6951 + }, + { + "epoch": 2.0706267800964278, + "grad_norm": 0.22741033136844635, + "learning_rate": 1.8396760367474565e-05, + "loss": 1.2614, + "step": 6952 + }, + { + "epoch": 2.0709246262960965, + "grad_norm": 0.23971451818943024, + "learning_rate": 1.839623648437078e-05, + "loss": 1.2663, + "step": 6953 + }, + { + "epoch": 2.071222472495765, + "grad_norm": 0.22412295639514923, + "learning_rate": 1.8395712523149392e-05, + "loss": 1.2601, + "step": 6954 + }, + { + "epoch": 2.0715203186954336, + "grad_norm": 0.22380006313323975, + "learning_rate": 1.839518848381528e-05, + "loss": 1.2614, + "step": 6955 + }, + { + "epoch": 2.0718181648951024, + "grad_norm": 0.2436816543340683, + "learning_rate": 1.8394664366373317e-05, + "loss": 1.2687, + "step": 6956 + }, + { + "epoch": 2.0721160110947707, + "grad_norm": 0.24516181647777557, + "learning_rate": 1.8394140170828382e-05, + "loss": 1.2494, + "step": 6957 + }, + { + "epoch": 2.0724138572944395, + "grad_norm": 0.21680797636508942, + "learning_rate": 1.8393615897185352e-05, + "loss": 1.2705, + "step": 6958 + }, + { + "epoch": 2.0727117034941083, + "grad_norm": 0.22413359582424164, + "learning_rate": 1.8393091545449103e-05, + "loss": 1.2561, + "step": 6959 + }, + { + "epoch": 2.073009549693777, + "grad_norm": 0.21973325312137604, + "learning_rate": 1.8392567115624514e-05, + "loss": 1.2504, + "step": 6960 + }, + { + "epoch": 2.0733073958934454, + "grad_norm": 0.2217159867286682, + "learning_rate": 1.8392042607716467e-05, + "loss": 1.2565, + "step": 6961 + }, + { + "epoch": 2.073605242093114, + "grad_norm": 0.22410368919372559, + "learning_rate": 1.839151802172984e-05, + "loss": 1.2835, + "step": 6962 + }, + { + "epoch": 2.073903088292783, + "grad_norm": 0.23028461635112762, + "learning_rate": 1.839099335766951e-05, + "loss": 1.2482, + "step": 6963 + }, + { + "epoch": 2.0742009344924517, + "grad_norm": 0.23592053353786469, + "learning_rate": 1.8390468615540366e-05, + "loss": 1.2453, + "step": 6964 + }, + { + "epoch": 2.07449878069212, + "grad_norm": 0.23165811598300934, + "learning_rate": 1.8389943795347284e-05, + "loss": 1.2465, + "step": 6965 + }, + { + "epoch": 2.074796626891789, + "grad_norm": 0.24902459979057312, + "learning_rate": 1.8389418897095145e-05, + "loss": 1.2702, + "step": 6966 + }, + { + "epoch": 2.0750944730914576, + "grad_norm": 0.24945801496505737, + "learning_rate": 1.8388893920788843e-05, + "loss": 1.2578, + "step": 6967 + }, + { + "epoch": 2.075392319291126, + "grad_norm": 0.22417689859867096, + "learning_rate": 1.8388368866433252e-05, + "loss": 1.2624, + "step": 6968 + }, + { + "epoch": 2.0756901654907947, + "grad_norm": 0.24911852180957794, + "learning_rate": 1.8387843734033258e-05, + "loss": 1.2526, + "step": 6969 + }, + { + "epoch": 2.0759880116904634, + "grad_norm": 0.23068749904632568, + "learning_rate": 1.8387318523593754e-05, + "loss": 1.2449, + "step": 6970 + }, + { + "epoch": 2.0762858578901318, + "grad_norm": 0.2273305356502533, + "learning_rate": 1.838679323511962e-05, + "loss": 1.2681, + "step": 6971 + }, + { + "epoch": 2.0765837040898005, + "grad_norm": 0.22893273830413818, + "learning_rate": 1.8386267868615747e-05, + "loss": 1.265, + "step": 6972 + }, + { + "epoch": 2.0768815502894693, + "grad_norm": 0.23285570740699768, + "learning_rate": 1.8385742424087022e-05, + "loss": 1.2615, + "step": 6973 + }, + { + "epoch": 2.077179396489138, + "grad_norm": 0.24007824063301086, + "learning_rate": 1.838521690153833e-05, + "loss": 1.2699, + "step": 6974 + }, + { + "epoch": 2.0774772426888064, + "grad_norm": 0.22753021121025085, + "learning_rate": 1.8384691300974563e-05, + "loss": 1.261, + "step": 6975 + }, + { + "epoch": 2.077775088888475, + "grad_norm": 0.2304292917251587, + "learning_rate": 1.8384165622400613e-05, + "loss": 1.255, + "step": 6976 + }, + { + "epoch": 2.078072935088144, + "grad_norm": 0.24333694577217102, + "learning_rate": 1.838363986582137e-05, + "loss": 1.2575, + "step": 6977 + }, + { + "epoch": 2.0783707812878127, + "grad_norm": 0.22278420627117157, + "learning_rate": 1.838311403124172e-05, + "loss": 1.2443, + "step": 6978 + }, + { + "epoch": 2.078668627487481, + "grad_norm": 0.22980521619319916, + "learning_rate": 1.8382588118666564e-05, + "loss": 1.2649, + "step": 6979 + }, + { + "epoch": 2.07896647368715, + "grad_norm": 0.22375096380710602, + "learning_rate": 1.838206212810079e-05, + "loss": 1.249, + "step": 6980 + }, + { + "epoch": 2.0792643198868186, + "grad_norm": 0.21912263333797455, + "learning_rate": 1.8381536059549298e-05, + "loss": 1.2602, + "step": 6981 + }, + { + "epoch": 2.079562166086487, + "grad_norm": 0.23730771243572235, + "learning_rate": 1.838100991301697e-05, + "loss": 1.2677, + "step": 6982 + }, + { + "epoch": 2.0798600122861557, + "grad_norm": 0.21935300529003143, + "learning_rate": 1.8380483688508713e-05, + "loss": 1.2646, + "step": 6983 + }, + { + "epoch": 2.0801578584858245, + "grad_norm": 0.22099409997463226, + "learning_rate": 1.8379957386029417e-05, + "loss": 1.2589, + "step": 6984 + }, + { + "epoch": 2.0804557046854932, + "grad_norm": 0.2275436371564865, + "learning_rate": 1.8379431005583977e-05, + "loss": 1.2545, + "step": 6985 + }, + { + "epoch": 2.0807535508851616, + "grad_norm": 0.22090966999530792, + "learning_rate": 1.83789045471773e-05, + "loss": 1.2587, + "step": 6986 + }, + { + "epoch": 2.0810513970848303, + "grad_norm": 0.23452916741371155, + "learning_rate": 1.8378378010814276e-05, + "loss": 1.2536, + "step": 6987 + }, + { + "epoch": 2.081349243284499, + "grad_norm": 0.24162159860134125, + "learning_rate": 1.8377851396499804e-05, + "loss": 1.2646, + "step": 6988 + }, + { + "epoch": 2.0816470894841674, + "grad_norm": 0.22596555948257446, + "learning_rate": 1.8377324704238785e-05, + "loss": 1.2561, + "step": 6989 + }, + { + "epoch": 2.081944935683836, + "grad_norm": 0.23950867354869843, + "learning_rate": 1.8376797934036118e-05, + "loss": 1.2505, + "step": 6990 + }, + { + "epoch": 2.082242781883505, + "grad_norm": 0.23692114651203156, + "learning_rate": 1.8376271085896706e-05, + "loss": 1.254, + "step": 6991 + }, + { + "epoch": 2.0825406280831738, + "grad_norm": 0.2381732314825058, + "learning_rate": 1.8375744159825452e-05, + "loss": 1.276, + "step": 6992 + }, + { + "epoch": 2.082838474282842, + "grad_norm": 0.22579234838485718, + "learning_rate": 1.8375217155827255e-05, + "loss": 1.2728, + "step": 6993 + }, + { + "epoch": 2.083136320482511, + "grad_norm": 0.22766689956188202, + "learning_rate": 1.837469007390702e-05, + "loss": 1.2296, + "step": 6994 + }, + { + "epoch": 2.0834341666821796, + "grad_norm": 0.2368287295103073, + "learning_rate": 1.8374162914069652e-05, + "loss": 1.2648, + "step": 6995 + }, + { + "epoch": 2.083732012881848, + "grad_norm": 0.21219930052757263, + "learning_rate": 1.8373635676320052e-05, + "loss": 1.2447, + "step": 6996 + }, + { + "epoch": 2.0840298590815167, + "grad_norm": 0.23508693277835846, + "learning_rate": 1.8373108360663126e-05, + "loss": 1.2649, + "step": 6997 + }, + { + "epoch": 2.0843277052811855, + "grad_norm": 0.23294752836227417, + "learning_rate": 1.8372580967103787e-05, + "loss": 1.246, + "step": 6998 + }, + { + "epoch": 2.0846255514808543, + "grad_norm": 0.2368563860654831, + "learning_rate": 1.8372053495646934e-05, + "loss": 1.2609, + "step": 6999 + }, + { + "epoch": 2.0849233976805226, + "grad_norm": 0.25801020860671997, + "learning_rate": 1.8371525946297474e-05, + "loss": 1.2652, + "step": 7000 + }, + { + "epoch": 2.0849233976805226, + "eval_loss": 1.3425718545913696, + "eval_runtime": 21.1491, + "eval_samples_per_second": 81.989, + "eval_steps_per_second": 5.154, + "step": 7000 + }, + { + "epoch": 2.0852212438801914, + "grad_norm": 0.23416325449943542, + "learning_rate": 1.8370998319060322e-05, + "loss": 1.2779, + "step": 7001 + }, + { + "epoch": 2.08551909007986, + "grad_norm": 0.22796592116355896, + "learning_rate": 1.837047061394038e-05, + "loss": 1.2707, + "step": 7002 + }, + { + "epoch": 2.0858169362795285, + "grad_norm": 0.24770605564117432, + "learning_rate": 1.8369942830942567e-05, + "loss": 1.2526, + "step": 7003 + }, + { + "epoch": 2.0861147824791972, + "grad_norm": 0.23605570197105408, + "learning_rate": 1.8369414970071783e-05, + "loss": 1.2479, + "step": 7004 + }, + { + "epoch": 2.086412628678866, + "grad_norm": 0.23263491690158844, + "learning_rate": 1.8368887031332945e-05, + "loss": 1.2474, + "step": 7005 + }, + { + "epoch": 2.086710474878535, + "grad_norm": 0.24133925139904022, + "learning_rate": 1.836835901473096e-05, + "loss": 1.2501, + "step": 7006 + }, + { + "epoch": 2.087008321078203, + "grad_norm": 0.20939506590366364, + "learning_rate": 1.836783092027075e-05, + "loss": 1.2444, + "step": 7007 + }, + { + "epoch": 2.087306167277872, + "grad_norm": 0.22284309566020966, + "learning_rate": 1.8367302747957216e-05, + "loss": 1.2544, + "step": 7008 + }, + { + "epoch": 2.0876040134775407, + "grad_norm": 0.24253371357917786, + "learning_rate": 1.8366774497795284e-05, + "loss": 1.2658, + "step": 7009 + }, + { + "epoch": 2.087901859677209, + "grad_norm": 0.23892003297805786, + "learning_rate": 1.8366246169789858e-05, + "loss": 1.2367, + "step": 7010 + }, + { + "epoch": 2.0881997058768778, + "grad_norm": 0.22930456697940826, + "learning_rate": 1.8365717763945862e-05, + "loss": 1.2601, + "step": 7011 + }, + { + "epoch": 2.0884975520765465, + "grad_norm": 0.2401750683784485, + "learning_rate": 1.836518928026821e-05, + "loss": 1.2688, + "step": 7012 + }, + { + "epoch": 2.0887953982762153, + "grad_norm": 0.2856822609901428, + "learning_rate": 1.8364660718761816e-05, + "loss": 1.2573, + "step": 7013 + }, + { + "epoch": 2.0890932444758836, + "grad_norm": 0.2995435297489166, + "learning_rate": 1.83641320794316e-05, + "loss": 1.2656, + "step": 7014 + }, + { + "epoch": 2.0893910906755524, + "grad_norm": 0.24359357357025146, + "learning_rate": 1.836360336228248e-05, + "loss": 1.2537, + "step": 7015 + }, + { + "epoch": 2.089688936875221, + "grad_norm": 0.5386900901794434, + "learning_rate": 1.8363074567319374e-05, + "loss": 1.2737, + "step": 7016 + }, + { + "epoch": 2.0899867830748895, + "grad_norm": 0.2589731216430664, + "learning_rate": 1.8362545694547202e-05, + "loss": 1.2579, + "step": 7017 + }, + { + "epoch": 2.0902846292745583, + "grad_norm": 0.273444801568985, + "learning_rate": 1.836201674397089e-05, + "loss": 1.2638, + "step": 7018 + }, + { + "epoch": 2.090582475474227, + "grad_norm": 0.23925411701202393, + "learning_rate": 1.8361487715595353e-05, + "loss": 1.2682, + "step": 7019 + }, + { + "epoch": 2.090880321673896, + "grad_norm": 0.2418997436761856, + "learning_rate": 1.8360958609425512e-05, + "loss": 1.2489, + "step": 7020 + }, + { + "epoch": 2.091178167873564, + "grad_norm": 0.2647736072540283, + "learning_rate": 1.8360429425466297e-05, + "loss": 1.2583, + "step": 7021 + }, + { + "epoch": 2.091476014073233, + "grad_norm": 0.2629542350769043, + "learning_rate": 1.8359900163722622e-05, + "loss": 1.259, + "step": 7022 + }, + { + "epoch": 2.0917738602729017, + "grad_norm": 0.24343015253543854, + "learning_rate": 1.835937082419942e-05, + "loss": 1.2715, + "step": 7023 + }, + { + "epoch": 2.09207170647257, + "grad_norm": 0.21744422614574432, + "learning_rate": 1.835884140690161e-05, + "loss": 1.2327, + "step": 7024 + }, + { + "epoch": 2.092369552672239, + "grad_norm": 0.2299731820821762, + "learning_rate": 1.8358311911834122e-05, + "loss": 1.2448, + "step": 7025 + }, + { + "epoch": 2.0926673988719076, + "grad_norm": 0.25502943992614746, + "learning_rate": 1.8357782339001877e-05, + "loss": 1.2617, + "step": 7026 + }, + { + "epoch": 2.0929652450715763, + "grad_norm": 0.22891417145729065, + "learning_rate": 1.835725268840981e-05, + "loss": 1.2534, + "step": 7027 + }, + { + "epoch": 2.0932630912712447, + "grad_norm": 0.2278069704771042, + "learning_rate": 1.835672296006284e-05, + "loss": 1.2672, + "step": 7028 + }, + { + "epoch": 2.0935609374709134, + "grad_norm": 0.22990724444389343, + "learning_rate": 1.8356193153965897e-05, + "loss": 1.2635, + "step": 7029 + }, + { + "epoch": 2.093858783670582, + "grad_norm": 0.23280049860477448, + "learning_rate": 1.8355663270123916e-05, + "loss": 1.2564, + "step": 7030 + }, + { + "epoch": 2.094156629870251, + "grad_norm": 0.2301468402147293, + "learning_rate": 1.8355133308541827e-05, + "loss": 1.2705, + "step": 7031 + }, + { + "epoch": 2.0944544760699193, + "grad_norm": 0.227765753865242, + "learning_rate": 1.8354603269224554e-05, + "loss": 1.2808, + "step": 7032 + }, + { + "epoch": 2.094752322269588, + "grad_norm": 0.22093096375465393, + "learning_rate": 1.8354073152177032e-05, + "loss": 1.2482, + "step": 7033 + }, + { + "epoch": 2.095050168469257, + "grad_norm": 0.21572571992874146, + "learning_rate": 1.835354295740419e-05, + "loss": 1.2628, + "step": 7034 + }, + { + "epoch": 2.095348014668925, + "grad_norm": 0.23582948744297028, + "learning_rate": 1.8353012684910968e-05, + "loss": 1.2565, + "step": 7035 + }, + { + "epoch": 2.095645860868594, + "grad_norm": 0.2237798422574997, + "learning_rate": 1.8352482334702296e-05, + "loss": 1.2664, + "step": 7036 + }, + { + "epoch": 2.0959437070682627, + "grad_norm": 0.22031529247760773, + "learning_rate": 1.8351951906783108e-05, + "loss": 1.2567, + "step": 7037 + }, + { + "epoch": 2.096241553267931, + "grad_norm": 0.22599531710147858, + "learning_rate": 1.8351421401158337e-05, + "loss": 1.2484, + "step": 7038 + }, + { + "epoch": 2.0965393994676, + "grad_norm": 0.22428205609321594, + "learning_rate": 1.835089081783292e-05, + "loss": 1.2602, + "step": 7039 + }, + { + "epoch": 2.0968372456672686, + "grad_norm": 0.2303728461265564, + "learning_rate": 1.8350360156811796e-05, + "loss": 1.2655, + "step": 7040 + }, + { + "epoch": 2.0971350918669374, + "grad_norm": 0.22629369795322418, + "learning_rate": 1.83498294180999e-05, + "loss": 1.2672, + "step": 7041 + }, + { + "epoch": 2.0974329380666057, + "grad_norm": 0.22464722394943237, + "learning_rate": 1.834929860170217e-05, + "loss": 1.2596, + "step": 7042 + }, + { + "epoch": 2.0977307842662745, + "grad_norm": 0.2226731777191162, + "learning_rate": 1.8348767707623544e-05, + "loss": 1.2639, + "step": 7043 + }, + { + "epoch": 2.0980286304659432, + "grad_norm": 0.24072232842445374, + "learning_rate": 1.8348236735868963e-05, + "loss": 1.2646, + "step": 7044 + }, + { + "epoch": 2.098326476665612, + "grad_norm": 0.21738742291927338, + "learning_rate": 1.8347705686443365e-05, + "loss": 1.2569, + "step": 7045 + }, + { + "epoch": 2.0986243228652803, + "grad_norm": 0.22332030534744263, + "learning_rate": 1.8347174559351693e-05, + "loss": 1.2646, + "step": 7046 + }, + { + "epoch": 2.098922169064949, + "grad_norm": 0.23183809220790863, + "learning_rate": 1.834664335459889e-05, + "loss": 1.2605, + "step": 7047 + }, + { + "epoch": 2.099220015264618, + "grad_norm": 0.23424433171749115, + "learning_rate": 1.8346112072189894e-05, + "loss": 1.2593, + "step": 7048 + }, + { + "epoch": 2.099517861464286, + "grad_norm": 0.2322787046432495, + "learning_rate": 1.834558071212965e-05, + "loss": 1.234, + "step": 7049 + }, + { + "epoch": 2.099815707663955, + "grad_norm": 0.23153313994407654, + "learning_rate": 1.8345049274423102e-05, + "loss": 1.2555, + "step": 7050 + }, + { + "epoch": 2.1001135538636237, + "grad_norm": 0.22688642144203186, + "learning_rate": 1.8344517759075193e-05, + "loss": 1.2356, + "step": 7051 + }, + { + "epoch": 2.1004114000632925, + "grad_norm": 0.22773028910160065, + "learning_rate": 1.8343986166090873e-05, + "loss": 1.25, + "step": 7052 + }, + { + "epoch": 2.100709246262961, + "grad_norm": 0.23619934916496277, + "learning_rate": 1.8343454495475083e-05, + "loss": 1.2741, + "step": 7053 + }, + { + "epoch": 2.1010070924626296, + "grad_norm": 0.22756488621234894, + "learning_rate": 1.8342922747232768e-05, + "loss": 1.2498, + "step": 7054 + }, + { + "epoch": 2.1013049386622984, + "grad_norm": 0.2158007025718689, + "learning_rate": 1.834239092136888e-05, + "loss": 1.2508, + "step": 7055 + }, + { + "epoch": 2.1016027848619667, + "grad_norm": 0.21473854780197144, + "learning_rate": 1.8341859017888363e-05, + "loss": 1.2704, + "step": 7056 + }, + { + "epoch": 2.1019006310616355, + "grad_norm": 0.22193129360675812, + "learning_rate": 1.834132703679617e-05, + "loss": 1.2839, + "step": 7057 + }, + { + "epoch": 2.1021984772613043, + "grad_norm": 0.23289480805397034, + "learning_rate": 1.8340794978097248e-05, + "loss": 1.266, + "step": 7058 + }, + { + "epoch": 2.102496323460973, + "grad_norm": 0.22643692791461945, + "learning_rate": 1.8340262841796546e-05, + "loss": 1.2644, + "step": 7059 + }, + { + "epoch": 2.1027941696606414, + "grad_norm": 0.2265244424343109, + "learning_rate": 1.8339730627899017e-05, + "loss": 1.2534, + "step": 7060 + }, + { + "epoch": 2.10309201586031, + "grad_norm": 0.22465166449546814, + "learning_rate": 1.8339198336409615e-05, + "loss": 1.243, + "step": 7061 + }, + { + "epoch": 2.103389862059979, + "grad_norm": 0.2255132794380188, + "learning_rate": 1.8338665967333288e-05, + "loss": 1.2651, + "step": 7062 + }, + { + "epoch": 2.1036877082596472, + "grad_norm": 0.23886889219284058, + "learning_rate": 1.833813352067499e-05, + "loss": 1.2605, + "step": 7063 + }, + { + "epoch": 2.103985554459316, + "grad_norm": 0.2307594269514084, + "learning_rate": 1.8337600996439678e-05, + "loss": 1.2612, + "step": 7064 + }, + { + "epoch": 2.104283400658985, + "grad_norm": 0.23562608659267426, + "learning_rate": 1.83370683946323e-05, + "loss": 1.2712, + "step": 7065 + }, + { + "epoch": 2.1045812468586536, + "grad_norm": 0.22392936050891876, + "learning_rate": 1.8336535715257818e-05, + "loss": 1.2533, + "step": 7066 + }, + { + "epoch": 2.104879093058322, + "grad_norm": 0.233314648270607, + "learning_rate": 1.8336002958321185e-05, + "loss": 1.2763, + "step": 7067 + }, + { + "epoch": 2.1051769392579907, + "grad_norm": 0.2265716791152954, + "learning_rate": 1.8335470123827356e-05, + "loss": 1.248, + "step": 7068 + }, + { + "epoch": 2.1054747854576594, + "grad_norm": 0.2358282506465912, + "learning_rate": 1.833493721178129e-05, + "loss": 1.2695, + "step": 7069 + }, + { + "epoch": 2.1057726316573278, + "grad_norm": 0.2352137416601181, + "learning_rate": 1.8334404222187953e-05, + "loss": 1.2627, + "step": 7070 + }, + { + "epoch": 2.1060704778569965, + "grad_norm": 0.23004014790058136, + "learning_rate": 1.8333871155052286e-05, + "loss": 1.2474, + "step": 7071 + }, + { + "epoch": 2.1063683240566653, + "grad_norm": 0.24043875932693481, + "learning_rate": 1.8333338010379264e-05, + "loss": 1.2649, + "step": 7072 + }, + { + "epoch": 2.106666170256334, + "grad_norm": 0.23524345457553864, + "learning_rate": 1.8332804788173843e-05, + "loss": 1.262, + "step": 7073 + }, + { + "epoch": 2.1069640164560024, + "grad_norm": 0.22107917070388794, + "learning_rate": 1.8332271488440985e-05, + "loss": 1.2511, + "step": 7074 + }, + { + "epoch": 2.107261862655671, + "grad_norm": 0.2195582240819931, + "learning_rate": 1.8331738111185648e-05, + "loss": 1.2771, + "step": 7075 + }, + { + "epoch": 2.10755970885534, + "grad_norm": 0.23306547105312347, + "learning_rate": 1.8331204656412796e-05, + "loss": 1.2649, + "step": 7076 + }, + { + "epoch": 2.1078575550550083, + "grad_norm": 0.2380416840314865, + "learning_rate": 1.8330671124127394e-05, + "loss": 1.273, + "step": 7077 + }, + { + "epoch": 2.108155401254677, + "grad_norm": 0.22080141305923462, + "learning_rate": 1.8330137514334405e-05, + "loss": 1.2669, + "step": 7078 + }, + { + "epoch": 2.108453247454346, + "grad_norm": 0.22362235188484192, + "learning_rate": 1.8329603827038792e-05, + "loss": 1.2627, + "step": 7079 + }, + { + "epoch": 2.1087510936540146, + "grad_norm": 0.2260538935661316, + "learning_rate": 1.8329070062245522e-05, + "loss": 1.2521, + "step": 7080 + }, + { + "epoch": 2.109048939853683, + "grad_norm": 0.22199952602386475, + "learning_rate": 1.8328536219959563e-05, + "loss": 1.2398, + "step": 7081 + }, + { + "epoch": 2.1093467860533517, + "grad_norm": 0.22181862592697144, + "learning_rate": 1.8328002300185878e-05, + "loss": 1.2525, + "step": 7082 + }, + { + "epoch": 2.1096446322530205, + "grad_norm": 0.23434802889823914, + "learning_rate": 1.8327468302929437e-05, + "loss": 1.2468, + "step": 7083 + }, + { + "epoch": 2.109942478452689, + "grad_norm": 0.2293621003627777, + "learning_rate": 1.8326934228195205e-05, + "loss": 1.2695, + "step": 7084 + }, + { + "epoch": 2.1102403246523576, + "grad_norm": 0.22341133654117584, + "learning_rate": 1.8326400075988157e-05, + "loss": 1.2357, + "step": 7085 + }, + { + "epoch": 2.1105381708520263, + "grad_norm": 0.24247558414936066, + "learning_rate": 1.8325865846313255e-05, + "loss": 1.2589, + "step": 7086 + }, + { + "epoch": 2.110836017051695, + "grad_norm": 0.22479978203773499, + "learning_rate": 1.8325331539175475e-05, + "loss": 1.2624, + "step": 7087 + }, + { + "epoch": 2.1111338632513634, + "grad_norm": 0.22271622717380524, + "learning_rate": 1.832479715457979e-05, + "loss": 1.263, + "step": 7088 + }, + { + "epoch": 2.111431709451032, + "grad_norm": 0.2161114364862442, + "learning_rate": 1.8324262692531162e-05, + "loss": 1.2583, + "step": 7089 + }, + { + "epoch": 2.111729555650701, + "grad_norm": 0.21842052042484283, + "learning_rate": 1.8323728153034576e-05, + "loss": 1.2611, + "step": 7090 + }, + { + "epoch": 2.1120274018503693, + "grad_norm": 0.22679197788238525, + "learning_rate": 1.8323193536094998e-05, + "loss": 1.2384, + "step": 7091 + }, + { + "epoch": 2.112325248050038, + "grad_norm": 0.2320265918970108, + "learning_rate": 1.8322658841717404e-05, + "loss": 1.2473, + "step": 7092 + }, + { + "epoch": 2.112623094249707, + "grad_norm": 0.22139060497283936, + "learning_rate": 1.8322124069906765e-05, + "loss": 1.2461, + "step": 7093 + }, + { + "epoch": 2.1129209404493756, + "grad_norm": 0.2263459712266922, + "learning_rate": 1.8321589220668062e-05, + "loss": 1.2569, + "step": 7094 + }, + { + "epoch": 2.113218786649044, + "grad_norm": 0.22318021953105927, + "learning_rate": 1.8321054294006265e-05, + "loss": 1.2519, + "step": 7095 + }, + { + "epoch": 2.1135166328487127, + "grad_norm": 0.21863384544849396, + "learning_rate": 1.832051928992636e-05, + "loss": 1.2587, + "step": 7096 + }, + { + "epoch": 2.1138144790483815, + "grad_norm": 0.2100280076265335, + "learning_rate": 1.8319984208433318e-05, + "loss": 1.2587, + "step": 7097 + }, + { + "epoch": 2.1141123252480503, + "grad_norm": 0.22919198870658875, + "learning_rate": 1.8319449049532115e-05, + "loss": 1.2711, + "step": 7098 + }, + { + "epoch": 2.1144101714477186, + "grad_norm": 0.24317790567874908, + "learning_rate": 1.8318913813227738e-05, + "loss": 1.2668, + "step": 7099 + }, + { + "epoch": 2.1147080176473874, + "grad_norm": 0.23060758411884308, + "learning_rate": 1.831837849952516e-05, + "loss": 1.2724, + "step": 7100 + }, + { + "epoch": 2.115005863847056, + "grad_norm": 0.22312051057815552, + "learning_rate": 1.8317843108429362e-05, + "loss": 1.2454, + "step": 7101 + }, + { + "epoch": 2.1153037100467245, + "grad_norm": 0.23319660127162933, + "learning_rate": 1.8317307639945332e-05, + "loss": 1.2777, + "step": 7102 + }, + { + "epoch": 2.1156015562463932, + "grad_norm": 0.22555091977119446, + "learning_rate": 1.8316772094078043e-05, + "loss": 1.2523, + "step": 7103 + }, + { + "epoch": 2.115899402446062, + "grad_norm": 0.22101464867591858, + "learning_rate": 1.8316236470832484e-05, + "loss": 1.2671, + "step": 7104 + }, + { + "epoch": 2.1161972486457303, + "grad_norm": 0.22322897613048553, + "learning_rate": 1.8315700770213637e-05, + "loss": 1.2543, + "step": 7105 + }, + { + "epoch": 2.116495094845399, + "grad_norm": 0.2308381050825119, + "learning_rate": 1.8315164992226482e-05, + "loss": 1.2727, + "step": 7106 + }, + { + "epoch": 2.116792941045068, + "grad_norm": 0.2280161827802658, + "learning_rate": 1.831462913687601e-05, + "loss": 1.2642, + "step": 7107 + }, + { + "epoch": 2.1170907872447366, + "grad_norm": 0.24484100937843323, + "learning_rate": 1.83140932041672e-05, + "loss": 1.2837, + "step": 7108 + }, + { + "epoch": 2.117388633444405, + "grad_norm": 0.23114639520645142, + "learning_rate": 1.8313557194105046e-05, + "loss": 1.2573, + "step": 7109 + }, + { + "epoch": 2.1176864796440737, + "grad_norm": 0.2284366935491562, + "learning_rate": 1.831302110669453e-05, + "loss": 1.2669, + "step": 7110 + }, + { + "epoch": 2.1179843258437425, + "grad_norm": 0.224045529961586, + "learning_rate": 1.8312484941940635e-05, + "loss": 1.2434, + "step": 7111 + }, + { + "epoch": 2.1182821720434113, + "grad_norm": 0.24650663137435913, + "learning_rate": 1.831194869984836e-05, + "loss": 1.2562, + "step": 7112 + }, + { + "epoch": 2.1185800182430796, + "grad_norm": 0.22716335952281952, + "learning_rate": 1.8311412380422688e-05, + "loss": 1.2444, + "step": 7113 + }, + { + "epoch": 2.1188778644427484, + "grad_norm": 0.2214750498533249, + "learning_rate": 1.831087598366861e-05, + "loss": 1.2655, + "step": 7114 + }, + { + "epoch": 2.119175710642417, + "grad_norm": 0.22327202558517456, + "learning_rate": 1.8310339509591118e-05, + "loss": 1.2576, + "step": 7115 + }, + { + "epoch": 2.1194735568420855, + "grad_norm": 0.2163681983947754, + "learning_rate": 1.8309802958195197e-05, + "loss": 1.2498, + "step": 7116 + }, + { + "epoch": 2.1197714030417543, + "grad_norm": 0.22627170383930206, + "learning_rate": 1.8309266329485847e-05, + "loss": 1.2722, + "step": 7117 + }, + { + "epoch": 2.120069249241423, + "grad_norm": 0.22877243161201477, + "learning_rate": 1.8308729623468056e-05, + "loss": 1.2592, + "step": 7118 + }, + { + "epoch": 2.120367095441092, + "grad_norm": 0.22487469017505646, + "learning_rate": 1.830819284014682e-05, + "loss": 1.2586, + "step": 7119 + }, + { + "epoch": 2.12066494164076, + "grad_norm": 0.23687994480133057, + "learning_rate": 1.8307655979527133e-05, + "loss": 1.2619, + "step": 7120 + }, + { + "epoch": 2.120962787840429, + "grad_norm": 0.21765540540218353, + "learning_rate": 1.8307119041613988e-05, + "loss": 1.255, + "step": 7121 + }, + { + "epoch": 2.1212606340400977, + "grad_norm": 0.22309458255767822, + "learning_rate": 1.8306582026412384e-05, + "loss": 1.2648, + "step": 7122 + }, + { + "epoch": 2.121558480239766, + "grad_norm": 0.22108714282512665, + "learning_rate": 1.830604493392731e-05, + "loss": 1.2516, + "step": 7123 + }, + { + "epoch": 2.121856326439435, + "grad_norm": 0.23164448142051697, + "learning_rate": 1.8305507764163772e-05, + "loss": 1.285, + "step": 7124 + }, + { + "epoch": 2.1221541726391036, + "grad_norm": 0.23148779571056366, + "learning_rate": 1.830497051712676e-05, + "loss": 1.2271, + "step": 7125 + }, + { + "epoch": 2.1224520188387723, + "grad_norm": 0.22546571493148804, + "learning_rate": 1.830443319282128e-05, + "loss": 1.2637, + "step": 7126 + }, + { + "epoch": 2.1227498650384407, + "grad_norm": 0.22211135923862457, + "learning_rate": 1.8303895791252327e-05, + "loss": 1.255, + "step": 7127 + }, + { + "epoch": 2.1230477112381094, + "grad_norm": 0.2260240763425827, + "learning_rate": 1.83033583124249e-05, + "loss": 1.2687, + "step": 7128 + }, + { + "epoch": 2.123345557437778, + "grad_norm": 0.23260627686977386, + "learning_rate": 1.8302820756344e-05, + "loss": 1.2624, + "step": 7129 + }, + { + "epoch": 2.1236434036374465, + "grad_norm": 0.2216912806034088, + "learning_rate": 1.830228312301463e-05, + "loss": 1.2567, + "step": 7130 + }, + { + "epoch": 2.1239412498371153, + "grad_norm": 0.2314852923154831, + "learning_rate": 1.8301745412441793e-05, + "loss": 1.2652, + "step": 7131 + }, + { + "epoch": 2.124239096036784, + "grad_norm": 0.23260203003883362, + "learning_rate": 1.8301207624630487e-05, + "loss": 1.2622, + "step": 7132 + }, + { + "epoch": 2.124536942236453, + "grad_norm": 0.22038774192333221, + "learning_rate": 1.8300669759585724e-05, + "loss": 1.2579, + "step": 7133 + }, + { + "epoch": 2.124834788436121, + "grad_norm": 0.22577354311943054, + "learning_rate": 1.83001318173125e-05, + "loss": 1.2662, + "step": 7134 + }, + { + "epoch": 2.12513263463579, + "grad_norm": 0.22296591103076935, + "learning_rate": 1.829959379781582e-05, + "loss": 1.2715, + "step": 7135 + }, + { + "epoch": 2.1254304808354587, + "grad_norm": 0.22403176128864288, + "learning_rate": 1.8299055701100696e-05, + "loss": 1.2463, + "step": 7136 + }, + { + "epoch": 2.125728327035127, + "grad_norm": 0.23563499748706818, + "learning_rate": 1.8298517527172128e-05, + "loss": 1.2562, + "step": 7137 + }, + { + "epoch": 2.126026173234796, + "grad_norm": 0.22584551572799683, + "learning_rate": 1.8297979276035128e-05, + "loss": 1.2479, + "step": 7138 + }, + { + "epoch": 2.1263240194344646, + "grad_norm": 0.23392264544963837, + "learning_rate": 1.8297440947694703e-05, + "loss": 1.2683, + "step": 7139 + }, + { + "epoch": 2.1266218656341334, + "grad_norm": 0.23094266653060913, + "learning_rate": 1.8296902542155862e-05, + "loss": 1.258, + "step": 7140 + }, + { + "epoch": 2.1269197118338017, + "grad_norm": 0.22943681478500366, + "learning_rate": 1.829636405942361e-05, + "loss": 1.2705, + "step": 7141 + }, + { + "epoch": 2.1272175580334705, + "grad_norm": 0.23909318447113037, + "learning_rate": 1.829582549950296e-05, + "loss": 1.2741, + "step": 7142 + }, + { + "epoch": 2.1275154042331392, + "grad_norm": 0.22960753738880157, + "learning_rate": 1.829528686239892e-05, + "loss": 1.2576, + "step": 7143 + }, + { + "epoch": 2.1278132504328076, + "grad_norm": 0.22400975227355957, + "learning_rate": 1.8294748148116505e-05, + "loss": 1.2507, + "step": 7144 + }, + { + "epoch": 2.1281110966324763, + "grad_norm": 0.2300107777118683, + "learning_rate": 1.829420935666073e-05, + "loss": 1.2666, + "step": 7145 + }, + { + "epoch": 2.128408942832145, + "grad_norm": 0.23621103167533875, + "learning_rate": 1.8293670488036598e-05, + "loss": 1.2505, + "step": 7146 + }, + { + "epoch": 2.128706789031814, + "grad_norm": 0.22813810408115387, + "learning_rate": 1.8293131542249135e-05, + "loss": 1.2746, + "step": 7147 + }, + { + "epoch": 2.129004635231482, + "grad_norm": 0.21974337100982666, + "learning_rate": 1.829259251930334e-05, + "loss": 1.2543, + "step": 7148 + }, + { + "epoch": 2.129302481431151, + "grad_norm": 0.21717719733715057, + "learning_rate": 1.8292053419204244e-05, + "loss": 1.2689, + "step": 7149 + }, + { + "epoch": 2.1296003276308197, + "grad_norm": 0.22804971039295197, + "learning_rate": 1.8291514241956853e-05, + "loss": 1.2672, + "step": 7150 + }, + { + "epoch": 2.1298981738304885, + "grad_norm": 0.23685619235038757, + "learning_rate": 1.8290974987566183e-05, + "loss": 1.2711, + "step": 7151 + }, + { + "epoch": 2.130196020030157, + "grad_norm": 0.23363293707370758, + "learning_rate": 1.8290435656037256e-05, + "loss": 1.2595, + "step": 7152 + }, + { + "epoch": 2.1304938662298256, + "grad_norm": 0.22478599846363068, + "learning_rate": 1.828989624737509e-05, + "loss": 1.2615, + "step": 7153 + }, + { + "epoch": 2.1307917124294944, + "grad_norm": 0.2230616807937622, + "learning_rate": 1.8289356761584698e-05, + "loss": 1.2661, + "step": 7154 + }, + { + "epoch": 2.1310895586291627, + "grad_norm": 0.22488093376159668, + "learning_rate": 1.8288817198671103e-05, + "loss": 1.255, + "step": 7155 + }, + { + "epoch": 2.1313874048288315, + "grad_norm": 0.22709622979164124, + "learning_rate": 1.828827755863933e-05, + "loss": 1.2545, + "step": 7156 + }, + { + "epoch": 2.1316852510285003, + "grad_norm": 0.2322174310684204, + "learning_rate": 1.8287737841494387e-05, + "loss": 1.2603, + "step": 7157 + }, + { + "epoch": 2.1319830972281686, + "grad_norm": 0.22935152053833008, + "learning_rate": 1.828719804724131e-05, + "loss": 1.2581, + "step": 7158 + }, + { + "epoch": 2.1322809434278374, + "grad_norm": 0.22376160323619843, + "learning_rate": 1.828665817588511e-05, + "loss": 1.2679, + "step": 7159 + }, + { + "epoch": 2.132578789627506, + "grad_norm": 0.22763124108314514, + "learning_rate": 1.8286118227430816e-05, + "loss": 1.2655, + "step": 7160 + }, + { + "epoch": 2.132876635827175, + "grad_norm": 0.22681979835033417, + "learning_rate": 1.828557820188345e-05, + "loss": 1.2472, + "step": 7161 + }, + { + "epoch": 2.1331744820268432, + "grad_norm": 0.23133428394794464, + "learning_rate": 1.8285038099248032e-05, + "loss": 1.2594, + "step": 7162 + }, + { + "epoch": 2.133472328226512, + "grad_norm": 0.22428368031978607, + "learning_rate": 1.8284497919529592e-05, + "loss": 1.2625, + "step": 7163 + }, + { + "epoch": 2.1337701744261808, + "grad_norm": 0.22396983206272125, + "learning_rate": 1.828395766273316e-05, + "loss": 1.2425, + "step": 7164 + }, + { + "epoch": 2.1340680206258495, + "grad_norm": 0.21950854361057281, + "learning_rate": 1.8283417328863752e-05, + "loss": 1.2489, + "step": 7165 + }, + { + "epoch": 2.134365866825518, + "grad_norm": 0.2416682243347168, + "learning_rate": 1.82828769179264e-05, + "loss": 1.2673, + "step": 7166 + }, + { + "epoch": 2.1346637130251866, + "grad_norm": 0.21806782484054565, + "learning_rate": 1.828233642992614e-05, + "loss": 1.2661, + "step": 7167 + }, + { + "epoch": 2.1349615592248554, + "grad_norm": 0.22544948756694794, + "learning_rate": 1.828179586486799e-05, + "loss": 1.262, + "step": 7168 + }, + { + "epoch": 2.1352594054245237, + "grad_norm": 0.22799630463123322, + "learning_rate": 1.8281255222756977e-05, + "loss": 1.2529, + "step": 7169 + }, + { + "epoch": 2.1355572516241925, + "grad_norm": 0.2227330505847931, + "learning_rate": 1.828071450359814e-05, + "loss": 1.2513, + "step": 7170 + }, + { + "epoch": 2.1358550978238613, + "grad_norm": 0.22194263339042664, + "learning_rate": 1.8280173707396507e-05, + "loss": 1.2589, + "step": 7171 + }, + { + "epoch": 2.1361529440235296, + "grad_norm": 0.2329639047384262, + "learning_rate": 1.827963283415711e-05, + "loss": 1.2685, + "step": 7172 + }, + { + "epoch": 2.1364507902231984, + "grad_norm": 0.23032844066619873, + "learning_rate": 1.8279091883884977e-05, + "loss": 1.2414, + "step": 7173 + }, + { + "epoch": 2.136748636422867, + "grad_norm": 0.2239466905593872, + "learning_rate": 1.8278550856585142e-05, + "loss": 1.2429, + "step": 7174 + }, + { + "epoch": 2.137046482622536, + "grad_norm": 0.22458164393901825, + "learning_rate": 1.8278009752262647e-05, + "loss": 1.2576, + "step": 7175 + }, + { + "epoch": 2.1373443288222043, + "grad_norm": 0.22926200926303864, + "learning_rate": 1.8277468570922515e-05, + "loss": 1.2485, + "step": 7176 + }, + { + "epoch": 2.137642175021873, + "grad_norm": 0.21348576247692108, + "learning_rate": 1.827692731256979e-05, + "loss": 1.2591, + "step": 7177 + }, + { + "epoch": 2.137940021221542, + "grad_norm": 0.22748205065727234, + "learning_rate": 1.82763859772095e-05, + "loss": 1.2777, + "step": 7178 + }, + { + "epoch": 2.1382378674212106, + "grad_norm": 0.22369438409805298, + "learning_rate": 1.8275844564846688e-05, + "loss": 1.2521, + "step": 7179 + }, + { + "epoch": 2.138535713620879, + "grad_norm": 0.23350432515144348, + "learning_rate": 1.8275303075486386e-05, + "loss": 1.2588, + "step": 7180 + }, + { + "epoch": 2.1388335598205477, + "grad_norm": 0.2353038489818573, + "learning_rate": 1.8274761509133635e-05, + "loss": 1.2676, + "step": 7181 + }, + { + "epoch": 2.1391314060202165, + "grad_norm": 0.22810761630535126, + "learning_rate": 1.8274219865793477e-05, + "loss": 1.2504, + "step": 7182 + }, + { + "epoch": 2.139429252219885, + "grad_norm": 0.21917693316936493, + "learning_rate": 1.8273678145470947e-05, + "loss": 1.2681, + "step": 7183 + }, + { + "epoch": 2.1397270984195536, + "grad_norm": 0.23463785648345947, + "learning_rate": 1.8273136348171082e-05, + "loss": 1.2598, + "step": 7184 + }, + { + "epoch": 2.1400249446192223, + "grad_norm": 0.2273704707622528, + "learning_rate": 1.827259447389893e-05, + "loss": 1.2754, + "step": 7185 + }, + { + "epoch": 2.140322790818891, + "grad_norm": 0.2283557802438736, + "learning_rate": 1.8272052522659525e-05, + "loss": 1.2435, + "step": 7186 + }, + { + "epoch": 2.1406206370185594, + "grad_norm": 0.23130172491073608, + "learning_rate": 1.827151049445792e-05, + "loss": 1.2578, + "step": 7187 + }, + { + "epoch": 2.140918483218228, + "grad_norm": 0.2366407811641693, + "learning_rate": 1.8270968389299145e-05, + "loss": 1.247, + "step": 7188 + }, + { + "epoch": 2.141216329417897, + "grad_norm": 0.22454547882080078, + "learning_rate": 1.8270426207188252e-05, + "loss": 1.2624, + "step": 7189 + }, + { + "epoch": 2.1415141756175653, + "grad_norm": 0.22233931720256805, + "learning_rate": 1.8269883948130283e-05, + "loss": 1.2723, + "step": 7190 + }, + { + "epoch": 2.141812021817234, + "grad_norm": 0.23232373595237732, + "learning_rate": 1.8269341612130284e-05, + "loss": 1.257, + "step": 7191 + }, + { + "epoch": 2.142109868016903, + "grad_norm": 0.22737377882003784, + "learning_rate": 1.82687991991933e-05, + "loss": 1.2609, + "step": 7192 + }, + { + "epoch": 2.1424077142165716, + "grad_norm": 0.2243301272392273, + "learning_rate": 1.8268256709324383e-05, + "loss": 1.2515, + "step": 7193 + }, + { + "epoch": 2.14270556041624, + "grad_norm": 0.23105941712856293, + "learning_rate": 1.826771414252857e-05, + "loss": 1.2595, + "step": 7194 + }, + { + "epoch": 2.1430034066159087, + "grad_norm": 0.22792844474315643, + "learning_rate": 1.826717149881091e-05, + "loss": 1.2664, + "step": 7195 + }, + { + "epoch": 2.1433012528155775, + "grad_norm": 0.22798797488212585, + "learning_rate": 1.8266628778176465e-05, + "loss": 1.2624, + "step": 7196 + }, + { + "epoch": 2.143599099015246, + "grad_norm": 0.24311818182468414, + "learning_rate": 1.826608598063027e-05, + "loss": 1.258, + "step": 7197 + }, + { + "epoch": 2.1438969452149146, + "grad_norm": 0.2258772999048233, + "learning_rate": 1.826554310617738e-05, + "loss": 1.2578, + "step": 7198 + }, + { + "epoch": 2.1441947914145834, + "grad_norm": 0.2354695200920105, + "learning_rate": 1.8265000154822846e-05, + "loss": 1.2454, + "step": 7199 + }, + { + "epoch": 2.144492637614252, + "grad_norm": 0.24303671717643738, + "learning_rate": 1.8264457126571723e-05, + "loss": 1.277, + "step": 7200 + }, + { + "epoch": 2.1447904838139205, + "grad_norm": 0.23406969010829926, + "learning_rate": 1.8263914021429057e-05, + "loss": 1.2581, + "step": 7201 + }, + { + "epoch": 2.1450883300135892, + "grad_norm": 0.2457258403301239, + "learning_rate": 1.8263370839399906e-05, + "loss": 1.2546, + "step": 7202 + }, + { + "epoch": 2.145386176213258, + "grad_norm": 0.2412896603345871, + "learning_rate": 1.8262827580489322e-05, + "loss": 1.2597, + "step": 7203 + }, + { + "epoch": 2.1456840224129263, + "grad_norm": 0.2142343819141388, + "learning_rate": 1.826228424470236e-05, + "loss": 1.2576, + "step": 7204 + }, + { + "epoch": 2.145981868612595, + "grad_norm": 0.2583129107952118, + "learning_rate": 1.826174083204407e-05, + "loss": 1.2615, + "step": 7205 + }, + { + "epoch": 2.146279714812264, + "grad_norm": 0.22269707918167114, + "learning_rate": 1.8261197342519513e-05, + "loss": 1.2541, + "step": 7206 + }, + { + "epoch": 2.1465775610119326, + "grad_norm": 0.23978152871131897, + "learning_rate": 1.8260653776133746e-05, + "loss": 1.2631, + "step": 7207 + }, + { + "epoch": 2.146875407211601, + "grad_norm": 0.2298610359430313, + "learning_rate": 1.8260110132891825e-05, + "loss": 1.2583, + "step": 7208 + }, + { + "epoch": 2.1471732534112697, + "grad_norm": 0.22163039445877075, + "learning_rate": 1.825956641279881e-05, + "loss": 1.2533, + "step": 7209 + }, + { + "epoch": 2.1474710996109385, + "grad_norm": 0.23827357590198517, + "learning_rate": 1.8259022615859756e-05, + "loss": 1.2775, + "step": 7210 + }, + { + "epoch": 2.147768945810607, + "grad_norm": 0.24706870317459106, + "learning_rate": 1.8258478742079725e-05, + "loss": 1.268, + "step": 7211 + }, + { + "epoch": 2.1480667920102756, + "grad_norm": 0.23246964812278748, + "learning_rate": 1.8257934791463774e-05, + "loss": 1.2754, + "step": 7212 + }, + { + "epoch": 2.1483646382099444, + "grad_norm": 0.22534261643886566, + "learning_rate": 1.8257390764016968e-05, + "loss": 1.2704, + "step": 7213 + }, + { + "epoch": 2.148662484409613, + "grad_norm": 0.2393663376569748, + "learning_rate": 1.8256846659744364e-05, + "loss": 1.273, + "step": 7214 + }, + { + "epoch": 2.1489603306092815, + "grad_norm": 0.21977217495441437, + "learning_rate": 1.8256302478651033e-05, + "loss": 1.2625, + "step": 7215 + }, + { + "epoch": 2.1492581768089503, + "grad_norm": 0.24201923608779907, + "learning_rate": 1.8255758220742025e-05, + "loss": 1.2478, + "step": 7216 + }, + { + "epoch": 2.149556023008619, + "grad_norm": 0.24218101799488068, + "learning_rate": 1.8255213886022412e-05, + "loss": 1.2552, + "step": 7217 + }, + { + "epoch": 2.149853869208288, + "grad_norm": 0.2628040909767151, + "learning_rate": 1.825466947449726e-05, + "loss": 1.2653, + "step": 7218 + }, + { + "epoch": 2.150151715407956, + "grad_norm": 0.23015199601650238, + "learning_rate": 1.825412498617163e-05, + "loss": 1.2422, + "step": 7219 + }, + { + "epoch": 2.150449561607625, + "grad_norm": 0.2507378160953522, + "learning_rate": 1.8253580421050587e-05, + "loss": 1.2785, + "step": 7220 + }, + { + "epoch": 2.1507474078072937, + "grad_norm": 0.2190084457397461, + "learning_rate": 1.8253035779139203e-05, + "loss": 1.2675, + "step": 7221 + }, + { + "epoch": 2.151045254006962, + "grad_norm": 0.23176588118076324, + "learning_rate": 1.825249106044254e-05, + "loss": 1.2642, + "step": 7222 + }, + { + "epoch": 2.1513431002066308, + "grad_norm": 0.22592073678970337, + "learning_rate": 1.8251946264965668e-05, + "loss": 1.2487, + "step": 7223 + }, + { + "epoch": 2.1516409464062995, + "grad_norm": 0.24401065707206726, + "learning_rate": 1.8251401392713655e-05, + "loss": 1.2684, + "step": 7224 + }, + { + "epoch": 2.151938792605968, + "grad_norm": 0.23437196016311646, + "learning_rate": 1.825085644369157e-05, + "loss": 1.2643, + "step": 7225 + }, + { + "epoch": 2.1522366388056366, + "grad_norm": 0.2282634824514389, + "learning_rate": 1.8250311417904488e-05, + "loss": 1.2717, + "step": 7226 + }, + { + "epoch": 2.1525344850053054, + "grad_norm": 0.23535984754562378, + "learning_rate": 1.8249766315357474e-05, + "loss": 1.2717, + "step": 7227 + }, + { + "epoch": 2.152832331204974, + "grad_norm": 0.22330625355243683, + "learning_rate": 1.82492211360556e-05, + "loss": 1.2542, + "step": 7228 + }, + { + "epoch": 2.1531301774046425, + "grad_norm": 0.2360328733921051, + "learning_rate": 1.824867588000394e-05, + "loss": 1.2545, + "step": 7229 + }, + { + "epoch": 2.1534280236043113, + "grad_norm": 0.2204810082912445, + "learning_rate": 1.824813054720757e-05, + "loss": 1.2444, + "step": 7230 + }, + { + "epoch": 2.15372586980398, + "grad_norm": 0.23123958706855774, + "learning_rate": 1.8247585137671562e-05, + "loss": 1.247, + "step": 7231 + }, + { + "epoch": 2.154023716003649, + "grad_norm": 0.22586317360401154, + "learning_rate": 1.8247039651400984e-05, + "loss": 1.2506, + "step": 7232 + }, + { + "epoch": 2.154321562203317, + "grad_norm": 0.22761893272399902, + "learning_rate": 1.8246494088400918e-05, + "loss": 1.2599, + "step": 7233 + }, + { + "epoch": 2.154619408402986, + "grad_norm": 0.2438083440065384, + "learning_rate": 1.8245948448676438e-05, + "loss": 1.2562, + "step": 7234 + }, + { + "epoch": 2.1549172546026547, + "grad_norm": 0.22488285601139069, + "learning_rate": 1.8245402732232622e-05, + "loss": 1.2649, + "step": 7235 + }, + { + "epoch": 2.155215100802323, + "grad_norm": 0.24174481630325317, + "learning_rate": 1.8244856939074544e-05, + "loss": 1.2448, + "step": 7236 + }, + { + "epoch": 2.155512947001992, + "grad_norm": 0.23343084752559662, + "learning_rate": 1.8244311069207285e-05, + "loss": 1.2557, + "step": 7237 + }, + { + "epoch": 2.1558107932016606, + "grad_norm": 0.27241039276123047, + "learning_rate": 1.824376512263592e-05, + "loss": 1.2537, + "step": 7238 + }, + { + "epoch": 2.156108639401329, + "grad_norm": 0.22729940712451935, + "learning_rate": 1.8243219099365534e-05, + "loss": 1.2502, + "step": 7239 + }, + { + "epoch": 2.1564064856009977, + "grad_norm": 0.23571279644966125, + "learning_rate": 1.8242672999401202e-05, + "loss": 1.2629, + "step": 7240 + }, + { + "epoch": 2.1567043318006665, + "grad_norm": 0.23134379088878632, + "learning_rate": 1.824212682274801e-05, + "loss": 1.2681, + "step": 7241 + }, + { + "epoch": 2.1570021780003352, + "grad_norm": 0.23377610743045807, + "learning_rate": 1.8241580569411038e-05, + "loss": 1.2497, + "step": 7242 + }, + { + "epoch": 2.1573000242000036, + "grad_norm": 0.2495405226945877, + "learning_rate": 1.824103423939536e-05, + "loss": 1.2578, + "step": 7243 + }, + { + "epoch": 2.1575978703996723, + "grad_norm": 0.24279369413852692, + "learning_rate": 1.824048783270607e-05, + "loss": 1.2564, + "step": 7244 + }, + { + "epoch": 2.157895716599341, + "grad_norm": 0.24328194558620453, + "learning_rate": 1.8239941349348246e-05, + "loss": 1.2721, + "step": 7245 + }, + { + "epoch": 2.15819356279901, + "grad_norm": 0.2252289354801178, + "learning_rate": 1.8239394789326978e-05, + "loss": 1.2542, + "step": 7246 + }, + { + "epoch": 2.158491408998678, + "grad_norm": 0.22826309502124786, + "learning_rate": 1.8238848152647345e-05, + "loss": 1.2661, + "step": 7247 + }, + { + "epoch": 2.158789255198347, + "grad_norm": 0.23042802512645721, + "learning_rate": 1.823830143931443e-05, + "loss": 1.2625, + "step": 7248 + }, + { + "epoch": 2.1590871013980157, + "grad_norm": 0.23423734307289124, + "learning_rate": 1.8237754649333334e-05, + "loss": 1.27, + "step": 7249 + }, + { + "epoch": 2.159384947597684, + "grad_norm": 0.2401869148015976, + "learning_rate": 1.8237207782709124e-05, + "loss": 1.2711, + "step": 7250 + }, + { + "epoch": 2.159682793797353, + "grad_norm": 0.24477271735668182, + "learning_rate": 1.8236660839446908e-05, + "loss": 1.272, + "step": 7251 + }, + { + "epoch": 2.1599806399970216, + "grad_norm": 0.23227417469024658, + "learning_rate": 1.8236113819551758e-05, + "loss": 1.2616, + "step": 7252 + }, + { + "epoch": 2.1602784861966904, + "grad_norm": 0.2281360775232315, + "learning_rate": 1.8235566723028776e-05, + "loss": 1.2566, + "step": 7253 + }, + { + "epoch": 2.1605763323963587, + "grad_norm": 0.3357749581336975, + "learning_rate": 1.8235019549883045e-05, + "loss": 1.2721, + "step": 7254 + }, + { + "epoch": 2.1608741785960275, + "grad_norm": 0.26791083812713623, + "learning_rate": 1.823447230011966e-05, + "loss": 1.2501, + "step": 7255 + }, + { + "epoch": 2.1611720247956963, + "grad_norm": 0.2542476952075958, + "learning_rate": 1.8233924973743707e-05, + "loss": 1.2634, + "step": 7256 + }, + { + "epoch": 2.1614698709953646, + "grad_norm": 0.22172990441322327, + "learning_rate": 1.823337757076028e-05, + "loss": 1.2648, + "step": 7257 + }, + { + "epoch": 2.1617677171950334, + "grad_norm": 0.2839290499687195, + "learning_rate": 1.823283009117448e-05, + "loss": 1.2669, + "step": 7258 + }, + { + "epoch": 2.162065563394702, + "grad_norm": 0.23253114521503448, + "learning_rate": 1.823228253499139e-05, + "loss": 1.2503, + "step": 7259 + }, + { + "epoch": 2.162363409594371, + "grad_norm": 0.23647059500217438, + "learning_rate": 1.823173490221611e-05, + "loss": 1.2686, + "step": 7260 + }, + { + "epoch": 2.1626612557940392, + "grad_norm": 0.24149256944656372, + "learning_rate": 1.8231187192853732e-05, + "loss": 1.2444, + "step": 7261 + }, + { + "epoch": 2.162959101993708, + "grad_norm": 0.233870729804039, + "learning_rate": 1.8230639406909357e-05, + "loss": 1.2642, + "step": 7262 + }, + { + "epoch": 2.1632569481933768, + "grad_norm": 0.21807897090911865, + "learning_rate": 1.8230091544388074e-05, + "loss": 1.2575, + "step": 7263 + }, + { + "epoch": 2.163554794393045, + "grad_norm": 0.24832923710346222, + "learning_rate": 1.8229543605294985e-05, + "loss": 1.2598, + "step": 7264 + }, + { + "epoch": 2.163852640592714, + "grad_norm": 0.23019640147686005, + "learning_rate": 1.822899558963519e-05, + "loss": 1.2529, + "step": 7265 + }, + { + "epoch": 2.1641504867923826, + "grad_norm": 0.22573243081569672, + "learning_rate": 1.8228447497413785e-05, + "loss": 1.2497, + "step": 7266 + }, + { + "epoch": 2.1644483329920514, + "grad_norm": 0.24307508766651154, + "learning_rate": 1.8227899328635867e-05, + "loss": 1.239, + "step": 7267 + }, + { + "epoch": 2.1647461791917197, + "grad_norm": 0.23096969723701477, + "learning_rate": 1.822735108330654e-05, + "loss": 1.2417, + "step": 7268 + }, + { + "epoch": 2.1650440253913885, + "grad_norm": 0.2541373074054718, + "learning_rate": 1.8226802761430905e-05, + "loss": 1.263, + "step": 7269 + }, + { + "epoch": 2.1653418715910573, + "grad_norm": 0.23824550211429596, + "learning_rate": 1.8226254363014058e-05, + "loss": 1.2667, + "step": 7270 + }, + { + "epoch": 2.1656397177907256, + "grad_norm": 0.23206298053264618, + "learning_rate": 1.8225705888061107e-05, + "loss": 1.2544, + "step": 7271 + }, + { + "epoch": 2.1659375639903944, + "grad_norm": 0.21915820240974426, + "learning_rate": 1.8225157336577153e-05, + "loss": 1.2517, + "step": 7272 + }, + { + "epoch": 2.166235410190063, + "grad_norm": 0.25287652015686035, + "learning_rate": 1.82246087085673e-05, + "loss": 1.2436, + "step": 7273 + }, + { + "epoch": 2.166533256389732, + "grad_norm": 0.24380943179130554, + "learning_rate": 1.8224060004036652e-05, + "loss": 1.2671, + "step": 7274 + }, + { + "epoch": 2.1668311025894003, + "grad_norm": 0.24164150655269623, + "learning_rate": 1.8223511222990313e-05, + "loss": 1.2468, + "step": 7275 + }, + { + "epoch": 2.167128948789069, + "grad_norm": 0.23552227020263672, + "learning_rate": 1.822296236543339e-05, + "loss": 1.277, + "step": 7276 + }, + { + "epoch": 2.167426794988738, + "grad_norm": 0.22162701189517975, + "learning_rate": 1.822241343137099e-05, + "loss": 1.2572, + "step": 7277 + }, + { + "epoch": 2.167724641188406, + "grad_norm": 0.23937855660915375, + "learning_rate": 1.822186442080822e-05, + "loss": 1.2497, + "step": 7278 + }, + { + "epoch": 2.168022487388075, + "grad_norm": 0.22830797731876373, + "learning_rate": 1.8221315333750187e-05, + "loss": 1.2713, + "step": 7279 + }, + { + "epoch": 2.1683203335877437, + "grad_norm": 0.2521073520183563, + "learning_rate": 1.8220766170202e-05, + "loss": 1.2629, + "step": 7280 + }, + { + "epoch": 2.1686181797874124, + "grad_norm": 0.2280997335910797, + "learning_rate": 1.822021693016877e-05, + "loss": 1.2534, + "step": 7281 + }, + { + "epoch": 2.1689160259870808, + "grad_norm": 0.22700686752796173, + "learning_rate": 1.82196676136556e-05, + "loss": 1.2643, + "step": 7282 + }, + { + "epoch": 2.1692138721867495, + "grad_norm": 0.22790886461734772, + "learning_rate": 1.8219118220667616e-05, + "loss": 1.2464, + "step": 7283 + }, + { + "epoch": 2.1695117183864183, + "grad_norm": 0.2317965030670166, + "learning_rate": 1.8218568751209914e-05, + "loss": 1.2647, + "step": 7284 + }, + { + "epoch": 2.169809564586087, + "grad_norm": 0.2157122939825058, + "learning_rate": 1.8218019205287613e-05, + "loss": 1.246, + "step": 7285 + }, + { + "epoch": 2.1701074107857554, + "grad_norm": 0.22612899541854858, + "learning_rate": 1.8217469582905826e-05, + "loss": 1.2781, + "step": 7286 + }, + { + "epoch": 2.170405256985424, + "grad_norm": 0.21930953860282898, + "learning_rate": 1.8216919884069663e-05, + "loss": 1.2552, + "step": 7287 + }, + { + "epoch": 2.170703103185093, + "grad_norm": 0.2246462106704712, + "learning_rate": 1.8216370108784243e-05, + "loss": 1.2596, + "step": 7288 + }, + { + "epoch": 2.1710009493847613, + "grad_norm": 0.2279270887374878, + "learning_rate": 1.821582025705468e-05, + "loss": 1.2403, + "step": 7289 + }, + { + "epoch": 2.17129879558443, + "grad_norm": 0.23340153694152832, + "learning_rate": 1.8215270328886084e-05, + "loss": 1.2641, + "step": 7290 + }, + { + "epoch": 2.171596641784099, + "grad_norm": 0.225138857960701, + "learning_rate": 1.8214720324283584e-05, + "loss": 1.2596, + "step": 7291 + }, + { + "epoch": 2.171894487983767, + "grad_norm": 0.22960343956947327, + "learning_rate": 1.8214170243252284e-05, + "loss": 1.2507, + "step": 7292 + }, + { + "epoch": 2.172192334183436, + "grad_norm": 0.25454360246658325, + "learning_rate": 1.8213620085797308e-05, + "loss": 1.262, + "step": 7293 + }, + { + "epoch": 2.1724901803831047, + "grad_norm": 0.2196982353925705, + "learning_rate": 1.8213069851923775e-05, + "loss": 1.2573, + "step": 7294 + }, + { + "epoch": 2.1727880265827735, + "grad_norm": 0.314749151468277, + "learning_rate": 1.8212519541636798e-05, + "loss": 1.244, + "step": 7295 + }, + { + "epoch": 2.173085872782442, + "grad_norm": 0.32642850279808044, + "learning_rate": 1.821196915494151e-05, + "loss": 1.2591, + "step": 7296 + }, + { + "epoch": 2.1733837189821106, + "grad_norm": 0.27008360624313354, + "learning_rate": 1.821141869184302e-05, + "loss": 1.2594, + "step": 7297 + }, + { + "epoch": 2.1736815651817794, + "grad_norm": 0.5230419039726257, + "learning_rate": 1.8210868152346456e-05, + "loss": 1.2637, + "step": 7298 + }, + { + "epoch": 2.173979411381448, + "grad_norm": 0.24664689600467682, + "learning_rate": 1.8210317536456934e-05, + "loss": 1.2583, + "step": 7299 + }, + { + "epoch": 2.1742772575811165, + "grad_norm": 0.2564030587673187, + "learning_rate": 1.820976684417958e-05, + "loss": 1.2536, + "step": 7300 + }, + { + "epoch": 2.1745751037807852, + "grad_norm": 0.2287197709083557, + "learning_rate": 1.8209216075519522e-05, + "loss": 1.2753, + "step": 7301 + }, + { + "epoch": 2.174872949980454, + "grad_norm": 0.22164995968341827, + "learning_rate": 1.8208665230481878e-05, + "loss": 1.2506, + "step": 7302 + }, + { + "epoch": 2.1751707961801223, + "grad_norm": 0.24038252234458923, + "learning_rate": 1.8208114309071776e-05, + "loss": 1.2522, + "step": 7303 + }, + { + "epoch": 2.175468642379791, + "grad_norm": 0.25487661361694336, + "learning_rate": 1.820756331129434e-05, + "loss": 1.2614, + "step": 7304 + }, + { + "epoch": 2.17576648857946, + "grad_norm": 0.25161856412887573, + "learning_rate": 1.82070122371547e-05, + "loss": 1.2811, + "step": 7305 + }, + { + "epoch": 2.176064334779128, + "grad_norm": 0.23490729928016663, + "learning_rate": 1.820646108665798e-05, + "loss": 1.2512, + "step": 7306 + }, + { + "epoch": 2.176362180978797, + "grad_norm": 0.22478432953357697, + "learning_rate": 1.8205909859809307e-05, + "loss": 1.2576, + "step": 7307 + }, + { + "epoch": 2.1766600271784657, + "grad_norm": 0.23823775351047516, + "learning_rate": 1.820535855661381e-05, + "loss": 1.2573, + "step": 7308 + }, + { + "epoch": 2.1769578733781345, + "grad_norm": 0.23629657924175262, + "learning_rate": 1.8204807177076617e-05, + "loss": 1.2723, + "step": 7309 + }, + { + "epoch": 2.177255719577803, + "grad_norm": 0.22689202427864075, + "learning_rate": 1.8204255721202867e-05, + "loss": 1.2475, + "step": 7310 + }, + { + "epoch": 2.1775535657774716, + "grad_norm": 0.22683067619800568, + "learning_rate": 1.820370418899768e-05, + "loss": 1.2508, + "step": 7311 + }, + { + "epoch": 2.1778514119771404, + "grad_norm": 0.23133999109268188, + "learning_rate": 1.8203152580466187e-05, + "loss": 1.2549, + "step": 7312 + }, + { + "epoch": 2.178149258176809, + "grad_norm": 0.23448750376701355, + "learning_rate": 1.820260089561353e-05, + "loss": 1.247, + "step": 7313 + }, + { + "epoch": 2.1784471043764775, + "grad_norm": 0.2387056052684784, + "learning_rate": 1.8202049134444837e-05, + "loss": 1.2543, + "step": 7314 + }, + { + "epoch": 2.1787449505761463, + "grad_norm": 0.2189805805683136, + "learning_rate": 1.820149729696524e-05, + "loss": 1.2484, + "step": 7315 + }, + { + "epoch": 2.179042796775815, + "grad_norm": 0.22526004910469055, + "learning_rate": 1.820094538317987e-05, + "loss": 1.2571, + "step": 7316 + }, + { + "epoch": 2.1793406429754834, + "grad_norm": 0.23390717804431915, + "learning_rate": 1.820039339309387e-05, + "loss": 1.2411, + "step": 7317 + }, + { + "epoch": 2.179638489175152, + "grad_norm": 0.22101286053657532, + "learning_rate": 1.8199841326712368e-05, + "loss": 1.2657, + "step": 7318 + }, + { + "epoch": 2.179936335374821, + "grad_norm": 0.24784044921398163, + "learning_rate": 1.8199289184040507e-05, + "loss": 1.2403, + "step": 7319 + }, + { + "epoch": 2.1802341815744897, + "grad_norm": 0.22040356695652008, + "learning_rate": 1.819873696508342e-05, + "loss": 1.2663, + "step": 7320 + }, + { + "epoch": 2.180532027774158, + "grad_norm": 0.23102213442325592, + "learning_rate": 1.8198184669846243e-05, + "loss": 1.2511, + "step": 7321 + }, + { + "epoch": 2.1808298739738268, + "grad_norm": 0.233845517039299, + "learning_rate": 1.819763229833412e-05, + "loss": 1.2594, + "step": 7322 + }, + { + "epoch": 2.1811277201734955, + "grad_norm": 0.22688744962215424, + "learning_rate": 1.8197079850552188e-05, + "loss": 1.2525, + "step": 7323 + }, + { + "epoch": 2.181425566373164, + "grad_norm": 0.21882183849811554, + "learning_rate": 1.8196527326505585e-05, + "loss": 1.2547, + "step": 7324 + }, + { + "epoch": 2.1817234125728326, + "grad_norm": 0.23973548412322998, + "learning_rate": 1.8195974726199454e-05, + "loss": 1.2478, + "step": 7325 + }, + { + "epoch": 2.1820212587725014, + "grad_norm": 0.23163484036922455, + "learning_rate": 1.8195422049638935e-05, + "loss": 1.2569, + "step": 7326 + }, + { + "epoch": 2.18231910497217, + "grad_norm": 0.23362986743450165, + "learning_rate": 1.8194869296829167e-05, + "loss": 1.2596, + "step": 7327 + }, + { + "epoch": 2.1826169511718385, + "grad_norm": 0.2282031625509262, + "learning_rate": 1.81943164677753e-05, + "loss": 1.2459, + "step": 7328 + }, + { + "epoch": 2.1829147973715073, + "grad_norm": 0.22614647448062897, + "learning_rate": 1.819376356248247e-05, + "loss": 1.2479, + "step": 7329 + }, + { + "epoch": 2.183212643571176, + "grad_norm": 0.22355595231056213, + "learning_rate": 1.819321058095583e-05, + "loss": 1.2726, + "step": 7330 + }, + { + "epoch": 2.1835104897708444, + "grad_norm": 0.2299981415271759, + "learning_rate": 1.8192657523200514e-05, + "loss": 1.255, + "step": 7331 + }, + { + "epoch": 2.183808335970513, + "grad_norm": 0.22681733965873718, + "learning_rate": 1.8192104389221677e-05, + "loss": 1.2557, + "step": 7332 + }, + { + "epoch": 2.184106182170182, + "grad_norm": 0.2261442244052887, + "learning_rate": 1.8191551179024462e-05, + "loss": 1.2635, + "step": 7333 + }, + { + "epoch": 2.1844040283698507, + "grad_norm": 0.2253769338130951, + "learning_rate": 1.819099789261401e-05, + "loss": 1.266, + "step": 7334 + }, + { + "epoch": 2.184701874569519, + "grad_norm": 0.224657341837883, + "learning_rate": 1.819044452999548e-05, + "loss": 1.2601, + "step": 7335 + }, + { + "epoch": 2.184999720769188, + "grad_norm": 0.22582083940505981, + "learning_rate": 1.818989109117401e-05, + "loss": 1.2542, + "step": 7336 + }, + { + "epoch": 2.1852975669688566, + "grad_norm": 0.22553744912147522, + "learning_rate": 1.818933757615476e-05, + "loss": 1.2606, + "step": 7337 + }, + { + "epoch": 2.185595413168525, + "grad_norm": 0.2209191471338272, + "learning_rate": 1.8188783984942865e-05, + "loss": 1.2512, + "step": 7338 + }, + { + "epoch": 2.1858932593681937, + "grad_norm": 0.23108714818954468, + "learning_rate": 1.818823031754349e-05, + "loss": 1.2642, + "step": 7339 + }, + { + "epoch": 2.1861911055678624, + "grad_norm": 0.22774134576320648, + "learning_rate": 1.818767657396178e-05, + "loss": 1.2722, + "step": 7340 + }, + { + "epoch": 2.186488951767531, + "grad_norm": 0.22755049169063568, + "learning_rate": 1.8187122754202884e-05, + "loss": 1.2588, + "step": 7341 + }, + { + "epoch": 2.1867867979671995, + "grad_norm": 0.22437641024589539, + "learning_rate": 1.818656885827196e-05, + "loss": 1.2487, + "step": 7342 + }, + { + "epoch": 2.1870846441668683, + "grad_norm": 0.23329325020313263, + "learning_rate": 1.818601488617416e-05, + "loss": 1.2527, + "step": 7343 + }, + { + "epoch": 2.187382490366537, + "grad_norm": 0.22857099771499634, + "learning_rate": 1.8185460837914635e-05, + "loss": 1.2625, + "step": 7344 + }, + { + "epoch": 2.1876803365662054, + "grad_norm": 0.2302483767271042, + "learning_rate": 1.8184906713498544e-05, + "loss": 1.2655, + "step": 7345 + }, + { + "epoch": 2.187978182765874, + "grad_norm": 0.23071642220020294, + "learning_rate": 1.8184352512931044e-05, + "loss": 1.2666, + "step": 7346 + }, + { + "epoch": 2.188276028965543, + "grad_norm": 0.23209547996520996, + "learning_rate": 1.818379823621728e-05, + "loss": 1.264, + "step": 7347 + }, + { + "epoch": 2.1885738751652117, + "grad_norm": 0.2276519536972046, + "learning_rate": 1.8183243883362424e-05, + "loss": 1.2591, + "step": 7348 + }, + { + "epoch": 2.18887172136488, + "grad_norm": 0.23050251603126526, + "learning_rate": 1.8182689454371622e-05, + "loss": 1.25, + "step": 7349 + }, + { + "epoch": 2.189169567564549, + "grad_norm": 0.2253330498933792, + "learning_rate": 1.818213494925004e-05, + "loss": 1.2655, + "step": 7350 + }, + { + "epoch": 2.1894674137642176, + "grad_norm": 0.2334011048078537, + "learning_rate": 1.8181580368002833e-05, + "loss": 1.2629, + "step": 7351 + }, + { + "epoch": 2.1897652599638864, + "grad_norm": 0.22390751540660858, + "learning_rate": 1.8181025710635163e-05, + "loss": 1.249, + "step": 7352 + }, + { + "epoch": 2.1900631061635547, + "grad_norm": 0.2347511351108551, + "learning_rate": 1.8180470977152188e-05, + "loss": 1.2778, + "step": 7353 + }, + { + "epoch": 2.1903609523632235, + "grad_norm": 0.22769004106521606, + "learning_rate": 1.8179916167559067e-05, + "loss": 1.2693, + "step": 7354 + }, + { + "epoch": 2.1906587985628923, + "grad_norm": 0.2368936538696289, + "learning_rate": 1.817936128186097e-05, + "loss": 1.2525, + "step": 7355 + }, + { + "epoch": 2.1909566447625606, + "grad_norm": 0.2429242581129074, + "learning_rate": 1.8178806320063054e-05, + "loss": 1.2667, + "step": 7356 + }, + { + "epoch": 2.1912544909622294, + "grad_norm": 0.2240075021982193, + "learning_rate": 1.817825128217048e-05, + "loss": 1.2665, + "step": 7357 + }, + { + "epoch": 2.191552337161898, + "grad_norm": 0.22910600900650024, + "learning_rate": 1.8177696168188417e-05, + "loss": 1.2589, + "step": 7358 + }, + { + "epoch": 2.1918501833615665, + "grad_norm": 0.2249964326620102, + "learning_rate": 1.817714097812203e-05, + "loss": 1.2532, + "step": 7359 + }, + { + "epoch": 2.1921480295612352, + "grad_norm": 0.23178963363170624, + "learning_rate": 1.817658571197648e-05, + "loss": 1.2586, + "step": 7360 + }, + { + "epoch": 2.192445875760904, + "grad_norm": 0.25021347403526306, + "learning_rate": 1.8176030369756935e-05, + "loss": 1.2577, + "step": 7361 + }, + { + "epoch": 2.1927437219605728, + "grad_norm": 0.2263617217540741, + "learning_rate": 1.8175474951468564e-05, + "loss": 1.2564, + "step": 7362 + }, + { + "epoch": 2.193041568160241, + "grad_norm": 0.24502508342266083, + "learning_rate": 1.8174919457116532e-05, + "loss": 1.2547, + "step": 7363 + }, + { + "epoch": 2.19333941435991, + "grad_norm": 0.23017960786819458, + "learning_rate": 1.8174363886706004e-05, + "loss": 1.255, + "step": 7364 + }, + { + "epoch": 2.1936372605595786, + "grad_norm": 0.2292788326740265, + "learning_rate": 1.8173808240242156e-05, + "loss": 1.2461, + "step": 7365 + }, + { + "epoch": 2.1939351067592474, + "grad_norm": 0.22475972771644592, + "learning_rate": 1.817325251773016e-05, + "loss": 1.2611, + "step": 7366 + }, + { + "epoch": 2.1942329529589157, + "grad_norm": 0.23396527767181396, + "learning_rate": 1.8172696719175172e-05, + "loss": 1.2573, + "step": 7367 + }, + { + "epoch": 2.1945307991585845, + "grad_norm": 0.24701645970344543, + "learning_rate": 1.8172140844582377e-05, + "loss": 1.2705, + "step": 7368 + }, + { + "epoch": 2.1948286453582533, + "grad_norm": 0.24896377325057983, + "learning_rate": 1.8171584893956943e-05, + "loss": 1.254, + "step": 7369 + }, + { + "epoch": 2.1951264915579216, + "grad_norm": 0.24997586011886597, + "learning_rate": 1.817102886730404e-05, + "loss": 1.2639, + "step": 7370 + }, + { + "epoch": 2.1954243377575904, + "grad_norm": 0.22887980937957764, + "learning_rate": 1.817047276462884e-05, + "loss": 1.2589, + "step": 7371 + }, + { + "epoch": 2.195722183957259, + "grad_norm": 0.22723832726478577, + "learning_rate": 1.8169916585936523e-05, + "loss": 1.2462, + "step": 7372 + }, + { + "epoch": 2.1960200301569275, + "grad_norm": 0.25205981731414795, + "learning_rate": 1.8169360331232258e-05, + "loss": 1.2354, + "step": 7373 + }, + { + "epoch": 2.1963178763565963, + "grad_norm": 0.23514196276664734, + "learning_rate": 1.8168804000521222e-05, + "loss": 1.2519, + "step": 7374 + }, + { + "epoch": 2.196615722556265, + "grad_norm": 0.23175553977489471, + "learning_rate": 1.8168247593808594e-05, + "loss": 1.2463, + "step": 7375 + }, + { + "epoch": 2.196913568755934, + "grad_norm": 0.22872765362262726, + "learning_rate": 1.816769111109955e-05, + "loss": 1.2668, + "step": 7376 + }, + { + "epoch": 2.197211414955602, + "grad_norm": 0.22961491346359253, + "learning_rate": 1.816713455239926e-05, + "loss": 1.238, + "step": 7377 + }, + { + "epoch": 2.197509261155271, + "grad_norm": 0.233178973197937, + "learning_rate": 1.816657791771291e-05, + "loss": 1.2572, + "step": 7378 + }, + { + "epoch": 2.1978071073549397, + "grad_norm": 0.23030194640159607, + "learning_rate": 1.816602120704568e-05, + "loss": 1.2474, + "step": 7379 + }, + { + "epoch": 2.1981049535546084, + "grad_norm": 0.23031818866729736, + "learning_rate": 1.8165464420402742e-05, + "loss": 1.2478, + "step": 7380 + }, + { + "epoch": 2.1984027997542768, + "grad_norm": 0.2297825664281845, + "learning_rate": 1.816490755778928e-05, + "loss": 1.2449, + "step": 7381 + }, + { + "epoch": 2.1987006459539455, + "grad_norm": 0.2581939995288849, + "learning_rate": 1.816435061921048e-05, + "loss": 1.2573, + "step": 7382 + }, + { + "epoch": 2.1989984921536143, + "grad_norm": 0.23687393963336945, + "learning_rate": 1.8163793604671516e-05, + "loss": 1.2747, + "step": 7383 + }, + { + "epoch": 2.1992963383532826, + "grad_norm": 0.22601406276226044, + "learning_rate": 1.8163236514177575e-05, + "loss": 1.258, + "step": 7384 + }, + { + "epoch": 2.1995941845529514, + "grad_norm": 0.2287813276052475, + "learning_rate": 1.816267934773384e-05, + "loss": 1.2549, + "step": 7385 + }, + { + "epoch": 2.19989203075262, + "grad_norm": 0.2404824048280716, + "learning_rate": 1.816212210534549e-05, + "loss": 1.264, + "step": 7386 + }, + { + "epoch": 2.200189876952289, + "grad_norm": 0.2244536280632019, + "learning_rate": 1.8161564787017716e-05, + "loss": 1.2583, + "step": 7387 + }, + { + "epoch": 2.2004877231519573, + "grad_norm": 0.23286356031894684, + "learning_rate": 1.81610073927557e-05, + "loss": 1.2674, + "step": 7388 + }, + { + "epoch": 2.200785569351626, + "grad_norm": 0.2290877103805542, + "learning_rate": 1.8160449922564627e-05, + "loss": 1.2494, + "step": 7389 + }, + { + "epoch": 2.201083415551295, + "grad_norm": 0.2407354712486267, + "learning_rate": 1.8159892376449685e-05, + "loss": 1.2622, + "step": 7390 + }, + { + "epoch": 2.201381261750963, + "grad_norm": 0.22834835946559906, + "learning_rate": 1.815933475441606e-05, + "loss": 1.259, + "step": 7391 + }, + { + "epoch": 2.201679107950632, + "grad_norm": 0.23079723119735718, + "learning_rate": 1.8158777056468942e-05, + "loss": 1.2622, + "step": 7392 + }, + { + "epoch": 2.2019769541503007, + "grad_norm": 0.22950461506843567, + "learning_rate": 1.815821928261352e-05, + "loss": 1.2458, + "step": 7393 + }, + { + "epoch": 2.2022748003499695, + "grad_norm": 0.2347557693719864, + "learning_rate": 1.8157661432854982e-05, + "loss": 1.258, + "step": 7394 + }, + { + "epoch": 2.202572646549638, + "grad_norm": 0.24329420924186707, + "learning_rate": 1.8157103507198522e-05, + "loss": 1.2569, + "step": 7395 + }, + { + "epoch": 2.2028704927493066, + "grad_norm": 0.23572076857089996, + "learning_rate": 1.8156545505649323e-05, + "loss": 1.2505, + "step": 7396 + }, + { + "epoch": 2.2031683389489753, + "grad_norm": 0.23422683775424957, + "learning_rate": 1.815598742821258e-05, + "loss": 1.2532, + "step": 7397 + }, + { + "epoch": 2.2034661851486437, + "grad_norm": 0.22762665152549744, + "learning_rate": 1.8155429274893493e-05, + "loss": 1.2584, + "step": 7398 + }, + { + "epoch": 2.2037640313483124, + "grad_norm": 0.2506447732448578, + "learning_rate": 1.8154871045697243e-05, + "loss": 1.2506, + "step": 7399 + }, + { + "epoch": 2.204061877547981, + "grad_norm": 0.24103914201259613, + "learning_rate": 1.815431274062903e-05, + "loss": 1.262, + "step": 7400 + }, + { + "epoch": 2.20435972374765, + "grad_norm": 0.23970681428909302, + "learning_rate": 1.815375435969405e-05, + "loss": 1.2573, + "step": 7401 + }, + { + "epoch": 2.2046575699473183, + "grad_norm": 0.2336505949497223, + "learning_rate": 1.8153195902897495e-05, + "loss": 1.2459, + "step": 7402 + }, + { + "epoch": 2.204955416146987, + "grad_norm": 0.27376458048820496, + "learning_rate": 1.8152637370244557e-05, + "loss": 1.2671, + "step": 7403 + }, + { + "epoch": 2.205253262346656, + "grad_norm": 0.2327093482017517, + "learning_rate": 1.8152078761740438e-05, + "loss": 1.2613, + "step": 7404 + }, + { + "epoch": 2.205551108546324, + "grad_norm": 0.2462681084871292, + "learning_rate": 1.815152007739034e-05, + "loss": 1.2664, + "step": 7405 + }, + { + "epoch": 2.205848954745993, + "grad_norm": 0.2664211690425873, + "learning_rate": 1.815096131719945e-05, + "loss": 1.2528, + "step": 7406 + }, + { + "epoch": 2.2061468009456617, + "grad_norm": 0.246301531791687, + "learning_rate": 1.8150402481172973e-05, + "loss": 1.2496, + "step": 7407 + }, + { + "epoch": 2.2064446471453305, + "grad_norm": 0.25915271043777466, + "learning_rate": 1.8149843569316107e-05, + "loss": 1.2721, + "step": 7408 + }, + { + "epoch": 2.206742493344999, + "grad_norm": 0.23023463785648346, + "learning_rate": 1.814928458163405e-05, + "loss": 1.2565, + "step": 7409 + }, + { + "epoch": 2.2070403395446676, + "grad_norm": 0.2400447577238083, + "learning_rate": 1.8148725518132005e-05, + "loss": 1.2508, + "step": 7410 + }, + { + "epoch": 2.2073381857443364, + "grad_norm": 0.226594939827919, + "learning_rate": 1.8148166378815178e-05, + "loss": 1.269, + "step": 7411 + }, + { + "epoch": 2.2076360319440047, + "grad_norm": 0.2427109032869339, + "learning_rate": 1.8147607163688763e-05, + "loss": 1.2441, + "step": 7412 + }, + { + "epoch": 2.2079338781436735, + "grad_norm": 0.23050576448440552, + "learning_rate": 1.8147047872757964e-05, + "loss": 1.2628, + "step": 7413 + }, + { + "epoch": 2.2082317243433423, + "grad_norm": 0.23181301355361938, + "learning_rate": 1.8146488506027996e-05, + "loss": 1.2536, + "step": 7414 + }, + { + "epoch": 2.208529570543011, + "grad_norm": 0.22633394598960876, + "learning_rate": 1.8145929063504043e-05, + "loss": 1.2426, + "step": 7415 + }, + { + "epoch": 2.2088274167426794, + "grad_norm": 0.22863715887069702, + "learning_rate": 1.814536954519133e-05, + "loss": 1.2457, + "step": 7416 + }, + { + "epoch": 2.209125262942348, + "grad_norm": 0.23071017861366272, + "learning_rate": 1.8144809951095052e-05, + "loss": 1.2585, + "step": 7417 + }, + { + "epoch": 2.209423109142017, + "grad_norm": 0.2343638688325882, + "learning_rate": 1.8144250281220412e-05, + "loss": 1.2429, + "step": 7418 + }, + { + "epoch": 2.2097209553416857, + "grad_norm": 0.23640675842761993, + "learning_rate": 1.814369053557263e-05, + "loss": 1.2481, + "step": 7419 + }, + { + "epoch": 2.210018801541354, + "grad_norm": 0.23015083372592926, + "learning_rate": 1.81431307141569e-05, + "loss": 1.2751, + "step": 7420 + }, + { + "epoch": 2.2103166477410228, + "grad_norm": 0.22590556740760803, + "learning_rate": 1.814257081697844e-05, + "loss": 1.2459, + "step": 7421 + }, + { + "epoch": 2.2106144939406915, + "grad_norm": 0.23690733313560486, + "learning_rate": 1.8142010844042454e-05, + "loss": 1.2473, + "step": 7422 + }, + { + "epoch": 2.21091234014036, + "grad_norm": 0.22188520431518555, + "learning_rate": 1.8141450795354155e-05, + "loss": 1.2513, + "step": 7423 + }, + { + "epoch": 2.2112101863400286, + "grad_norm": 0.2462684065103531, + "learning_rate": 1.8140890670918755e-05, + "loss": 1.2594, + "step": 7424 + }, + { + "epoch": 2.2115080325396974, + "grad_norm": 0.23598931729793549, + "learning_rate": 1.814033047074146e-05, + "loss": 1.2569, + "step": 7425 + }, + { + "epoch": 2.2118058787393657, + "grad_norm": 0.24653668701648712, + "learning_rate": 1.8139770194827485e-05, + "loss": 1.2657, + "step": 7426 + }, + { + "epoch": 2.2121037249390345, + "grad_norm": 0.27435415983200073, + "learning_rate": 1.8139209843182043e-05, + "loss": 1.2661, + "step": 7427 + }, + { + "epoch": 2.2124015711387033, + "grad_norm": 0.22956374287605286, + "learning_rate": 1.8138649415810348e-05, + "loss": 1.2546, + "step": 7428 + }, + { + "epoch": 2.212699417338372, + "grad_norm": 0.33536839485168457, + "learning_rate": 1.813808891271761e-05, + "loss": 1.2586, + "step": 7429 + }, + { + "epoch": 2.2129972635380404, + "grad_norm": 0.2709580063819885, + "learning_rate": 1.8137528333909048e-05, + "loss": 1.276, + "step": 7430 + }, + { + "epoch": 2.213295109737709, + "grad_norm": 0.2404656857252121, + "learning_rate": 1.813696767938988e-05, + "loss": 1.2687, + "step": 7431 + }, + { + "epoch": 2.213592955937378, + "grad_norm": 0.23287378251552582, + "learning_rate": 1.8136406949165315e-05, + "loss": 1.2365, + "step": 7432 + }, + { + "epoch": 2.2138908021370467, + "grad_norm": 0.255045086145401, + "learning_rate": 1.8135846143240575e-05, + "loss": 1.2567, + "step": 7433 + }, + { + "epoch": 2.214188648336715, + "grad_norm": 0.234689861536026, + "learning_rate": 1.8135285261620882e-05, + "loss": 1.2636, + "step": 7434 + }, + { + "epoch": 2.214486494536384, + "grad_norm": 0.23581352829933167, + "learning_rate": 1.8134724304311443e-05, + "loss": 1.2582, + "step": 7435 + }, + { + "epoch": 2.2147843407360526, + "grad_norm": 0.2270398736000061, + "learning_rate": 1.8134163271317483e-05, + "loss": 1.2559, + "step": 7436 + }, + { + "epoch": 2.215082186935721, + "grad_norm": 0.24003979563713074, + "learning_rate": 1.8133602162644225e-05, + "loss": 1.2608, + "step": 7437 + }, + { + "epoch": 2.2153800331353897, + "grad_norm": 0.23116470873355865, + "learning_rate": 1.813304097829688e-05, + "loss": 1.2734, + "step": 7438 + }, + { + "epoch": 2.2156778793350584, + "grad_norm": 0.25134536623954773, + "learning_rate": 1.813247971828068e-05, + "loss": 1.2421, + "step": 7439 + }, + { + "epoch": 2.2159757255347268, + "grad_norm": 0.2228419929742813, + "learning_rate": 1.8131918382600843e-05, + "loss": 1.2741, + "step": 7440 + }, + { + "epoch": 2.2162735717343955, + "grad_norm": 0.22911174595355988, + "learning_rate": 1.813135697126259e-05, + "loss": 1.2747, + "step": 7441 + }, + { + "epoch": 2.2165714179340643, + "grad_norm": 0.22643044590950012, + "learning_rate": 1.8130795484271147e-05, + "loss": 1.2472, + "step": 7442 + }, + { + "epoch": 2.216869264133733, + "grad_norm": 0.2381702959537506, + "learning_rate": 1.8130233921631733e-05, + "loss": 1.26, + "step": 7443 + }, + { + "epoch": 2.2171671103334014, + "grad_norm": 0.2607860863208771, + "learning_rate": 1.8129672283349577e-05, + "loss": 1.2545, + "step": 7444 + }, + { + "epoch": 2.21746495653307, + "grad_norm": 0.22908510267734528, + "learning_rate": 1.8129110569429906e-05, + "loss": 1.2681, + "step": 7445 + }, + { + "epoch": 2.217762802732739, + "grad_norm": 0.2557067573070526, + "learning_rate": 1.812854877987794e-05, + "loss": 1.2713, + "step": 7446 + }, + { + "epoch": 2.2180606489324077, + "grad_norm": 0.24011479318141937, + "learning_rate": 1.812798691469891e-05, + "loss": 1.2477, + "step": 7447 + }, + { + "epoch": 2.218358495132076, + "grad_norm": 0.2298644781112671, + "learning_rate": 1.8127424973898046e-05, + "loss": 1.2533, + "step": 7448 + }, + { + "epoch": 2.218656341331745, + "grad_norm": 0.23344075679779053, + "learning_rate": 1.8126862957480572e-05, + "loss": 1.2462, + "step": 7449 + }, + { + "epoch": 2.2189541875314136, + "grad_norm": 0.24125604331493378, + "learning_rate": 1.8126300865451716e-05, + "loss": 1.2507, + "step": 7450 + }, + { + "epoch": 2.219252033731082, + "grad_norm": 0.24440434575080872, + "learning_rate": 1.812573869781671e-05, + "loss": 1.256, + "step": 7451 + }, + { + "epoch": 2.2195498799307507, + "grad_norm": 0.24648922681808472, + "learning_rate": 1.8125176454580785e-05, + "loss": 1.2559, + "step": 7452 + }, + { + "epoch": 2.2198477261304195, + "grad_norm": 0.3261624276638031, + "learning_rate": 1.812461413574917e-05, + "loss": 1.2543, + "step": 7453 + }, + { + "epoch": 2.2201455723300882, + "grad_norm": 0.2771762013435364, + "learning_rate": 1.81240517413271e-05, + "loss": 1.245, + "step": 7454 + }, + { + "epoch": 2.2204434185297566, + "grad_norm": 0.2676098942756653, + "learning_rate": 1.81234892713198e-05, + "loss": 1.2548, + "step": 7455 + }, + { + "epoch": 2.2207412647294253, + "grad_norm": 0.33100032806396484, + "learning_rate": 1.8122926725732513e-05, + "loss": 1.2602, + "step": 7456 + }, + { + "epoch": 2.221039110929094, + "grad_norm": 0.23672987520694733, + "learning_rate": 1.812236410457047e-05, + "loss": 1.2741, + "step": 7457 + }, + { + "epoch": 2.2213369571287624, + "grad_norm": 0.238791361451149, + "learning_rate": 1.8121801407838903e-05, + "loss": 1.2719, + "step": 7458 + }, + { + "epoch": 2.221634803328431, + "grad_norm": 0.23731465637683868, + "learning_rate": 1.8121238635543043e-05, + "loss": 1.2602, + "step": 7459 + }, + { + "epoch": 2.2219326495281, + "grad_norm": 0.22845380008220673, + "learning_rate": 1.8120675787688134e-05, + "loss": 1.256, + "step": 7460 + }, + { + "epoch": 2.2222304957277688, + "grad_norm": 0.24039316177368164, + "learning_rate": 1.812011286427941e-05, + "loss": 1.2407, + "step": 7461 + }, + { + "epoch": 2.222528341927437, + "grad_norm": 0.22779229283332825, + "learning_rate": 1.811954986532211e-05, + "loss": 1.2619, + "step": 7462 + }, + { + "epoch": 2.222826188127106, + "grad_norm": 0.2333533763885498, + "learning_rate": 1.8118986790821468e-05, + "loss": 1.2559, + "step": 7463 + }, + { + "epoch": 2.2231240343267746, + "grad_norm": 0.23861506581306458, + "learning_rate": 1.8118423640782724e-05, + "loss": 1.2662, + "step": 7464 + }, + { + "epoch": 2.223421880526443, + "grad_norm": 0.22525864839553833, + "learning_rate": 1.811786041521112e-05, + "loss": 1.2527, + "step": 7465 + }, + { + "epoch": 2.2237197267261117, + "grad_norm": 0.23941968381404877, + "learning_rate": 1.8117297114111894e-05, + "loss": 1.2569, + "step": 7466 + }, + { + "epoch": 2.2240175729257805, + "grad_norm": 0.2459057718515396, + "learning_rate": 1.8116733737490292e-05, + "loss": 1.2558, + "step": 7467 + }, + { + "epoch": 2.2243154191254493, + "grad_norm": 0.2310280203819275, + "learning_rate": 1.8116170285351545e-05, + "loss": 1.2646, + "step": 7468 + }, + { + "epoch": 2.2246132653251176, + "grad_norm": 0.22232785820960999, + "learning_rate": 1.81156067577009e-05, + "loss": 1.2502, + "step": 7469 + }, + { + "epoch": 2.2249111115247864, + "grad_norm": 0.23295165598392487, + "learning_rate": 1.811504315454361e-05, + "loss": 1.2465, + "step": 7470 + }, + { + "epoch": 2.225208957724455, + "grad_norm": 0.24131250381469727, + "learning_rate": 1.8114479475884906e-05, + "loss": 1.2508, + "step": 7471 + }, + { + "epoch": 2.2255068039241235, + "grad_norm": 0.24913492798805237, + "learning_rate": 1.8113915721730036e-05, + "loss": 1.254, + "step": 7472 + }, + { + "epoch": 2.2258046501237922, + "grad_norm": 0.24638479948043823, + "learning_rate": 1.8113351892084242e-05, + "loss": 1.2634, + "step": 7473 + }, + { + "epoch": 2.226102496323461, + "grad_norm": 0.22303815186023712, + "learning_rate": 1.8112787986952776e-05, + "loss": 1.2719, + "step": 7474 + }, + { + "epoch": 2.22640034252313, + "grad_norm": 0.2328311800956726, + "learning_rate": 1.8112224006340887e-05, + "loss": 1.2465, + "step": 7475 + }, + { + "epoch": 2.226698188722798, + "grad_norm": 0.23571890592575073, + "learning_rate": 1.811165995025381e-05, + "loss": 1.2768, + "step": 7476 + }, + { + "epoch": 2.226996034922467, + "grad_norm": 0.25322532653808594, + "learning_rate": 1.8111095818696805e-05, + "loss": 1.2453, + "step": 7477 + }, + { + "epoch": 2.2272938811221357, + "grad_norm": 0.276750385761261, + "learning_rate": 1.8110531611675112e-05, + "loss": 1.2583, + "step": 7478 + }, + { + "epoch": 2.227591727321804, + "grad_norm": 0.28301140666007996, + "learning_rate": 1.8109967329193986e-05, + "loss": 1.2395, + "step": 7479 + }, + { + "epoch": 2.2278895735214728, + "grad_norm": 0.24565206468105316, + "learning_rate": 1.8109402971258676e-05, + "loss": 1.2594, + "step": 7480 + }, + { + "epoch": 2.2281874197211415, + "grad_norm": 0.3332265615463257, + "learning_rate": 1.8108838537874428e-05, + "loss": 1.2633, + "step": 7481 + }, + { + "epoch": 2.2284852659208103, + "grad_norm": 0.2968139946460724, + "learning_rate": 1.81082740290465e-05, + "loss": 1.2752, + "step": 7482 + }, + { + "epoch": 2.2287831121204786, + "grad_norm": 0.2606860399246216, + "learning_rate": 1.810770944478014e-05, + "loss": 1.2588, + "step": 7483 + }, + { + "epoch": 2.2290809583201474, + "grad_norm": 0.2384524643421173, + "learning_rate": 1.8107144785080604e-05, + "loss": 1.2421, + "step": 7484 + }, + { + "epoch": 2.229378804519816, + "grad_norm": 0.28585508465766907, + "learning_rate": 1.810658004995314e-05, + "loss": 1.2511, + "step": 7485 + }, + { + "epoch": 2.229676650719485, + "grad_norm": 0.29733842611312866, + "learning_rate": 1.810601523940301e-05, + "loss": 1.2553, + "step": 7486 + }, + { + "epoch": 2.2299744969191533, + "grad_norm": 0.23562990128993988, + "learning_rate": 1.8105450353435463e-05, + "loss": 1.2835, + "step": 7487 + }, + { + "epoch": 2.230272343118822, + "grad_norm": 0.23298020660877228, + "learning_rate": 1.8104885392055755e-05, + "loss": 1.2612, + "step": 7488 + }, + { + "epoch": 2.230570189318491, + "grad_norm": 0.23380275070667267, + "learning_rate": 1.8104320355269145e-05, + "loss": 1.2566, + "step": 7489 + }, + { + "epoch": 2.230868035518159, + "grad_norm": 0.2470693737268448, + "learning_rate": 1.8103755243080893e-05, + "loss": 1.2517, + "step": 7490 + }, + { + "epoch": 2.231165881717828, + "grad_norm": 0.22034691274166107, + "learning_rate": 1.8103190055496246e-05, + "loss": 1.2417, + "step": 7491 + }, + { + "epoch": 2.2314637279174967, + "grad_norm": 0.23405425250530243, + "learning_rate": 1.8102624792520472e-05, + "loss": 1.2783, + "step": 7492 + }, + { + "epoch": 2.231761574117165, + "grad_norm": 0.2418336421251297, + "learning_rate": 1.8102059454158824e-05, + "loss": 1.2496, + "step": 7493 + }, + { + "epoch": 2.232059420316834, + "grad_norm": 0.23358824849128723, + "learning_rate": 1.8101494040416566e-05, + "loss": 1.2414, + "step": 7494 + }, + { + "epoch": 2.2323572665165026, + "grad_norm": 0.2360301911830902, + "learning_rate": 1.810092855129896e-05, + "loss": 1.2538, + "step": 7495 + }, + { + "epoch": 2.2326551127161713, + "grad_norm": 0.2237144261598587, + "learning_rate": 1.8100362986811262e-05, + "loss": 1.2534, + "step": 7496 + }, + { + "epoch": 2.2329529589158397, + "grad_norm": 0.23044529557228088, + "learning_rate": 1.809979734695874e-05, + "loss": 1.2422, + "step": 7497 + }, + { + "epoch": 2.2332508051155084, + "grad_norm": 0.24516120553016663, + "learning_rate": 1.809923163174665e-05, + "loss": 1.2648, + "step": 7498 + }, + { + "epoch": 2.233548651315177, + "grad_norm": 0.24140438437461853, + "learning_rate": 1.8098665841180262e-05, + "loss": 1.2455, + "step": 7499 + }, + { + "epoch": 2.233846497514846, + "grad_norm": 0.22544613480567932, + "learning_rate": 1.8098099975264834e-05, + "loss": 1.2529, + "step": 7500 + }, + { + "epoch": 2.233846497514846, + "eval_loss": 1.343520164489746, + "eval_runtime": 20.7156, + "eval_samples_per_second": 83.705, + "eval_steps_per_second": 5.262, + "step": 7500 + }, + { + "epoch": 2.2341443437145143, + "grad_norm": 0.23673491179943085, + "learning_rate": 1.8097534034005636e-05, + "loss": 1.2488, + "step": 7501 + }, + { + "epoch": 2.234442189914183, + "grad_norm": 0.22079885005950928, + "learning_rate": 1.809696801740793e-05, + "loss": 1.2609, + "step": 7502 + }, + { + "epoch": 2.234740036113852, + "grad_norm": 0.2393401712179184, + "learning_rate": 1.809640192547698e-05, + "loss": 1.2668, + "step": 7503 + }, + { + "epoch": 2.23503788231352, + "grad_norm": 0.23307958245277405, + "learning_rate": 1.809583575821806e-05, + "loss": 1.238, + "step": 7504 + }, + { + "epoch": 2.235335728513189, + "grad_norm": 0.2271372377872467, + "learning_rate": 1.809526951563643e-05, + "loss": 1.2503, + "step": 7505 + }, + { + "epoch": 2.2356335747128577, + "grad_norm": 0.23181135952472687, + "learning_rate": 1.8094703197737364e-05, + "loss": 1.2561, + "step": 7506 + }, + { + "epoch": 2.2359314209125265, + "grad_norm": 0.22521166503429413, + "learning_rate": 1.809413680452613e-05, + "loss": 1.2544, + "step": 7507 + }, + { + "epoch": 2.236229267112195, + "grad_norm": 0.22727955877780914, + "learning_rate": 1.8093570336007996e-05, + "loss": 1.2475, + "step": 7508 + }, + { + "epoch": 2.2365271133118636, + "grad_norm": 0.22818197309970856, + "learning_rate": 1.8093003792188227e-05, + "loss": 1.2526, + "step": 7509 + }, + { + "epoch": 2.2368249595115324, + "grad_norm": 0.23357020318508148, + "learning_rate": 1.8092437173072105e-05, + "loss": 1.2754, + "step": 7510 + }, + { + "epoch": 2.2371228057112007, + "grad_norm": 0.23212124407291412, + "learning_rate": 1.8091870478664898e-05, + "loss": 1.2467, + "step": 7511 + }, + { + "epoch": 2.2374206519108695, + "grad_norm": 0.2305627465248108, + "learning_rate": 1.809130370897187e-05, + "loss": 1.282, + "step": 7512 + }, + { + "epoch": 2.2377184981105382, + "grad_norm": 0.23068830370903015, + "learning_rate": 1.8090736863998307e-05, + "loss": 1.2518, + "step": 7513 + }, + { + "epoch": 2.238016344310207, + "grad_norm": 0.23182810842990875, + "learning_rate": 1.8090169943749477e-05, + "loss": 1.2579, + "step": 7514 + }, + { + "epoch": 2.2383141905098753, + "grad_norm": 0.21896368265151978, + "learning_rate": 1.8089602948230653e-05, + "loss": 1.2613, + "step": 7515 + }, + { + "epoch": 2.238612036709544, + "grad_norm": 0.23936767876148224, + "learning_rate": 1.8089035877447114e-05, + "loss": 1.2482, + "step": 7516 + }, + { + "epoch": 2.238909882909213, + "grad_norm": 0.22754739224910736, + "learning_rate": 1.808846873140413e-05, + "loss": 1.2672, + "step": 7517 + }, + { + "epoch": 2.239207729108881, + "grad_norm": 0.2435189187526703, + "learning_rate": 1.808790151010698e-05, + "loss": 1.2532, + "step": 7518 + }, + { + "epoch": 2.23950557530855, + "grad_norm": 0.2510552406311035, + "learning_rate": 1.808733421356095e-05, + "loss": 1.2701, + "step": 7519 + }, + { + "epoch": 2.2398034215082188, + "grad_norm": 0.22681453824043274, + "learning_rate": 1.8086766841771305e-05, + "loss": 1.2505, + "step": 7520 + }, + { + "epoch": 2.2401012677078875, + "grad_norm": 0.25242292881011963, + "learning_rate": 1.808619939474333e-05, + "loss": 1.2735, + "step": 7521 + }, + { + "epoch": 2.240399113907556, + "grad_norm": 0.2699648141860962, + "learning_rate": 1.8085631872482306e-05, + "loss": 1.2509, + "step": 7522 + }, + { + "epoch": 2.2406969601072246, + "grad_norm": 0.22968195378780365, + "learning_rate": 1.8085064274993507e-05, + "loss": 1.2615, + "step": 7523 + }, + { + "epoch": 2.2409948063068934, + "grad_norm": 0.22539275884628296, + "learning_rate": 1.8084496602282223e-05, + "loss": 1.2528, + "step": 7524 + }, + { + "epoch": 2.2412926525065617, + "grad_norm": 0.24039646983146667, + "learning_rate": 1.8083928854353732e-05, + "loss": 1.2634, + "step": 7525 + }, + { + "epoch": 2.2415904987062305, + "grad_norm": 0.23207874596118927, + "learning_rate": 1.808336103121331e-05, + "loss": 1.2787, + "step": 7526 + }, + { + "epoch": 2.2418883449058993, + "grad_norm": 0.23065651953220367, + "learning_rate": 1.808279313286625e-05, + "loss": 1.2545, + "step": 7527 + }, + { + "epoch": 2.242186191105568, + "grad_norm": 0.23620425164699554, + "learning_rate": 1.8082225159317827e-05, + "loss": 1.2435, + "step": 7528 + }, + { + "epoch": 2.2424840373052364, + "grad_norm": 0.22482654452323914, + "learning_rate": 1.8081657110573327e-05, + "loss": 1.2587, + "step": 7529 + }, + { + "epoch": 2.242781883504905, + "grad_norm": 0.228297159075737, + "learning_rate": 1.808108898663804e-05, + "loss": 1.2665, + "step": 7530 + }, + { + "epoch": 2.243079729704574, + "grad_norm": 0.23502565920352936, + "learning_rate": 1.808052078751725e-05, + "loss": 1.2735, + "step": 7531 + }, + { + "epoch": 2.2433775759042422, + "grad_norm": 0.24171894788742065, + "learning_rate": 1.8079952513216238e-05, + "loss": 1.2627, + "step": 7532 + }, + { + "epoch": 2.243675422103911, + "grad_norm": 0.22185218334197998, + "learning_rate": 1.8079384163740296e-05, + "loss": 1.2512, + "step": 7533 + }, + { + "epoch": 2.24397326830358, + "grad_norm": 0.22304552793502808, + "learning_rate": 1.8078815739094714e-05, + "loss": 1.2444, + "step": 7534 + }, + { + "epoch": 2.2442711145032486, + "grad_norm": 0.26106178760528564, + "learning_rate": 1.807824723928478e-05, + "loss": 1.2564, + "step": 7535 + }, + { + "epoch": 2.244568960702917, + "grad_norm": 0.37261083722114563, + "learning_rate": 1.8077678664315775e-05, + "loss": 1.2324, + "step": 7536 + }, + { + "epoch": 2.2448668069025857, + "grad_norm": 0.29698267579078674, + "learning_rate": 1.8077110014192997e-05, + "loss": 1.274, + "step": 7537 + }, + { + "epoch": 2.2451646531022544, + "grad_norm": 0.2707362473011017, + "learning_rate": 1.8076541288921733e-05, + "loss": 1.251, + "step": 7538 + }, + { + "epoch": 2.2454624993019228, + "grad_norm": 0.2235717624425888, + "learning_rate": 1.807597248850728e-05, + "loss": 1.2532, + "step": 7539 + }, + { + "epoch": 2.2457603455015915, + "grad_norm": 0.4530438780784607, + "learning_rate": 1.8075403612954926e-05, + "loss": 1.2535, + "step": 7540 + }, + { + "epoch": 2.2460581917012603, + "grad_norm": 0.23756301403045654, + "learning_rate": 1.8074834662269957e-05, + "loss": 1.261, + "step": 7541 + }, + { + "epoch": 2.246356037900929, + "grad_norm": 0.23115527629852295, + "learning_rate": 1.807426563645768e-05, + "loss": 1.2605, + "step": 7542 + }, + { + "epoch": 2.2466538841005974, + "grad_norm": 0.22856971621513367, + "learning_rate": 1.8073696535523383e-05, + "loss": 1.2416, + "step": 7543 + }, + { + "epoch": 2.246951730300266, + "grad_norm": 0.2366601526737213, + "learning_rate": 1.8073127359472355e-05, + "loss": 1.245, + "step": 7544 + }, + { + "epoch": 2.247249576499935, + "grad_norm": 0.22170701622962952, + "learning_rate": 1.8072558108309902e-05, + "loss": 1.2462, + "step": 7545 + }, + { + "epoch": 2.2475474226996033, + "grad_norm": 0.23127001523971558, + "learning_rate": 1.8071988782041308e-05, + "loss": 1.2541, + "step": 7546 + }, + { + "epoch": 2.247845268899272, + "grad_norm": 0.22585774958133698, + "learning_rate": 1.8071419380671883e-05, + "loss": 1.2622, + "step": 7547 + }, + { + "epoch": 2.248143115098941, + "grad_norm": 0.2309402972459793, + "learning_rate": 1.8070849904206916e-05, + "loss": 1.2733, + "step": 7548 + }, + { + "epoch": 2.2484409612986096, + "grad_norm": 0.22721268236637115, + "learning_rate": 1.807028035265171e-05, + "loss": 1.2347, + "step": 7549 + }, + { + "epoch": 2.248738807498278, + "grad_norm": 0.2178564965724945, + "learning_rate": 1.806971072601156e-05, + "loss": 1.2525, + "step": 7550 + }, + { + "epoch": 2.2490366536979467, + "grad_norm": 0.23110532760620117, + "learning_rate": 1.8069141024291768e-05, + "loss": 1.2599, + "step": 7551 + }, + { + "epoch": 2.2493344998976155, + "grad_norm": 0.2313280552625656, + "learning_rate": 1.8068571247497636e-05, + "loss": 1.246, + "step": 7552 + }, + { + "epoch": 2.2496323460972842, + "grad_norm": 0.22944484651088715, + "learning_rate": 1.806800139563446e-05, + "loss": 1.2483, + "step": 7553 + }, + { + "epoch": 2.2499301922969526, + "grad_norm": 0.23092588782310486, + "learning_rate": 1.806743146870755e-05, + "loss": 1.2585, + "step": 7554 + }, + { + "epoch": 2.2502280384966213, + "grad_norm": 0.22942329943180084, + "learning_rate": 1.80668614667222e-05, + "loss": 1.2677, + "step": 7555 + }, + { + "epoch": 2.25052588469629, + "grad_norm": 0.22068588435649872, + "learning_rate": 1.8066291389683717e-05, + "loss": 1.2677, + "step": 7556 + }, + { + "epoch": 2.2508237308959584, + "grad_norm": 0.22939462959766388, + "learning_rate": 1.8065721237597403e-05, + "loss": 1.2464, + "step": 7557 + }, + { + "epoch": 2.251121577095627, + "grad_norm": 0.2231801152229309, + "learning_rate": 1.806515101046857e-05, + "loss": 1.2502, + "step": 7558 + }, + { + "epoch": 2.251419423295296, + "grad_norm": 0.22507058084011078, + "learning_rate": 1.806458070830251e-05, + "loss": 1.2541, + "step": 7559 + }, + { + "epoch": 2.2517172694949643, + "grad_norm": 0.22735922038555145, + "learning_rate": 1.806401033110454e-05, + "loss": 1.2492, + "step": 7560 + }, + { + "epoch": 2.252015115694633, + "grad_norm": 0.22040493786334991, + "learning_rate": 1.806343987887997e-05, + "loss": 1.2396, + "step": 7561 + }, + { + "epoch": 2.252312961894302, + "grad_norm": 0.23119086027145386, + "learning_rate": 1.8062869351634095e-05, + "loss": 1.2757, + "step": 7562 + }, + { + "epoch": 2.2526108080939706, + "grad_norm": 0.24003298580646515, + "learning_rate": 1.806229874937223e-05, + "loss": 1.2753, + "step": 7563 + }, + { + "epoch": 2.252908654293639, + "grad_norm": 0.2215633988380432, + "learning_rate": 1.8061728072099682e-05, + "loss": 1.2654, + "step": 7564 + }, + { + "epoch": 2.2532065004933077, + "grad_norm": 0.22103771567344666, + "learning_rate": 1.806115731982176e-05, + "loss": 1.2547, + "step": 7565 + }, + { + "epoch": 2.2535043466929765, + "grad_norm": 0.23959650099277496, + "learning_rate": 1.8060586492543777e-05, + "loss": 1.265, + "step": 7566 + }, + { + "epoch": 2.2538021928926453, + "grad_norm": 0.23556892573833466, + "learning_rate": 1.8060015590271045e-05, + "loss": 1.2506, + "step": 7567 + }, + { + "epoch": 2.2541000390923136, + "grad_norm": 0.22280603647232056, + "learning_rate": 1.8059444613008873e-05, + "loss": 1.2708, + "step": 7568 + }, + { + "epoch": 2.2543978852919824, + "grad_norm": 0.2173597514629364, + "learning_rate": 1.8058873560762567e-05, + "loss": 1.2472, + "step": 7569 + }, + { + "epoch": 2.254695731491651, + "grad_norm": 0.22834259271621704, + "learning_rate": 1.8058302433537454e-05, + "loss": 1.2568, + "step": 7570 + }, + { + "epoch": 2.2549935776913195, + "grad_norm": 0.23622368276119232, + "learning_rate": 1.8057731231338836e-05, + "loss": 1.265, + "step": 7571 + }, + { + "epoch": 2.2552914238909882, + "grad_norm": 0.22571687400341034, + "learning_rate": 1.8057159954172032e-05, + "loss": 1.2505, + "step": 7572 + }, + { + "epoch": 2.255589270090657, + "grad_norm": 0.22665946185588837, + "learning_rate": 1.805658860204236e-05, + "loss": 1.2539, + "step": 7573 + }, + { + "epoch": 2.2558871162903253, + "grad_norm": 0.23141254484653473, + "learning_rate": 1.8056017174955127e-05, + "loss": 1.2576, + "step": 7574 + }, + { + "epoch": 2.256184962489994, + "grad_norm": 0.21993815898895264, + "learning_rate": 1.805544567291566e-05, + "loss": 1.247, + "step": 7575 + }, + { + "epoch": 2.256482808689663, + "grad_norm": 0.227073073387146, + "learning_rate": 1.8054874095929267e-05, + "loss": 1.2491, + "step": 7576 + }, + { + "epoch": 2.2567806548893317, + "grad_norm": 0.2353278547525406, + "learning_rate": 1.8054302444001274e-05, + "loss": 1.2501, + "step": 7577 + }, + { + "epoch": 2.257078501089, + "grad_norm": 0.23063941299915314, + "learning_rate": 1.805373071713699e-05, + "loss": 1.2669, + "step": 7578 + }, + { + "epoch": 2.2573763472886688, + "grad_norm": 0.23481705784797668, + "learning_rate": 1.8053158915341743e-05, + "loss": 1.2677, + "step": 7579 + }, + { + "epoch": 2.2576741934883375, + "grad_norm": 0.22463823854923248, + "learning_rate": 1.8052587038620852e-05, + "loss": 1.2582, + "step": 7580 + }, + { + "epoch": 2.2579720396880063, + "grad_norm": 0.22420097887516022, + "learning_rate": 1.8052015086979632e-05, + "loss": 1.2425, + "step": 7581 + }, + { + "epoch": 2.2582698858876746, + "grad_norm": 0.23631395399570465, + "learning_rate": 1.805144306042341e-05, + "loss": 1.2444, + "step": 7582 + }, + { + "epoch": 2.2585677320873434, + "grad_norm": 0.2266579270362854, + "learning_rate": 1.8050870958957504e-05, + "loss": 1.2528, + "step": 7583 + }, + { + "epoch": 2.258865578287012, + "grad_norm": 0.2310836911201477, + "learning_rate": 1.805029878258724e-05, + "loss": 1.2459, + "step": 7584 + }, + { + "epoch": 2.2591634244866805, + "grad_norm": 0.23519113659858704, + "learning_rate": 1.804972653131794e-05, + "loss": 1.2859, + "step": 7585 + }, + { + "epoch": 2.2594612706863493, + "grad_norm": 0.2356254607439041, + "learning_rate": 1.8049154205154928e-05, + "loss": 1.2536, + "step": 7586 + }, + { + "epoch": 2.259759116886018, + "grad_norm": 0.2175767570734024, + "learning_rate": 1.804858180410353e-05, + "loss": 1.2559, + "step": 7587 + }, + { + "epoch": 2.260056963085687, + "grad_norm": 0.23540367186069489, + "learning_rate": 1.804800932816907e-05, + "loss": 1.2646, + "step": 7588 + }, + { + "epoch": 2.260354809285355, + "grad_norm": 0.22534309327602386, + "learning_rate": 1.8047436777356875e-05, + "loss": 1.2613, + "step": 7589 + }, + { + "epoch": 2.260652655485024, + "grad_norm": 0.23676803708076477, + "learning_rate": 1.804686415167227e-05, + "loss": 1.2586, + "step": 7590 + }, + { + "epoch": 2.2609505016846927, + "grad_norm": 0.23621177673339844, + "learning_rate": 1.804629145112059e-05, + "loss": 1.2566, + "step": 7591 + }, + { + "epoch": 2.261248347884361, + "grad_norm": 0.22155267000198364, + "learning_rate": 1.8045718675707152e-05, + "loss": 1.2438, + "step": 7592 + }, + { + "epoch": 2.26154619408403, + "grad_norm": 0.23292042315006256, + "learning_rate": 1.8045145825437294e-05, + "loss": 1.259, + "step": 7593 + }, + { + "epoch": 2.2618440402836986, + "grad_norm": 0.22824744880199432, + "learning_rate": 1.804457290031634e-05, + "loss": 1.2487, + "step": 7594 + }, + { + "epoch": 2.2621418864833673, + "grad_norm": 0.23176303505897522, + "learning_rate": 1.804399990034963e-05, + "loss": 1.2323, + "step": 7595 + }, + { + "epoch": 2.2624397326830357, + "grad_norm": 0.24980776011943817, + "learning_rate": 1.804342682554248e-05, + "loss": 1.2654, + "step": 7596 + }, + { + "epoch": 2.2627375788827044, + "grad_norm": 0.24396789073944092, + "learning_rate": 1.8042853675900235e-05, + "loss": 1.27, + "step": 7597 + }, + { + "epoch": 2.263035425082373, + "grad_norm": 0.23645758628845215, + "learning_rate": 1.8042280451428222e-05, + "loss": 1.2403, + "step": 7598 + }, + { + "epoch": 2.2633332712820415, + "grad_norm": 0.23644089698791504, + "learning_rate": 1.8041707152131772e-05, + "loss": 1.2389, + "step": 7599 + }, + { + "epoch": 2.2636311174817103, + "grad_norm": 0.2311485856771469, + "learning_rate": 1.804113377801622e-05, + "loss": 1.2662, + "step": 7600 + }, + { + "epoch": 2.263928963681379, + "grad_norm": 0.23153501749038696, + "learning_rate": 1.8040560329086908e-05, + "loss": 1.2583, + "step": 7601 + }, + { + "epoch": 2.264226809881048, + "grad_norm": 0.23604245483875275, + "learning_rate": 1.8039986805349167e-05, + "loss": 1.2504, + "step": 7602 + }, + { + "epoch": 2.264524656080716, + "grad_norm": 0.22809988260269165, + "learning_rate": 1.8039413206808326e-05, + "loss": 1.2422, + "step": 7603 + }, + { + "epoch": 2.264822502280385, + "grad_norm": 0.2308470755815506, + "learning_rate": 1.803883953346973e-05, + "loss": 1.2558, + "step": 7604 + }, + { + "epoch": 2.2651203484800537, + "grad_norm": 0.23038657009601593, + "learning_rate": 1.803826578533871e-05, + "loss": 1.266, + "step": 7605 + }, + { + "epoch": 2.2654181946797225, + "grad_norm": 0.22560255229473114, + "learning_rate": 1.803769196242061e-05, + "loss": 1.2569, + "step": 7606 + }, + { + "epoch": 2.265716040879391, + "grad_norm": 0.23066085577011108, + "learning_rate": 1.8037118064720767e-05, + "loss": 1.2543, + "step": 7607 + }, + { + "epoch": 2.2660138870790596, + "grad_norm": 0.2334594577550888, + "learning_rate": 1.803654409224452e-05, + "loss": 1.2505, + "step": 7608 + }, + { + "epoch": 2.2663117332787284, + "grad_norm": 0.24509860575199127, + "learning_rate": 1.8035970044997212e-05, + "loss": 1.2572, + "step": 7609 + }, + { + "epoch": 2.2666095794783967, + "grad_norm": 0.23509445786476135, + "learning_rate": 1.803539592298418e-05, + "loss": 1.2586, + "step": 7610 + }, + { + "epoch": 2.2669074256780655, + "grad_norm": 0.2275058478116989, + "learning_rate": 1.803482172621076e-05, + "loss": 1.2646, + "step": 7611 + }, + { + "epoch": 2.2672052718777342, + "grad_norm": 0.2272939532995224, + "learning_rate": 1.803424745468231e-05, + "loss": 1.2736, + "step": 7612 + }, + { + "epoch": 2.2675031180774026, + "grad_norm": 0.2450646609067917, + "learning_rate": 1.8033673108404157e-05, + "loss": 1.2436, + "step": 7613 + }, + { + "epoch": 2.2678009642770713, + "grad_norm": 0.2289837896823883, + "learning_rate": 1.8033098687381656e-05, + "loss": 1.2453, + "step": 7614 + }, + { + "epoch": 2.26809881047674, + "grad_norm": 0.23882275819778442, + "learning_rate": 1.8032524191620143e-05, + "loss": 1.2587, + "step": 7615 + }, + { + "epoch": 2.268396656676409, + "grad_norm": 0.23556388914585114, + "learning_rate": 1.8031949621124967e-05, + "loss": 1.2615, + "step": 7616 + }, + { + "epoch": 2.268694502876077, + "grad_norm": 0.2411070019006729, + "learning_rate": 1.803137497590148e-05, + "loss": 1.2636, + "step": 7617 + }, + { + "epoch": 2.268992349075746, + "grad_norm": 0.23291894793510437, + "learning_rate": 1.8030800255955014e-05, + "loss": 1.2397, + "step": 7618 + }, + { + "epoch": 2.2692901952754148, + "grad_norm": 0.22428922355175018, + "learning_rate": 1.803022546129093e-05, + "loss": 1.2575, + "step": 7619 + }, + { + "epoch": 2.2695880414750835, + "grad_norm": 0.2388489842414856, + "learning_rate": 1.8029650591914566e-05, + "loss": 1.2649, + "step": 7620 + }, + { + "epoch": 2.269885887674752, + "grad_norm": 0.22928275167942047, + "learning_rate": 1.8029075647831274e-05, + "loss": 1.2601, + "step": 7621 + }, + { + "epoch": 2.2701837338744206, + "grad_norm": 0.2210404872894287, + "learning_rate": 1.8028500629046408e-05, + "loss": 1.2673, + "step": 7622 + }, + { + "epoch": 2.2704815800740894, + "grad_norm": 0.22059838473796844, + "learning_rate": 1.802792553556531e-05, + "loss": 1.2481, + "step": 7623 + }, + { + "epoch": 2.2707794262737577, + "grad_norm": 0.22074320912361145, + "learning_rate": 1.8027350367393337e-05, + "loss": 1.2415, + "step": 7624 + }, + { + "epoch": 2.2710772724734265, + "grad_norm": 0.24399973452091217, + "learning_rate": 1.8026775124535835e-05, + "loss": 1.2522, + "step": 7625 + }, + { + "epoch": 2.2713751186730953, + "grad_norm": 0.23464760184288025, + "learning_rate": 1.8026199806998163e-05, + "loss": 1.2716, + "step": 7626 + }, + { + "epoch": 2.2716729648727636, + "grad_norm": 0.2399558275938034, + "learning_rate": 1.8025624414785663e-05, + "loss": 1.2717, + "step": 7627 + }, + { + "epoch": 2.2719708110724324, + "grad_norm": 0.23549529910087585, + "learning_rate": 1.8025048947903698e-05, + "loss": 1.2663, + "step": 7628 + }, + { + "epoch": 2.272268657272101, + "grad_norm": 0.231827050447464, + "learning_rate": 1.802447340635762e-05, + "loss": 1.2398, + "step": 7629 + }, + { + "epoch": 2.27256650347177, + "grad_norm": 0.23063279688358307, + "learning_rate": 1.8023897790152778e-05, + "loss": 1.245, + "step": 7630 + }, + { + "epoch": 2.2728643496714382, + "grad_norm": 0.23611456155776978, + "learning_rate": 1.8023322099294533e-05, + "loss": 1.2436, + "step": 7631 + }, + { + "epoch": 2.273162195871107, + "grad_norm": 0.22566045820713043, + "learning_rate": 1.8022746333788243e-05, + "loss": 1.2592, + "step": 7632 + }, + { + "epoch": 2.273460042070776, + "grad_norm": 0.23024094104766846, + "learning_rate": 1.8022170493639258e-05, + "loss": 1.2664, + "step": 7633 + }, + { + "epoch": 2.2737578882704446, + "grad_norm": 0.25157734751701355, + "learning_rate": 1.8021594578852942e-05, + "loss": 1.2835, + "step": 7634 + }, + { + "epoch": 2.274055734470113, + "grad_norm": 0.2292707860469818, + "learning_rate": 1.802101858943465e-05, + "loss": 1.2735, + "step": 7635 + }, + { + "epoch": 2.2743535806697817, + "grad_norm": 0.23525993525981903, + "learning_rate": 1.8020442525389742e-05, + "loss": 1.2605, + "step": 7636 + }, + { + "epoch": 2.2746514268694504, + "grad_norm": 0.24633367359638214, + "learning_rate": 1.8019866386723582e-05, + "loss": 1.253, + "step": 7637 + }, + { + "epoch": 2.2749492730691188, + "grad_norm": 0.267260879278183, + "learning_rate": 1.801929017344152e-05, + "loss": 1.2732, + "step": 7638 + }, + { + "epoch": 2.2752471192687875, + "grad_norm": 0.236179381608963, + "learning_rate": 1.801871388554892e-05, + "loss": 1.2599, + "step": 7639 + }, + { + "epoch": 2.2755449654684563, + "grad_norm": 0.24293148517608643, + "learning_rate": 1.801813752305115e-05, + "loss": 1.2631, + "step": 7640 + }, + { + "epoch": 2.2758428116681246, + "grad_norm": 0.2451498657464981, + "learning_rate": 1.801756108595357e-05, + "loss": 1.246, + "step": 7641 + }, + { + "epoch": 2.2761406578677934, + "grad_norm": 0.2330862134695053, + "learning_rate": 1.801698457426154e-05, + "loss": 1.2497, + "step": 7642 + }, + { + "epoch": 2.276438504067462, + "grad_norm": 0.2343113273382187, + "learning_rate": 1.8016407987980427e-05, + "loss": 1.2576, + "step": 7643 + }, + { + "epoch": 2.276736350267131, + "grad_norm": 0.24515904486179352, + "learning_rate": 1.8015831327115592e-05, + "loss": 1.2512, + "step": 7644 + }, + { + "epoch": 2.2770341964667993, + "grad_norm": 0.24204449355602264, + "learning_rate": 1.8015254591672403e-05, + "loss": 1.2312, + "step": 7645 + }, + { + "epoch": 2.277332042666468, + "grad_norm": 0.2208164781332016, + "learning_rate": 1.8014677781656226e-05, + "loss": 1.2421, + "step": 7646 + }, + { + "epoch": 2.277629888866137, + "grad_norm": 0.23881329596042633, + "learning_rate": 1.801410089707243e-05, + "loss": 1.2381, + "step": 7647 + }, + { + "epoch": 2.2779277350658056, + "grad_norm": 0.23398655652999878, + "learning_rate": 1.8013523937926375e-05, + "loss": 1.2551, + "step": 7648 + }, + { + "epoch": 2.278225581265474, + "grad_norm": 0.22927062213420868, + "learning_rate": 1.801294690422343e-05, + "loss": 1.2526, + "step": 7649 + }, + { + "epoch": 2.2785234274651427, + "grad_norm": 0.24090448021888733, + "learning_rate": 1.8012369795968972e-05, + "loss": 1.2652, + "step": 7650 + }, + { + "epoch": 2.2788212736648115, + "grad_norm": 0.22747567296028137, + "learning_rate": 1.801179261316836e-05, + "loss": 1.2431, + "step": 7651 + }, + { + "epoch": 2.27911911986448, + "grad_norm": 0.24036529660224915, + "learning_rate": 1.8011215355826976e-05, + "loss": 1.2688, + "step": 7652 + }, + { + "epoch": 2.2794169660641486, + "grad_norm": 0.2285313606262207, + "learning_rate": 1.801063802395018e-05, + "loss": 1.2636, + "step": 7653 + }, + { + "epoch": 2.2797148122638173, + "grad_norm": 0.28348928689956665, + "learning_rate": 1.8010060617543346e-05, + "loss": 1.2525, + "step": 7654 + }, + { + "epoch": 2.280012658463486, + "grad_norm": 0.24212366342544556, + "learning_rate": 1.8009483136611847e-05, + "loss": 1.2716, + "step": 7655 + }, + { + "epoch": 2.2803105046631544, + "grad_norm": 0.2637227475643158, + "learning_rate": 1.800890558116106e-05, + "loss": 1.2647, + "step": 7656 + }, + { + "epoch": 2.280608350862823, + "grad_norm": 0.23166510462760925, + "learning_rate": 1.8008327951196352e-05, + "loss": 1.2686, + "step": 7657 + }, + { + "epoch": 2.280906197062492, + "grad_norm": 0.28819966316223145, + "learning_rate": 1.80077502467231e-05, + "loss": 1.2638, + "step": 7658 + }, + { + "epoch": 2.2812040432621603, + "grad_norm": 0.26619935035705566, + "learning_rate": 1.8007172467746677e-05, + "loss": 1.2556, + "step": 7659 + }, + { + "epoch": 2.281501889461829, + "grad_norm": 0.2544212341308594, + "learning_rate": 1.8006594614272462e-05, + "loss": 1.2497, + "step": 7660 + }, + { + "epoch": 2.281799735661498, + "grad_norm": 0.226546049118042, + "learning_rate": 1.800601668630583e-05, + "loss": 1.2616, + "step": 7661 + }, + { + "epoch": 2.2820975818611666, + "grad_norm": 0.27174046635627747, + "learning_rate": 1.8005438683852158e-05, + "loss": 1.244, + "step": 7662 + }, + { + "epoch": 2.282395428060835, + "grad_norm": 0.23841282725334167, + "learning_rate": 1.800486060691682e-05, + "loss": 1.2624, + "step": 7663 + }, + { + "epoch": 2.2826932742605037, + "grad_norm": 0.2460107058286667, + "learning_rate": 1.8004282455505202e-05, + "loss": 1.2618, + "step": 7664 + }, + { + "epoch": 2.2829911204601725, + "grad_norm": 0.24947495758533478, + "learning_rate": 1.800370422962268e-05, + "loss": 1.273, + "step": 7665 + }, + { + "epoch": 2.283288966659841, + "grad_norm": 0.22599336504936218, + "learning_rate": 1.8003125929274628e-05, + "loss": 1.2569, + "step": 7666 + }, + { + "epoch": 2.2835868128595096, + "grad_norm": 0.26056331396102905, + "learning_rate": 1.8002547554466433e-05, + "loss": 1.2623, + "step": 7667 + }, + { + "epoch": 2.2838846590591784, + "grad_norm": 0.2334567755460739, + "learning_rate": 1.800196910520347e-05, + "loss": 1.2523, + "step": 7668 + }, + { + "epoch": 2.284182505258847, + "grad_norm": 0.24040871858596802, + "learning_rate": 1.800139058149113e-05, + "loss": 1.2563, + "step": 7669 + }, + { + "epoch": 2.2844803514585155, + "grad_norm": 0.24634890258312225, + "learning_rate": 1.8000811983334788e-05, + "loss": 1.2549, + "step": 7670 + }, + { + "epoch": 2.2847781976581842, + "grad_norm": 0.23977108299732208, + "learning_rate": 1.8000233310739828e-05, + "loss": 1.2502, + "step": 7671 + }, + { + "epoch": 2.285076043857853, + "grad_norm": 0.2574748694896698, + "learning_rate": 1.799965456371164e-05, + "loss": 1.2725, + "step": 7672 + }, + { + "epoch": 2.285373890057522, + "grad_norm": 0.2517244219779968, + "learning_rate": 1.7999075742255602e-05, + "loss": 1.2605, + "step": 7673 + }, + { + "epoch": 2.28567173625719, + "grad_norm": 0.23184464871883392, + "learning_rate": 1.79984968463771e-05, + "loss": 1.2518, + "step": 7674 + }, + { + "epoch": 2.285969582456859, + "grad_norm": 0.24130327999591827, + "learning_rate": 1.799791787608152e-05, + "loss": 1.2655, + "step": 7675 + }, + { + "epoch": 2.2862674286565277, + "grad_norm": 0.23445512354373932, + "learning_rate": 1.7997338831374254e-05, + "loss": 1.2527, + "step": 7676 + }, + { + "epoch": 2.286565274856196, + "grad_norm": 0.2374877631664276, + "learning_rate": 1.7996759712260683e-05, + "loss": 1.26, + "step": 7677 + }, + { + "epoch": 2.2868631210558648, + "grad_norm": 0.23821516335010529, + "learning_rate": 1.7996180518746197e-05, + "loss": 1.2467, + "step": 7678 + }, + { + "epoch": 2.2871609672555335, + "grad_norm": 0.2226201891899109, + "learning_rate": 1.7995601250836184e-05, + "loss": 1.2445, + "step": 7679 + }, + { + "epoch": 2.287458813455202, + "grad_norm": 0.2388964295387268, + "learning_rate": 1.7995021908536037e-05, + "loss": 1.2637, + "step": 7680 + }, + { + "epoch": 2.2877566596548706, + "grad_norm": 0.22276601195335388, + "learning_rate": 1.7994442491851145e-05, + "loss": 1.2652, + "step": 7681 + }, + { + "epoch": 2.2880545058545394, + "grad_norm": 0.31236881017684937, + "learning_rate": 1.7993863000786893e-05, + "loss": 1.2496, + "step": 7682 + }, + { + "epoch": 2.288352352054208, + "grad_norm": 0.2572900950908661, + "learning_rate": 1.799328343534868e-05, + "loss": 1.2493, + "step": 7683 + }, + { + "epoch": 2.2886501982538765, + "grad_norm": 0.24791483581066132, + "learning_rate": 1.7992703795541895e-05, + "loss": 1.2462, + "step": 7684 + }, + { + "epoch": 2.2889480444535453, + "grad_norm": 0.23369233310222626, + "learning_rate": 1.799212408137193e-05, + "loss": 1.2589, + "step": 7685 + }, + { + "epoch": 2.289245890653214, + "grad_norm": 0.253845751285553, + "learning_rate": 1.799154429284418e-05, + "loss": 1.2519, + "step": 7686 + }, + { + "epoch": 2.289543736852883, + "grad_norm": 0.2269347906112671, + "learning_rate": 1.799096442996404e-05, + "loss": 1.2529, + "step": 7687 + }, + { + "epoch": 2.289841583052551, + "grad_norm": 0.24698413908481598, + "learning_rate": 1.7990384492736903e-05, + "loss": 1.2442, + "step": 7688 + }, + { + "epoch": 2.29013942925222, + "grad_norm": 0.23516568541526794, + "learning_rate": 1.7989804481168168e-05, + "loss": 1.2529, + "step": 7689 + }, + { + "epoch": 2.2904372754518887, + "grad_norm": 0.22608694434165955, + "learning_rate": 1.798922439526323e-05, + "loss": 1.2546, + "step": 7690 + }, + { + "epoch": 2.290735121651557, + "grad_norm": 0.23483608663082123, + "learning_rate": 1.7988644235027478e-05, + "loss": 1.2534, + "step": 7691 + }, + { + "epoch": 2.291032967851226, + "grad_norm": 0.24747517704963684, + "learning_rate": 1.7988064000466322e-05, + "loss": 1.2554, + "step": 7692 + }, + { + "epoch": 2.2913308140508946, + "grad_norm": 0.22679506242275238, + "learning_rate": 1.7987483691585156e-05, + "loss": 1.2411, + "step": 7693 + }, + { + "epoch": 2.291628660250563, + "grad_norm": 0.26324447989463806, + "learning_rate": 1.7986903308389375e-05, + "loss": 1.2531, + "step": 7694 + }, + { + "epoch": 2.2919265064502317, + "grad_norm": 0.23506203293800354, + "learning_rate": 1.7986322850884387e-05, + "loss": 1.2573, + "step": 7695 + }, + { + "epoch": 2.2922243526499004, + "grad_norm": 0.2332557588815689, + "learning_rate": 1.7985742319075584e-05, + "loss": 1.2588, + "step": 7696 + }, + { + "epoch": 2.292522198849569, + "grad_norm": 0.23972269892692566, + "learning_rate": 1.7985161712968372e-05, + "loss": 1.2614, + "step": 7697 + }, + { + "epoch": 2.2928200450492375, + "grad_norm": 0.23484289646148682, + "learning_rate": 1.798458103256815e-05, + "loss": 1.2537, + "step": 7698 + }, + { + "epoch": 2.2931178912489063, + "grad_norm": 0.22184839844703674, + "learning_rate": 1.7984000277880325e-05, + "loss": 1.2558, + "step": 7699 + }, + { + "epoch": 2.293415737448575, + "grad_norm": 0.2516103982925415, + "learning_rate": 1.7983419448910298e-05, + "loss": 1.2572, + "step": 7700 + }, + { + "epoch": 2.293713583648244, + "grad_norm": 0.2929055392742157, + "learning_rate": 1.798283854566347e-05, + "loss": 1.2588, + "step": 7701 + }, + { + "epoch": 2.294011429847912, + "grad_norm": 0.2357298880815506, + "learning_rate": 1.798225756814525e-05, + "loss": 1.2466, + "step": 7702 + }, + { + "epoch": 2.294309276047581, + "grad_norm": 0.23832914233207703, + "learning_rate": 1.798167651636104e-05, + "loss": 1.2322, + "step": 7703 + }, + { + "epoch": 2.2946071222472497, + "grad_norm": 0.24009771645069122, + "learning_rate": 1.798109539031625e-05, + "loss": 1.2525, + "step": 7704 + }, + { + "epoch": 2.294904968446918, + "grad_norm": 0.2367497831583023, + "learning_rate": 1.7980514190016283e-05, + "loss": 1.2573, + "step": 7705 + }, + { + "epoch": 2.295202814646587, + "grad_norm": 0.23563018441200256, + "learning_rate": 1.797993291546655e-05, + "loss": 1.2634, + "step": 7706 + }, + { + "epoch": 2.2955006608462556, + "grad_norm": 0.23822346329689026, + "learning_rate": 1.7979351566672454e-05, + "loss": 1.2506, + "step": 7707 + }, + { + "epoch": 2.295798507045924, + "grad_norm": 0.24498026072978973, + "learning_rate": 1.797877014363941e-05, + "loss": 1.2654, + "step": 7708 + }, + { + "epoch": 2.2960963532455927, + "grad_norm": 0.3724459111690521, + "learning_rate": 1.7978188646372818e-05, + "loss": 1.2516, + "step": 7709 + }, + { + "epoch": 2.2963941994452615, + "grad_norm": 0.2689630389213562, + "learning_rate": 1.79776070748781e-05, + "loss": 1.2433, + "step": 7710 + }, + { + "epoch": 2.2966920456449302, + "grad_norm": 0.2751366198062897, + "learning_rate": 1.797702542916066e-05, + "loss": 1.2307, + "step": 7711 + }, + { + "epoch": 2.2969898918445986, + "grad_norm": 0.298320472240448, + "learning_rate": 1.7976443709225912e-05, + "loss": 1.263, + "step": 7712 + }, + { + "epoch": 2.2972877380442673, + "grad_norm": 0.29242566227912903, + "learning_rate": 1.7975861915079263e-05, + "loss": 1.247, + "step": 7713 + }, + { + "epoch": 2.297585584243936, + "grad_norm": 0.23475217819213867, + "learning_rate": 1.7975280046726135e-05, + "loss": 1.2609, + "step": 7714 + }, + { + "epoch": 2.297883430443605, + "grad_norm": 0.26329460740089417, + "learning_rate": 1.7974698104171934e-05, + "loss": 1.2601, + "step": 7715 + }, + { + "epoch": 2.298181276643273, + "grad_norm": 0.22359760105609894, + "learning_rate": 1.797411608742208e-05, + "loss": 1.2702, + "step": 7716 + }, + { + "epoch": 2.298479122842942, + "grad_norm": 0.2478565275669098, + "learning_rate": 1.797353399648198e-05, + "loss": 1.2587, + "step": 7717 + }, + { + "epoch": 2.2987769690426108, + "grad_norm": 0.2553616464138031, + "learning_rate": 1.7972951831357056e-05, + "loss": 1.2609, + "step": 7718 + }, + { + "epoch": 2.299074815242279, + "grad_norm": 0.22546374797821045, + "learning_rate": 1.7972369592052726e-05, + "loss": 1.2599, + "step": 7719 + }, + { + "epoch": 2.299372661441948, + "grad_norm": 0.24717041850090027, + "learning_rate": 1.79717872785744e-05, + "loss": 1.2556, + "step": 7720 + }, + { + "epoch": 2.2996705076416166, + "grad_norm": 0.23478208482265472, + "learning_rate": 1.79712048909275e-05, + "loss": 1.2639, + "step": 7721 + }, + { + "epoch": 2.2999683538412854, + "grad_norm": 0.33374375104904175, + "learning_rate": 1.797062242911745e-05, + "loss": 1.2531, + "step": 7722 + }, + { + "epoch": 2.3002662000409537, + "grad_norm": 0.24589771032333374, + "learning_rate": 1.797003989314966e-05, + "loss": 1.2477, + "step": 7723 + }, + { + "epoch": 2.3005640462406225, + "grad_norm": 0.24167700111865997, + "learning_rate": 1.7969457283029554e-05, + "loss": 1.2493, + "step": 7724 + }, + { + "epoch": 2.3008618924402913, + "grad_norm": 0.229723259806633, + "learning_rate": 1.796887459876255e-05, + "loss": 1.2463, + "step": 7725 + }, + { + "epoch": 2.30115973863996, + "grad_norm": 0.23125611245632172, + "learning_rate": 1.7968291840354073e-05, + "loss": 1.2453, + "step": 7726 + }, + { + "epoch": 2.3014575848396284, + "grad_norm": 0.23347200453281403, + "learning_rate": 1.7967709007809544e-05, + "loss": 1.2579, + "step": 7727 + }, + { + "epoch": 2.301755431039297, + "grad_norm": 0.2410845160484314, + "learning_rate": 1.7967126101134386e-05, + "loss": 1.2632, + "step": 7728 + }, + { + "epoch": 2.302053277238966, + "grad_norm": 0.22133436799049377, + "learning_rate": 1.7966543120334016e-05, + "loss": 1.2621, + "step": 7729 + }, + { + "epoch": 2.3023511234386342, + "grad_norm": 0.24848565459251404, + "learning_rate": 1.796596006541387e-05, + "loss": 1.2479, + "step": 7730 + }, + { + "epoch": 2.302648969638303, + "grad_norm": 0.2426963597536087, + "learning_rate": 1.7965376936379358e-05, + "loss": 1.2469, + "step": 7731 + }, + { + "epoch": 2.302946815837972, + "grad_norm": 0.23645684123039246, + "learning_rate": 1.7964793733235916e-05, + "loss": 1.2509, + "step": 7732 + }, + { + "epoch": 2.30324466203764, + "grad_norm": 0.233712837100029, + "learning_rate": 1.796421045598897e-05, + "loss": 1.2359, + "step": 7733 + }, + { + "epoch": 2.303542508237309, + "grad_norm": 0.2517712116241455, + "learning_rate": 1.796362710464394e-05, + "loss": 1.2602, + "step": 7734 + }, + { + "epoch": 2.3038403544369777, + "grad_norm": 0.24844689667224884, + "learning_rate": 1.796304367920626e-05, + "loss": 1.2486, + "step": 7735 + }, + { + "epoch": 2.3041382006366464, + "grad_norm": 0.2320019155740738, + "learning_rate": 1.7962460179681357e-05, + "loss": 1.2745, + "step": 7736 + }, + { + "epoch": 2.3044360468363148, + "grad_norm": 0.2363097369670868, + "learning_rate": 1.796187660607465e-05, + "loss": 1.2531, + "step": 7737 + }, + { + "epoch": 2.3047338930359835, + "grad_norm": 0.24142713844776154, + "learning_rate": 1.7961292958391585e-05, + "loss": 1.2652, + "step": 7738 + }, + { + "epoch": 2.3050317392356523, + "grad_norm": 0.23480938374996185, + "learning_rate": 1.796070923663758e-05, + "loss": 1.2615, + "step": 7739 + }, + { + "epoch": 2.305329585435321, + "grad_norm": 0.23894020915031433, + "learning_rate": 1.7960125440818073e-05, + "loss": 1.2363, + "step": 7740 + }, + { + "epoch": 2.3056274316349894, + "grad_norm": 0.27292248606681824, + "learning_rate": 1.7959541570938487e-05, + "loss": 1.2273, + "step": 7741 + }, + { + "epoch": 2.305925277834658, + "grad_norm": 0.25207120180130005, + "learning_rate": 1.7958957627004265e-05, + "loss": 1.2726, + "step": 7742 + }, + { + "epoch": 2.306223124034327, + "grad_norm": 0.23265618085861206, + "learning_rate": 1.7958373609020833e-05, + "loss": 1.2501, + "step": 7743 + }, + { + "epoch": 2.3065209702339953, + "grad_norm": 0.24379467964172363, + "learning_rate": 1.7957789516993623e-05, + "loss": 1.2544, + "step": 7744 + }, + { + "epoch": 2.306818816433664, + "grad_norm": 0.33975672721862793, + "learning_rate": 1.7957205350928076e-05, + "loss": 1.2577, + "step": 7745 + }, + { + "epoch": 2.307116662633333, + "grad_norm": 0.353736013174057, + "learning_rate": 1.7956621110829624e-05, + "loss": 1.2658, + "step": 7746 + }, + { + "epoch": 2.307414508833001, + "grad_norm": 0.27018821239471436, + "learning_rate": 1.79560367967037e-05, + "loss": 1.2548, + "step": 7747 + }, + { + "epoch": 2.30771235503267, + "grad_norm": 0.621097207069397, + "learning_rate": 1.7955452408555744e-05, + "loss": 1.2628, + "step": 7748 + }, + { + "epoch": 2.3080102012323387, + "grad_norm": 0.2451144903898239, + "learning_rate": 1.7954867946391192e-05, + "loss": 1.27, + "step": 7749 + }, + { + "epoch": 2.3083080474320075, + "grad_norm": 0.2555001974105835, + "learning_rate": 1.795428341021548e-05, + "loss": 1.2554, + "step": 7750 + }, + { + "epoch": 2.308605893631676, + "grad_norm": 0.25533974170684814, + "learning_rate": 1.795369880003405e-05, + "loss": 1.2467, + "step": 7751 + }, + { + "epoch": 2.3089037398313446, + "grad_norm": 0.22745493054389954, + "learning_rate": 1.7953114115852336e-05, + "loss": 1.2617, + "step": 7752 + }, + { + "epoch": 2.3092015860310133, + "grad_norm": 0.22543756663799286, + "learning_rate": 1.7952529357675783e-05, + "loss": 1.2543, + "step": 7753 + }, + { + "epoch": 2.309499432230682, + "grad_norm": 0.26410216093063354, + "learning_rate": 1.795194452550983e-05, + "loss": 1.2555, + "step": 7754 + }, + { + "epoch": 2.3097972784303504, + "grad_norm": 0.24005134403705597, + "learning_rate": 1.7951359619359917e-05, + "loss": 1.2544, + "step": 7755 + }, + { + "epoch": 2.310095124630019, + "grad_norm": 0.2326568216085434, + "learning_rate": 1.7950774639231486e-05, + "loss": 1.2521, + "step": 7756 + }, + { + "epoch": 2.310392970829688, + "grad_norm": 0.21785187721252441, + "learning_rate": 1.7950189585129977e-05, + "loss": 1.2254, + "step": 7757 + }, + { + "epoch": 2.3106908170293563, + "grad_norm": 0.2363487184047699, + "learning_rate": 1.7949604457060845e-05, + "loss": 1.2518, + "step": 7758 + }, + { + "epoch": 2.310988663229025, + "grad_norm": 0.23210032284259796, + "learning_rate": 1.7949019255029517e-05, + "loss": 1.272, + "step": 7759 + }, + { + "epoch": 2.311286509428694, + "grad_norm": 0.23723196983337402, + "learning_rate": 1.794843397904145e-05, + "loss": 1.2384, + "step": 7760 + }, + { + "epoch": 2.311584355628362, + "grad_norm": 0.22842244803905487, + "learning_rate": 1.7947848629102082e-05, + "loss": 1.2448, + "step": 7761 + }, + { + "epoch": 2.311882201828031, + "grad_norm": 0.22675757110118866, + "learning_rate": 1.7947263205216864e-05, + "loss": 1.2597, + "step": 7762 + }, + { + "epoch": 2.3121800480276997, + "grad_norm": 0.246642604470253, + "learning_rate": 1.7946677707391244e-05, + "loss": 1.2428, + "step": 7763 + }, + { + "epoch": 2.3124778942273685, + "grad_norm": 0.22891004383563995, + "learning_rate": 1.7946092135630665e-05, + "loss": 1.2654, + "step": 7764 + }, + { + "epoch": 2.312775740427037, + "grad_norm": 0.23032474517822266, + "learning_rate": 1.7945506489940573e-05, + "loss": 1.2489, + "step": 7765 + }, + { + "epoch": 2.3130735866267056, + "grad_norm": 0.2301180064678192, + "learning_rate": 1.7944920770326422e-05, + "loss": 1.2439, + "step": 7766 + }, + { + "epoch": 2.3133714328263744, + "grad_norm": 0.22702735662460327, + "learning_rate": 1.794433497679366e-05, + "loss": 1.2643, + "step": 7767 + }, + { + "epoch": 2.313669279026043, + "grad_norm": 0.22810158133506775, + "learning_rate": 1.7943749109347742e-05, + "loss": 1.2681, + "step": 7768 + }, + { + "epoch": 2.3139671252257115, + "grad_norm": 0.2263939380645752, + "learning_rate": 1.7943163167994108e-05, + "loss": 1.2475, + "step": 7769 + }, + { + "epoch": 2.3142649714253802, + "grad_norm": 0.23496605455875397, + "learning_rate": 1.7942577152738218e-05, + "loss": 1.2654, + "step": 7770 + }, + { + "epoch": 2.314562817625049, + "grad_norm": 0.23172979056835175, + "learning_rate": 1.794199106358552e-05, + "loss": 1.255, + "step": 7771 + }, + { + "epoch": 2.3148606638247173, + "grad_norm": 0.23240801692008972, + "learning_rate": 1.794140490054147e-05, + "loss": 1.2627, + "step": 7772 + }, + { + "epoch": 2.315158510024386, + "grad_norm": 0.23094674944877625, + "learning_rate": 1.794081866361152e-05, + "loss": 1.2584, + "step": 7773 + }, + { + "epoch": 2.315456356224055, + "grad_norm": 0.23189008235931396, + "learning_rate": 1.794023235280112e-05, + "loss": 1.2571, + "step": 7774 + }, + { + "epoch": 2.315754202423723, + "grad_norm": 0.22598449885845184, + "learning_rate": 1.7939645968115734e-05, + "loss": 1.258, + "step": 7775 + }, + { + "epoch": 2.316052048623392, + "grad_norm": 0.2240254282951355, + "learning_rate": 1.793905950956081e-05, + "loss": 1.2351, + "step": 7776 + }, + { + "epoch": 2.3163498948230608, + "grad_norm": 0.22057874500751495, + "learning_rate": 1.7938472977141814e-05, + "loss": 1.2552, + "step": 7777 + }, + { + "epoch": 2.3166477410227295, + "grad_norm": 0.21995866298675537, + "learning_rate": 1.793788637086419e-05, + "loss": 1.2783, + "step": 7778 + }, + { + "epoch": 2.316945587222398, + "grad_norm": 0.23166123032569885, + "learning_rate": 1.7937299690733402e-05, + "loss": 1.2578, + "step": 7779 + }, + { + "epoch": 2.3172434334220666, + "grad_norm": 0.23654724657535553, + "learning_rate": 1.793671293675491e-05, + "loss": 1.2624, + "step": 7780 + }, + { + "epoch": 2.3175412796217354, + "grad_norm": 0.22917701303958893, + "learning_rate": 1.7936126108934174e-05, + "loss": 1.2544, + "step": 7781 + }, + { + "epoch": 2.317839125821404, + "grad_norm": 0.23114502429962158, + "learning_rate": 1.7935539207276648e-05, + "loss": 1.271, + "step": 7782 + }, + { + "epoch": 2.3181369720210725, + "grad_norm": 0.23824429512023926, + "learning_rate": 1.7934952231787797e-05, + "loss": 1.2563, + "step": 7783 + }, + { + "epoch": 2.3184348182207413, + "grad_norm": 0.23625443875789642, + "learning_rate": 1.7934365182473082e-05, + "loss": 1.2535, + "step": 7784 + }, + { + "epoch": 2.31873266442041, + "grad_norm": 0.2254304736852646, + "learning_rate": 1.793377805933796e-05, + "loss": 1.2567, + "step": 7785 + }, + { + "epoch": 2.3190305106200784, + "grad_norm": 0.2224108725786209, + "learning_rate": 1.79331908623879e-05, + "loss": 1.2665, + "step": 7786 + }, + { + "epoch": 2.319328356819747, + "grad_norm": 0.22796858847141266, + "learning_rate": 1.7932603591628363e-05, + "loss": 1.2637, + "step": 7787 + }, + { + "epoch": 2.319626203019416, + "grad_norm": 0.22788214683532715, + "learning_rate": 1.793201624706481e-05, + "loss": 1.2578, + "step": 7788 + }, + { + "epoch": 2.3199240492190847, + "grad_norm": 0.22808735072612762, + "learning_rate": 1.793142882870271e-05, + "loss": 1.2542, + "step": 7789 + }, + { + "epoch": 2.320221895418753, + "grad_norm": 0.21906575560569763, + "learning_rate": 1.7930841336547525e-05, + "loss": 1.2411, + "step": 7790 + }, + { + "epoch": 2.320519741618422, + "grad_norm": 0.22968780994415283, + "learning_rate": 1.7930253770604726e-05, + "loss": 1.2583, + "step": 7791 + }, + { + "epoch": 2.3208175878180906, + "grad_norm": 0.2303999662399292, + "learning_rate": 1.792966613087977e-05, + "loss": 1.2609, + "step": 7792 + }, + { + "epoch": 2.3211154340177593, + "grad_norm": 0.2238573133945465, + "learning_rate": 1.7929078417378135e-05, + "loss": 1.2414, + "step": 7793 + }, + { + "epoch": 2.3214132802174277, + "grad_norm": 0.22726310789585114, + "learning_rate": 1.792849063010528e-05, + "loss": 1.256, + "step": 7794 + }, + { + "epoch": 2.3217111264170964, + "grad_norm": 0.22456595301628113, + "learning_rate": 1.7927902769066682e-05, + "loss": 1.2592, + "step": 7795 + }, + { + "epoch": 2.322008972616765, + "grad_norm": 0.2278863936662674, + "learning_rate": 1.7927314834267804e-05, + "loss": 1.2402, + "step": 7796 + }, + { + "epoch": 2.3223068188164335, + "grad_norm": 0.2422952800989151, + "learning_rate": 1.7926726825714117e-05, + "loss": 1.2547, + "step": 7797 + }, + { + "epoch": 2.3226046650161023, + "grad_norm": 0.22926056385040283, + "learning_rate": 1.7926138743411095e-05, + "loss": 1.2543, + "step": 7798 + }, + { + "epoch": 2.322902511215771, + "grad_norm": 0.23135961592197418, + "learning_rate": 1.792555058736421e-05, + "loss": 1.2626, + "step": 7799 + }, + { + "epoch": 2.3232003574154394, + "grad_norm": 0.22803236544132233, + "learning_rate": 1.7924962357578928e-05, + "loss": 1.2424, + "step": 7800 + }, + { + "epoch": 2.323498203615108, + "grad_norm": 0.22060105204582214, + "learning_rate": 1.7924374054060725e-05, + "loss": 1.2479, + "step": 7801 + }, + { + "epoch": 2.323796049814777, + "grad_norm": 0.2315826714038849, + "learning_rate": 1.7923785676815078e-05, + "loss": 1.2546, + "step": 7802 + }, + { + "epoch": 2.3240938960144457, + "grad_norm": 0.2271241992712021, + "learning_rate": 1.7923197225847457e-05, + "loss": 1.2718, + "step": 7803 + }, + { + "epoch": 2.324391742214114, + "grad_norm": 0.23083896934986115, + "learning_rate": 1.792260870116334e-05, + "loss": 1.2639, + "step": 7804 + }, + { + "epoch": 2.324689588413783, + "grad_norm": 0.2252204567193985, + "learning_rate": 1.7922020102768197e-05, + "loss": 1.2534, + "step": 7805 + }, + { + "epoch": 2.3249874346134516, + "grad_norm": 0.21705453097820282, + "learning_rate": 1.792143143066751e-05, + "loss": 1.2344, + "step": 7806 + }, + { + "epoch": 2.3252852808131204, + "grad_norm": 0.23861151933670044, + "learning_rate": 1.7920842684866756e-05, + "loss": 1.2557, + "step": 7807 + }, + { + "epoch": 2.3255831270127887, + "grad_norm": 0.22201643884181976, + "learning_rate": 1.7920253865371407e-05, + "loss": 1.236, + "step": 7808 + }, + { + "epoch": 2.3258809732124575, + "grad_norm": 0.22820638120174408, + "learning_rate": 1.7919664972186946e-05, + "loss": 1.2531, + "step": 7809 + }, + { + "epoch": 2.3261788194121262, + "grad_norm": 0.23169077932834625, + "learning_rate": 1.7919076005318852e-05, + "loss": 1.2574, + "step": 7810 + }, + { + "epoch": 2.3264766656117946, + "grad_norm": 0.2316010743379593, + "learning_rate": 1.79184869647726e-05, + "loss": 1.2705, + "step": 7811 + }, + { + "epoch": 2.3267745118114633, + "grad_norm": 0.22276555001735687, + "learning_rate": 1.791789785055368e-05, + "loss": 1.2446, + "step": 7812 + }, + { + "epoch": 2.327072358011132, + "grad_norm": 0.22381189465522766, + "learning_rate": 1.791730866266756e-05, + "loss": 1.2644, + "step": 7813 + }, + { + "epoch": 2.3273702042108004, + "grad_norm": 0.23511162400245667, + "learning_rate": 1.7916719401119733e-05, + "loss": 1.2624, + "step": 7814 + }, + { + "epoch": 2.327668050410469, + "grad_norm": 0.21948397159576416, + "learning_rate": 1.7916130065915675e-05, + "loss": 1.2624, + "step": 7815 + }, + { + "epoch": 2.327965896610138, + "grad_norm": 0.22322896122932434, + "learning_rate": 1.7915540657060873e-05, + "loss": 1.2579, + "step": 7816 + }, + { + "epoch": 2.3282637428098067, + "grad_norm": 0.22729644179344177, + "learning_rate": 1.791495117456081e-05, + "loss": 1.2524, + "step": 7817 + }, + { + "epoch": 2.328561589009475, + "grad_norm": 0.22460895776748657, + "learning_rate": 1.7914361618420966e-05, + "loss": 1.267, + "step": 7818 + }, + { + "epoch": 2.328859435209144, + "grad_norm": 0.2263486236333847, + "learning_rate": 1.7913771988646832e-05, + "loss": 1.2576, + "step": 7819 + }, + { + "epoch": 2.3291572814088126, + "grad_norm": 0.22927837073802948, + "learning_rate": 1.791318228524389e-05, + "loss": 1.2536, + "step": 7820 + }, + { + "epoch": 2.3294551276084814, + "grad_norm": 0.23089389503002167, + "learning_rate": 1.7912592508217627e-05, + "loss": 1.2668, + "step": 7821 + }, + { + "epoch": 2.3297529738081497, + "grad_norm": 0.23298054933547974, + "learning_rate": 1.7912002657573533e-05, + "loss": 1.253, + "step": 7822 + }, + { + "epoch": 2.3300508200078185, + "grad_norm": 0.23412643373012543, + "learning_rate": 1.7911412733317096e-05, + "loss": 1.2693, + "step": 7823 + }, + { + "epoch": 2.3303486662074873, + "grad_norm": 0.21775412559509277, + "learning_rate": 1.7910822735453797e-05, + "loss": 1.2557, + "step": 7824 + }, + { + "epoch": 2.3306465124071556, + "grad_norm": 0.22747136652469635, + "learning_rate": 1.7910232663989135e-05, + "loss": 1.2553, + "step": 7825 + }, + { + "epoch": 2.3309443586068244, + "grad_norm": 0.22300876677036285, + "learning_rate": 1.7909642518928593e-05, + "loss": 1.2591, + "step": 7826 + }, + { + "epoch": 2.331242204806493, + "grad_norm": 0.22167746722698212, + "learning_rate": 1.790905230027767e-05, + "loss": 1.2485, + "step": 7827 + }, + { + "epoch": 2.3315400510061615, + "grad_norm": 0.22107626497745514, + "learning_rate": 1.790846200804185e-05, + "loss": 1.2575, + "step": 7828 + }, + { + "epoch": 2.3318378972058302, + "grad_norm": 0.2367164045572281, + "learning_rate": 1.790787164222662e-05, + "loss": 1.2628, + "step": 7829 + }, + { + "epoch": 2.332135743405499, + "grad_norm": 0.22507604956626892, + "learning_rate": 1.790728120283749e-05, + "loss": 1.2579, + "step": 7830 + }, + { + "epoch": 2.332433589605168, + "grad_norm": 0.22825001180171967, + "learning_rate": 1.7906690689879935e-05, + "loss": 1.2661, + "step": 7831 + }, + { + "epoch": 2.332731435804836, + "grad_norm": 0.2338763177394867, + "learning_rate": 1.790610010335946e-05, + "loss": 1.2355, + "step": 7832 + }, + { + "epoch": 2.333029282004505, + "grad_norm": 0.22398051619529724, + "learning_rate": 1.790550944328156e-05, + "loss": 1.2439, + "step": 7833 + }, + { + "epoch": 2.3333271282041737, + "grad_norm": 0.2237871140241623, + "learning_rate": 1.7904918709651723e-05, + "loss": 1.2651, + "step": 7834 + }, + { + "epoch": 2.3336249744038424, + "grad_norm": 0.22252443432807922, + "learning_rate": 1.790432790247545e-05, + "loss": 1.2581, + "step": 7835 + }, + { + "epoch": 2.3339228206035108, + "grad_norm": 0.21413569152355194, + "learning_rate": 1.7903737021758237e-05, + "loss": 1.2473, + "step": 7836 + }, + { + "epoch": 2.3342206668031795, + "grad_norm": 0.2181364744901657, + "learning_rate": 1.7903146067505582e-05, + "loss": 1.2618, + "step": 7837 + }, + { + "epoch": 2.3345185130028483, + "grad_norm": 0.22701337933540344, + "learning_rate": 1.790255503972298e-05, + "loss": 1.2466, + "step": 7838 + }, + { + "epoch": 2.3348163592025166, + "grad_norm": 0.2238103300333023, + "learning_rate": 1.7901963938415935e-05, + "loss": 1.2459, + "step": 7839 + }, + { + "epoch": 2.3351142054021854, + "grad_norm": 0.230665922164917, + "learning_rate": 1.7901372763589947e-05, + "loss": 1.2502, + "step": 7840 + }, + { + "epoch": 2.335412051601854, + "grad_norm": 0.2242044061422348, + "learning_rate": 1.790078151525051e-05, + "loss": 1.2501, + "step": 7841 + }, + { + "epoch": 2.3357098978015225, + "grad_norm": 0.23318134248256683, + "learning_rate": 1.790019019340313e-05, + "loss": 1.2467, + "step": 7842 + }, + { + "epoch": 2.3360077440011913, + "grad_norm": 0.2387678474187851, + "learning_rate": 1.7899598798053306e-05, + "loss": 1.2434, + "step": 7843 + }, + { + "epoch": 2.33630559020086, + "grad_norm": 0.22364795207977295, + "learning_rate": 1.789900732920654e-05, + "loss": 1.2656, + "step": 7844 + }, + { + "epoch": 2.336603436400529, + "grad_norm": 0.22308523952960968, + "learning_rate": 1.7898415786868338e-05, + "loss": 1.2808, + "step": 7845 + }, + { + "epoch": 2.336901282600197, + "grad_norm": 0.23038873076438904, + "learning_rate": 1.78978241710442e-05, + "loss": 1.2387, + "step": 7846 + }, + { + "epoch": 2.337199128799866, + "grad_norm": 0.22550682723522186, + "learning_rate": 1.7897232481739634e-05, + "loss": 1.2587, + "step": 7847 + }, + { + "epoch": 2.3374969749995347, + "grad_norm": 0.22604312002658844, + "learning_rate": 1.789664071896014e-05, + "loss": 1.2436, + "step": 7848 + }, + { + "epoch": 2.3377948211992035, + "grad_norm": 0.2248874008655548, + "learning_rate": 1.789604888271123e-05, + "loss": 1.2602, + "step": 7849 + }, + { + "epoch": 2.338092667398872, + "grad_norm": 0.22980491816997528, + "learning_rate": 1.7895456972998406e-05, + "loss": 1.2608, + "step": 7850 + }, + { + "epoch": 2.3383905135985406, + "grad_norm": 0.22804193198680878, + "learning_rate": 1.7894864989827176e-05, + "loss": 1.2568, + "step": 7851 + }, + { + "epoch": 2.3386883597982093, + "grad_norm": 0.2315458208322525, + "learning_rate": 1.7894272933203048e-05, + "loss": 1.2587, + "step": 7852 + }, + { + "epoch": 2.3389862059978777, + "grad_norm": 0.22518876194953918, + "learning_rate": 1.7893680803131528e-05, + "loss": 1.2641, + "step": 7853 + }, + { + "epoch": 2.3392840521975464, + "grad_norm": 0.2149897813796997, + "learning_rate": 1.789308859961813e-05, + "loss": 1.2486, + "step": 7854 + }, + { + "epoch": 2.339581898397215, + "grad_norm": 0.23355364799499512, + "learning_rate": 1.7892496322668363e-05, + "loss": 1.2454, + "step": 7855 + }, + { + "epoch": 2.339879744596884, + "grad_norm": 0.22749406099319458, + "learning_rate": 1.7891903972287733e-05, + "loss": 1.2615, + "step": 7856 + }, + { + "epoch": 2.3401775907965523, + "grad_norm": 0.23475691676139832, + "learning_rate": 1.7891311548481754e-05, + "loss": 1.2476, + "step": 7857 + }, + { + "epoch": 2.340475436996221, + "grad_norm": 0.24041210114955902, + "learning_rate": 1.789071905125594e-05, + "loss": 1.2372, + "step": 7858 + }, + { + "epoch": 2.34077328319589, + "grad_norm": 0.24427035450935364, + "learning_rate": 1.78901264806158e-05, + "loss": 1.2559, + "step": 7859 + }, + { + "epoch": 2.3410711293955586, + "grad_norm": 0.2267962247133255, + "learning_rate": 1.7889533836566845e-05, + "loss": 1.2533, + "step": 7860 + }, + { + "epoch": 2.341368975595227, + "grad_norm": 0.22979292273521423, + "learning_rate": 1.7888941119114597e-05, + "loss": 1.2531, + "step": 7861 + }, + { + "epoch": 2.3416668217948957, + "grad_norm": 0.2190021276473999, + "learning_rate": 1.7888348328264563e-05, + "loss": 1.2542, + "step": 7862 + }, + { + "epoch": 2.3419646679945645, + "grad_norm": 0.23141010105609894, + "learning_rate": 1.7887755464022265e-05, + "loss": 1.2546, + "step": 7863 + }, + { + "epoch": 2.342262514194233, + "grad_norm": 0.22548598051071167, + "learning_rate": 1.7887162526393212e-05, + "loss": 1.2325, + "step": 7864 + }, + { + "epoch": 2.3425603603939016, + "grad_norm": 0.22789525985717773, + "learning_rate": 1.7886569515382927e-05, + "loss": 1.2457, + "step": 7865 + }, + { + "epoch": 2.3428582065935704, + "grad_norm": 0.2295307219028473, + "learning_rate": 1.7885976430996922e-05, + "loss": 1.2633, + "step": 7866 + }, + { + "epoch": 2.3431560527932387, + "grad_norm": 0.2266976684331894, + "learning_rate": 1.7885383273240716e-05, + "loss": 1.2625, + "step": 7867 + }, + { + "epoch": 2.3434538989929075, + "grad_norm": 0.22434692084789276, + "learning_rate": 1.7884790042119826e-05, + "loss": 1.2661, + "step": 7868 + }, + { + "epoch": 2.3437517451925762, + "grad_norm": 0.23144297301769257, + "learning_rate": 1.7884196737639777e-05, + "loss": 1.2534, + "step": 7869 + }, + { + "epoch": 2.344049591392245, + "grad_norm": 0.24207472801208496, + "learning_rate": 1.788360335980609e-05, + "loss": 1.2495, + "step": 7870 + }, + { + "epoch": 2.3443474375919133, + "grad_norm": 0.22699899971485138, + "learning_rate": 1.7883009908624276e-05, + "loss": 1.2512, + "step": 7871 + }, + { + "epoch": 2.344645283791582, + "grad_norm": 0.23398232460021973, + "learning_rate": 1.7882416384099867e-05, + "loss": 1.2441, + "step": 7872 + }, + { + "epoch": 2.344943129991251, + "grad_norm": 0.2263219654560089, + "learning_rate": 1.7881822786238376e-05, + "loss": 1.2361, + "step": 7873 + }, + { + "epoch": 2.3452409761909196, + "grad_norm": 0.22675180435180664, + "learning_rate": 1.7881229115045333e-05, + "loss": 1.2534, + "step": 7874 + }, + { + "epoch": 2.345538822390588, + "grad_norm": 0.23145712912082672, + "learning_rate": 1.7880635370526257e-05, + "loss": 1.2602, + "step": 7875 + }, + { + "epoch": 2.3458366685902567, + "grad_norm": 0.225954070687294, + "learning_rate": 1.7880041552686674e-05, + "loss": 1.2608, + "step": 7876 + }, + { + "epoch": 2.3461345147899255, + "grad_norm": 0.2244177609682083, + "learning_rate": 1.7879447661532107e-05, + "loss": 1.226, + "step": 7877 + }, + { + "epoch": 2.346432360989594, + "grad_norm": 0.22313684225082397, + "learning_rate": 1.7878853697068085e-05, + "loss": 1.2526, + "step": 7878 + }, + { + "epoch": 2.3467302071892626, + "grad_norm": 0.23761752247810364, + "learning_rate": 1.787825965930013e-05, + "loss": 1.2514, + "step": 7879 + }, + { + "epoch": 2.3470280533889314, + "grad_norm": 0.2322452962398529, + "learning_rate": 1.7877665548233775e-05, + "loss": 1.2581, + "step": 7880 + }, + { + "epoch": 2.3473258995885997, + "grad_norm": 0.22980079054832458, + "learning_rate": 1.7877071363874542e-05, + "loss": 1.2764, + "step": 7881 + }, + { + "epoch": 2.3476237457882685, + "grad_norm": 0.24619711935520172, + "learning_rate": 1.7876477106227958e-05, + "loss": 1.2592, + "step": 7882 + }, + { + "epoch": 2.3479215919879373, + "grad_norm": 0.23735593259334564, + "learning_rate": 1.7875882775299557e-05, + "loss": 1.2743, + "step": 7883 + }, + { + "epoch": 2.348219438187606, + "grad_norm": 0.22482122480869293, + "learning_rate": 1.7875288371094867e-05, + "loss": 1.2553, + "step": 7884 + }, + { + "epoch": 2.3485172843872744, + "grad_norm": 0.2246832549571991, + "learning_rate": 1.7874693893619415e-05, + "loss": 1.2675, + "step": 7885 + }, + { + "epoch": 2.348815130586943, + "grad_norm": 0.2285926192998886, + "learning_rate": 1.7874099342878733e-05, + "loss": 1.26, + "step": 7886 + }, + { + "epoch": 2.349112976786612, + "grad_norm": 0.2274121195077896, + "learning_rate": 1.7873504718878362e-05, + "loss": 1.2479, + "step": 7887 + }, + { + "epoch": 2.3494108229862807, + "grad_norm": 0.2367192953824997, + "learning_rate": 1.7872910021623816e-05, + "loss": 1.2493, + "step": 7888 + }, + { + "epoch": 2.349708669185949, + "grad_norm": 0.22948838770389557, + "learning_rate": 1.7872315251120643e-05, + "loss": 1.2585, + "step": 7889 + }, + { + "epoch": 2.350006515385618, + "grad_norm": 0.23225794732570648, + "learning_rate": 1.7871720407374375e-05, + "loss": 1.2211, + "step": 7890 + }, + { + "epoch": 2.3503043615852865, + "grad_norm": 0.22939930856227875, + "learning_rate": 1.787112549039054e-05, + "loss": 1.2483, + "step": 7891 + }, + { + "epoch": 2.350602207784955, + "grad_norm": 0.242367222905159, + "learning_rate": 1.787053050017468e-05, + "loss": 1.2621, + "step": 7892 + }, + { + "epoch": 2.3509000539846236, + "grad_norm": 0.23725159466266632, + "learning_rate": 1.786993543673232e-05, + "loss": 1.2481, + "step": 7893 + }, + { + "epoch": 2.3511979001842924, + "grad_norm": 0.2657933235168457, + "learning_rate": 1.7869340300069012e-05, + "loss": 1.2441, + "step": 7894 + }, + { + "epoch": 2.3514957463839607, + "grad_norm": 0.2347121685743332, + "learning_rate": 1.786874509019028e-05, + "loss": 1.2523, + "step": 7895 + }, + { + "epoch": 2.3517935925836295, + "grad_norm": 0.23327769339084625, + "learning_rate": 1.7868149807101666e-05, + "loss": 1.2558, + "step": 7896 + }, + { + "epoch": 2.3520914387832983, + "grad_norm": 0.22707679867744446, + "learning_rate": 1.786755445080871e-05, + "loss": 1.2515, + "step": 7897 + }, + { + "epoch": 2.352389284982967, + "grad_norm": 0.2269888073205948, + "learning_rate": 1.786695902131695e-05, + "loss": 1.2621, + "step": 7898 + }, + { + "epoch": 2.3526871311826354, + "grad_norm": 0.22768308222293854, + "learning_rate": 1.7866363518631925e-05, + "loss": 1.2543, + "step": 7899 + }, + { + "epoch": 2.352984977382304, + "grad_norm": 0.21840879321098328, + "learning_rate": 1.7865767942759177e-05, + "loss": 1.2501, + "step": 7900 + }, + { + "epoch": 2.353282823581973, + "grad_norm": 0.23985709249973297, + "learning_rate": 1.786517229370425e-05, + "loss": 1.2419, + "step": 7901 + }, + { + "epoch": 2.3535806697816417, + "grad_norm": 0.2266675978899002, + "learning_rate": 1.7864576571472678e-05, + "loss": 1.2625, + "step": 7902 + }, + { + "epoch": 2.35387851598131, + "grad_norm": 0.23214082419872284, + "learning_rate": 1.7863980776070007e-05, + "loss": 1.2592, + "step": 7903 + }, + { + "epoch": 2.354176362180979, + "grad_norm": 0.23541660606861115, + "learning_rate": 1.7863384907501784e-05, + "loss": 1.2702, + "step": 7904 + }, + { + "epoch": 2.3544742083806476, + "grad_norm": 0.23487745225429535, + "learning_rate": 1.786278896577355e-05, + "loss": 1.2521, + "step": 7905 + }, + { + "epoch": 2.354772054580316, + "grad_norm": 0.23422829806804657, + "learning_rate": 1.7862192950890846e-05, + "loss": 1.2473, + "step": 7906 + }, + { + "epoch": 2.3550699007799847, + "grad_norm": 0.2254904955625534, + "learning_rate": 1.7861596862859224e-05, + "loss": 1.2518, + "step": 7907 + }, + { + "epoch": 2.3553677469796535, + "grad_norm": 0.23697738349437714, + "learning_rate": 1.7861000701684225e-05, + "loss": 1.2688, + "step": 7908 + }, + { + "epoch": 2.355665593179322, + "grad_norm": 0.22762492299079895, + "learning_rate": 1.7860404467371398e-05, + "loss": 1.2344, + "step": 7909 + }, + { + "epoch": 2.3559634393789906, + "grad_norm": 0.24504396319389343, + "learning_rate": 1.785980815992629e-05, + "loss": 1.256, + "step": 7910 + }, + { + "epoch": 2.3562612855786593, + "grad_norm": 0.2187090963125229, + "learning_rate": 1.785921177935445e-05, + "loss": 1.2615, + "step": 7911 + }, + { + "epoch": 2.356559131778328, + "grad_norm": 0.23367871344089508, + "learning_rate": 1.785861532566142e-05, + "loss": 1.2485, + "step": 7912 + }, + { + "epoch": 2.3568569779779964, + "grad_norm": 0.23267200589179993, + "learning_rate": 1.7858018798852758e-05, + "loss": 1.2613, + "step": 7913 + }, + { + "epoch": 2.357154824177665, + "grad_norm": 0.24347952008247375, + "learning_rate": 1.7857422198934012e-05, + "loss": 1.2732, + "step": 7914 + }, + { + "epoch": 2.357452670377334, + "grad_norm": 0.22890640795230865, + "learning_rate": 1.785682552591073e-05, + "loss": 1.2683, + "step": 7915 + }, + { + "epoch": 2.3577505165770027, + "grad_norm": 0.2342701554298401, + "learning_rate": 1.7856228779788462e-05, + "loss": 1.2664, + "step": 7916 + }, + { + "epoch": 2.358048362776671, + "grad_norm": 0.2646051049232483, + "learning_rate": 1.7855631960572764e-05, + "loss": 1.2586, + "step": 7917 + }, + { + "epoch": 2.35834620897634, + "grad_norm": 0.25664758682250977, + "learning_rate": 1.7855035068269192e-05, + "loss": 1.267, + "step": 7918 + }, + { + "epoch": 2.3586440551760086, + "grad_norm": 0.25148266553878784, + "learning_rate": 1.785443810288329e-05, + "loss": 1.2465, + "step": 7919 + }, + { + "epoch": 2.358941901375677, + "grad_norm": 0.2698698043823242, + "learning_rate": 1.7853841064420617e-05, + "loss": 1.247, + "step": 7920 + }, + { + "epoch": 2.3592397475753457, + "grad_norm": 0.2338656485080719, + "learning_rate": 1.785324395288673e-05, + "loss": 1.2559, + "step": 7921 + }, + { + "epoch": 2.3595375937750145, + "grad_norm": 0.24385720491409302, + "learning_rate": 1.7852646768287182e-05, + "loss": 1.2537, + "step": 7922 + }, + { + "epoch": 2.3598354399746833, + "grad_norm": 0.24207328259944916, + "learning_rate": 1.7852049510627526e-05, + "loss": 1.2706, + "step": 7923 + }, + { + "epoch": 2.3601332861743516, + "grad_norm": 0.2335837334394455, + "learning_rate": 1.7851452179913327e-05, + "loss": 1.2443, + "step": 7924 + }, + { + "epoch": 2.3604311323740204, + "grad_norm": 0.24712742865085602, + "learning_rate": 1.7850854776150136e-05, + "loss": 1.2575, + "step": 7925 + }, + { + "epoch": 2.360728978573689, + "grad_norm": 0.23162207007408142, + "learning_rate": 1.785025729934351e-05, + "loss": 1.2437, + "step": 7926 + }, + { + "epoch": 2.361026824773358, + "grad_norm": 0.2364334911108017, + "learning_rate": 1.784965974949901e-05, + "loss": 1.2456, + "step": 7927 + }, + { + "epoch": 2.3613246709730262, + "grad_norm": 0.22042107582092285, + "learning_rate": 1.7849062126622204e-05, + "loss": 1.2403, + "step": 7928 + }, + { + "epoch": 2.361622517172695, + "grad_norm": 0.2693556249141693, + "learning_rate": 1.7848464430718637e-05, + "loss": 1.2756, + "step": 7929 + }, + { + "epoch": 2.3619203633723638, + "grad_norm": 0.2263534814119339, + "learning_rate": 1.784786666179388e-05, + "loss": 1.258, + "step": 7930 + }, + { + "epoch": 2.362218209572032, + "grad_norm": 0.25349611043930054, + "learning_rate": 1.7847268819853493e-05, + "loss": 1.2588, + "step": 7931 + }, + { + "epoch": 2.362516055771701, + "grad_norm": 0.23080848157405853, + "learning_rate": 1.7846670904903032e-05, + "loss": 1.2447, + "step": 7932 + }, + { + "epoch": 2.3628139019713696, + "grad_norm": 0.23246009647846222, + "learning_rate": 1.784607291694807e-05, + "loss": 1.2686, + "step": 7933 + }, + { + "epoch": 2.363111748171038, + "grad_norm": 0.23326291143894196, + "learning_rate": 1.7845474855994166e-05, + "loss": 1.2531, + "step": 7934 + }, + { + "epoch": 2.3634095943707067, + "grad_norm": 0.23655524849891663, + "learning_rate": 1.784487672204688e-05, + "loss": 1.2538, + "step": 7935 + }, + { + "epoch": 2.3637074405703755, + "grad_norm": 0.21734671294689178, + "learning_rate": 1.7844278515111785e-05, + "loss": 1.2333, + "step": 7936 + }, + { + "epoch": 2.3640052867700443, + "grad_norm": 0.23813478648662567, + "learning_rate": 1.784368023519444e-05, + "loss": 1.2702, + "step": 7937 + }, + { + "epoch": 2.3643031329697126, + "grad_norm": 0.22581513226032257, + "learning_rate": 1.7843081882300414e-05, + "loss": 1.2531, + "step": 7938 + }, + { + "epoch": 2.3646009791693814, + "grad_norm": 0.2356468141078949, + "learning_rate": 1.7842483456435275e-05, + "loss": 1.2545, + "step": 7939 + }, + { + "epoch": 2.36489882536905, + "grad_norm": 0.23121199011802673, + "learning_rate": 1.784188495760459e-05, + "loss": 1.2583, + "step": 7940 + }, + { + "epoch": 2.365196671568719, + "grad_norm": 0.3000968098640442, + "learning_rate": 1.7841286385813922e-05, + "loss": 1.2641, + "step": 7941 + }, + { + "epoch": 2.3654945177683873, + "grad_norm": 0.26428544521331787, + "learning_rate": 1.7840687741068852e-05, + "loss": 1.2495, + "step": 7942 + }, + { + "epoch": 2.365792363968056, + "grad_norm": 0.2293512225151062, + "learning_rate": 1.7840089023374937e-05, + "loss": 1.2388, + "step": 7943 + }, + { + "epoch": 2.366090210167725, + "grad_norm": 0.25424709916114807, + "learning_rate": 1.7839490232737756e-05, + "loss": 1.246, + "step": 7944 + }, + { + "epoch": 2.366388056367393, + "grad_norm": 0.22907119989395142, + "learning_rate": 1.783889136916288e-05, + "loss": 1.2467, + "step": 7945 + }, + { + "epoch": 2.366685902567062, + "grad_norm": 0.24755051732063293, + "learning_rate": 1.7838292432655874e-05, + "loss": 1.2576, + "step": 7946 + }, + { + "epoch": 2.3669837487667307, + "grad_norm": 0.22845561802387238, + "learning_rate": 1.7837693423222314e-05, + "loss": 1.266, + "step": 7947 + }, + { + "epoch": 2.367281594966399, + "grad_norm": 0.23387497663497925, + "learning_rate": 1.7837094340867775e-05, + "loss": 1.2731, + "step": 7948 + }, + { + "epoch": 2.3675794411660678, + "grad_norm": 0.2515455186367035, + "learning_rate": 1.7836495185597828e-05, + "loss": 1.2627, + "step": 7949 + }, + { + "epoch": 2.3678772873657365, + "grad_norm": 0.22718527913093567, + "learning_rate": 1.783589595741805e-05, + "loss": 1.2673, + "step": 7950 + }, + { + "epoch": 2.3681751335654053, + "grad_norm": 0.2656766474246979, + "learning_rate": 1.7835296656334017e-05, + "loss": 1.2578, + "step": 7951 + }, + { + "epoch": 2.3684729797650736, + "grad_norm": 0.22987836599349976, + "learning_rate": 1.78346972823513e-05, + "loss": 1.2566, + "step": 7952 + }, + { + "epoch": 2.3687708259647424, + "grad_norm": 0.2829611301422119, + "learning_rate": 1.7834097835475475e-05, + "loss": 1.2561, + "step": 7953 + }, + { + "epoch": 2.369068672164411, + "grad_norm": 0.25986650586128235, + "learning_rate": 1.7833498315712126e-05, + "loss": 1.2722, + "step": 7954 + }, + { + "epoch": 2.36936651836408, + "grad_norm": 0.24209918081760406, + "learning_rate": 1.783289872306683e-05, + "loss": 1.252, + "step": 7955 + }, + { + "epoch": 2.3696643645637483, + "grad_norm": 0.3400126099586487, + "learning_rate": 1.7832299057545158e-05, + "loss": 1.245, + "step": 7956 + }, + { + "epoch": 2.369962210763417, + "grad_norm": 0.27828526496887207, + "learning_rate": 1.78316993191527e-05, + "loss": 1.247, + "step": 7957 + }, + { + "epoch": 2.370260056963086, + "grad_norm": 0.27232053875923157, + "learning_rate": 1.7831099507895026e-05, + "loss": 1.2762, + "step": 7958 + }, + { + "epoch": 2.370557903162754, + "grad_norm": 0.2513825595378876, + "learning_rate": 1.783049962377772e-05, + "loss": 1.2554, + "step": 7959 + }, + { + "epoch": 2.370855749362423, + "grad_norm": 0.28506922721862793, + "learning_rate": 1.7829899666806363e-05, + "loss": 1.2699, + "step": 7960 + }, + { + "epoch": 2.3711535955620917, + "grad_norm": 0.2429376244544983, + "learning_rate": 1.7829299636986536e-05, + "loss": 1.2713, + "step": 7961 + }, + { + "epoch": 2.37145144176176, + "grad_norm": 0.2558785080909729, + "learning_rate": 1.7828699534323828e-05, + "loss": 1.245, + "step": 7962 + }, + { + "epoch": 2.371749287961429, + "grad_norm": 0.23596563935279846, + "learning_rate": 1.7828099358823818e-05, + "loss": 1.244, + "step": 7963 + }, + { + "epoch": 2.3720471341610976, + "grad_norm": 0.2452450841665268, + "learning_rate": 1.7827499110492086e-05, + "loss": 1.2415, + "step": 7964 + }, + { + "epoch": 2.3723449803607664, + "grad_norm": 0.24221771955490112, + "learning_rate": 1.7826898789334223e-05, + "loss": 1.2358, + "step": 7965 + }, + { + "epoch": 2.3726428265604347, + "grad_norm": 0.2362058162689209, + "learning_rate": 1.7826298395355806e-05, + "loss": 1.2669, + "step": 7966 + }, + { + "epoch": 2.3729406727601035, + "grad_norm": 0.22643139958381653, + "learning_rate": 1.7825697928562433e-05, + "loss": 1.245, + "step": 7967 + }, + { + "epoch": 2.3732385189597722, + "grad_norm": 0.23953528702259064, + "learning_rate": 1.7825097388959682e-05, + "loss": 1.2377, + "step": 7968 + }, + { + "epoch": 2.373536365159441, + "grad_norm": 0.24942715466022491, + "learning_rate": 1.7824496776553143e-05, + "loss": 1.2499, + "step": 7969 + }, + { + "epoch": 2.3738342113591093, + "grad_norm": 0.23290188610553741, + "learning_rate": 1.7823896091348403e-05, + "loss": 1.2614, + "step": 7970 + }, + { + "epoch": 2.374132057558778, + "grad_norm": 0.239968404173851, + "learning_rate": 1.782329533335105e-05, + "loss": 1.2475, + "step": 7971 + }, + { + "epoch": 2.374429903758447, + "grad_norm": 0.25155937671661377, + "learning_rate": 1.7822694502566675e-05, + "loss": 1.2543, + "step": 7972 + }, + { + "epoch": 2.374727749958115, + "grad_norm": 0.2466360330581665, + "learning_rate": 1.7822093599000868e-05, + "loss": 1.2458, + "step": 7973 + }, + { + "epoch": 2.375025596157784, + "grad_norm": 0.23671437799930573, + "learning_rate": 1.782149262265922e-05, + "loss": 1.2507, + "step": 7974 + }, + { + "epoch": 2.3753234423574527, + "grad_norm": 0.2404567152261734, + "learning_rate": 1.782089157354732e-05, + "loss": 1.26, + "step": 7975 + }, + { + "epoch": 2.375621288557121, + "grad_norm": 0.23791678249835968, + "learning_rate": 1.7820290451670767e-05, + "loss": 1.234, + "step": 7976 + }, + { + "epoch": 2.37591913475679, + "grad_norm": 0.23145097494125366, + "learning_rate": 1.7819689257035144e-05, + "loss": 1.249, + "step": 7977 + }, + { + "epoch": 2.3762169809564586, + "grad_norm": 0.2443423867225647, + "learning_rate": 1.781908798964605e-05, + "loss": 1.2532, + "step": 7978 + }, + { + "epoch": 2.3765148271561274, + "grad_norm": 0.23321878910064697, + "learning_rate": 1.781848664950908e-05, + "loss": 1.2391, + "step": 7979 + }, + { + "epoch": 2.3768126733557957, + "grad_norm": 0.21833214163780212, + "learning_rate": 1.7817885236629824e-05, + "loss": 1.2568, + "step": 7980 + }, + { + "epoch": 2.3771105195554645, + "grad_norm": 0.23264311254024506, + "learning_rate": 1.7817283751013882e-05, + "loss": 1.2725, + "step": 7981 + }, + { + "epoch": 2.3774083657551333, + "grad_norm": 0.23435115814208984, + "learning_rate": 1.781668219266685e-05, + "loss": 1.246, + "step": 7982 + }, + { + "epoch": 2.377706211954802, + "grad_norm": 0.22859029471874237, + "learning_rate": 1.7816080561594322e-05, + "loss": 1.2546, + "step": 7983 + }, + { + "epoch": 2.3780040581544704, + "grad_norm": 0.23380321264266968, + "learning_rate": 1.7815478857801896e-05, + "loss": 1.2607, + "step": 7984 + }, + { + "epoch": 2.378301904354139, + "grad_norm": 0.23703013360500336, + "learning_rate": 1.781487708129517e-05, + "loss": 1.2652, + "step": 7985 + }, + { + "epoch": 2.378599750553808, + "grad_norm": 0.23852579295635223, + "learning_rate": 1.7814275232079748e-05, + "loss": 1.2528, + "step": 7986 + }, + { + "epoch": 2.3788975967534762, + "grad_norm": 0.2664247751235962, + "learning_rate": 1.7813673310161227e-05, + "loss": 1.2535, + "step": 7987 + }, + { + "epoch": 2.379195442953145, + "grad_norm": 0.2276148945093155, + "learning_rate": 1.7813071315545202e-05, + "loss": 1.2507, + "step": 7988 + }, + { + "epoch": 2.3794932891528138, + "grad_norm": 0.2421445995569229, + "learning_rate": 1.7812469248237277e-05, + "loss": 1.2532, + "step": 7989 + }, + { + "epoch": 2.3797911353524825, + "grad_norm": 0.22952455282211304, + "learning_rate": 1.7811867108243056e-05, + "loss": 1.2545, + "step": 7990 + }, + { + "epoch": 2.380088981552151, + "grad_norm": 0.23287591338157654, + "learning_rate": 1.781126489556814e-05, + "loss": 1.2528, + "step": 7991 + }, + { + "epoch": 2.3803868277518196, + "grad_norm": 0.24828951060771942, + "learning_rate": 1.7810662610218132e-05, + "loss": 1.2648, + "step": 7992 + }, + { + "epoch": 2.3806846739514884, + "grad_norm": 0.24405096471309662, + "learning_rate": 1.7810060252198634e-05, + "loss": 1.2523, + "step": 7993 + }, + { + "epoch": 2.380982520151157, + "grad_norm": 0.23873582482337952, + "learning_rate": 1.780945782151525e-05, + "loss": 1.2441, + "step": 7994 + }, + { + "epoch": 2.3812803663508255, + "grad_norm": 0.24015365540981293, + "learning_rate": 1.7808855318173586e-05, + "loss": 1.2482, + "step": 7995 + }, + { + "epoch": 2.3815782125504943, + "grad_norm": 0.24203690886497498, + "learning_rate": 1.780825274217925e-05, + "loss": 1.2496, + "step": 7996 + }, + { + "epoch": 2.381876058750163, + "grad_norm": 0.3029463589191437, + "learning_rate": 1.7807650093537844e-05, + "loss": 1.2459, + "step": 7997 + }, + { + "epoch": 2.3821739049498314, + "grad_norm": 0.3219797611236572, + "learning_rate": 1.780704737225498e-05, + "loss": 1.2558, + "step": 7998 + }, + { + "epoch": 2.3824717511495, + "grad_norm": 0.2756625711917877, + "learning_rate": 1.7806444578336258e-05, + "loss": 1.2429, + "step": 7999 + }, + { + "epoch": 2.382769597349169, + "grad_norm": 0.23735983669757843, + "learning_rate": 1.78058417117873e-05, + "loss": 1.2544, + "step": 8000 + }, + { + "epoch": 2.382769597349169, + "eval_loss": 1.3381941318511963, + "eval_runtime": 20.3569, + "eval_samples_per_second": 85.18, + "eval_steps_per_second": 5.354, + "step": 8000 + }, + { + "epoch": 2.3830674435488373, + "grad_norm": 0.23161669075489044, + "learning_rate": 1.7805238772613694e-05, + "loss": 1.2388, + "step": 8001 + }, + { + "epoch": 2.383365289748506, + "grad_norm": 0.25970157980918884, + "learning_rate": 1.780463576082107e-05, + "loss": 1.2579, + "step": 8002 + }, + { + "epoch": 2.383663135948175, + "grad_norm": 0.2822204530239105, + "learning_rate": 1.7804032676415028e-05, + "loss": 1.2651, + "step": 8003 + }, + { + "epoch": 2.3839609821478436, + "grad_norm": 0.24108311533927917, + "learning_rate": 1.780342951940118e-05, + "loss": 1.2437, + "step": 8004 + }, + { + "epoch": 2.384258828347512, + "grad_norm": 0.4408750534057617, + "learning_rate": 1.780282628978514e-05, + "loss": 1.2393, + "step": 8005 + }, + { + "epoch": 2.3845566745471807, + "grad_norm": 0.2541865110397339, + "learning_rate": 1.7802222987572522e-05, + "loss": 1.2684, + "step": 8006 + }, + { + "epoch": 2.3848545207468494, + "grad_norm": 0.2612745463848114, + "learning_rate": 1.780161961276893e-05, + "loss": 1.2633, + "step": 8007 + }, + { + "epoch": 2.385152366946518, + "grad_norm": 0.24160808324813843, + "learning_rate": 1.780101616537999e-05, + "loss": 1.2692, + "step": 8008 + }, + { + "epoch": 2.3854502131461865, + "grad_norm": 0.22500190138816833, + "learning_rate": 1.7800412645411306e-05, + "loss": 1.2589, + "step": 8009 + }, + { + "epoch": 2.3857480593458553, + "grad_norm": 0.234347864985466, + "learning_rate": 1.7799809052868503e-05, + "loss": 1.2352, + "step": 8010 + }, + { + "epoch": 2.386045905545524, + "grad_norm": 0.25336188077926636, + "learning_rate": 1.7799205387757186e-05, + "loss": 1.2487, + "step": 8011 + }, + { + "epoch": 2.3863437517451924, + "grad_norm": 0.23233574628829956, + "learning_rate": 1.7798601650082976e-05, + "loss": 1.2508, + "step": 8012 + }, + { + "epoch": 2.386641597944861, + "grad_norm": 0.22018462419509888, + "learning_rate": 1.7797997839851497e-05, + "loss": 1.2236, + "step": 8013 + }, + { + "epoch": 2.38693944414453, + "grad_norm": 0.2367081195116043, + "learning_rate": 1.7797393957068356e-05, + "loss": 1.2528, + "step": 8014 + }, + { + "epoch": 2.3872372903441983, + "grad_norm": 0.2383645623922348, + "learning_rate": 1.7796790001739174e-05, + "loss": 1.2541, + "step": 8015 + }, + { + "epoch": 2.387535136543867, + "grad_norm": 0.24189260601997375, + "learning_rate": 1.7796185973869575e-05, + "loss": 1.2545, + "step": 8016 + }, + { + "epoch": 2.387832982743536, + "grad_norm": 0.23932260274887085, + "learning_rate": 1.7795581873465174e-05, + "loss": 1.2445, + "step": 8017 + }, + { + "epoch": 2.3881308289432046, + "grad_norm": 0.22369612753391266, + "learning_rate": 1.7794977700531598e-05, + "loss": 1.2392, + "step": 8018 + }, + { + "epoch": 2.388428675142873, + "grad_norm": 0.2379557341337204, + "learning_rate": 1.7794373455074458e-05, + "loss": 1.257, + "step": 8019 + }, + { + "epoch": 2.3887265213425417, + "grad_norm": 0.23386241495609283, + "learning_rate": 1.7793769137099385e-05, + "loss": 1.236, + "step": 8020 + }, + { + "epoch": 2.3890243675422105, + "grad_norm": 0.2289877086877823, + "learning_rate": 1.7793164746611997e-05, + "loss": 1.2564, + "step": 8021 + }, + { + "epoch": 2.3893222137418793, + "grad_norm": 0.22471486032009125, + "learning_rate": 1.779256028361792e-05, + "loss": 1.2457, + "step": 8022 + }, + { + "epoch": 2.3896200599415476, + "grad_norm": 0.23774881660938263, + "learning_rate": 1.779195574812277e-05, + "loss": 1.2581, + "step": 8023 + }, + { + "epoch": 2.3899179061412164, + "grad_norm": 0.24830865859985352, + "learning_rate": 1.7791351140132182e-05, + "loss": 1.2566, + "step": 8024 + }, + { + "epoch": 2.390215752340885, + "grad_norm": 0.24003386497497559, + "learning_rate": 1.7790746459651775e-05, + "loss": 1.2417, + "step": 8025 + }, + { + "epoch": 2.3905135985405535, + "grad_norm": 0.22913935780525208, + "learning_rate": 1.7790141706687177e-05, + "loss": 1.2518, + "step": 8026 + }, + { + "epoch": 2.3908114447402222, + "grad_norm": 0.23181475698947906, + "learning_rate": 1.7789536881244017e-05, + "loss": 1.2511, + "step": 8027 + }, + { + "epoch": 2.391109290939891, + "grad_norm": 0.22720636427402496, + "learning_rate": 1.7788931983327914e-05, + "loss": 1.2594, + "step": 8028 + }, + { + "epoch": 2.3914071371395593, + "grad_norm": 0.23401761054992676, + "learning_rate": 1.7788327012944508e-05, + "loss": 1.2505, + "step": 8029 + }, + { + "epoch": 2.391704983339228, + "grad_norm": 0.21780389547348022, + "learning_rate": 1.7787721970099414e-05, + "loss": 1.2346, + "step": 8030 + }, + { + "epoch": 2.392002829538897, + "grad_norm": 0.2231125831604004, + "learning_rate": 1.7787116854798273e-05, + "loss": 1.2501, + "step": 8031 + }, + { + "epoch": 2.3923006757385656, + "grad_norm": 0.24449528753757477, + "learning_rate": 1.7786511667046706e-05, + "loss": 1.2587, + "step": 8032 + }, + { + "epoch": 2.392598521938234, + "grad_norm": 0.23590652644634247, + "learning_rate": 1.778590640685035e-05, + "loss": 1.2819, + "step": 8033 + }, + { + "epoch": 2.3928963681379027, + "grad_norm": 0.23900920152664185, + "learning_rate": 1.7785301074214835e-05, + "loss": 1.2559, + "step": 8034 + }, + { + "epoch": 2.3931942143375715, + "grad_norm": 0.23054538667201996, + "learning_rate": 1.778469566914579e-05, + "loss": 1.2668, + "step": 8035 + }, + { + "epoch": 2.3934920605372403, + "grad_norm": 0.23224182426929474, + "learning_rate": 1.7784090191648845e-05, + "loss": 1.2539, + "step": 8036 + }, + { + "epoch": 2.3937899067369086, + "grad_norm": 0.23819798231124878, + "learning_rate": 1.7783484641729643e-05, + "loss": 1.2313, + "step": 8037 + }, + { + "epoch": 2.3940877529365774, + "grad_norm": 0.23229552805423737, + "learning_rate": 1.7782879019393813e-05, + "loss": 1.2407, + "step": 8038 + }, + { + "epoch": 2.394385599136246, + "grad_norm": 0.236992746591568, + "learning_rate": 1.7782273324646987e-05, + "loss": 1.272, + "step": 8039 + }, + { + "epoch": 2.3946834453359145, + "grad_norm": 0.22687511146068573, + "learning_rate": 1.77816675574948e-05, + "loss": 1.2616, + "step": 8040 + }, + { + "epoch": 2.3949812915355833, + "grad_norm": 0.22410380840301514, + "learning_rate": 1.7781061717942895e-05, + "loss": 1.2527, + "step": 8041 + }, + { + "epoch": 2.395279137735252, + "grad_norm": 0.2301110178232193, + "learning_rate": 1.7780455805996902e-05, + "loss": 1.2534, + "step": 8042 + }, + { + "epoch": 2.3955769839349204, + "grad_norm": 0.22933273017406464, + "learning_rate": 1.7779849821662463e-05, + "loss": 1.2395, + "step": 8043 + }, + { + "epoch": 2.395874830134589, + "grad_norm": 0.2381003051996231, + "learning_rate": 1.7779243764945214e-05, + "loss": 1.2589, + "step": 8044 + }, + { + "epoch": 2.396172676334258, + "grad_norm": 0.2247830033302307, + "learning_rate": 1.7778637635850792e-05, + "loss": 1.2513, + "step": 8045 + }, + { + "epoch": 2.3964705225339267, + "grad_norm": 0.2232261747121811, + "learning_rate": 1.7778031434384834e-05, + "loss": 1.2609, + "step": 8046 + }, + { + "epoch": 2.396768368733595, + "grad_norm": 0.22561782598495483, + "learning_rate": 1.777742516055299e-05, + "loss": 1.2477, + "step": 8047 + }, + { + "epoch": 2.3970662149332638, + "grad_norm": 0.2415585070848465, + "learning_rate": 1.777681881436089e-05, + "loss": 1.2602, + "step": 8048 + }, + { + "epoch": 2.3973640611329325, + "grad_norm": 0.22911863029003143, + "learning_rate": 1.777621239581418e-05, + "loss": 1.2599, + "step": 8049 + }, + { + "epoch": 2.3976619073326013, + "grad_norm": 0.23407961428165436, + "learning_rate": 1.7775605904918505e-05, + "loss": 1.2643, + "step": 8050 + }, + { + "epoch": 2.3979597535322696, + "grad_norm": 0.22826075553894043, + "learning_rate": 1.77749993416795e-05, + "loss": 1.2462, + "step": 8051 + }, + { + "epoch": 2.3982575997319384, + "grad_norm": 0.2386498898267746, + "learning_rate": 1.7774392706102818e-05, + "loss": 1.2543, + "step": 8052 + }, + { + "epoch": 2.398555445931607, + "grad_norm": 0.2155432254076004, + "learning_rate": 1.777378599819409e-05, + "loss": 1.2602, + "step": 8053 + }, + { + "epoch": 2.3988532921312755, + "grad_norm": 0.23106734454631805, + "learning_rate": 1.7773179217958976e-05, + "loss": 1.2488, + "step": 8054 + }, + { + "epoch": 2.3991511383309443, + "grad_norm": 0.2344607561826706, + "learning_rate": 1.7772572365403112e-05, + "loss": 1.2502, + "step": 8055 + }, + { + "epoch": 2.399448984530613, + "grad_norm": 0.21689283847808838, + "learning_rate": 1.7771965440532145e-05, + "loss": 1.2509, + "step": 8056 + }, + { + "epoch": 2.399746830730282, + "grad_norm": 0.23053377866744995, + "learning_rate": 1.7771358443351724e-05, + "loss": 1.2453, + "step": 8057 + }, + { + "epoch": 2.40004467692995, + "grad_norm": 0.22964678704738617, + "learning_rate": 1.7770751373867494e-05, + "loss": 1.246, + "step": 8058 + }, + { + "epoch": 2.400342523129619, + "grad_norm": 0.21987757086753845, + "learning_rate": 1.7770144232085105e-05, + "loss": 1.2348, + "step": 8059 + }, + { + "epoch": 2.4006403693292877, + "grad_norm": 0.2204911708831787, + "learning_rate": 1.776953701801021e-05, + "loss": 1.244, + "step": 8060 + }, + { + "epoch": 2.4009382155289565, + "grad_norm": 0.21743136644363403, + "learning_rate": 1.7768929731648447e-05, + "loss": 1.235, + "step": 8061 + }, + { + "epoch": 2.401236061728625, + "grad_norm": 0.22693593800067902, + "learning_rate": 1.7768322373005474e-05, + "loss": 1.2409, + "step": 8062 + }, + { + "epoch": 2.4015339079282936, + "grad_norm": 0.23844876885414124, + "learning_rate": 1.776771494208694e-05, + "loss": 1.2527, + "step": 8063 + }, + { + "epoch": 2.4018317541279623, + "grad_norm": 0.23794607818126678, + "learning_rate": 1.77671074388985e-05, + "loss": 1.2596, + "step": 8064 + }, + { + "epoch": 2.4021296003276307, + "grad_norm": 0.2254185527563095, + "learning_rate": 1.7766499863445804e-05, + "loss": 1.2373, + "step": 8065 + }, + { + "epoch": 2.4024274465272994, + "grad_norm": 0.21888013184070587, + "learning_rate": 1.77658922157345e-05, + "loss": 1.2601, + "step": 8066 + }, + { + "epoch": 2.402725292726968, + "grad_norm": 0.25763723254203796, + "learning_rate": 1.776528449577025e-05, + "loss": 1.256, + "step": 8067 + }, + { + "epoch": 2.4030231389266365, + "grad_norm": 0.2353525310754776, + "learning_rate": 1.77646767035587e-05, + "loss": 1.2649, + "step": 8068 + }, + { + "epoch": 2.4033209851263053, + "grad_norm": 0.24496106803417206, + "learning_rate": 1.776406883910551e-05, + "loss": 1.2587, + "step": 8069 + }, + { + "epoch": 2.403618831325974, + "grad_norm": 0.24062785506248474, + "learning_rate": 1.7763460902416333e-05, + "loss": 1.2642, + "step": 8070 + }, + { + "epoch": 2.403916677525643, + "grad_norm": 0.23483458161354065, + "learning_rate": 1.7762852893496825e-05, + "loss": 1.2403, + "step": 8071 + }, + { + "epoch": 2.404214523725311, + "grad_norm": 0.23496435582637787, + "learning_rate": 1.7762244812352648e-05, + "loss": 1.2517, + "step": 8072 + }, + { + "epoch": 2.40451236992498, + "grad_norm": 0.23324401676654816, + "learning_rate": 1.7761636658989452e-05, + "loss": 1.2488, + "step": 8073 + }, + { + "epoch": 2.4048102161246487, + "grad_norm": 0.2287825495004654, + "learning_rate": 1.77610284334129e-05, + "loss": 1.2413, + "step": 8074 + }, + { + "epoch": 2.4051080623243175, + "grad_norm": 0.2442363053560257, + "learning_rate": 1.7760420135628652e-05, + "loss": 1.2495, + "step": 8075 + }, + { + "epoch": 2.405405908523986, + "grad_norm": 0.23072031140327454, + "learning_rate": 1.775981176564236e-05, + "loss": 1.2583, + "step": 8076 + }, + { + "epoch": 2.4057037547236546, + "grad_norm": 0.23112305998802185, + "learning_rate": 1.7759203323459693e-05, + "loss": 1.2585, + "step": 8077 + }, + { + "epoch": 2.4060016009233234, + "grad_norm": 0.24222977459430695, + "learning_rate": 1.7758594809086308e-05, + "loss": 1.2456, + "step": 8078 + }, + { + "epoch": 2.4062994471229917, + "grad_norm": 0.22285297513008118, + "learning_rate": 1.7757986222527864e-05, + "loss": 1.2626, + "step": 8079 + }, + { + "epoch": 2.4065972933226605, + "grad_norm": 0.22241798043251038, + "learning_rate": 1.775737756379003e-05, + "loss": 1.2452, + "step": 8080 + }, + { + "epoch": 2.4068951395223293, + "grad_norm": 0.24153169989585876, + "learning_rate": 1.7756768832878463e-05, + "loss": 1.246, + "step": 8081 + }, + { + "epoch": 2.4071929857219976, + "grad_norm": 0.23785129189491272, + "learning_rate": 1.7756160029798824e-05, + "loss": 1.2595, + "step": 8082 + }, + { + "epoch": 2.4074908319216664, + "grad_norm": 0.23555125296115875, + "learning_rate": 1.775555115455679e-05, + "loss": 1.2484, + "step": 8083 + }, + { + "epoch": 2.407788678121335, + "grad_norm": 0.24767501652240753, + "learning_rate": 1.775494220715801e-05, + "loss": 1.2438, + "step": 8084 + }, + { + "epoch": 2.408086524321004, + "grad_norm": 0.2364848554134369, + "learning_rate": 1.7754333187608163e-05, + "loss": 1.2685, + "step": 8085 + }, + { + "epoch": 2.4083843705206722, + "grad_norm": 0.24467679858207703, + "learning_rate": 1.7753724095912906e-05, + "loss": 1.2494, + "step": 8086 + }, + { + "epoch": 2.408682216720341, + "grad_norm": 0.2808564305305481, + "learning_rate": 1.775311493207791e-05, + "loss": 1.2435, + "step": 8087 + }, + { + "epoch": 2.4089800629200098, + "grad_norm": 0.22737839818000793, + "learning_rate": 1.775250569610884e-05, + "loss": 1.2472, + "step": 8088 + }, + { + "epoch": 2.4092779091196785, + "grad_norm": 0.2532578110694885, + "learning_rate": 1.775189638801137e-05, + "loss": 1.261, + "step": 8089 + }, + { + "epoch": 2.409575755319347, + "grad_norm": 0.23660686612129211, + "learning_rate": 1.7751287007791163e-05, + "loss": 1.2489, + "step": 8090 + }, + { + "epoch": 2.4098736015190156, + "grad_norm": 0.2513912618160248, + "learning_rate": 1.775067755545389e-05, + "loss": 1.2674, + "step": 8091 + }, + { + "epoch": 2.4101714477186844, + "grad_norm": 0.23578931391239166, + "learning_rate": 1.775006803100522e-05, + "loss": 1.2571, + "step": 8092 + }, + { + "epoch": 2.4104692939183527, + "grad_norm": 0.2183685600757599, + "learning_rate": 1.774945843445083e-05, + "loss": 1.2296, + "step": 8093 + }, + { + "epoch": 2.4107671401180215, + "grad_norm": 0.310764342546463, + "learning_rate": 1.7748848765796385e-05, + "loss": 1.2464, + "step": 8094 + }, + { + "epoch": 2.4110649863176903, + "grad_norm": 0.33203524351119995, + "learning_rate": 1.774823902504756e-05, + "loss": 1.2397, + "step": 8095 + }, + { + "epoch": 2.4113628325173586, + "grad_norm": 0.2394951581954956, + "learning_rate": 1.7747629212210033e-05, + "loss": 1.2333, + "step": 8096 + }, + { + "epoch": 2.4116606787170274, + "grad_norm": 0.46868062019348145, + "learning_rate": 1.7747019327289465e-05, + "loss": 1.2342, + "step": 8097 + }, + { + "epoch": 2.411958524916696, + "grad_norm": 0.29705339670181274, + "learning_rate": 1.774640937029154e-05, + "loss": 1.2477, + "step": 8098 + }, + { + "epoch": 2.412256371116365, + "grad_norm": 0.2741549611091614, + "learning_rate": 1.774579934122193e-05, + "loss": 1.2415, + "step": 8099 + }, + { + "epoch": 2.4125542173160333, + "grad_norm": 0.23555107414722443, + "learning_rate": 1.7745189240086313e-05, + "loss": 1.2648, + "step": 8100 + }, + { + "epoch": 2.412852063515702, + "grad_norm": 0.22980466485023499, + "learning_rate": 1.7744579066890363e-05, + "loss": 1.2498, + "step": 8101 + }, + { + "epoch": 2.413149909715371, + "grad_norm": 0.24976091086864471, + "learning_rate": 1.7743968821639757e-05, + "loss": 1.2497, + "step": 8102 + }, + { + "epoch": 2.4134477559150396, + "grad_norm": 0.256198912858963, + "learning_rate": 1.7743358504340173e-05, + "loss": 1.2587, + "step": 8103 + }, + { + "epoch": 2.413745602114708, + "grad_norm": 0.2384638488292694, + "learning_rate": 1.774274811499729e-05, + "loss": 1.258, + "step": 8104 + }, + { + "epoch": 2.4140434483143767, + "grad_norm": 0.22125479578971863, + "learning_rate": 1.7742137653616787e-05, + "loss": 1.2666, + "step": 8105 + }, + { + "epoch": 2.4143412945140454, + "grad_norm": 0.23319008946418762, + "learning_rate": 1.774152712020434e-05, + "loss": 1.2417, + "step": 8106 + }, + { + "epoch": 2.4146391407137138, + "grad_norm": 0.24594014883041382, + "learning_rate": 1.7740916514765638e-05, + "loss": 1.2693, + "step": 8107 + }, + { + "epoch": 2.4149369869133825, + "grad_norm": 0.23496028780937195, + "learning_rate": 1.7740305837306353e-05, + "loss": 1.2401, + "step": 8108 + }, + { + "epoch": 2.4152348331130513, + "grad_norm": 0.22659975290298462, + "learning_rate": 1.773969508783217e-05, + "loss": 1.2624, + "step": 8109 + }, + { + "epoch": 2.41553267931272, + "grad_norm": 0.22997890412807465, + "learning_rate": 1.7739084266348772e-05, + "loss": 1.2691, + "step": 8110 + }, + { + "epoch": 2.4158305255123884, + "grad_norm": 0.2514461278915405, + "learning_rate": 1.773847337286184e-05, + "loss": 1.2614, + "step": 8111 + }, + { + "epoch": 2.416128371712057, + "grad_norm": 0.2418598085641861, + "learning_rate": 1.773786240737706e-05, + "loss": 1.2583, + "step": 8112 + }, + { + "epoch": 2.416426217911726, + "grad_norm": 0.2289545089006424, + "learning_rate": 1.7737251369900118e-05, + "loss": 1.2521, + "step": 8113 + }, + { + "epoch": 2.4167240641113943, + "grad_norm": 0.22397193312644958, + "learning_rate": 1.7736640260436693e-05, + "loss": 1.2571, + "step": 8114 + }, + { + "epoch": 2.417021910311063, + "grad_norm": 0.2338794767856598, + "learning_rate": 1.7736029078992477e-05, + "loss": 1.2574, + "step": 8115 + }, + { + "epoch": 2.417319756510732, + "grad_norm": 0.235575869679451, + "learning_rate": 1.7735417825573154e-05, + "loss": 1.234, + "step": 8116 + }, + { + "epoch": 2.4176176027104006, + "grad_norm": 0.2261354774236679, + "learning_rate": 1.773480650018441e-05, + "loss": 1.2426, + "step": 8117 + }, + { + "epoch": 2.417915448910069, + "grad_norm": 0.22895053029060364, + "learning_rate": 1.773419510283193e-05, + "loss": 1.2641, + "step": 8118 + }, + { + "epoch": 2.4182132951097377, + "grad_norm": 0.23475541174411774, + "learning_rate": 1.7733583633521412e-05, + "loss": 1.2676, + "step": 8119 + }, + { + "epoch": 2.4185111413094065, + "grad_norm": 0.24860520660877228, + "learning_rate": 1.7732972092258535e-05, + "loss": 1.2355, + "step": 8120 + }, + { + "epoch": 2.418808987509075, + "grad_norm": 0.22481386363506317, + "learning_rate": 1.773236047904899e-05, + "loss": 1.2404, + "step": 8121 + }, + { + "epoch": 2.4191068337087436, + "grad_norm": 0.23251011967658997, + "learning_rate": 1.7731748793898472e-05, + "loss": 1.24, + "step": 8122 + }, + { + "epoch": 2.4194046799084123, + "grad_norm": 0.237873375415802, + "learning_rate": 1.7731137036812674e-05, + "loss": 1.2382, + "step": 8123 + }, + { + "epoch": 2.419702526108081, + "grad_norm": 0.2502184808254242, + "learning_rate": 1.773052520779728e-05, + "loss": 1.248, + "step": 8124 + }, + { + "epoch": 2.4200003723077494, + "grad_norm": 0.23529601097106934, + "learning_rate": 1.7729913306857987e-05, + "loss": 1.2515, + "step": 8125 + }, + { + "epoch": 2.420298218507418, + "grad_norm": 0.22526681423187256, + "learning_rate": 1.7729301334000486e-05, + "loss": 1.2516, + "step": 8126 + }, + { + "epoch": 2.420596064707087, + "grad_norm": 0.23735322058200836, + "learning_rate": 1.772868928923047e-05, + "loss": 1.2629, + "step": 8127 + }, + { + "epoch": 2.4208939109067558, + "grad_norm": 0.22896255552768707, + "learning_rate": 1.772807717255364e-05, + "loss": 1.2577, + "step": 8128 + }, + { + "epoch": 2.421191757106424, + "grad_norm": 0.23871658742427826, + "learning_rate": 1.772746498397568e-05, + "loss": 1.2457, + "step": 8129 + }, + { + "epoch": 2.421489603306093, + "grad_norm": 0.23917968571186066, + "learning_rate": 1.7726852723502296e-05, + "loss": 1.2349, + "step": 8130 + }, + { + "epoch": 2.4217874495057616, + "grad_norm": 0.22797973453998566, + "learning_rate": 1.772624039113918e-05, + "loss": 1.2533, + "step": 8131 + }, + { + "epoch": 2.42208529570543, + "grad_norm": 0.23172640800476074, + "learning_rate": 1.7725627986892028e-05, + "loss": 1.2257, + "step": 8132 + }, + { + "epoch": 2.4223831419050987, + "grad_norm": 0.2374148964881897, + "learning_rate": 1.772501551076654e-05, + "loss": 1.2345, + "step": 8133 + }, + { + "epoch": 2.4226809881047675, + "grad_norm": 0.2308272123336792, + "learning_rate": 1.7724402962768413e-05, + "loss": 1.2586, + "step": 8134 + }, + { + "epoch": 2.422978834304436, + "grad_norm": 0.23716746270656586, + "learning_rate": 1.7723790342903353e-05, + "loss": 1.2633, + "step": 8135 + }, + { + "epoch": 2.4232766805041046, + "grad_norm": 0.23746775090694427, + "learning_rate": 1.7723177651177046e-05, + "loss": 1.2371, + "step": 8136 + }, + { + "epoch": 2.4235745267037734, + "grad_norm": 0.25077927112579346, + "learning_rate": 1.77225648875952e-05, + "loss": 1.2462, + "step": 8137 + }, + { + "epoch": 2.423872372903442, + "grad_norm": 0.22465792298316956, + "learning_rate": 1.7721952052163517e-05, + "loss": 1.2437, + "step": 8138 + }, + { + "epoch": 2.4241702191031105, + "grad_norm": 0.2292993813753128, + "learning_rate": 1.77213391448877e-05, + "loss": 1.2634, + "step": 8139 + }, + { + "epoch": 2.4244680653027793, + "grad_norm": 0.22933197021484375, + "learning_rate": 1.7720726165773444e-05, + "loss": 1.2498, + "step": 8140 + }, + { + "epoch": 2.424765911502448, + "grad_norm": 0.2276661992073059, + "learning_rate": 1.7720113114826462e-05, + "loss": 1.2562, + "step": 8141 + }, + { + "epoch": 2.425063757702117, + "grad_norm": 0.2251230925321579, + "learning_rate": 1.771949999205245e-05, + "loss": 1.2293, + "step": 8142 + }, + { + "epoch": 2.425361603901785, + "grad_norm": 0.2342134565114975, + "learning_rate": 1.7718886797457118e-05, + "loss": 1.2502, + "step": 8143 + }, + { + "epoch": 2.425659450101454, + "grad_norm": 0.23211674392223358, + "learning_rate": 1.7718273531046167e-05, + "loss": 1.2669, + "step": 8144 + }, + { + "epoch": 2.4259572963011227, + "grad_norm": 0.24708154797554016, + "learning_rate": 1.7717660192825304e-05, + "loss": 1.2456, + "step": 8145 + }, + { + "epoch": 2.426255142500791, + "grad_norm": 0.2347024530172348, + "learning_rate": 1.7717046782800236e-05, + "loss": 1.2599, + "step": 8146 + }, + { + "epoch": 2.4265529887004598, + "grad_norm": 0.2513992190361023, + "learning_rate": 1.7716433300976667e-05, + "loss": 1.2559, + "step": 8147 + }, + { + "epoch": 2.4268508349001285, + "grad_norm": 0.23596066236495972, + "learning_rate": 1.7715819747360313e-05, + "loss": 1.2442, + "step": 8148 + }, + { + "epoch": 2.427148681099797, + "grad_norm": 0.2340855449438095, + "learning_rate": 1.7715206121956874e-05, + "loss": 1.2506, + "step": 8149 + }, + { + "epoch": 2.4274465272994656, + "grad_norm": 0.22590996325016022, + "learning_rate": 1.771459242477206e-05, + "loss": 1.2454, + "step": 8150 + }, + { + "epoch": 2.4277443734991344, + "grad_norm": 0.23030374944210052, + "learning_rate": 1.7713978655811583e-05, + "loss": 1.2528, + "step": 8151 + }, + { + "epoch": 2.428042219698803, + "grad_norm": 0.24000777304172516, + "learning_rate": 1.7713364815081154e-05, + "loss": 1.2424, + "step": 8152 + }, + { + "epoch": 2.4283400658984715, + "grad_norm": 0.24114497005939484, + "learning_rate": 1.7712750902586485e-05, + "loss": 1.2519, + "step": 8153 + }, + { + "epoch": 2.4286379120981403, + "grad_norm": 0.23782244324684143, + "learning_rate": 1.7712136918333285e-05, + "loss": 1.2687, + "step": 8154 + }, + { + "epoch": 2.428935758297809, + "grad_norm": 0.23510950803756714, + "learning_rate": 1.7711522862327267e-05, + "loss": 1.2696, + "step": 8155 + }, + { + "epoch": 2.429233604497478, + "grad_norm": 0.24703507125377655, + "learning_rate": 1.7710908734574147e-05, + "loss": 1.2383, + "step": 8156 + }, + { + "epoch": 2.429531450697146, + "grad_norm": 0.22979791462421417, + "learning_rate": 1.7710294535079633e-05, + "loss": 1.2644, + "step": 8157 + }, + { + "epoch": 2.429829296896815, + "grad_norm": 0.23439759016036987, + "learning_rate": 1.7709680263849445e-05, + "loss": 1.2621, + "step": 8158 + }, + { + "epoch": 2.4301271430964837, + "grad_norm": 0.2432500123977661, + "learning_rate": 1.770906592088929e-05, + "loss": 1.2688, + "step": 8159 + }, + { + "epoch": 2.430424989296152, + "grad_norm": 0.28154420852661133, + "learning_rate": 1.77084515062049e-05, + "loss": 1.2555, + "step": 8160 + }, + { + "epoch": 2.430722835495821, + "grad_norm": 0.22606149315834045, + "learning_rate": 1.7707837019801975e-05, + "loss": 1.256, + "step": 8161 + }, + { + "epoch": 2.4310206816954896, + "grad_norm": 0.21935401856899261, + "learning_rate": 1.770722246168624e-05, + "loss": 1.2425, + "step": 8162 + }, + { + "epoch": 2.431318527895158, + "grad_norm": 0.23840375244617462, + "learning_rate": 1.770660783186341e-05, + "loss": 1.2658, + "step": 8163 + }, + { + "epoch": 2.4316163740948267, + "grad_norm": 0.24386407434940338, + "learning_rate": 1.7705993130339204e-05, + "loss": 1.257, + "step": 8164 + }, + { + "epoch": 2.4319142202944954, + "grad_norm": 0.2271236926317215, + "learning_rate": 1.770537835711934e-05, + "loss": 1.2692, + "step": 8165 + }, + { + "epoch": 2.432212066494164, + "grad_norm": 0.23163583874702454, + "learning_rate": 1.7704763512209543e-05, + "loss": 1.2615, + "step": 8166 + }, + { + "epoch": 2.4325099126938325, + "grad_norm": 0.23572777211666107, + "learning_rate": 1.770414859561553e-05, + "loss": 1.2308, + "step": 8167 + }, + { + "epoch": 2.4328077588935013, + "grad_norm": 0.26480162143707275, + "learning_rate": 1.7703533607343017e-05, + "loss": 1.2534, + "step": 8168 + }, + { + "epoch": 2.43310560509317, + "grad_norm": 0.25801411271095276, + "learning_rate": 1.7702918547397734e-05, + "loss": 1.2568, + "step": 8169 + }, + { + "epoch": 2.433403451292839, + "grad_norm": 0.28933751583099365, + "learning_rate": 1.77023034157854e-05, + "loss": 1.2473, + "step": 8170 + }, + { + "epoch": 2.433701297492507, + "grad_norm": 0.23392869532108307, + "learning_rate": 1.7701688212511738e-05, + "loss": 1.2563, + "step": 8171 + }, + { + "epoch": 2.433999143692176, + "grad_norm": 0.24702076613903046, + "learning_rate": 1.7701072937582473e-05, + "loss": 1.238, + "step": 8172 + }, + { + "epoch": 2.4342969898918447, + "grad_norm": 0.2662452459335327, + "learning_rate": 1.7700457591003327e-05, + "loss": 1.2503, + "step": 8173 + }, + { + "epoch": 2.434594836091513, + "grad_norm": 0.23712895810604095, + "learning_rate": 1.7699842172780028e-05, + "loss": 1.2525, + "step": 8174 + }, + { + "epoch": 2.434892682291182, + "grad_norm": 0.3020852506160736, + "learning_rate": 1.7699226682918297e-05, + "loss": 1.2593, + "step": 8175 + }, + { + "epoch": 2.4351905284908506, + "grad_norm": 0.2775361239910126, + "learning_rate": 1.7698611121423867e-05, + "loss": 1.2568, + "step": 8176 + }, + { + "epoch": 2.4354883746905194, + "grad_norm": 0.269408643245697, + "learning_rate": 1.769799548830246e-05, + "loss": 1.2647, + "step": 8177 + }, + { + "epoch": 2.4357862208901877, + "grad_norm": 0.2418518215417862, + "learning_rate": 1.769737978355981e-05, + "loss": 1.2479, + "step": 8178 + }, + { + "epoch": 2.4360840670898565, + "grad_norm": 0.3381626307964325, + "learning_rate": 1.7696764007201638e-05, + "loss": 1.2746, + "step": 8179 + }, + { + "epoch": 2.4363819132895252, + "grad_norm": 0.23209111392498016, + "learning_rate": 1.7696148159233676e-05, + "loss": 1.2399, + "step": 8180 + }, + { + "epoch": 2.4366797594891936, + "grad_norm": 0.24149450659751892, + "learning_rate": 1.7695532239661655e-05, + "loss": 1.2634, + "step": 8181 + }, + { + "epoch": 2.4369776056888623, + "grad_norm": 0.2611485719680786, + "learning_rate": 1.7694916248491304e-05, + "loss": 1.2582, + "step": 8182 + }, + { + "epoch": 2.437275451888531, + "grad_norm": 0.22782889008522034, + "learning_rate": 1.7694300185728353e-05, + "loss": 1.2535, + "step": 8183 + }, + { + "epoch": 2.4375732980882, + "grad_norm": 0.22121752798557281, + "learning_rate": 1.7693684051378538e-05, + "loss": 1.2483, + "step": 8184 + }, + { + "epoch": 2.437871144287868, + "grad_norm": 0.24565903842449188, + "learning_rate": 1.7693067845447588e-05, + "loss": 1.2679, + "step": 8185 + }, + { + "epoch": 2.438168990487537, + "grad_norm": 0.24552184343338013, + "learning_rate": 1.7692451567941232e-05, + "loss": 1.2434, + "step": 8186 + }, + { + "epoch": 2.4384668366872058, + "grad_norm": 0.2353091686964035, + "learning_rate": 1.7691835218865217e-05, + "loss": 1.2488, + "step": 8187 + }, + { + "epoch": 2.438764682886874, + "grad_norm": 0.2381007969379425, + "learning_rate": 1.7691218798225262e-05, + "loss": 1.2654, + "step": 8188 + }, + { + "epoch": 2.439062529086543, + "grad_norm": 0.23425203561782837, + "learning_rate": 1.7690602306027114e-05, + "loss": 1.2554, + "step": 8189 + }, + { + "epoch": 2.4393603752862116, + "grad_norm": 0.23426920175552368, + "learning_rate": 1.7689985742276504e-05, + "loss": 1.2618, + "step": 8190 + }, + { + "epoch": 2.4396582214858804, + "grad_norm": 0.24380579590797424, + "learning_rate": 1.7689369106979166e-05, + "loss": 1.2411, + "step": 8191 + }, + { + "epoch": 2.4399560676855487, + "grad_norm": 0.22629253566265106, + "learning_rate": 1.7688752400140842e-05, + "loss": 1.2417, + "step": 8192 + }, + { + "epoch": 2.4402539138852175, + "grad_norm": 0.22987054288387299, + "learning_rate": 1.7688135621767262e-05, + "loss": 1.2657, + "step": 8193 + }, + { + "epoch": 2.4405517600848863, + "grad_norm": 0.2381831705570221, + "learning_rate": 1.7687518771864173e-05, + "loss": 1.237, + "step": 8194 + }, + { + "epoch": 2.440849606284555, + "grad_norm": 0.24247990548610687, + "learning_rate": 1.7686901850437316e-05, + "loss": 1.2607, + "step": 8195 + }, + { + "epoch": 2.4411474524842234, + "grad_norm": 0.22796709835529327, + "learning_rate": 1.768628485749242e-05, + "loss": 1.2441, + "step": 8196 + }, + { + "epoch": 2.441445298683892, + "grad_norm": 0.24143260717391968, + "learning_rate": 1.768566779303523e-05, + "loss": 1.2676, + "step": 8197 + }, + { + "epoch": 2.441743144883561, + "grad_norm": 0.23797409236431122, + "learning_rate": 1.7685050657071488e-05, + "loss": 1.2605, + "step": 8198 + }, + { + "epoch": 2.4420409910832293, + "grad_norm": 0.23557357490062714, + "learning_rate": 1.7684433449606938e-05, + "loss": 1.2568, + "step": 8199 + }, + { + "epoch": 2.442338837282898, + "grad_norm": 0.22420549392700195, + "learning_rate": 1.7683816170647322e-05, + "loss": 1.2412, + "step": 8200 + }, + { + "epoch": 2.442636683482567, + "grad_norm": 0.23614062368869781, + "learning_rate": 1.768319882019838e-05, + "loss": 1.2578, + "step": 8201 + }, + { + "epoch": 2.442934529682235, + "grad_norm": 0.2433299571275711, + "learning_rate": 1.7682581398265856e-05, + "loss": 1.2596, + "step": 8202 + }, + { + "epoch": 2.443232375881904, + "grad_norm": 0.24343116581439972, + "learning_rate": 1.7681963904855495e-05, + "loss": 1.2633, + "step": 8203 + }, + { + "epoch": 2.4435302220815727, + "grad_norm": 0.23356005549430847, + "learning_rate": 1.7681346339973044e-05, + "loss": 1.2509, + "step": 8204 + }, + { + "epoch": 2.4438280682812414, + "grad_norm": 0.24294687807559967, + "learning_rate": 1.7680728703624247e-05, + "loss": 1.2458, + "step": 8205 + }, + { + "epoch": 2.4441259144809098, + "grad_norm": 0.24111899733543396, + "learning_rate": 1.7680110995814855e-05, + "loss": 1.2502, + "step": 8206 + }, + { + "epoch": 2.4444237606805785, + "grad_norm": 0.23431457579135895, + "learning_rate": 1.7679493216550607e-05, + "loss": 1.2385, + "step": 8207 + }, + { + "epoch": 2.4447216068802473, + "grad_norm": 0.24026180803775787, + "learning_rate": 1.7678875365837253e-05, + "loss": 1.2535, + "step": 8208 + }, + { + "epoch": 2.445019453079916, + "grad_norm": 0.23439113795757294, + "learning_rate": 1.7678257443680546e-05, + "loss": 1.2459, + "step": 8209 + }, + { + "epoch": 2.4453172992795844, + "grad_norm": 0.24530130624771118, + "learning_rate": 1.7677639450086232e-05, + "loss": 1.2664, + "step": 8210 + }, + { + "epoch": 2.445615145479253, + "grad_norm": 0.24152901768684387, + "learning_rate": 1.7677021385060057e-05, + "loss": 1.2488, + "step": 8211 + }, + { + "epoch": 2.445912991678922, + "grad_norm": 0.2546330690383911, + "learning_rate": 1.767640324860778e-05, + "loss": 1.2481, + "step": 8212 + }, + { + "epoch": 2.4462108378785903, + "grad_norm": 0.24023675918579102, + "learning_rate": 1.7675785040735146e-05, + "loss": 1.265, + "step": 8213 + }, + { + "epoch": 2.446508684078259, + "grad_norm": 0.24142986536026, + "learning_rate": 1.7675166761447905e-05, + "loss": 1.2538, + "step": 8214 + }, + { + "epoch": 2.446806530277928, + "grad_norm": 0.2541647255420685, + "learning_rate": 1.7674548410751815e-05, + "loss": 1.2538, + "step": 8215 + }, + { + "epoch": 2.447104376477596, + "grad_norm": 0.23540470004081726, + "learning_rate": 1.7673929988652627e-05, + "loss": 1.2587, + "step": 8216 + }, + { + "epoch": 2.447402222677265, + "grad_norm": 0.23725782334804535, + "learning_rate": 1.7673311495156093e-05, + "loss": 1.2451, + "step": 8217 + }, + { + "epoch": 2.4477000688769337, + "grad_norm": 0.2404332160949707, + "learning_rate": 1.767269293026797e-05, + "loss": 1.251, + "step": 8218 + }, + { + "epoch": 2.4479979150766025, + "grad_norm": 0.22643688321113586, + "learning_rate": 1.767207429399401e-05, + "loss": 1.2541, + "step": 8219 + }, + { + "epoch": 2.448295761276271, + "grad_norm": 0.2497372180223465, + "learning_rate": 1.7671455586339973e-05, + "loss": 1.2448, + "step": 8220 + }, + { + "epoch": 2.4485936074759396, + "grad_norm": 0.22984126210212708, + "learning_rate": 1.7670836807311614e-05, + "loss": 1.2538, + "step": 8221 + }, + { + "epoch": 2.4488914536756083, + "grad_norm": 0.2346605509519577, + "learning_rate": 1.7670217956914685e-05, + "loss": 1.2471, + "step": 8222 + }, + { + "epoch": 2.449189299875277, + "grad_norm": 0.23125341534614563, + "learning_rate": 1.766959903515495e-05, + "loss": 1.234, + "step": 8223 + }, + { + "epoch": 2.4494871460749454, + "grad_norm": 0.23240256309509277, + "learning_rate": 1.7668980042038166e-05, + "loss": 1.2543, + "step": 8224 + }, + { + "epoch": 2.449784992274614, + "grad_norm": 0.2352667599916458, + "learning_rate": 1.7668360977570085e-05, + "loss": 1.2489, + "step": 8225 + }, + { + "epoch": 2.450082838474283, + "grad_norm": 0.23893482983112335, + "learning_rate": 1.7667741841756477e-05, + "loss": 1.2503, + "step": 8226 + }, + { + "epoch": 2.4503806846739513, + "grad_norm": 0.23530250787734985, + "learning_rate": 1.7667122634603097e-05, + "loss": 1.2487, + "step": 8227 + }, + { + "epoch": 2.45067853087362, + "grad_norm": 0.22781215608119965, + "learning_rate": 1.7666503356115706e-05, + "loss": 1.2583, + "step": 8228 + }, + { + "epoch": 2.450976377073289, + "grad_norm": 0.23113566637039185, + "learning_rate": 1.766588400630007e-05, + "loss": 1.2665, + "step": 8229 + }, + { + "epoch": 2.451274223272957, + "grad_norm": 0.23249532282352448, + "learning_rate": 1.7665264585161947e-05, + "loss": 1.2568, + "step": 8230 + }, + { + "epoch": 2.451572069472626, + "grad_norm": 0.2413317710161209, + "learning_rate": 1.7664645092707102e-05, + "loss": 1.2554, + "step": 8231 + }, + { + "epoch": 2.4518699156722947, + "grad_norm": 0.33798786997795105, + "learning_rate": 1.7664025528941295e-05, + "loss": 1.258, + "step": 8232 + }, + { + "epoch": 2.4521677618719635, + "grad_norm": 0.330740362405777, + "learning_rate": 1.7663405893870296e-05, + "loss": 1.2459, + "step": 8233 + }, + { + "epoch": 2.452465608071632, + "grad_norm": 0.28560367226600647, + "learning_rate": 1.7662786187499865e-05, + "loss": 1.2482, + "step": 8234 + }, + { + "epoch": 2.4527634542713006, + "grad_norm": 0.5696402788162231, + "learning_rate": 1.7662166409835772e-05, + "loss": 1.2583, + "step": 8235 + }, + { + "epoch": 2.4530613004709694, + "grad_norm": 0.24457116425037384, + "learning_rate": 1.7661546560883784e-05, + "loss": 1.2402, + "step": 8236 + }, + { + "epoch": 2.453359146670638, + "grad_norm": 0.24923206865787506, + "learning_rate": 1.766092664064966e-05, + "loss": 1.2388, + "step": 8237 + }, + { + "epoch": 2.4536569928703065, + "grad_norm": 0.24296973645687103, + "learning_rate": 1.766030664913917e-05, + "loss": 1.2567, + "step": 8238 + }, + { + "epoch": 2.4539548390699752, + "grad_norm": 0.2376331090927124, + "learning_rate": 1.765968658635809e-05, + "loss": 1.2694, + "step": 8239 + }, + { + "epoch": 2.454252685269644, + "grad_norm": 0.2334982305765152, + "learning_rate": 1.7659066452312184e-05, + "loss": 1.2507, + "step": 8240 + }, + { + "epoch": 2.4545505314693123, + "grad_norm": 0.25672271847724915, + "learning_rate": 1.7658446247007224e-05, + "loss": 1.2584, + "step": 8241 + }, + { + "epoch": 2.454848377668981, + "grad_norm": 0.24763455986976624, + "learning_rate": 1.7657825970448975e-05, + "loss": 1.2332, + "step": 8242 + }, + { + "epoch": 2.45514622386865, + "grad_norm": 0.23960673809051514, + "learning_rate": 1.7657205622643214e-05, + "loss": 1.2475, + "step": 8243 + }, + { + "epoch": 2.4554440700683187, + "grad_norm": 0.22872363030910492, + "learning_rate": 1.7656585203595708e-05, + "loss": 1.2622, + "step": 8244 + }, + { + "epoch": 2.455741916267987, + "grad_norm": 0.22929058969020844, + "learning_rate": 1.7655964713312232e-05, + "loss": 1.2567, + "step": 8245 + }, + { + "epoch": 2.4560397624676558, + "grad_norm": 0.23774370551109314, + "learning_rate": 1.7655344151798554e-05, + "loss": 1.2499, + "step": 8246 + }, + { + "epoch": 2.4563376086673245, + "grad_norm": 0.23429562151432037, + "learning_rate": 1.7654723519060457e-05, + "loss": 1.2536, + "step": 8247 + }, + { + "epoch": 2.456635454866993, + "grad_norm": 0.22949855029582977, + "learning_rate": 1.7654102815103708e-05, + "loss": 1.2442, + "step": 8248 + }, + { + "epoch": 2.4569333010666616, + "grad_norm": 0.2358241230249405, + "learning_rate": 1.7653482039934084e-05, + "loss": 1.254, + "step": 8249 + }, + { + "epoch": 2.4572311472663304, + "grad_norm": 0.23049359023571014, + "learning_rate": 1.765286119355736e-05, + "loss": 1.2416, + "step": 8250 + }, + { + "epoch": 2.457528993465999, + "grad_norm": 0.22333648800849915, + "learning_rate": 1.7652240275979312e-05, + "loss": 1.2468, + "step": 8251 + }, + { + "epoch": 2.4578268396656675, + "grad_norm": 0.21928377449512482, + "learning_rate": 1.7651619287205722e-05, + "loss": 1.2514, + "step": 8252 + }, + { + "epoch": 2.4581246858653363, + "grad_norm": 0.23795348405838013, + "learning_rate": 1.7650998227242357e-05, + "loss": 1.2476, + "step": 8253 + }, + { + "epoch": 2.458422532065005, + "grad_norm": 0.2292827069759369, + "learning_rate": 1.7650377096095007e-05, + "loss": 1.2392, + "step": 8254 + }, + { + "epoch": 2.4587203782646734, + "grad_norm": 0.22743940353393555, + "learning_rate": 1.7649755893769446e-05, + "loss": 1.2573, + "step": 8255 + }, + { + "epoch": 2.459018224464342, + "grad_norm": 0.22143808007240295, + "learning_rate": 1.764913462027145e-05, + "loss": 1.2518, + "step": 8256 + }, + { + "epoch": 2.459316070664011, + "grad_norm": 0.22706541419029236, + "learning_rate": 1.7648513275606802e-05, + "loss": 1.2339, + "step": 8257 + }, + { + "epoch": 2.4596139168636797, + "grad_norm": 0.23309893906116486, + "learning_rate": 1.7647891859781284e-05, + "loss": 1.2525, + "step": 8258 + }, + { + "epoch": 2.459911763063348, + "grad_norm": 0.22900895774364471, + "learning_rate": 1.764727037280068e-05, + "loss": 1.2633, + "step": 8259 + }, + { + "epoch": 2.460209609263017, + "grad_norm": 0.22730329632759094, + "learning_rate": 1.7646648814670765e-05, + "loss": 1.2409, + "step": 8260 + }, + { + "epoch": 2.4605074554626856, + "grad_norm": 0.22470271587371826, + "learning_rate": 1.764602718539733e-05, + "loss": 1.2409, + "step": 8261 + }, + { + "epoch": 2.4608053016623543, + "grad_norm": 0.2320781648159027, + "learning_rate": 1.7645405484986152e-05, + "loss": 1.2581, + "step": 8262 + }, + { + "epoch": 2.4611031478620227, + "grad_norm": 0.21992452442646027, + "learning_rate": 1.764478371344302e-05, + "loss": 1.2258, + "step": 8263 + }, + { + "epoch": 2.4614009940616914, + "grad_norm": 0.21929426491260529, + "learning_rate": 1.7644161870773715e-05, + "loss": 1.2574, + "step": 8264 + }, + { + "epoch": 2.46169884026136, + "grad_norm": 0.23645782470703125, + "learning_rate": 1.7643539956984026e-05, + "loss": 1.26, + "step": 8265 + }, + { + "epoch": 2.4619966864610285, + "grad_norm": 0.23062999546527863, + "learning_rate": 1.7642917972079737e-05, + "loss": 1.2329, + "step": 8266 + }, + { + "epoch": 2.4622945326606973, + "grad_norm": 0.23762331902980804, + "learning_rate": 1.7642295916066636e-05, + "loss": 1.2575, + "step": 8267 + }, + { + "epoch": 2.462592378860366, + "grad_norm": 0.23018376529216766, + "learning_rate": 1.7641673788950506e-05, + "loss": 1.2355, + "step": 8268 + }, + { + "epoch": 2.4628902250600344, + "grad_norm": 0.22822463512420654, + "learning_rate": 1.764105159073714e-05, + "loss": 1.2329, + "step": 8269 + }, + { + "epoch": 2.463188071259703, + "grad_norm": 0.23241938650608063, + "learning_rate": 1.7640429321432335e-05, + "loss": 1.2513, + "step": 8270 + }, + { + "epoch": 2.463485917459372, + "grad_norm": 0.23877792060375214, + "learning_rate": 1.7639806981041864e-05, + "loss": 1.247, + "step": 8271 + }, + { + "epoch": 2.4637837636590407, + "grad_norm": 0.2348979413509369, + "learning_rate": 1.7639184569571522e-05, + "loss": 1.255, + "step": 8272 + }, + { + "epoch": 2.464081609858709, + "grad_norm": 0.22891142964363098, + "learning_rate": 1.7638562087027106e-05, + "loss": 1.255, + "step": 8273 + }, + { + "epoch": 2.464379456058378, + "grad_norm": 0.23356257379055023, + "learning_rate": 1.7637939533414407e-05, + "loss": 1.2586, + "step": 8274 + }, + { + "epoch": 2.4646773022580466, + "grad_norm": 0.2342851608991623, + "learning_rate": 1.763731690873921e-05, + "loss": 1.2519, + "step": 8275 + }, + { + "epoch": 2.4649751484577154, + "grad_norm": 0.2412482500076294, + "learning_rate": 1.7636694213007316e-05, + "loss": 1.243, + "step": 8276 + }, + { + "epoch": 2.4652729946573837, + "grad_norm": 0.22454293072223663, + "learning_rate": 1.7636071446224508e-05, + "loss": 1.235, + "step": 8277 + }, + { + "epoch": 2.4655708408570525, + "grad_norm": 0.23977334797382355, + "learning_rate": 1.7635448608396592e-05, + "loss": 1.2599, + "step": 8278 + }, + { + "epoch": 2.4658686870567212, + "grad_norm": 0.23532359302043915, + "learning_rate": 1.7634825699529357e-05, + "loss": 1.2615, + "step": 8279 + }, + { + "epoch": 2.4661665332563896, + "grad_norm": 0.2302032858133316, + "learning_rate": 1.76342027196286e-05, + "loss": 1.2655, + "step": 8280 + }, + { + "epoch": 2.4664643794560583, + "grad_norm": 0.2321433573961258, + "learning_rate": 1.7633579668700114e-05, + "loss": 1.2524, + "step": 8281 + }, + { + "epoch": 2.466762225655727, + "grad_norm": 0.2206072211265564, + "learning_rate": 1.7632956546749696e-05, + "loss": 1.256, + "step": 8282 + }, + { + "epoch": 2.4670600718553954, + "grad_norm": 0.2309189736843109, + "learning_rate": 1.763233335378315e-05, + "loss": 1.2492, + "step": 8283 + }, + { + "epoch": 2.467357918055064, + "grad_norm": 0.2347707450389862, + "learning_rate": 1.7631710089806265e-05, + "loss": 1.2542, + "step": 8284 + }, + { + "epoch": 2.467655764254733, + "grad_norm": 0.23785153031349182, + "learning_rate": 1.7631086754824845e-05, + "loss": 1.2479, + "step": 8285 + }, + { + "epoch": 2.4679536104544018, + "grad_norm": 0.2298462986946106, + "learning_rate": 1.7630463348844694e-05, + "loss": 1.2641, + "step": 8286 + }, + { + "epoch": 2.46825145665407, + "grad_norm": 0.23382051289081573, + "learning_rate": 1.76298398718716e-05, + "loss": 1.2599, + "step": 8287 + }, + { + "epoch": 2.468549302853739, + "grad_norm": 0.2526465356349945, + "learning_rate": 1.7629216323911377e-05, + "loss": 1.29, + "step": 8288 + }, + { + "epoch": 2.4688471490534076, + "grad_norm": 0.2504581809043884, + "learning_rate": 1.7628592704969815e-05, + "loss": 1.2599, + "step": 8289 + }, + { + "epoch": 2.4691449952530764, + "grad_norm": 0.2236238569021225, + "learning_rate": 1.762796901505272e-05, + "loss": 1.2481, + "step": 8290 + }, + { + "epoch": 2.4694428414527447, + "grad_norm": 0.22754114866256714, + "learning_rate": 1.76273452541659e-05, + "loss": 1.234, + "step": 8291 + }, + { + "epoch": 2.4697406876524135, + "grad_norm": 0.23117542266845703, + "learning_rate": 1.7626721422315153e-05, + "loss": 1.2252, + "step": 8292 + }, + { + "epoch": 2.4700385338520823, + "grad_norm": 0.23307673633098602, + "learning_rate": 1.7626097519506285e-05, + "loss": 1.2422, + "step": 8293 + }, + { + "epoch": 2.4703363800517506, + "grad_norm": 0.23654675483703613, + "learning_rate": 1.76254735457451e-05, + "loss": 1.2352, + "step": 8294 + }, + { + "epoch": 2.4706342262514194, + "grad_norm": 0.23664255440235138, + "learning_rate": 1.76248495010374e-05, + "loss": 1.2593, + "step": 8295 + }, + { + "epoch": 2.470932072451088, + "grad_norm": 0.2272440791130066, + "learning_rate": 1.7624225385388998e-05, + "loss": 1.2538, + "step": 8296 + }, + { + "epoch": 2.4712299186507565, + "grad_norm": 0.22650983929634094, + "learning_rate": 1.7623601198805695e-05, + "loss": 1.2515, + "step": 8297 + }, + { + "epoch": 2.4715277648504252, + "grad_norm": 0.23413802683353424, + "learning_rate": 1.76229769412933e-05, + "loss": 1.2614, + "step": 8298 + }, + { + "epoch": 2.471825611050094, + "grad_norm": 0.2297011911869049, + "learning_rate": 1.7622352612857622e-05, + "loss": 1.256, + "step": 8299 + }, + { + "epoch": 2.472123457249763, + "grad_norm": 0.23197448253631592, + "learning_rate": 1.762172821350447e-05, + "loss": 1.237, + "step": 8300 + }, + { + "epoch": 2.472421303449431, + "grad_norm": 0.23182253539562225, + "learning_rate": 1.7621103743239652e-05, + "loss": 1.2585, + "step": 8301 + }, + { + "epoch": 2.4727191496491, + "grad_norm": 0.2433755099773407, + "learning_rate": 1.7620479202068977e-05, + "loss": 1.2453, + "step": 8302 + }, + { + "epoch": 2.4730169958487687, + "grad_norm": 0.22609774768352509, + "learning_rate": 1.761985458999826e-05, + "loss": 1.2519, + "step": 8303 + }, + { + "epoch": 2.4733148420484374, + "grad_norm": 0.24316880106925964, + "learning_rate": 1.7619229907033307e-05, + "loss": 1.2365, + "step": 8304 + }, + { + "epoch": 2.4736126882481058, + "grad_norm": 0.2347557693719864, + "learning_rate": 1.7618605153179933e-05, + "loss": 1.2437, + "step": 8305 + }, + { + "epoch": 2.4739105344477745, + "grad_norm": 0.2216932475566864, + "learning_rate": 1.7617980328443952e-05, + "loss": 1.2369, + "step": 8306 + }, + { + "epoch": 2.4742083806474433, + "grad_norm": 0.24111506342887878, + "learning_rate": 1.7617355432831175e-05, + "loss": 1.2569, + "step": 8307 + }, + { + "epoch": 2.4745062268471116, + "grad_norm": 0.23701366782188416, + "learning_rate": 1.7616730466347415e-05, + "loss": 1.2542, + "step": 8308 + }, + { + "epoch": 2.4748040730467804, + "grad_norm": 0.23881372809410095, + "learning_rate": 1.7616105428998487e-05, + "loss": 1.2449, + "step": 8309 + }, + { + "epoch": 2.475101919246449, + "grad_norm": 0.23791971802711487, + "learning_rate": 1.7615480320790208e-05, + "loss": 1.2613, + "step": 8310 + }, + { + "epoch": 2.475399765446118, + "grad_norm": 0.2507789433002472, + "learning_rate": 1.7614855141728395e-05, + "loss": 1.2469, + "step": 8311 + }, + { + "epoch": 2.4756976116457863, + "grad_norm": 0.22700440883636475, + "learning_rate": 1.761422989181886e-05, + "loss": 1.2544, + "step": 8312 + }, + { + "epoch": 2.475995457845455, + "grad_norm": 0.23406459391117096, + "learning_rate": 1.7613604571067425e-05, + "loss": 1.2394, + "step": 8313 + }, + { + "epoch": 2.476293304045124, + "grad_norm": 0.227818563580513, + "learning_rate": 1.7612979179479903e-05, + "loss": 1.2446, + "step": 8314 + }, + { + "epoch": 2.4765911502447926, + "grad_norm": 0.23256555199623108, + "learning_rate": 1.7612353717062117e-05, + "loss": 1.2715, + "step": 8315 + }, + { + "epoch": 2.476888996444461, + "grad_norm": 0.22174721956253052, + "learning_rate": 1.7611728183819888e-05, + "loss": 1.2467, + "step": 8316 + }, + { + "epoch": 2.4771868426441297, + "grad_norm": 0.23079760372638702, + "learning_rate": 1.761110257975903e-05, + "loss": 1.2595, + "step": 8317 + }, + { + "epoch": 2.4774846888437985, + "grad_norm": 0.23321036994457245, + "learning_rate": 1.7610476904885363e-05, + "loss": 1.2507, + "step": 8318 + }, + { + "epoch": 2.477782535043467, + "grad_norm": 0.22463597357273102, + "learning_rate": 1.7609851159204716e-05, + "loss": 1.2277, + "step": 8319 + }, + { + "epoch": 2.4780803812431356, + "grad_norm": 0.23679356276988983, + "learning_rate": 1.7609225342722906e-05, + "loss": 1.2531, + "step": 8320 + }, + { + "epoch": 2.4783782274428043, + "grad_norm": 0.22336843609809875, + "learning_rate": 1.7608599455445753e-05, + "loss": 1.2311, + "step": 8321 + }, + { + "epoch": 2.4786760736424727, + "grad_norm": 0.23606246709823608, + "learning_rate": 1.7607973497379083e-05, + "loss": 1.2543, + "step": 8322 + }, + { + "epoch": 2.4789739198421414, + "grad_norm": 0.23542477190494537, + "learning_rate": 1.7607347468528718e-05, + "loss": 1.231, + "step": 8323 + }, + { + "epoch": 2.47927176604181, + "grad_norm": 0.2172466218471527, + "learning_rate": 1.760672136890049e-05, + "loss": 1.251, + "step": 8324 + }, + { + "epoch": 2.479569612241479, + "grad_norm": 0.23283933103084564, + "learning_rate": 1.7606095198500213e-05, + "loss": 1.2563, + "step": 8325 + }, + { + "epoch": 2.4798674584411473, + "grad_norm": 0.23413917422294617, + "learning_rate": 1.7605468957333722e-05, + "loss": 1.2655, + "step": 8326 + }, + { + "epoch": 2.480165304640816, + "grad_norm": 0.22806623578071594, + "learning_rate": 1.760484264540684e-05, + "loss": 1.2641, + "step": 8327 + }, + { + "epoch": 2.480463150840485, + "grad_norm": 0.22838027775287628, + "learning_rate": 1.7604216262725392e-05, + "loss": 1.2412, + "step": 8328 + }, + { + "epoch": 2.4807609970401536, + "grad_norm": 0.23101909458637238, + "learning_rate": 1.7603589809295205e-05, + "loss": 1.2404, + "step": 8329 + }, + { + "epoch": 2.481058843239822, + "grad_norm": 0.22244912385940552, + "learning_rate": 1.7602963285122115e-05, + "loss": 1.2399, + "step": 8330 + }, + { + "epoch": 2.4813566894394907, + "grad_norm": 0.22440123558044434, + "learning_rate": 1.7602336690211944e-05, + "loss": 1.2486, + "step": 8331 + }, + { + "epoch": 2.4816545356391595, + "grad_norm": 0.24199633300304413, + "learning_rate": 1.7601710024570524e-05, + "loss": 1.2348, + "step": 8332 + }, + { + "epoch": 2.481952381838828, + "grad_norm": 0.2698337137699127, + "learning_rate": 1.7601083288203685e-05, + "loss": 1.2496, + "step": 8333 + }, + { + "epoch": 2.4822502280384966, + "grad_norm": 0.24751141667366028, + "learning_rate": 1.7600456481117257e-05, + "loss": 1.2594, + "step": 8334 + }, + { + "epoch": 2.4825480742381654, + "grad_norm": 0.23082788288593292, + "learning_rate": 1.7599829603317075e-05, + "loss": 1.2473, + "step": 8335 + }, + { + "epoch": 2.4828459204378337, + "grad_norm": 0.21850734949111938, + "learning_rate": 1.7599202654808972e-05, + "loss": 1.2519, + "step": 8336 + }, + { + "epoch": 2.4831437666375025, + "grad_norm": 0.2733902335166931, + "learning_rate": 1.7598575635598775e-05, + "loss": 1.253, + "step": 8337 + }, + { + "epoch": 2.4834416128371712, + "grad_norm": 0.2962580919265747, + "learning_rate": 1.759794854569232e-05, + "loss": 1.2546, + "step": 8338 + }, + { + "epoch": 2.48373945903684, + "grad_norm": 0.23880480229854584, + "learning_rate": 1.7597321385095445e-05, + "loss": 1.2532, + "step": 8339 + }, + { + "epoch": 2.4840373052365083, + "grad_norm": 0.31910011172294617, + "learning_rate": 1.7596694153813984e-05, + "loss": 1.2539, + "step": 8340 + }, + { + "epoch": 2.484335151436177, + "grad_norm": 0.29905804991722107, + "learning_rate": 1.759606685185377e-05, + "loss": 1.2373, + "step": 8341 + }, + { + "epoch": 2.484632997635846, + "grad_norm": 0.25430893898010254, + "learning_rate": 1.759543947922064e-05, + "loss": 1.2527, + "step": 8342 + }, + { + "epoch": 2.4849308438355147, + "grad_norm": 0.30623292922973633, + "learning_rate": 1.7594812035920434e-05, + "loss": 1.2703, + "step": 8343 + }, + { + "epoch": 2.485228690035183, + "grad_norm": 0.2231859266757965, + "learning_rate": 1.7594184521958986e-05, + "loss": 1.2616, + "step": 8344 + }, + { + "epoch": 2.4855265362348518, + "grad_norm": 0.24325911700725555, + "learning_rate": 1.7593556937342136e-05, + "loss": 1.2464, + "step": 8345 + }, + { + "epoch": 2.4858243824345205, + "grad_norm": 0.23927530646324158, + "learning_rate": 1.7592929282075722e-05, + "loss": 1.2496, + "step": 8346 + }, + { + "epoch": 2.486122228634189, + "grad_norm": 0.24509276449680328, + "learning_rate": 1.7592301556165584e-05, + "loss": 1.2581, + "step": 8347 + }, + { + "epoch": 2.4864200748338576, + "grad_norm": 0.2705968916416168, + "learning_rate": 1.7591673759617564e-05, + "loss": 1.2318, + "step": 8348 + }, + { + "epoch": 2.4867179210335264, + "grad_norm": 0.27921196818351746, + "learning_rate": 1.7591045892437503e-05, + "loss": 1.242, + "step": 8349 + }, + { + "epoch": 2.4870157672331947, + "grad_norm": 0.27228739857673645, + "learning_rate": 1.7590417954631238e-05, + "loss": 1.2626, + "step": 8350 + }, + { + "epoch": 2.4873136134328635, + "grad_norm": 0.4469757080078125, + "learning_rate": 1.7589789946204617e-05, + "loss": 1.2477, + "step": 8351 + }, + { + "epoch": 2.4876114596325323, + "grad_norm": 0.28334805369377136, + "learning_rate": 1.758916186716348e-05, + "loss": 1.2571, + "step": 8352 + }, + { + "epoch": 2.487909305832201, + "grad_norm": 0.2805827558040619, + "learning_rate": 1.758853371751367e-05, + "loss": 1.2554, + "step": 8353 + }, + { + "epoch": 2.4882071520318694, + "grad_norm": 0.2342253029346466, + "learning_rate": 1.758790549726103e-05, + "loss": 1.2336, + "step": 8354 + }, + { + "epoch": 2.488504998231538, + "grad_norm": 0.26948752999305725, + "learning_rate": 1.758727720641141e-05, + "loss": 1.2533, + "step": 8355 + }, + { + "epoch": 2.488802844431207, + "grad_norm": 0.26174086332321167, + "learning_rate": 1.7586648844970652e-05, + "loss": 1.2521, + "step": 8356 + }, + { + "epoch": 2.4891006906308757, + "grad_norm": 0.23120491206645966, + "learning_rate": 1.7586020412944603e-05, + "loss": 1.2392, + "step": 8357 + }, + { + "epoch": 2.489398536830544, + "grad_norm": 0.24739497900009155, + "learning_rate": 1.758539191033911e-05, + "loss": 1.2678, + "step": 8358 + }, + { + "epoch": 2.489696383030213, + "grad_norm": 0.23574678599834442, + "learning_rate": 1.758476333716002e-05, + "loss": 1.2505, + "step": 8359 + }, + { + "epoch": 2.4899942292298816, + "grad_norm": 0.24132049083709717, + "learning_rate": 1.7584134693413178e-05, + "loss": 1.2475, + "step": 8360 + }, + { + "epoch": 2.49029207542955, + "grad_norm": 0.22077466547489166, + "learning_rate": 1.758350597910444e-05, + "loss": 1.2442, + "step": 8361 + }, + { + "epoch": 2.4905899216292187, + "grad_norm": 0.2397473156452179, + "learning_rate": 1.7582877194239652e-05, + "loss": 1.2504, + "step": 8362 + }, + { + "epoch": 2.4908877678288874, + "grad_norm": 0.24787132441997528, + "learning_rate": 1.7582248338824662e-05, + "loss": 1.2522, + "step": 8363 + }, + { + "epoch": 2.4911856140285558, + "grad_norm": 0.23322662711143494, + "learning_rate": 1.7581619412865322e-05, + "loss": 1.2616, + "step": 8364 + }, + { + "epoch": 2.4914834602282245, + "grad_norm": 0.2274780422449112, + "learning_rate": 1.7580990416367484e-05, + "loss": 1.2441, + "step": 8365 + }, + { + "epoch": 2.4917813064278933, + "grad_norm": 0.23379875719547272, + "learning_rate": 1.7580361349337006e-05, + "loss": 1.2425, + "step": 8366 + }, + { + "epoch": 2.492079152627562, + "grad_norm": 0.23417527973651886, + "learning_rate": 1.757973221177973e-05, + "loss": 1.2451, + "step": 8367 + }, + { + "epoch": 2.4923769988272304, + "grad_norm": 0.23419426381587982, + "learning_rate": 1.7579103003701514e-05, + "loss": 1.2589, + "step": 8368 + }, + { + "epoch": 2.492674845026899, + "grad_norm": 0.23790061473846436, + "learning_rate": 1.757847372510821e-05, + "loss": 1.2371, + "step": 8369 + }, + { + "epoch": 2.492972691226568, + "grad_norm": 0.22814206779003143, + "learning_rate": 1.757784437600568e-05, + "loss": 1.2725, + "step": 8370 + }, + { + "epoch": 2.4932705374262367, + "grad_norm": 0.2249986231327057, + "learning_rate": 1.757721495639977e-05, + "loss": 1.2418, + "step": 8371 + }, + { + "epoch": 2.493568383625905, + "grad_norm": 0.24132882058620453, + "learning_rate": 1.7576585466296346e-05, + "loss": 1.2399, + "step": 8372 + }, + { + "epoch": 2.493866229825574, + "grad_norm": 0.24062736332416534, + "learning_rate": 1.7575955905701257e-05, + "loss": 1.2523, + "step": 8373 + }, + { + "epoch": 2.4941640760252426, + "grad_norm": 0.23082296550273895, + "learning_rate": 1.7575326274620362e-05, + "loss": 1.2506, + "step": 8374 + }, + { + "epoch": 2.494461922224911, + "grad_norm": 0.23409095406532288, + "learning_rate": 1.757469657305952e-05, + "loss": 1.2485, + "step": 8375 + }, + { + "epoch": 2.4947597684245797, + "grad_norm": 0.23145711421966553, + "learning_rate": 1.757406680102459e-05, + "loss": 1.2593, + "step": 8376 + }, + { + "epoch": 2.4950576146242485, + "grad_norm": 0.23635199666023254, + "learning_rate": 1.7573436958521428e-05, + "loss": 1.2491, + "step": 8377 + }, + { + "epoch": 2.4953554608239172, + "grad_norm": 0.23779116570949554, + "learning_rate": 1.75728070455559e-05, + "loss": 1.2788, + "step": 8378 + }, + { + "epoch": 2.4956533070235856, + "grad_norm": 0.23136919736862183, + "learning_rate": 1.7572177062133863e-05, + "loss": 1.2767, + "step": 8379 + }, + { + "epoch": 2.4959511532232543, + "grad_norm": 0.2295750081539154, + "learning_rate": 1.7571547008261175e-05, + "loss": 1.2734, + "step": 8380 + }, + { + "epoch": 2.496248999422923, + "grad_norm": 0.23830348253250122, + "learning_rate": 1.7570916883943704e-05, + "loss": 1.2625, + "step": 8381 + }, + { + "epoch": 2.496546845622592, + "grad_norm": 0.2223721295595169, + "learning_rate": 1.7570286689187312e-05, + "loss": 1.2436, + "step": 8382 + }, + { + "epoch": 2.49684469182226, + "grad_norm": 0.29212528467178345, + "learning_rate": 1.7569656423997858e-05, + "loss": 1.2531, + "step": 8383 + }, + { + "epoch": 2.497142538021929, + "grad_norm": 0.2426663041114807, + "learning_rate": 1.756902608838121e-05, + "loss": 1.2533, + "step": 8384 + }, + { + "epoch": 2.4974403842215978, + "grad_norm": 0.27768445014953613, + "learning_rate": 1.7568395682343226e-05, + "loss": 1.231, + "step": 8385 + }, + { + "epoch": 2.497738230421266, + "grad_norm": 0.23751646280288696, + "learning_rate": 1.756776520588978e-05, + "loss": 1.2624, + "step": 8386 + }, + { + "epoch": 2.498036076620935, + "grad_norm": 0.26912587881088257, + "learning_rate": 1.7567134659026734e-05, + "loss": 1.2522, + "step": 8387 + }, + { + "epoch": 2.4983339228206036, + "grad_norm": 0.2330433577299118, + "learning_rate": 1.7566504041759954e-05, + "loss": 1.2488, + "step": 8388 + }, + { + "epoch": 2.498631769020272, + "grad_norm": 0.24332500994205475, + "learning_rate": 1.756587335409531e-05, + "loss": 1.2642, + "step": 8389 + }, + { + "epoch": 2.4989296152199407, + "grad_norm": 0.22631646692752838, + "learning_rate": 1.7565242596038664e-05, + "loss": 1.2486, + "step": 8390 + }, + { + "epoch": 2.4992274614196095, + "grad_norm": 0.268858939409256, + "learning_rate": 1.7564611767595888e-05, + "loss": 1.2471, + "step": 8391 + }, + { + "epoch": 2.4995253076192783, + "grad_norm": 0.2353544533252716, + "learning_rate": 1.7563980868772853e-05, + "loss": 1.2551, + "step": 8392 + }, + { + "epoch": 2.4998231538189466, + "grad_norm": 0.23257684707641602, + "learning_rate": 1.7563349899575427e-05, + "loss": 1.2442, + "step": 8393 + }, + { + "epoch": 2.5001210000186154, + "grad_norm": 0.24033115804195404, + "learning_rate": 1.756271886000948e-05, + "loss": 1.2563, + "step": 8394 + }, + { + "epoch": 2.500418846218284, + "grad_norm": 0.2385256290435791, + "learning_rate": 1.7562087750080885e-05, + "loss": 1.2446, + "step": 8395 + }, + { + "epoch": 2.500716692417953, + "grad_norm": 0.2354026585817337, + "learning_rate": 1.756145656979551e-05, + "loss": 1.2405, + "step": 8396 + }, + { + "epoch": 2.5010145386176212, + "grad_norm": 0.23773229122161865, + "learning_rate": 1.756082531915923e-05, + "loss": 1.2423, + "step": 8397 + }, + { + "epoch": 2.50131238481729, + "grad_norm": 0.38805413246154785, + "learning_rate": 1.7560193998177922e-05, + "loss": 1.2595, + "step": 8398 + }, + { + "epoch": 2.501610231016959, + "grad_norm": 0.32371848821640015, + "learning_rate": 1.755956260685745e-05, + "loss": 1.2408, + "step": 8399 + }, + { + "epoch": 2.501908077216627, + "grad_norm": 0.28234052658081055, + "learning_rate": 1.7558931145203697e-05, + "loss": 1.2483, + "step": 8400 + }, + { + "epoch": 2.502205923416296, + "grad_norm": 0.3353901505470276, + "learning_rate": 1.7558299613222534e-05, + "loss": 1.263, + "step": 8401 + }, + { + "epoch": 2.5025037696159647, + "grad_norm": 0.23078328371047974, + "learning_rate": 1.755766801091984e-05, + "loss": 1.2556, + "step": 8402 + }, + { + "epoch": 2.502801615815633, + "grad_norm": 0.23619240522384644, + "learning_rate": 1.7557036338301486e-05, + "loss": 1.2478, + "step": 8403 + }, + { + "epoch": 2.5030994620153018, + "grad_norm": 0.2363365888595581, + "learning_rate": 1.755640459537335e-05, + "loss": 1.236, + "step": 8404 + }, + { + "epoch": 2.5033973082149705, + "grad_norm": 0.2406965047121048, + "learning_rate": 1.7555772782141315e-05, + "loss": 1.2811, + "step": 8405 + }, + { + "epoch": 2.5036951544146393, + "grad_norm": 0.23396317660808563, + "learning_rate": 1.7555140898611257e-05, + "loss": 1.2602, + "step": 8406 + }, + { + "epoch": 2.5039930006143076, + "grad_norm": 0.23090584576129913, + "learning_rate": 1.7554508944789055e-05, + "loss": 1.2536, + "step": 8407 + }, + { + "epoch": 2.5042908468139764, + "grad_norm": 0.2459108680486679, + "learning_rate": 1.7553876920680584e-05, + "loss": 1.2713, + "step": 8408 + }, + { + "epoch": 2.504588693013645, + "grad_norm": 0.22830830514431, + "learning_rate": 1.755324482629173e-05, + "loss": 1.2472, + "step": 8409 + }, + { + "epoch": 2.504886539213314, + "grad_norm": 0.23577916622161865, + "learning_rate": 1.7552612661628373e-05, + "loss": 1.2472, + "step": 8410 + }, + { + "epoch": 2.5051843854129823, + "grad_norm": 0.24144499003887177, + "learning_rate": 1.755198042669639e-05, + "loss": 1.2436, + "step": 8411 + }, + { + "epoch": 2.505482231612651, + "grad_norm": 0.2292211502790451, + "learning_rate": 1.755134812150167e-05, + "loss": 1.2664, + "step": 8412 + }, + { + "epoch": 2.50578007781232, + "grad_norm": 0.2611818313598633, + "learning_rate": 1.755071574605009e-05, + "loss": 1.2461, + "step": 8413 + }, + { + "epoch": 2.506077924011988, + "grad_norm": 0.2922768294811249, + "learning_rate": 1.755008330034754e-05, + "loss": 1.2442, + "step": 8414 + }, + { + "epoch": 2.506375770211657, + "grad_norm": 0.24702337384223938, + "learning_rate": 1.7549450784399894e-05, + "loss": 1.2644, + "step": 8415 + }, + { + "epoch": 2.5066736164113257, + "grad_norm": 0.22738125920295715, + "learning_rate": 1.7548818198213048e-05, + "loss": 1.2602, + "step": 8416 + }, + { + "epoch": 2.506971462610994, + "grad_norm": 0.258711040019989, + "learning_rate": 1.7548185541792883e-05, + "loss": 1.2542, + "step": 8417 + }, + { + "epoch": 2.507269308810663, + "grad_norm": 0.2498316615819931, + "learning_rate": 1.754755281514528e-05, + "loss": 1.2729, + "step": 8418 + }, + { + "epoch": 2.5075671550103316, + "grad_norm": 0.25507616996765137, + "learning_rate": 1.7546920018276136e-05, + "loss": 1.2532, + "step": 8419 + }, + { + "epoch": 2.5078650012100003, + "grad_norm": 0.2289208471775055, + "learning_rate": 1.7546287151191332e-05, + "loss": 1.2541, + "step": 8420 + }, + { + "epoch": 2.508162847409669, + "grad_norm": 0.2770848274230957, + "learning_rate": 1.7545654213896756e-05, + "loss": 1.2645, + "step": 8421 + }, + { + "epoch": 2.5084606936093374, + "grad_norm": 0.2718277871608734, + "learning_rate": 1.7545021206398297e-05, + "loss": 1.2522, + "step": 8422 + }, + { + "epoch": 2.508758539809006, + "grad_norm": 0.2465401440858841, + "learning_rate": 1.754438812870185e-05, + "loss": 1.2502, + "step": 8423 + }, + { + "epoch": 2.509056386008675, + "grad_norm": 0.3208794593811035, + "learning_rate": 1.7543754980813298e-05, + "loss": 1.2609, + "step": 8424 + }, + { + "epoch": 2.5093542322083433, + "grad_norm": 0.24295663833618164, + "learning_rate": 1.754312176273853e-05, + "loss": 1.2328, + "step": 8425 + }, + { + "epoch": 2.509652078408012, + "grad_norm": 0.25769492983818054, + "learning_rate": 1.7542488474483446e-05, + "loss": 1.2591, + "step": 8426 + }, + { + "epoch": 2.509949924607681, + "grad_norm": 0.2720673680305481, + "learning_rate": 1.7541855116053935e-05, + "loss": 1.2625, + "step": 8427 + }, + { + "epoch": 2.510247770807349, + "grad_norm": 0.22432737052440643, + "learning_rate": 1.754122168745589e-05, + "loss": 1.2352, + "step": 8428 + }, + { + "epoch": 2.510545617007018, + "grad_norm": 0.2628263831138611, + "learning_rate": 1.7540588188695197e-05, + "loss": 1.262, + "step": 8429 + }, + { + "epoch": 2.5108434632066867, + "grad_norm": 0.2659337520599365, + "learning_rate": 1.753995461977776e-05, + "loss": 1.2459, + "step": 8430 + }, + { + "epoch": 2.511141309406355, + "grad_norm": 0.23539134860038757, + "learning_rate": 1.7539320980709466e-05, + "loss": 1.2491, + "step": 8431 + }, + { + "epoch": 2.511439155606024, + "grad_norm": 0.2545980215072632, + "learning_rate": 1.7538687271496214e-05, + "loss": 1.2634, + "step": 8432 + }, + { + "epoch": 2.5117370018056926, + "grad_norm": 0.2392890602350235, + "learning_rate": 1.7538053492143902e-05, + "loss": 1.2412, + "step": 8433 + }, + { + "epoch": 2.5120348480053614, + "grad_norm": 0.25248026847839355, + "learning_rate": 1.7537419642658423e-05, + "loss": 1.2418, + "step": 8434 + }, + { + "epoch": 2.51233269420503, + "grad_norm": 0.3714005947113037, + "learning_rate": 1.7536785723045674e-05, + "loss": 1.2549, + "step": 8435 + }, + { + "epoch": 2.5126305404046985, + "grad_norm": 0.3023560047149658, + "learning_rate": 1.7536151733311557e-05, + "loss": 1.2344, + "step": 8436 + }, + { + "epoch": 2.5129283866043672, + "grad_norm": 0.282634973526001, + "learning_rate": 1.753551767346197e-05, + "loss": 1.2538, + "step": 8437 + }, + { + "epoch": 2.513226232804036, + "grad_norm": 0.31312984228134155, + "learning_rate": 1.7534883543502804e-05, + "loss": 1.2514, + "step": 8438 + }, + { + "epoch": 2.5135240790037043, + "grad_norm": 0.2593906819820404, + "learning_rate": 1.7534249343439967e-05, + "loss": 1.2328, + "step": 8439 + }, + { + "epoch": 2.513821925203373, + "grad_norm": 0.25793519616127014, + "learning_rate": 1.7533615073279363e-05, + "loss": 1.2425, + "step": 8440 + }, + { + "epoch": 2.514119771403042, + "grad_norm": 0.2638314366340637, + "learning_rate": 1.753298073302688e-05, + "loss": 1.2366, + "step": 8441 + }, + { + "epoch": 2.51441761760271, + "grad_norm": 0.23106567561626434, + "learning_rate": 1.7532346322688434e-05, + "loss": 1.2503, + "step": 8442 + }, + { + "epoch": 2.514715463802379, + "grad_norm": 0.3165026605129242, + "learning_rate": 1.753171184226992e-05, + "loss": 1.2629, + "step": 8443 + }, + { + "epoch": 2.5150133100020478, + "grad_norm": 0.2291480451822281, + "learning_rate": 1.7531077291777242e-05, + "loss": 1.2405, + "step": 8444 + }, + { + "epoch": 2.515311156201716, + "grad_norm": 0.24233947694301605, + "learning_rate": 1.75304426712163e-05, + "loss": 1.245, + "step": 8445 + }, + { + "epoch": 2.515609002401385, + "grad_norm": 0.2441457360982895, + "learning_rate": 1.7529807980593006e-05, + "loss": 1.2502, + "step": 8446 + }, + { + "epoch": 2.5159068486010536, + "grad_norm": 0.21896037459373474, + "learning_rate": 1.752917321991326e-05, + "loss": 1.2495, + "step": 8447 + }, + { + "epoch": 2.5162046948007224, + "grad_norm": 0.2734503448009491, + "learning_rate": 1.752853838918297e-05, + "loss": 1.2435, + "step": 8448 + }, + { + "epoch": 2.516502541000391, + "grad_norm": 0.23741139471530914, + "learning_rate": 1.7527903488408044e-05, + "loss": 1.258, + "step": 8449 + }, + { + "epoch": 2.5168003872000595, + "grad_norm": 0.24481236934661865, + "learning_rate": 1.7527268517594383e-05, + "loss": 1.2531, + "step": 8450 + }, + { + "epoch": 2.5170982333997283, + "grad_norm": 0.23682580888271332, + "learning_rate": 1.75266334767479e-05, + "loss": 1.2584, + "step": 8451 + }, + { + "epoch": 2.517396079599397, + "grad_norm": 0.29860150814056396, + "learning_rate": 1.75259983658745e-05, + "loss": 1.2407, + "step": 8452 + }, + { + "epoch": 2.5176939257990654, + "grad_norm": 0.23807692527770996, + "learning_rate": 1.7525363184980095e-05, + "loss": 1.2344, + "step": 8453 + }, + { + "epoch": 2.517991771998734, + "grad_norm": 0.25496482849121094, + "learning_rate": 1.7524727934070596e-05, + "loss": 1.2577, + "step": 8454 + }, + { + "epoch": 2.518289618198403, + "grad_norm": 0.2405746430158615, + "learning_rate": 1.752409261315191e-05, + "loss": 1.2408, + "step": 8455 + }, + { + "epoch": 2.5185874643980712, + "grad_norm": 0.31379228830337524, + "learning_rate": 1.7523457222229944e-05, + "loss": 1.256, + "step": 8456 + }, + { + "epoch": 2.51888531059774, + "grad_norm": 0.292829304933548, + "learning_rate": 1.7522821761310616e-05, + "loss": 1.2628, + "step": 8457 + }, + { + "epoch": 2.519183156797409, + "grad_norm": 0.2625550925731659, + "learning_rate": 1.752218623039984e-05, + "loss": 1.2408, + "step": 8458 + }, + { + "epoch": 2.519481002997077, + "grad_norm": 0.2893580198287964, + "learning_rate": 1.7521550629503524e-05, + "loss": 1.2384, + "step": 8459 + }, + { + "epoch": 2.519778849196746, + "grad_norm": 0.24691055715084076, + "learning_rate": 1.752091495862758e-05, + "loss": 1.2414, + "step": 8460 + }, + { + "epoch": 2.5200766953964147, + "grad_norm": 0.26145702600479126, + "learning_rate": 1.752027921777793e-05, + "loss": 1.2303, + "step": 8461 + }, + { + "epoch": 2.5203745415960834, + "grad_norm": 0.22267287969589233, + "learning_rate": 1.751964340696048e-05, + "loss": 1.24, + "step": 8462 + }, + { + "epoch": 2.520672387795752, + "grad_norm": 0.26279503107070923, + "learning_rate": 1.751900752618115e-05, + "loss": 1.2453, + "step": 8463 + }, + { + "epoch": 2.5209702339954205, + "grad_norm": 0.23793211579322815, + "learning_rate": 1.751837157544586e-05, + "loss": 1.2428, + "step": 8464 + }, + { + "epoch": 2.5212680801950893, + "grad_norm": 0.24436452984809875, + "learning_rate": 1.7517735554760518e-05, + "loss": 1.2452, + "step": 8465 + }, + { + "epoch": 2.521565926394758, + "grad_norm": 0.24439586699008942, + "learning_rate": 1.7517099464131045e-05, + "loss": 1.2544, + "step": 8466 + }, + { + "epoch": 2.5218637725944264, + "grad_norm": 0.2703554630279541, + "learning_rate": 1.7516463303563364e-05, + "loss": 1.2608, + "step": 8467 + }, + { + "epoch": 2.522161618794095, + "grad_norm": 0.2439625859260559, + "learning_rate": 1.7515827073063388e-05, + "loss": 1.2464, + "step": 8468 + }, + { + "epoch": 2.522459464993764, + "grad_norm": 0.2397177666425705, + "learning_rate": 1.7515190772637038e-05, + "loss": 1.2514, + "step": 8469 + }, + { + "epoch": 2.5227573111934323, + "grad_norm": 0.23239301145076752, + "learning_rate": 1.7514554402290235e-05, + "loss": 1.2628, + "step": 8470 + }, + { + "epoch": 2.523055157393101, + "grad_norm": 0.2251158356666565, + "learning_rate": 1.75139179620289e-05, + "loss": 1.2434, + "step": 8471 + }, + { + "epoch": 2.52335300359277, + "grad_norm": 0.23615312576293945, + "learning_rate": 1.751328145185895e-05, + "loss": 1.2347, + "step": 8472 + }, + { + "epoch": 2.5236508497924386, + "grad_norm": 0.2576223909854889, + "learning_rate": 1.7512644871786312e-05, + "loss": 1.2346, + "step": 8473 + }, + { + "epoch": 2.523948695992107, + "grad_norm": 0.2452310174703598, + "learning_rate": 1.751200822181691e-05, + "loss": 1.2472, + "step": 8474 + }, + { + "epoch": 2.5242465421917757, + "grad_norm": 0.25342339277267456, + "learning_rate": 1.751137150195666e-05, + "loss": 1.2573, + "step": 8475 + }, + { + "epoch": 2.5245443883914445, + "grad_norm": 0.3129463493824005, + "learning_rate": 1.7510734712211494e-05, + "loss": 1.2415, + "step": 8476 + }, + { + "epoch": 2.5248422345911132, + "grad_norm": 0.30068057775497437, + "learning_rate": 1.751009785258733e-05, + "loss": 1.2541, + "step": 8477 + }, + { + "epoch": 2.5251400807907816, + "grad_norm": 0.23290516436100006, + "learning_rate": 1.75094609230901e-05, + "loss": 1.2629, + "step": 8478 + }, + { + "epoch": 2.5254379269904503, + "grad_norm": 0.3935732841491699, + "learning_rate": 1.7508823923725723e-05, + "loss": 1.2463, + "step": 8479 + }, + { + "epoch": 2.525735773190119, + "grad_norm": 0.30810996890068054, + "learning_rate": 1.750818685450013e-05, + "loss": 1.2537, + "step": 8480 + }, + { + "epoch": 2.5260336193897874, + "grad_norm": 0.28003424406051636, + "learning_rate": 1.7507549715419245e-05, + "loss": 1.2443, + "step": 8481 + }, + { + "epoch": 2.526331465589456, + "grad_norm": 0.23418410122394562, + "learning_rate": 1.7506912506489002e-05, + "loss": 1.2507, + "step": 8482 + }, + { + "epoch": 2.526629311789125, + "grad_norm": 0.32362639904022217, + "learning_rate": 1.750627522771532e-05, + "loss": 1.2509, + "step": 8483 + }, + { + "epoch": 2.5269271579887933, + "grad_norm": 0.2395038902759552, + "learning_rate": 1.7505637879104137e-05, + "loss": 1.2429, + "step": 8484 + }, + { + "epoch": 2.527225004188462, + "grad_norm": 0.24742889404296875, + "learning_rate": 1.7505000460661378e-05, + "loss": 1.2534, + "step": 8485 + }, + { + "epoch": 2.527522850388131, + "grad_norm": 0.2470606416463852, + "learning_rate": 1.7504362972392977e-05, + "loss": 1.2462, + "step": 8486 + }, + { + "epoch": 2.5278206965877996, + "grad_norm": 0.23291617631912231, + "learning_rate": 1.750372541430486e-05, + "loss": 1.2446, + "step": 8487 + }, + { + "epoch": 2.5281185427874684, + "grad_norm": 0.22437015175819397, + "learning_rate": 1.7503087786402962e-05, + "loss": 1.23, + "step": 8488 + }, + { + "epoch": 2.5284163889871367, + "grad_norm": 0.2314341962337494, + "learning_rate": 1.750245008869322e-05, + "loss": 1.2428, + "step": 8489 + }, + { + "epoch": 2.5287142351868055, + "grad_norm": 0.24021659791469574, + "learning_rate": 1.7501812321181556e-05, + "loss": 1.25, + "step": 8490 + }, + { + "epoch": 2.5290120813864743, + "grad_norm": 0.23177403211593628, + "learning_rate": 1.7501174483873914e-05, + "loss": 1.2478, + "step": 8491 + }, + { + "epoch": 2.5293099275861426, + "grad_norm": 0.22764109075069427, + "learning_rate": 1.750053657677622e-05, + "loss": 1.2393, + "step": 8492 + }, + { + "epoch": 2.5296077737858114, + "grad_norm": 0.24774837493896484, + "learning_rate": 1.7499898599894415e-05, + "loss": 1.2576, + "step": 8493 + }, + { + "epoch": 2.52990561998548, + "grad_norm": 0.22911469638347626, + "learning_rate": 1.7499260553234434e-05, + "loss": 1.2438, + "step": 8494 + }, + { + "epoch": 2.5302034661851485, + "grad_norm": 0.24415606260299683, + "learning_rate": 1.749862243680221e-05, + "loss": 1.2526, + "step": 8495 + }, + { + "epoch": 2.5305013123848172, + "grad_norm": 0.24845975637435913, + "learning_rate": 1.7497984250603687e-05, + "loss": 1.2468, + "step": 8496 + }, + { + "epoch": 2.530799158584486, + "grad_norm": 0.2423754185438156, + "learning_rate": 1.7497345994644795e-05, + "loss": 1.261, + "step": 8497 + }, + { + "epoch": 2.5310970047841543, + "grad_norm": 0.2517605125904083, + "learning_rate": 1.7496707668931474e-05, + "loss": 1.2638, + "step": 8498 + }, + { + "epoch": 2.531394850983823, + "grad_norm": 0.2364327609539032, + "learning_rate": 1.7496069273469664e-05, + "loss": 1.2519, + "step": 8499 + }, + { + "epoch": 2.531692697183492, + "grad_norm": 0.2540130019187927, + "learning_rate": 1.7495430808265307e-05, + "loss": 1.2511, + "step": 8500 + }, + { + "epoch": 2.531692697183492, + "eval_loss": 1.3396437168121338, + "eval_runtime": 19.6651, + "eval_samples_per_second": 88.177, + "eval_steps_per_second": 5.543, + "step": 8500 + }, + { + "epoch": 2.5319905433831607, + "grad_norm": 0.23343361914157867, + "learning_rate": 1.7494792273324337e-05, + "loss": 1.2657, + "step": 8501 + }, + { + "epoch": 2.5322883895828294, + "grad_norm": 0.238958477973938, + "learning_rate": 1.7494153668652702e-05, + "loss": 1.2398, + "step": 8502 + }, + { + "epoch": 2.5325862357824978, + "grad_norm": 0.2486771047115326, + "learning_rate": 1.7493514994256336e-05, + "loss": 1.2617, + "step": 8503 + }, + { + "epoch": 2.5328840819821665, + "grad_norm": 0.24094340205192566, + "learning_rate": 1.749287625014119e-05, + "loss": 1.2398, + "step": 8504 + }, + { + "epoch": 2.5331819281818353, + "grad_norm": 0.24892079830169678, + "learning_rate": 1.74922374363132e-05, + "loss": 1.2634, + "step": 8505 + }, + { + "epoch": 2.5334797743815036, + "grad_norm": 0.24563492834568024, + "learning_rate": 1.749159855277831e-05, + "loss": 1.2632, + "step": 8506 + }, + { + "epoch": 2.5337776205811724, + "grad_norm": 0.268160343170166, + "learning_rate": 1.7490959599542467e-05, + "loss": 1.2442, + "step": 8507 + }, + { + "epoch": 2.534075466780841, + "grad_norm": 0.25259169936180115, + "learning_rate": 1.7490320576611613e-05, + "loss": 1.2319, + "step": 8508 + }, + { + "epoch": 2.5343733129805095, + "grad_norm": 0.29176679253578186, + "learning_rate": 1.7489681483991696e-05, + "loss": 1.2304, + "step": 8509 + }, + { + "epoch": 2.5346711591801783, + "grad_norm": 0.24003355205059052, + "learning_rate": 1.748904232168866e-05, + "loss": 1.2592, + "step": 8510 + }, + { + "epoch": 2.534969005379847, + "grad_norm": 0.24610748887062073, + "learning_rate": 1.7488403089708455e-05, + "loss": 1.2395, + "step": 8511 + }, + { + "epoch": 2.5352668515795154, + "grad_norm": 0.24857553839683533, + "learning_rate": 1.7487763788057022e-05, + "loss": 1.2578, + "step": 8512 + }, + { + "epoch": 2.535564697779184, + "grad_norm": 0.2514202296733856, + "learning_rate": 1.7487124416740315e-05, + "loss": 1.2422, + "step": 8513 + }, + { + "epoch": 2.535862543978853, + "grad_norm": 0.26457077264785767, + "learning_rate": 1.7486484975764278e-05, + "loss": 1.2681, + "step": 8514 + }, + { + "epoch": 2.5361603901785217, + "grad_norm": 0.2324807494878769, + "learning_rate": 1.7485845465134866e-05, + "loss": 1.2478, + "step": 8515 + }, + { + "epoch": 2.5364582363781905, + "grad_norm": 0.2839811146259308, + "learning_rate": 1.7485205884858024e-05, + "loss": 1.246, + "step": 8516 + }, + { + "epoch": 2.536756082577859, + "grad_norm": 0.24369819462299347, + "learning_rate": 1.7484566234939705e-05, + "loss": 1.2394, + "step": 8517 + }, + { + "epoch": 2.5370539287775276, + "grad_norm": 0.24273979663848877, + "learning_rate": 1.7483926515385862e-05, + "loss": 1.2449, + "step": 8518 + }, + { + "epoch": 2.5373517749771963, + "grad_norm": 0.2644827365875244, + "learning_rate": 1.748328672620244e-05, + "loss": 1.2445, + "step": 8519 + }, + { + "epoch": 2.5376496211768647, + "grad_norm": 0.23556312918663025, + "learning_rate": 1.7482646867395396e-05, + "loss": 1.2518, + "step": 8520 + }, + { + "epoch": 2.5379474673765334, + "grad_norm": 0.2538624107837677, + "learning_rate": 1.7482006938970685e-05, + "loss": 1.2402, + "step": 8521 + }, + { + "epoch": 2.538245313576202, + "grad_norm": 0.23888514935970306, + "learning_rate": 1.7481366940934256e-05, + "loss": 1.2346, + "step": 8522 + }, + { + "epoch": 2.5385431597758705, + "grad_norm": 0.2887038290500641, + "learning_rate": 1.748072687329207e-05, + "loss": 1.244, + "step": 8523 + }, + { + "epoch": 2.5388410059755393, + "grad_norm": 0.3057018518447876, + "learning_rate": 1.7480086736050076e-05, + "loss": 1.2572, + "step": 8524 + }, + { + "epoch": 2.539138852175208, + "grad_norm": 0.2414216697216034, + "learning_rate": 1.7479446529214232e-05, + "loss": 1.2524, + "step": 8525 + }, + { + "epoch": 2.5394366983748764, + "grad_norm": 0.4179585874080658, + "learning_rate": 1.7478806252790497e-05, + "loss": 1.2575, + "step": 8526 + }, + { + "epoch": 2.539734544574545, + "grad_norm": 0.33019116520881653, + "learning_rate": 1.7478165906784826e-05, + "loss": 1.2562, + "step": 8527 + }, + { + "epoch": 2.540032390774214, + "grad_norm": 0.2703005373477936, + "learning_rate": 1.747752549120317e-05, + "loss": 1.2565, + "step": 8528 + }, + { + "epoch": 2.5403302369738827, + "grad_norm": 0.23929455876350403, + "learning_rate": 1.74768850060515e-05, + "loss": 1.2523, + "step": 8529 + }, + { + "epoch": 2.5406280831735515, + "grad_norm": 0.30801793932914734, + "learning_rate": 1.7476244451335767e-05, + "loss": 1.2488, + "step": 8530 + }, + { + "epoch": 2.54092592937322, + "grad_norm": 0.2433696687221527, + "learning_rate": 1.747560382706193e-05, + "loss": 1.2386, + "step": 8531 + }, + { + "epoch": 2.5412237755728886, + "grad_norm": 0.241928830742836, + "learning_rate": 1.7474963133235955e-05, + "loss": 1.2512, + "step": 8532 + }, + { + "epoch": 2.5415216217725574, + "grad_norm": 0.2384069710969925, + "learning_rate": 1.7474322369863797e-05, + "loss": 1.2702, + "step": 8533 + }, + { + "epoch": 2.5418194679722257, + "grad_norm": 0.2398209571838379, + "learning_rate": 1.7473681536951424e-05, + "loss": 1.2574, + "step": 8534 + }, + { + "epoch": 2.5421173141718945, + "grad_norm": 0.23053257167339325, + "learning_rate": 1.747304063450479e-05, + "loss": 1.2515, + "step": 8535 + }, + { + "epoch": 2.5424151603715632, + "grad_norm": 0.23549732565879822, + "learning_rate": 1.7472399662529865e-05, + "loss": 1.2402, + "step": 8536 + }, + { + "epoch": 2.5427130065712316, + "grad_norm": 0.2632571756839752, + "learning_rate": 1.747175862103261e-05, + "loss": 1.2505, + "step": 8537 + }, + { + "epoch": 2.5430108527709003, + "grad_norm": 0.23406517505645752, + "learning_rate": 1.7471117510018988e-05, + "loss": 1.2592, + "step": 8538 + }, + { + "epoch": 2.543308698970569, + "grad_norm": 0.23157420754432678, + "learning_rate": 1.7470476329494962e-05, + "loss": 1.2542, + "step": 8539 + }, + { + "epoch": 2.543606545170238, + "grad_norm": 0.2320341169834137, + "learning_rate": 1.7469835079466502e-05, + "loss": 1.2353, + "step": 8540 + }, + { + "epoch": 2.543904391369906, + "grad_norm": 0.2633756697177887, + "learning_rate": 1.7469193759939576e-05, + "loss": 1.2778, + "step": 8541 + }, + { + "epoch": 2.544202237569575, + "grad_norm": 0.2357613444328308, + "learning_rate": 1.7468552370920145e-05, + "loss": 1.2499, + "step": 8542 + }, + { + "epoch": 2.5445000837692437, + "grad_norm": 0.23448744416236877, + "learning_rate": 1.7467910912414177e-05, + "loss": 1.2647, + "step": 8543 + }, + { + "epoch": 2.5447979299689125, + "grad_norm": 0.24945658445358276, + "learning_rate": 1.7467269384427644e-05, + "loss": 1.2528, + "step": 8544 + }, + { + "epoch": 2.545095776168581, + "grad_norm": 0.23232224583625793, + "learning_rate": 1.746662778696651e-05, + "loss": 1.2527, + "step": 8545 + }, + { + "epoch": 2.5453936223682496, + "grad_norm": 0.2522088587284088, + "learning_rate": 1.7465986120036746e-05, + "loss": 1.2477, + "step": 8546 + }, + { + "epoch": 2.5456914685679184, + "grad_norm": 0.2554646134376526, + "learning_rate": 1.7465344383644326e-05, + "loss": 1.2545, + "step": 8547 + }, + { + "epoch": 2.5459893147675867, + "grad_norm": 0.2312830537557602, + "learning_rate": 1.7464702577795215e-05, + "loss": 1.253, + "step": 8548 + }, + { + "epoch": 2.5462871609672555, + "grad_norm": 0.25109291076660156, + "learning_rate": 1.7464060702495386e-05, + "loss": 1.2509, + "step": 8549 + }, + { + "epoch": 2.5465850071669243, + "grad_norm": 0.257561594247818, + "learning_rate": 1.7463418757750815e-05, + "loss": 1.2446, + "step": 8550 + }, + { + "epoch": 2.5468828533665926, + "grad_norm": 0.22968845069408417, + "learning_rate": 1.7462776743567465e-05, + "loss": 1.2351, + "step": 8551 + }, + { + "epoch": 2.5471806995662614, + "grad_norm": 0.2320071905851364, + "learning_rate": 1.746213465995132e-05, + "loss": 1.2455, + "step": 8552 + }, + { + "epoch": 2.54747854576593, + "grad_norm": 0.24376271665096283, + "learning_rate": 1.7461492506908348e-05, + "loss": 1.2612, + "step": 8553 + }, + { + "epoch": 2.547776391965599, + "grad_norm": 0.23271118104457855, + "learning_rate": 1.7460850284444527e-05, + "loss": 1.272, + "step": 8554 + }, + { + "epoch": 2.5480742381652677, + "grad_norm": 0.25115910172462463, + "learning_rate": 1.7460207992565827e-05, + "loss": 1.2402, + "step": 8555 + }, + { + "epoch": 2.548372084364936, + "grad_norm": 0.2954905331134796, + "learning_rate": 1.745956563127823e-05, + "loss": 1.2444, + "step": 8556 + }, + { + "epoch": 2.548669930564605, + "grad_norm": 0.25200897455215454, + "learning_rate": 1.7458923200587705e-05, + "loss": 1.2584, + "step": 8557 + }, + { + "epoch": 2.5489677767642736, + "grad_norm": 0.27656984329223633, + "learning_rate": 1.7458280700500232e-05, + "loss": 1.2443, + "step": 8558 + }, + { + "epoch": 2.549265622963942, + "grad_norm": 0.47544968128204346, + "learning_rate": 1.7457638131021795e-05, + "loss": 1.2482, + "step": 8559 + }, + { + "epoch": 2.5495634691636107, + "grad_norm": 0.30716463923454285, + "learning_rate": 1.7456995492158366e-05, + "loss": 1.2535, + "step": 8560 + }, + { + "epoch": 2.5498613153632794, + "grad_norm": 0.2898482084274292, + "learning_rate": 1.745635278391592e-05, + "loss": 1.243, + "step": 8561 + }, + { + "epoch": 2.5501591615629478, + "grad_norm": 0.23438648879528046, + "learning_rate": 1.745571000630045e-05, + "loss": 1.2324, + "step": 8562 + }, + { + "epoch": 2.5504570077626165, + "grad_norm": 0.28218358755111694, + "learning_rate": 1.7455067159317924e-05, + "loss": 1.2544, + "step": 8563 + }, + { + "epoch": 2.5507548539622853, + "grad_norm": 0.2600075602531433, + "learning_rate": 1.7454424242974327e-05, + "loss": 1.2629, + "step": 8564 + }, + { + "epoch": 2.5510527001619536, + "grad_norm": 0.224159374833107, + "learning_rate": 1.7453781257275643e-05, + "loss": 1.258, + "step": 8565 + }, + { + "epoch": 2.5513505463616224, + "grad_norm": 0.22335480153560638, + "learning_rate": 1.745313820222785e-05, + "loss": 1.2405, + "step": 8566 + }, + { + "epoch": 2.551648392561291, + "grad_norm": 0.22238238155841827, + "learning_rate": 1.7452495077836936e-05, + "loss": 1.2426, + "step": 8567 + }, + { + "epoch": 2.55194623876096, + "grad_norm": 0.2267833650112152, + "learning_rate": 1.745185188410888e-05, + "loss": 1.2595, + "step": 8568 + }, + { + "epoch": 2.5522440849606287, + "grad_norm": 0.23567399382591248, + "learning_rate": 1.7451208621049668e-05, + "loss": 1.2339, + "step": 8569 + }, + { + "epoch": 2.552541931160297, + "grad_norm": 0.2311100959777832, + "learning_rate": 1.7450565288665284e-05, + "loss": 1.2591, + "step": 8570 + }, + { + "epoch": 2.552839777359966, + "grad_norm": 0.22517523169517517, + "learning_rate": 1.7449921886961716e-05, + "loss": 1.2495, + "step": 8571 + }, + { + "epoch": 2.5531376235596346, + "grad_norm": 0.2333524376153946, + "learning_rate": 1.7449278415944947e-05, + "loss": 1.2469, + "step": 8572 + }, + { + "epoch": 2.553435469759303, + "grad_norm": 0.23362566530704498, + "learning_rate": 1.7448634875620967e-05, + "loss": 1.2544, + "step": 8573 + }, + { + "epoch": 2.5537333159589717, + "grad_norm": 0.23605845868587494, + "learning_rate": 1.7447991265995764e-05, + "loss": 1.2485, + "step": 8574 + }, + { + "epoch": 2.5540311621586405, + "grad_norm": 0.23068703711032867, + "learning_rate": 1.7447347587075317e-05, + "loss": 1.2562, + "step": 8575 + }, + { + "epoch": 2.554329008358309, + "grad_norm": 0.23618610203266144, + "learning_rate": 1.7446703838865624e-05, + "loss": 1.2611, + "step": 8576 + }, + { + "epoch": 2.5546268545579776, + "grad_norm": 0.22635234892368317, + "learning_rate": 1.7446060021372674e-05, + "loss": 1.2531, + "step": 8577 + }, + { + "epoch": 2.5549247007576463, + "grad_norm": 0.22549006342887878, + "learning_rate": 1.744541613460245e-05, + "loss": 1.2445, + "step": 8578 + }, + { + "epoch": 2.5552225469573147, + "grad_norm": 0.22893446683883667, + "learning_rate": 1.7444772178560955e-05, + "loss": 1.2486, + "step": 8579 + }, + { + "epoch": 2.5555203931569834, + "grad_norm": 0.22884921729564667, + "learning_rate": 1.7444128153254164e-05, + "loss": 1.2346, + "step": 8580 + }, + { + "epoch": 2.555818239356652, + "grad_norm": 0.2412279099225998, + "learning_rate": 1.744348405868808e-05, + "loss": 1.2553, + "step": 8581 + }, + { + "epoch": 2.556116085556321, + "grad_norm": 0.23234815895557404, + "learning_rate": 1.7442839894868698e-05, + "loss": 1.2352, + "step": 8582 + }, + { + "epoch": 2.5564139317559897, + "grad_norm": 0.2420787811279297, + "learning_rate": 1.7442195661802004e-05, + "loss": 1.2629, + "step": 8583 + }, + { + "epoch": 2.556711777955658, + "grad_norm": 0.23283688724040985, + "learning_rate": 1.744155135949399e-05, + "loss": 1.2468, + "step": 8584 + }, + { + "epoch": 2.557009624155327, + "grad_norm": 0.2487560659646988, + "learning_rate": 1.744090698795066e-05, + "loss": 1.2376, + "step": 8585 + }, + { + "epoch": 2.5573074703549956, + "grad_norm": 0.23037666082382202, + "learning_rate": 1.7440262547178e-05, + "loss": 1.2385, + "step": 8586 + }, + { + "epoch": 2.557605316554664, + "grad_norm": 0.23633430898189545, + "learning_rate": 1.7439618037182013e-05, + "loss": 1.2423, + "step": 8587 + }, + { + "epoch": 2.5579031627543327, + "grad_norm": 0.22492575645446777, + "learning_rate": 1.743897345796869e-05, + "loss": 1.2616, + "step": 8588 + }, + { + "epoch": 2.5582010089540015, + "grad_norm": 0.2327558845281601, + "learning_rate": 1.7438328809544033e-05, + "loss": 1.2409, + "step": 8589 + }, + { + "epoch": 2.55849885515367, + "grad_norm": 0.23688359558582306, + "learning_rate": 1.7437684091914036e-05, + "loss": 1.2617, + "step": 8590 + }, + { + "epoch": 2.5587967013533386, + "grad_norm": 0.22410444915294647, + "learning_rate": 1.74370393050847e-05, + "loss": 1.2518, + "step": 8591 + }, + { + "epoch": 2.5590945475530074, + "grad_norm": 0.22046175599098206, + "learning_rate": 1.7436394449062016e-05, + "loss": 1.2572, + "step": 8592 + }, + { + "epoch": 2.5593923937526757, + "grad_norm": 0.2666095197200775, + "learning_rate": 1.7435749523851996e-05, + "loss": 1.2697, + "step": 8593 + }, + { + "epoch": 2.5596902399523445, + "grad_norm": 0.2578844726085663, + "learning_rate": 1.743510452946063e-05, + "loss": 1.2382, + "step": 8594 + }, + { + "epoch": 2.5599880861520132, + "grad_norm": 0.23768317699432373, + "learning_rate": 1.7434459465893927e-05, + "loss": 1.2316, + "step": 8595 + }, + { + "epoch": 2.560285932351682, + "grad_norm": 0.35362330079078674, + "learning_rate": 1.7433814333157886e-05, + "loss": 1.2572, + "step": 8596 + }, + { + "epoch": 2.5605837785513508, + "grad_norm": 0.31020310521125793, + "learning_rate": 1.743316913125851e-05, + "loss": 1.269, + "step": 8597 + }, + { + "epoch": 2.560881624751019, + "grad_norm": 0.28512439131736755, + "learning_rate": 1.743252386020179e-05, + "loss": 1.238, + "step": 8598 + }, + { + "epoch": 2.561179470950688, + "grad_norm": 0.4712526500225067, + "learning_rate": 1.7431878519993745e-05, + "loss": 1.2553, + "step": 8599 + }, + { + "epoch": 2.5614773171503566, + "grad_norm": 0.23439401388168335, + "learning_rate": 1.743123311064038e-05, + "loss": 1.2388, + "step": 8600 + }, + { + "epoch": 2.561775163350025, + "grad_norm": 0.2374618649482727, + "learning_rate": 1.7430587632147685e-05, + "loss": 1.2579, + "step": 8601 + }, + { + "epoch": 2.5620730095496937, + "grad_norm": 0.2444273978471756, + "learning_rate": 1.7429942084521676e-05, + "loss": 1.2566, + "step": 8602 + }, + { + "epoch": 2.5623708557493625, + "grad_norm": 0.23217162489891052, + "learning_rate": 1.742929646776836e-05, + "loss": 1.2568, + "step": 8603 + }, + { + "epoch": 2.562668701949031, + "grad_norm": 0.23166996240615845, + "learning_rate": 1.742865078189374e-05, + "loss": 1.243, + "step": 8604 + }, + { + "epoch": 2.5629665481486996, + "grad_norm": 0.22938452661037445, + "learning_rate": 1.7428005026903823e-05, + "loss": 1.2373, + "step": 8605 + }, + { + "epoch": 2.5632643943483684, + "grad_norm": 0.2335512489080429, + "learning_rate": 1.742735920280462e-05, + "loss": 1.2368, + "step": 8606 + }, + { + "epoch": 2.563562240548037, + "grad_norm": 0.23666122555732727, + "learning_rate": 1.7426713309602132e-05, + "loss": 1.2343, + "step": 8607 + }, + { + "epoch": 2.5638600867477055, + "grad_norm": 0.23143525421619415, + "learning_rate": 1.742606734730238e-05, + "loss": 1.2336, + "step": 8608 + }, + { + "epoch": 2.5641579329473743, + "grad_norm": 0.23421306908130646, + "learning_rate": 1.7425421315911368e-05, + "loss": 1.2564, + "step": 8609 + }, + { + "epoch": 2.564455779147043, + "grad_norm": 0.2414148598909378, + "learning_rate": 1.7424775215435106e-05, + "loss": 1.2594, + "step": 8610 + }, + { + "epoch": 2.564753625346712, + "grad_norm": 0.2428535521030426, + "learning_rate": 1.7424129045879605e-05, + "loss": 1.2325, + "step": 8611 + }, + { + "epoch": 2.56505147154638, + "grad_norm": 0.23873963952064514, + "learning_rate": 1.742348280725088e-05, + "loss": 1.24, + "step": 8612 + }, + { + "epoch": 2.565349317746049, + "grad_norm": 0.22313565015792847, + "learning_rate": 1.742283649955494e-05, + "loss": 1.258, + "step": 8613 + }, + { + "epoch": 2.5656471639457177, + "grad_norm": 0.2256631702184677, + "learning_rate": 1.74221901227978e-05, + "loss": 1.2456, + "step": 8614 + }, + { + "epoch": 2.565945010145386, + "grad_norm": 0.22985881567001343, + "learning_rate": 1.7421543676985476e-05, + "loss": 1.2435, + "step": 8615 + }, + { + "epoch": 2.566242856345055, + "grad_norm": 0.22781185805797577, + "learning_rate": 1.7420897162123976e-05, + "loss": 1.2363, + "step": 8616 + }, + { + "epoch": 2.5665407025447236, + "grad_norm": 0.2470318078994751, + "learning_rate": 1.742025057821932e-05, + "loss": 1.2529, + "step": 8617 + }, + { + "epoch": 2.566838548744392, + "grad_norm": 0.22905333340168, + "learning_rate": 1.7419603925277524e-05, + "loss": 1.2711, + "step": 8618 + }, + { + "epoch": 2.5671363949440607, + "grad_norm": 0.2343169003725052, + "learning_rate": 1.7418957203304604e-05, + "loss": 1.2482, + "step": 8619 + }, + { + "epoch": 2.5674342411437294, + "grad_norm": 0.22915878891944885, + "learning_rate": 1.741831041230657e-05, + "loss": 1.2481, + "step": 8620 + }, + { + "epoch": 2.567732087343398, + "grad_norm": 0.22456571459770203, + "learning_rate": 1.7417663552289452e-05, + "loss": 1.2636, + "step": 8621 + }, + { + "epoch": 2.568029933543067, + "grad_norm": 0.22265848517417908, + "learning_rate": 1.7417016623259263e-05, + "loss": 1.2563, + "step": 8622 + }, + { + "epoch": 2.5683277797427353, + "grad_norm": 0.225847065448761, + "learning_rate": 1.741636962522202e-05, + "loss": 1.2412, + "step": 8623 + }, + { + "epoch": 2.568625625942404, + "grad_norm": 0.23029930889606476, + "learning_rate": 1.7415722558183738e-05, + "loss": 1.24, + "step": 8624 + }, + { + "epoch": 2.568923472142073, + "grad_norm": 0.2382858693599701, + "learning_rate": 1.741507542215045e-05, + "loss": 1.2733, + "step": 8625 + }, + { + "epoch": 2.569221318341741, + "grad_norm": 0.224764883518219, + "learning_rate": 1.7414428217128165e-05, + "loss": 1.2436, + "step": 8626 + }, + { + "epoch": 2.56951916454141, + "grad_norm": 0.2427278608083725, + "learning_rate": 1.741378094312291e-05, + "loss": 1.2535, + "step": 8627 + }, + { + "epoch": 2.5698170107410787, + "grad_norm": 0.25812211632728577, + "learning_rate": 1.741313360014071e-05, + "loss": 1.2486, + "step": 8628 + }, + { + "epoch": 2.570114856940747, + "grad_norm": 0.22946663200855255, + "learning_rate": 1.741248618818758e-05, + "loss": 1.2525, + "step": 8629 + }, + { + "epoch": 2.570412703140416, + "grad_norm": 0.23297150433063507, + "learning_rate": 1.7411838707269552e-05, + "loss": 1.2492, + "step": 8630 + }, + { + "epoch": 2.5707105493400846, + "grad_norm": 0.23864395916461945, + "learning_rate": 1.7411191157392642e-05, + "loss": 1.252, + "step": 8631 + }, + { + "epoch": 2.571008395539753, + "grad_norm": 0.22829514741897583, + "learning_rate": 1.7410543538562884e-05, + "loss": 1.2591, + "step": 8632 + }, + { + "epoch": 2.5713062417394217, + "grad_norm": 0.236654132604599, + "learning_rate": 1.7409895850786293e-05, + "loss": 1.2432, + "step": 8633 + }, + { + "epoch": 2.5716040879390905, + "grad_norm": 0.23477168381214142, + "learning_rate": 1.74092480940689e-05, + "loss": 1.2565, + "step": 8634 + }, + { + "epoch": 2.5719019341387592, + "grad_norm": 0.23336704075336456, + "learning_rate": 1.7408600268416733e-05, + "loss": 1.2504, + "step": 8635 + }, + { + "epoch": 2.572199780338428, + "grad_norm": 0.23772408068180084, + "learning_rate": 1.7407952373835818e-05, + "loss": 1.2576, + "step": 8636 + }, + { + "epoch": 2.5724976265380963, + "grad_norm": 0.22036735713481903, + "learning_rate": 1.740730441033218e-05, + "loss": 1.2614, + "step": 8637 + }, + { + "epoch": 2.572795472737765, + "grad_norm": 0.23383082449436188, + "learning_rate": 1.7406656377911854e-05, + "loss": 1.2492, + "step": 8638 + }, + { + "epoch": 2.573093318937434, + "grad_norm": 0.2365119755268097, + "learning_rate": 1.7406008276580866e-05, + "loss": 1.2564, + "step": 8639 + }, + { + "epoch": 2.573391165137102, + "grad_norm": 0.254342257976532, + "learning_rate": 1.7405360106345242e-05, + "loss": 1.2643, + "step": 8640 + }, + { + "epoch": 2.573689011336771, + "grad_norm": 0.24152617156505585, + "learning_rate": 1.740471186721102e-05, + "loss": 1.2541, + "step": 8641 + }, + { + "epoch": 2.5739868575364397, + "grad_norm": 0.2462647706270218, + "learning_rate": 1.7404063559184227e-05, + "loss": 1.2535, + "step": 8642 + }, + { + "epoch": 2.574284703736108, + "grad_norm": 0.29389768838882446, + "learning_rate": 1.740341518227089e-05, + "loss": 1.2555, + "step": 8643 + }, + { + "epoch": 2.574582549935777, + "grad_norm": 0.23136746883392334, + "learning_rate": 1.740276673647705e-05, + "loss": 1.2454, + "step": 8644 + }, + { + "epoch": 2.5748803961354456, + "grad_norm": 0.2376822680234909, + "learning_rate": 1.7402118221808733e-05, + "loss": 1.2337, + "step": 8645 + }, + { + "epoch": 2.575178242335114, + "grad_norm": 0.22440697252750397, + "learning_rate": 1.740146963827198e-05, + "loss": 1.2516, + "step": 8646 + }, + { + "epoch": 2.5754760885347827, + "grad_norm": 0.2741200029850006, + "learning_rate": 1.740082098587282e-05, + "loss": 1.2492, + "step": 8647 + }, + { + "epoch": 2.5757739347344515, + "grad_norm": 0.22482140362262726, + "learning_rate": 1.740017226461729e-05, + "loss": 1.2261, + "step": 8648 + }, + { + "epoch": 2.5760717809341203, + "grad_norm": 0.2622748017311096, + "learning_rate": 1.7399523474511423e-05, + "loss": 1.2265, + "step": 8649 + }, + { + "epoch": 2.576369627133789, + "grad_norm": 0.23331144452095032, + "learning_rate": 1.7398874615561258e-05, + "loss": 1.2458, + "step": 8650 + }, + { + "epoch": 2.5766674733334574, + "grad_norm": 0.295108824968338, + "learning_rate": 1.7398225687772834e-05, + "loss": 1.2508, + "step": 8651 + }, + { + "epoch": 2.576965319533126, + "grad_norm": 0.2672525644302368, + "learning_rate": 1.7397576691152185e-05, + "loss": 1.2467, + "step": 8652 + }, + { + "epoch": 2.577263165732795, + "grad_norm": 0.24952372908592224, + "learning_rate": 1.7396927625705345e-05, + "loss": 1.2628, + "step": 8653 + }, + { + "epoch": 2.5775610119324632, + "grad_norm": 0.28745022416114807, + "learning_rate": 1.7396278491438363e-05, + "loss": 1.2386, + "step": 8654 + }, + { + "epoch": 2.577858858132132, + "grad_norm": 0.23667721450328827, + "learning_rate": 1.7395629288357275e-05, + "loss": 1.2449, + "step": 8655 + }, + { + "epoch": 2.5781567043318008, + "grad_norm": 0.2344713658094406, + "learning_rate": 1.7394980016468113e-05, + "loss": 1.2529, + "step": 8656 + }, + { + "epoch": 2.578454550531469, + "grad_norm": 0.22538915276527405, + "learning_rate": 1.739433067577693e-05, + "loss": 1.2434, + "step": 8657 + }, + { + "epoch": 2.578752396731138, + "grad_norm": 0.25074049830436707, + "learning_rate": 1.7393681266289758e-05, + "loss": 1.2527, + "step": 8658 + }, + { + "epoch": 2.5790502429308066, + "grad_norm": 0.25742632150650024, + "learning_rate": 1.7393031788012643e-05, + "loss": 1.2529, + "step": 8659 + }, + { + "epoch": 2.579348089130475, + "grad_norm": 0.2562355399131775, + "learning_rate": 1.7392382240951628e-05, + "loss": 1.2493, + "step": 8660 + }, + { + "epoch": 2.5796459353301437, + "grad_norm": 0.318065345287323, + "learning_rate": 1.7391732625112754e-05, + "loss": 1.2406, + "step": 8661 + }, + { + "epoch": 2.5799437815298125, + "grad_norm": 0.2281266450881958, + "learning_rate": 1.7391082940502065e-05, + "loss": 1.2515, + "step": 8662 + }, + { + "epoch": 2.5802416277294813, + "grad_norm": 0.26424264907836914, + "learning_rate": 1.739043318712561e-05, + "loss": 1.2479, + "step": 8663 + }, + { + "epoch": 2.58053947392915, + "grad_norm": 0.25391125679016113, + "learning_rate": 1.7389783364989432e-05, + "loss": 1.2616, + "step": 8664 + }, + { + "epoch": 2.5808373201288184, + "grad_norm": 0.366413414478302, + "learning_rate": 1.7389133474099577e-05, + "loss": 1.2408, + "step": 8665 + }, + { + "epoch": 2.581135166328487, + "grad_norm": 0.2793494760990143, + "learning_rate": 1.7388483514462088e-05, + "loss": 1.2511, + "step": 8666 + }, + { + "epoch": 2.581433012528156, + "grad_norm": 0.26306232810020447, + "learning_rate": 1.7387833486083013e-05, + "loss": 1.2346, + "step": 8667 + }, + { + "epoch": 2.5817308587278243, + "grad_norm": 0.2654370367527008, + "learning_rate": 1.7387183388968404e-05, + "loss": 1.2446, + "step": 8668 + }, + { + "epoch": 2.582028704927493, + "grad_norm": 0.2921411991119385, + "learning_rate": 1.7386533223124308e-05, + "loss": 1.2647, + "step": 8669 + }, + { + "epoch": 2.582326551127162, + "grad_norm": 0.2798786163330078, + "learning_rate": 1.7385882988556774e-05, + "loss": 1.2329, + "step": 8670 + }, + { + "epoch": 2.58262439732683, + "grad_norm": 0.26488715410232544, + "learning_rate": 1.7385232685271845e-05, + "loss": 1.2483, + "step": 8671 + }, + { + "epoch": 2.582922243526499, + "grad_norm": 0.23640671372413635, + "learning_rate": 1.7384582313275583e-05, + "loss": 1.2382, + "step": 8672 + }, + { + "epoch": 2.5832200897261677, + "grad_norm": 0.34911125898361206, + "learning_rate": 1.738393187257403e-05, + "loss": 1.259, + "step": 8673 + }, + { + "epoch": 2.5835179359258365, + "grad_norm": 0.2304478883743286, + "learning_rate": 1.738328136317324e-05, + "loss": 1.2427, + "step": 8674 + }, + { + "epoch": 2.583815782125505, + "grad_norm": 0.25745683908462524, + "learning_rate": 1.7382630785079267e-05, + "loss": 1.2363, + "step": 8675 + }, + { + "epoch": 2.5841136283251736, + "grad_norm": 0.2522353231906891, + "learning_rate": 1.7381980138298165e-05, + "loss": 1.2429, + "step": 8676 + }, + { + "epoch": 2.5844114745248423, + "grad_norm": 0.22577343881130219, + "learning_rate": 1.7381329422835986e-05, + "loss": 1.2455, + "step": 8677 + }, + { + "epoch": 2.584709320724511, + "grad_norm": 0.257718563079834, + "learning_rate": 1.738067863869878e-05, + "loss": 1.2662, + "step": 8678 + }, + { + "epoch": 2.5850071669241794, + "grad_norm": 0.24879182875156403, + "learning_rate": 1.738002778589261e-05, + "loss": 1.2385, + "step": 8679 + }, + { + "epoch": 2.585305013123848, + "grad_norm": 0.2378930002450943, + "learning_rate": 1.737937686442352e-05, + "loss": 1.2622, + "step": 8680 + }, + { + "epoch": 2.585602859323517, + "grad_norm": 0.2391463816165924, + "learning_rate": 1.7378725874297578e-05, + "loss": 1.2576, + "step": 8681 + }, + { + "epoch": 2.5859007055231853, + "grad_norm": 0.25435328483581543, + "learning_rate": 1.7378074815520836e-05, + "loss": 1.2336, + "step": 8682 + }, + { + "epoch": 2.586198551722854, + "grad_norm": 0.21997854113578796, + "learning_rate": 1.737742368809935e-05, + "loss": 1.2388, + "step": 8683 + }, + { + "epoch": 2.586496397922523, + "grad_norm": 0.2943100035190582, + "learning_rate": 1.737677249203918e-05, + "loss": 1.2491, + "step": 8684 + }, + { + "epoch": 2.586794244122191, + "grad_norm": 0.23588617146015167, + "learning_rate": 1.737612122734638e-05, + "loss": 1.2534, + "step": 8685 + }, + { + "epoch": 2.58709209032186, + "grad_norm": 0.23237860202789307, + "learning_rate": 1.7375469894027018e-05, + "loss": 1.242, + "step": 8686 + }, + { + "epoch": 2.5873899365215287, + "grad_norm": 0.22945088148117065, + "learning_rate": 1.7374818492087146e-05, + "loss": 1.2466, + "step": 8687 + }, + { + "epoch": 2.5876877827211975, + "grad_norm": 0.30818605422973633, + "learning_rate": 1.7374167021532828e-05, + "loss": 1.2404, + "step": 8688 + }, + { + "epoch": 2.5879856289208663, + "grad_norm": 0.27361810207366943, + "learning_rate": 1.7373515482370125e-05, + "loss": 1.2653, + "step": 8689 + }, + { + "epoch": 2.5882834751205346, + "grad_norm": 0.24611662328243256, + "learning_rate": 1.7372863874605103e-05, + "loss": 1.2407, + "step": 8690 + }, + { + "epoch": 2.5885813213202034, + "grad_norm": 0.2687990069389343, + "learning_rate": 1.7372212198243815e-05, + "loss": 1.2316, + "step": 8691 + }, + { + "epoch": 2.588879167519872, + "grad_norm": 0.25579991936683655, + "learning_rate": 1.7371560453292327e-05, + "loss": 1.2482, + "step": 8692 + }, + { + "epoch": 2.5891770137195405, + "grad_norm": 0.263778418302536, + "learning_rate": 1.737090863975671e-05, + "loss": 1.2404, + "step": 8693 + }, + { + "epoch": 2.5894748599192092, + "grad_norm": 0.238523930311203, + "learning_rate": 1.737025675764302e-05, + "loss": 1.2448, + "step": 8694 + }, + { + "epoch": 2.589772706118878, + "grad_norm": 0.2523680627346039, + "learning_rate": 1.7369604806957326e-05, + "loss": 1.2455, + "step": 8695 + }, + { + "epoch": 2.5900705523185463, + "grad_norm": 0.2315751165151596, + "learning_rate": 1.7368952787705694e-05, + "loss": 1.2293, + "step": 8696 + }, + { + "epoch": 2.590368398518215, + "grad_norm": 0.23198474943637848, + "learning_rate": 1.736830069989419e-05, + "loss": 1.2535, + "step": 8697 + }, + { + "epoch": 2.590666244717884, + "grad_norm": 0.2466469556093216, + "learning_rate": 1.736764854352888e-05, + "loss": 1.2425, + "step": 8698 + }, + { + "epoch": 2.590964090917552, + "grad_norm": 0.23793677985668182, + "learning_rate": 1.736699631861583e-05, + "loss": 1.2635, + "step": 8699 + }, + { + "epoch": 2.591261937117221, + "grad_norm": 0.2367255985736847, + "learning_rate": 1.7366344025161114e-05, + "loss": 1.2564, + "step": 8700 + }, + { + "epoch": 2.5915597833168897, + "grad_norm": 0.22675922513008118, + "learning_rate": 1.7365691663170793e-05, + "loss": 1.2563, + "step": 8701 + }, + { + "epoch": 2.5918576295165585, + "grad_norm": 0.2322254180908203, + "learning_rate": 1.7365039232650945e-05, + "loss": 1.2597, + "step": 8702 + }, + { + "epoch": 2.5921554757162273, + "grad_norm": 0.26231998205184937, + "learning_rate": 1.7364386733607634e-05, + "loss": 1.2647, + "step": 8703 + }, + { + "epoch": 2.5924533219158956, + "grad_norm": 0.265171080827713, + "learning_rate": 1.736373416604693e-05, + "loss": 1.2474, + "step": 8704 + }, + { + "epoch": 2.5927511681155644, + "grad_norm": 0.2530490458011627, + "learning_rate": 1.7363081529974906e-05, + "loss": 1.2497, + "step": 8705 + }, + { + "epoch": 2.593049014315233, + "grad_norm": 0.23230291903018951, + "learning_rate": 1.736242882539764e-05, + "loss": 1.2536, + "step": 8706 + }, + { + "epoch": 2.5933468605149015, + "grad_norm": 0.35599738359451294, + "learning_rate": 1.7361776052321196e-05, + "loss": 1.2626, + "step": 8707 + }, + { + "epoch": 2.5936447067145703, + "grad_norm": 0.3302066922187805, + "learning_rate": 1.7361123210751652e-05, + "loss": 1.268, + "step": 8708 + }, + { + "epoch": 2.593942552914239, + "grad_norm": 0.300210565328598, + "learning_rate": 1.736047030069508e-05, + "loss": 1.2578, + "step": 8709 + }, + { + "epoch": 2.5942403991139074, + "grad_norm": 0.47248217463493347, + "learning_rate": 1.7359817322157556e-05, + "loss": 1.267, + "step": 8710 + }, + { + "epoch": 2.594538245313576, + "grad_norm": 0.23757919669151306, + "learning_rate": 1.7359164275145154e-05, + "loss": 1.2513, + "step": 8711 + }, + { + "epoch": 2.594836091513245, + "grad_norm": 0.24626424908638, + "learning_rate": 1.7358511159663952e-05, + "loss": 1.2544, + "step": 8712 + }, + { + "epoch": 2.5951339377129132, + "grad_norm": 0.23010654747486115, + "learning_rate": 1.735785797572002e-05, + "loss": 1.2376, + "step": 8713 + }, + { + "epoch": 2.595431783912582, + "grad_norm": 0.23664626479148865, + "learning_rate": 1.7357204723319447e-05, + "loss": 1.2512, + "step": 8714 + }, + { + "epoch": 2.5957296301122508, + "grad_norm": 0.2520230710506439, + "learning_rate": 1.7356551402468303e-05, + "loss": 1.2513, + "step": 8715 + }, + { + "epoch": 2.5960274763119195, + "grad_norm": 0.2515430748462677, + "learning_rate": 1.7355898013172666e-05, + "loss": 1.247, + "step": 8716 + }, + { + "epoch": 2.5963253225115883, + "grad_norm": 0.24009092152118683, + "learning_rate": 1.7355244555438616e-05, + "loss": 1.256, + "step": 8717 + }, + { + "epoch": 2.5966231687112566, + "grad_norm": 0.23353387415409088, + "learning_rate": 1.7354591029272236e-05, + "loss": 1.2531, + "step": 8718 + }, + { + "epoch": 2.5969210149109254, + "grad_norm": 0.24309256672859192, + "learning_rate": 1.7353937434679597e-05, + "loss": 1.2393, + "step": 8719 + }, + { + "epoch": 2.597218861110594, + "grad_norm": 0.24077485501766205, + "learning_rate": 1.735328377166679e-05, + "loss": 1.2395, + "step": 8720 + }, + { + "epoch": 2.5975167073102625, + "grad_norm": 0.24538271129131317, + "learning_rate": 1.7352630040239895e-05, + "loss": 1.2602, + "step": 8721 + }, + { + "epoch": 2.5978145535099313, + "grad_norm": 0.2278101146221161, + "learning_rate": 1.735197624040499e-05, + "loss": 1.2472, + "step": 8722 + }, + { + "epoch": 2.5981123997096, + "grad_norm": 0.23970970511436462, + "learning_rate": 1.7351322372168162e-05, + "loss": 1.2486, + "step": 8723 + }, + { + "epoch": 2.5984102459092684, + "grad_norm": 0.24316518008708954, + "learning_rate": 1.735066843553549e-05, + "loss": 1.2553, + "step": 8724 + }, + { + "epoch": 2.598708092108937, + "grad_norm": 0.2406611293554306, + "learning_rate": 1.7350014430513064e-05, + "loss": 1.2492, + "step": 8725 + }, + { + "epoch": 2.599005938308606, + "grad_norm": 0.23540037870407104, + "learning_rate": 1.7349360357106967e-05, + "loss": 1.2318, + "step": 8726 + }, + { + "epoch": 2.5993037845082747, + "grad_norm": 0.2322627454996109, + "learning_rate": 1.734870621532328e-05, + "loss": 1.2515, + "step": 8727 + }, + { + "epoch": 2.599601630707943, + "grad_norm": 0.24906675517559052, + "learning_rate": 1.734805200516809e-05, + "loss": 1.2591, + "step": 8728 + }, + { + "epoch": 2.599899476907612, + "grad_norm": 0.23014332354068756, + "learning_rate": 1.734739772664749e-05, + "loss": 1.2416, + "step": 8729 + }, + { + "epoch": 2.6001973231072806, + "grad_norm": 0.23581655323505402, + "learning_rate": 1.734674337976756e-05, + "loss": 1.2431, + "step": 8730 + }, + { + "epoch": 2.6004951693069494, + "grad_norm": 0.22533650696277618, + "learning_rate": 1.7346088964534395e-05, + "loss": 1.2538, + "step": 8731 + }, + { + "epoch": 2.6007930155066177, + "grad_norm": 0.2528383433818817, + "learning_rate": 1.7345434480954074e-05, + "loss": 1.2367, + "step": 8732 + }, + { + "epoch": 2.6010908617062865, + "grad_norm": 0.24218665063381195, + "learning_rate": 1.7344779929032695e-05, + "loss": 1.2365, + "step": 8733 + }, + { + "epoch": 2.6013887079059552, + "grad_norm": 0.2333836704492569, + "learning_rate": 1.7344125308776348e-05, + "loss": 1.2715, + "step": 8734 + }, + { + "epoch": 2.6016865541056236, + "grad_norm": 0.24962691962718964, + "learning_rate": 1.7343470620191112e-05, + "loss": 1.2621, + "step": 8735 + }, + { + "epoch": 2.6019844003052923, + "grad_norm": 0.25837767124176025, + "learning_rate": 1.7342815863283092e-05, + "loss": 1.2569, + "step": 8736 + }, + { + "epoch": 2.602282246504961, + "grad_norm": 0.24159511923789978, + "learning_rate": 1.7342161038058378e-05, + "loss": 1.252, + "step": 8737 + }, + { + "epoch": 2.6025800927046294, + "grad_norm": 0.256168931722641, + "learning_rate": 1.734150614452305e-05, + "loss": 1.2546, + "step": 8738 + }, + { + "epoch": 2.602877938904298, + "grad_norm": 0.24095866084098816, + "learning_rate": 1.734085118268322e-05, + "loss": 1.2452, + "step": 8739 + }, + { + "epoch": 2.603175785103967, + "grad_norm": 0.2776990234851837, + "learning_rate": 1.7340196152544965e-05, + "loss": 1.2586, + "step": 8740 + }, + { + "epoch": 2.6034736313036357, + "grad_norm": 0.24577198922634125, + "learning_rate": 1.7339541054114385e-05, + "loss": 1.2543, + "step": 8741 + }, + { + "epoch": 2.603771477503304, + "grad_norm": 0.259563148021698, + "learning_rate": 1.7338885887397577e-05, + "loss": 1.2554, + "step": 8742 + }, + { + "epoch": 2.604069323702973, + "grad_norm": 0.25315192341804504, + "learning_rate": 1.7338230652400637e-05, + "loss": 1.2538, + "step": 8743 + }, + { + "epoch": 2.6043671699026416, + "grad_norm": 0.2514294683933258, + "learning_rate": 1.7337575349129657e-05, + "loss": 1.2538, + "step": 8744 + }, + { + "epoch": 2.6046650161023104, + "grad_norm": 0.2541000545024872, + "learning_rate": 1.7336919977590742e-05, + "loss": 1.2407, + "step": 8745 + }, + { + "epoch": 2.6049628623019787, + "grad_norm": 0.23705552518367767, + "learning_rate": 1.7336264537789977e-05, + "loss": 1.2447, + "step": 8746 + }, + { + "epoch": 2.6052607085016475, + "grad_norm": 0.2308826446533203, + "learning_rate": 1.733560902973347e-05, + "loss": 1.2509, + "step": 8747 + }, + { + "epoch": 2.6055585547013163, + "grad_norm": 0.23354917764663696, + "learning_rate": 1.7334953453427315e-05, + "loss": 1.2605, + "step": 8748 + }, + { + "epoch": 2.6058564009009846, + "grad_norm": 0.2550734281539917, + "learning_rate": 1.7334297808877612e-05, + "loss": 1.2496, + "step": 8749 + }, + { + "epoch": 2.6061542471006534, + "grad_norm": 0.25695809721946716, + "learning_rate": 1.7333642096090468e-05, + "loss": 1.2624, + "step": 8750 + }, + { + "epoch": 2.606452093300322, + "grad_norm": 0.23154670000076294, + "learning_rate": 1.7332986315071977e-05, + "loss": 1.274, + "step": 8751 + }, + { + "epoch": 2.6067499394999905, + "grad_norm": 0.25509071350097656, + "learning_rate": 1.7332330465828238e-05, + "loss": 1.2524, + "step": 8752 + }, + { + "epoch": 2.6070477856996592, + "grad_norm": 0.22895443439483643, + "learning_rate": 1.7331674548365357e-05, + "loss": 1.2541, + "step": 8753 + }, + { + "epoch": 2.607345631899328, + "grad_norm": 0.23581722378730774, + "learning_rate": 1.7331018562689435e-05, + "loss": 1.2499, + "step": 8754 + }, + { + "epoch": 2.6076434780989968, + "grad_norm": 0.24990935623645782, + "learning_rate": 1.7330362508806578e-05, + "loss": 1.2467, + "step": 8755 + }, + { + "epoch": 2.6079413242986655, + "grad_norm": 0.29261094331741333, + "learning_rate": 1.7329706386722888e-05, + "loss": 1.2502, + "step": 8756 + }, + { + "epoch": 2.608239170498334, + "grad_norm": 0.2690500020980835, + "learning_rate": 1.7329050196444467e-05, + "loss": 1.245, + "step": 8757 + }, + { + "epoch": 2.6085370166980026, + "grad_norm": 0.22287563979625702, + "learning_rate": 1.7328393937977424e-05, + "loss": 1.2421, + "step": 8758 + }, + { + "epoch": 2.6088348628976714, + "grad_norm": 0.3303421437740326, + "learning_rate": 1.732773761132786e-05, + "loss": 1.2412, + "step": 8759 + }, + { + "epoch": 2.6091327090973397, + "grad_norm": 0.2834303379058838, + "learning_rate": 1.732708121650189e-05, + "loss": 1.2459, + "step": 8760 + }, + { + "epoch": 2.6094305552970085, + "grad_norm": 0.2834492027759552, + "learning_rate": 1.7326424753505612e-05, + "loss": 1.2489, + "step": 8761 + }, + { + "epoch": 2.6097284014966773, + "grad_norm": 0.27727025747299194, + "learning_rate": 1.7325768222345137e-05, + "loss": 1.2515, + "step": 8762 + }, + { + "epoch": 2.6100262476963456, + "grad_norm": 0.2752852141857147, + "learning_rate": 1.7325111623026575e-05, + "loss": 1.2469, + "step": 8763 + }, + { + "epoch": 2.6103240938960144, + "grad_norm": 0.2830751836299896, + "learning_rate": 1.7324454955556032e-05, + "loss": 1.2554, + "step": 8764 + }, + { + "epoch": 2.610621940095683, + "grad_norm": 0.2597208321094513, + "learning_rate": 1.732379821993962e-05, + "loss": 1.2391, + "step": 8765 + }, + { + "epoch": 2.6109197862953515, + "grad_norm": 0.3108302652835846, + "learning_rate": 1.7323141416183448e-05, + "loss": 1.2347, + "step": 8766 + }, + { + "epoch": 2.6112176324950203, + "grad_norm": 0.22924266755580902, + "learning_rate": 1.732248454429363e-05, + "loss": 1.2474, + "step": 8767 + }, + { + "epoch": 2.611515478694689, + "grad_norm": 0.2544800043106079, + "learning_rate": 1.732182760427627e-05, + "loss": 1.2372, + "step": 8768 + }, + { + "epoch": 2.611813324894358, + "grad_norm": 0.24528615176677704, + "learning_rate": 1.7321170596137486e-05, + "loss": 1.2595, + "step": 8769 + }, + { + "epoch": 2.6121111710940266, + "grad_norm": 0.3880791664123535, + "learning_rate": 1.7320513519883392e-05, + "loss": 1.2448, + "step": 8770 + }, + { + "epoch": 2.612409017293695, + "grad_norm": 0.2784728407859802, + "learning_rate": 1.7319856375520093e-05, + "loss": 1.2508, + "step": 8771 + }, + { + "epoch": 2.6127068634933637, + "grad_norm": 0.2712165415287018, + "learning_rate": 1.7319199163053713e-05, + "loss": 1.2603, + "step": 8772 + }, + { + "epoch": 2.6130047096930324, + "grad_norm": 0.2408665120601654, + "learning_rate": 1.7318541882490362e-05, + "loss": 1.2371, + "step": 8773 + }, + { + "epoch": 2.6133025558927008, + "grad_norm": 0.3493192195892334, + "learning_rate": 1.7317884533836154e-05, + "loss": 1.2572, + "step": 8774 + }, + { + "epoch": 2.6136004020923695, + "grad_norm": 0.2725376784801483, + "learning_rate": 1.7317227117097207e-05, + "loss": 1.2561, + "step": 8775 + }, + { + "epoch": 2.6138982482920383, + "grad_norm": 0.25890398025512695, + "learning_rate": 1.7316569632279637e-05, + "loss": 1.2454, + "step": 8776 + }, + { + "epoch": 2.6141960944917066, + "grad_norm": 0.23684841394424438, + "learning_rate": 1.731591207938956e-05, + "loss": 1.2578, + "step": 8777 + }, + { + "epoch": 2.6144939406913754, + "grad_norm": 0.34290099143981934, + "learning_rate": 1.7315254458433097e-05, + "loss": 1.232, + "step": 8778 + }, + { + "epoch": 2.614791786891044, + "grad_norm": 0.22663018107414246, + "learning_rate": 1.731459676941636e-05, + "loss": 1.236, + "step": 8779 + }, + { + "epoch": 2.6150896330907125, + "grad_norm": 0.2505500614643097, + "learning_rate": 1.731393901234548e-05, + "loss": 1.2405, + "step": 8780 + }, + { + "epoch": 2.6153874792903813, + "grad_norm": 0.24107787013053894, + "learning_rate": 1.7313281187226564e-05, + "loss": 1.2511, + "step": 8781 + }, + { + "epoch": 2.61568532549005, + "grad_norm": 0.2568422853946686, + "learning_rate": 1.7312623294065737e-05, + "loss": 1.2387, + "step": 8782 + }, + { + "epoch": 2.615983171689719, + "grad_norm": 0.3152964413166046, + "learning_rate": 1.7311965332869122e-05, + "loss": 1.2349, + "step": 8783 + }, + { + "epoch": 2.6162810178893876, + "grad_norm": 0.2565925419330597, + "learning_rate": 1.731130730364284e-05, + "loss": 1.2345, + "step": 8784 + }, + { + "epoch": 2.616578864089056, + "grad_norm": 0.25363919138908386, + "learning_rate": 1.7310649206393012e-05, + "loss": 1.2492, + "step": 8785 + }, + { + "epoch": 2.6168767102887247, + "grad_norm": 0.22520498931407928, + "learning_rate": 1.730999104112576e-05, + "loss": 1.2495, + "step": 8786 + }, + { + "epoch": 2.6171745564883935, + "grad_norm": 0.30125007033348083, + "learning_rate": 1.730933280784721e-05, + "loss": 1.2478, + "step": 8787 + }, + { + "epoch": 2.617472402688062, + "grad_norm": 0.24418741464614868, + "learning_rate": 1.730867450656348e-05, + "loss": 1.2557, + "step": 8788 + }, + { + "epoch": 2.6177702488877306, + "grad_norm": 0.2495867758989334, + "learning_rate": 1.7308016137280705e-05, + "loss": 1.243, + "step": 8789 + }, + { + "epoch": 2.6180680950873993, + "grad_norm": 0.24253544211387634, + "learning_rate": 1.7307357700005e-05, + "loss": 1.2542, + "step": 8790 + }, + { + "epoch": 2.6183659412870677, + "grad_norm": 0.2348175346851349, + "learning_rate": 1.73066991947425e-05, + "loss": 1.2529, + "step": 8791 + }, + { + "epoch": 2.6186637874867364, + "grad_norm": 0.26520952582359314, + "learning_rate": 1.730604062149933e-05, + "loss": 1.2565, + "step": 8792 + }, + { + "epoch": 2.618961633686405, + "grad_norm": 0.23837199807167053, + "learning_rate": 1.7305381980281608e-05, + "loss": 1.2457, + "step": 8793 + }, + { + "epoch": 2.619259479886074, + "grad_norm": 0.26091358065605164, + "learning_rate": 1.7304723271095473e-05, + "loss": 1.2593, + "step": 8794 + }, + { + "epoch": 2.6195573260857423, + "grad_norm": 0.22857588529586792, + "learning_rate": 1.7304064493947047e-05, + "loss": 1.2514, + "step": 8795 + }, + { + "epoch": 2.619855172285411, + "grad_norm": 0.314048707485199, + "learning_rate": 1.7303405648842462e-05, + "loss": 1.2584, + "step": 8796 + }, + { + "epoch": 2.62015301848508, + "grad_norm": 0.2389499694108963, + "learning_rate": 1.7302746735787847e-05, + "loss": 1.2465, + "step": 8797 + }, + { + "epoch": 2.6204508646847486, + "grad_norm": 0.2749323546886444, + "learning_rate": 1.7302087754789334e-05, + "loss": 1.2414, + "step": 8798 + }, + { + "epoch": 2.620748710884417, + "grad_norm": 0.2627394497394562, + "learning_rate": 1.7301428705853053e-05, + "loss": 1.2636, + "step": 8799 + }, + { + "epoch": 2.6210465570840857, + "grad_norm": 0.2637195885181427, + "learning_rate": 1.730076958898513e-05, + "loss": 1.2467, + "step": 8800 + }, + { + "epoch": 2.6213444032837545, + "grad_norm": 0.2565141022205353, + "learning_rate": 1.730011040419171e-05, + "loss": 1.2584, + "step": 8801 + }, + { + "epoch": 2.621642249483423, + "grad_norm": 0.250991553068161, + "learning_rate": 1.7299451151478915e-05, + "loss": 1.2471, + "step": 8802 + }, + { + "epoch": 2.6219400956830916, + "grad_norm": 0.344510942697525, + "learning_rate": 1.7298791830852886e-05, + "loss": 1.2504, + "step": 8803 + }, + { + "epoch": 2.6222379418827604, + "grad_norm": 0.28531351685523987, + "learning_rate": 1.7298132442319752e-05, + "loss": 1.2615, + "step": 8804 + }, + { + "epoch": 2.6225357880824287, + "grad_norm": 0.24256275594234467, + "learning_rate": 1.7297472985885647e-05, + "loss": 1.2617, + "step": 8805 + }, + { + "epoch": 2.6228336342820975, + "grad_norm": 0.30849483609199524, + "learning_rate": 1.7296813461556712e-05, + "loss": 1.2423, + "step": 8806 + }, + { + "epoch": 2.6231314804817663, + "grad_norm": 0.23779423534870148, + "learning_rate": 1.729615386933908e-05, + "loss": 1.2356, + "step": 8807 + }, + { + "epoch": 2.623429326681435, + "grad_norm": 0.27032792568206787, + "learning_rate": 1.7295494209238887e-05, + "loss": 1.2396, + "step": 8808 + }, + { + "epoch": 2.623727172881104, + "grad_norm": 0.2340458184480667, + "learning_rate": 1.729483448126227e-05, + "loss": 1.2555, + "step": 8809 + }, + { + "epoch": 2.624025019080772, + "grad_norm": 0.3646049499511719, + "learning_rate": 1.729417468541537e-05, + "loss": 1.2333, + "step": 8810 + }, + { + "epoch": 2.624322865280441, + "grad_norm": 0.2888542413711548, + "learning_rate": 1.7293514821704326e-05, + "loss": 1.2538, + "step": 8811 + }, + { + "epoch": 2.6246207114801097, + "grad_norm": 0.268113911151886, + "learning_rate": 1.7292854890135275e-05, + "loss": 1.233, + "step": 8812 + }, + { + "epoch": 2.624918557679778, + "grad_norm": 0.26564687490463257, + "learning_rate": 1.7292194890714356e-05, + "loss": 1.233, + "step": 8813 + }, + { + "epoch": 2.6252164038794468, + "grad_norm": 0.2861003875732422, + "learning_rate": 1.729153482344771e-05, + "loss": 1.2288, + "step": 8814 + }, + { + "epoch": 2.6255142500791155, + "grad_norm": 0.2574459910392761, + "learning_rate": 1.729087468834148e-05, + "loss": 1.2522, + "step": 8815 + }, + { + "epoch": 2.625812096278784, + "grad_norm": 0.26207756996154785, + "learning_rate": 1.7290214485401806e-05, + "loss": 1.2457, + "step": 8816 + }, + { + "epoch": 2.6261099424784526, + "grad_norm": 0.24351252615451813, + "learning_rate": 1.7289554214634834e-05, + "loss": 1.2381, + "step": 8817 + }, + { + "epoch": 2.6264077886781214, + "grad_norm": 0.3597773611545563, + "learning_rate": 1.72888938760467e-05, + "loss": 1.2461, + "step": 8818 + }, + { + "epoch": 2.6267056348777897, + "grad_norm": 0.25255200266838074, + "learning_rate": 1.7288233469643555e-05, + "loss": 1.2269, + "step": 8819 + }, + { + "epoch": 2.6270034810774585, + "grad_norm": 0.26288414001464844, + "learning_rate": 1.728757299543154e-05, + "loss": 1.2437, + "step": 8820 + }, + { + "epoch": 2.6273013272771273, + "grad_norm": 0.25060439109802246, + "learning_rate": 1.7286912453416803e-05, + "loss": 1.2679, + "step": 8821 + }, + { + "epoch": 2.627599173476796, + "grad_norm": 0.2610555589199066, + "learning_rate": 1.7286251843605483e-05, + "loss": 1.2498, + "step": 8822 + }, + { + "epoch": 2.627897019676465, + "grad_norm": 0.3638914227485657, + "learning_rate": 1.728559116600373e-05, + "loss": 1.2395, + "step": 8823 + }, + { + "epoch": 2.628194865876133, + "grad_norm": 0.23534813523292542, + "learning_rate": 1.7284930420617696e-05, + "loss": 1.2355, + "step": 8824 + }, + { + "epoch": 2.628492712075802, + "grad_norm": 0.25642281770706177, + "learning_rate": 1.7284269607453522e-05, + "loss": 1.2522, + "step": 8825 + }, + { + "epoch": 2.6287905582754707, + "grad_norm": 0.24854740500450134, + "learning_rate": 1.7283608726517354e-05, + "loss": 1.2413, + "step": 8826 + }, + { + "epoch": 2.629088404475139, + "grad_norm": 0.231789693236351, + "learning_rate": 1.7282947777815348e-05, + "loss": 1.2545, + "step": 8827 + }, + { + "epoch": 2.629386250674808, + "grad_norm": 0.26283374428749084, + "learning_rate": 1.7282286761353648e-05, + "loss": 1.2466, + "step": 8828 + }, + { + "epoch": 2.6296840968744766, + "grad_norm": 0.24622972309589386, + "learning_rate": 1.7281625677138408e-05, + "loss": 1.2502, + "step": 8829 + }, + { + "epoch": 2.629981943074145, + "grad_norm": 0.2555600702762604, + "learning_rate": 1.7280964525175773e-05, + "loss": 1.242, + "step": 8830 + }, + { + "epoch": 2.6302797892738137, + "grad_norm": 0.25393983721733093, + "learning_rate": 1.72803033054719e-05, + "loss": 1.2529, + "step": 8831 + }, + { + "epoch": 2.6305776354734824, + "grad_norm": 0.22881856560707092, + "learning_rate": 1.727964201803294e-05, + "loss": 1.2709, + "step": 8832 + }, + { + "epoch": 2.6308754816731508, + "grad_norm": 0.31835678219795227, + "learning_rate": 1.7278980662865044e-05, + "loss": 1.2738, + "step": 8833 + }, + { + "epoch": 2.6311733278728195, + "grad_norm": 0.2397090345621109, + "learning_rate": 1.7278319239974363e-05, + "loss": 1.2488, + "step": 8834 + }, + { + "epoch": 2.6314711740724883, + "grad_norm": 0.26797693967819214, + "learning_rate": 1.7277657749367055e-05, + "loss": 1.247, + "step": 8835 + }, + { + "epoch": 2.631769020272157, + "grad_norm": 0.2677474617958069, + "learning_rate": 1.7276996191049274e-05, + "loss": 1.2328, + "step": 8836 + }, + { + "epoch": 2.632066866471826, + "grad_norm": 0.2518165111541748, + "learning_rate": 1.727633456502717e-05, + "loss": 1.26, + "step": 8837 + }, + { + "epoch": 2.632364712671494, + "grad_norm": 0.27545467019081116, + "learning_rate": 1.7275672871306907e-05, + "loss": 1.2531, + "step": 8838 + }, + { + "epoch": 2.632662558871163, + "grad_norm": 0.23853330314159393, + "learning_rate": 1.7275011109894634e-05, + "loss": 1.2507, + "step": 8839 + }, + { + "epoch": 2.6329604050708317, + "grad_norm": 0.24203594028949738, + "learning_rate": 1.7274349280796513e-05, + "loss": 1.2516, + "step": 8840 + }, + { + "epoch": 2.6332582512705, + "grad_norm": 0.24164514243602753, + "learning_rate": 1.72736873840187e-05, + "loss": 1.2553, + "step": 8841 + }, + { + "epoch": 2.633556097470169, + "grad_norm": 0.25497618317604065, + "learning_rate": 1.727302541956735e-05, + "loss": 1.2582, + "step": 8842 + }, + { + "epoch": 2.6338539436698376, + "grad_norm": 0.27957555651664734, + "learning_rate": 1.7272363387448625e-05, + "loss": 1.247, + "step": 8843 + }, + { + "epoch": 2.634151789869506, + "grad_norm": 0.24557183682918549, + "learning_rate": 1.727170128766868e-05, + "loss": 1.2357, + "step": 8844 + }, + { + "epoch": 2.6344496360691747, + "grad_norm": 0.2701668441295624, + "learning_rate": 1.7271039120233685e-05, + "loss": 1.2467, + "step": 8845 + }, + { + "epoch": 2.6347474822688435, + "grad_norm": 0.2438381314277649, + "learning_rate": 1.727037688514979e-05, + "loss": 1.2555, + "step": 8846 + }, + { + "epoch": 2.635045328468512, + "grad_norm": 0.37517014145851135, + "learning_rate": 1.7269714582423165e-05, + "loss": 1.2504, + "step": 8847 + }, + { + "epoch": 2.6353431746681806, + "grad_norm": 0.24342043697834015, + "learning_rate": 1.7269052212059966e-05, + "loss": 1.2584, + "step": 8848 + }, + { + "epoch": 2.6356410208678493, + "grad_norm": 0.2818973958492279, + "learning_rate": 1.726838977406636e-05, + "loss": 1.2392, + "step": 8849 + }, + { + "epoch": 2.635938867067518, + "grad_norm": 0.23356172442436218, + "learning_rate": 1.7267727268448503e-05, + "loss": 1.2671, + "step": 8850 + }, + { + "epoch": 2.636236713267187, + "grad_norm": 0.27586445212364197, + "learning_rate": 1.726706469521257e-05, + "loss": 1.2602, + "step": 8851 + }, + { + "epoch": 2.636534559466855, + "grad_norm": 0.2238815575838089, + "learning_rate": 1.7266402054364712e-05, + "loss": 1.2456, + "step": 8852 + }, + { + "epoch": 2.636832405666524, + "grad_norm": 0.23642008006572723, + "learning_rate": 1.7265739345911105e-05, + "loss": 1.2401, + "step": 8853 + }, + { + "epoch": 2.6371302518661928, + "grad_norm": 0.24406154453754425, + "learning_rate": 1.726507656985791e-05, + "loss": 1.2396, + "step": 8854 + }, + { + "epoch": 2.637428098065861, + "grad_norm": 0.2447628676891327, + "learning_rate": 1.72644137262113e-05, + "loss": 1.2439, + "step": 8855 + }, + { + "epoch": 2.63772594426553, + "grad_norm": 0.23084242641925812, + "learning_rate": 1.726375081497743e-05, + "loss": 1.2395, + "step": 8856 + }, + { + "epoch": 2.6380237904651986, + "grad_norm": 0.21892322599887848, + "learning_rate": 1.7263087836162477e-05, + "loss": 1.2503, + "step": 8857 + }, + { + "epoch": 2.638321636664867, + "grad_norm": 0.24469958245754242, + "learning_rate": 1.7262424789772607e-05, + "loss": 1.2472, + "step": 8858 + }, + { + "epoch": 2.6386194828645357, + "grad_norm": 0.24429583549499512, + "learning_rate": 1.726176167581399e-05, + "loss": 1.2565, + "step": 8859 + }, + { + "epoch": 2.6389173290642045, + "grad_norm": 0.2654368281364441, + "learning_rate": 1.7261098494292786e-05, + "loss": 1.2473, + "step": 8860 + }, + { + "epoch": 2.6392151752638733, + "grad_norm": 0.22381658852100372, + "learning_rate": 1.7260435245215182e-05, + "loss": 1.2494, + "step": 8861 + }, + { + "epoch": 2.6395130214635416, + "grad_norm": 0.2904224395751953, + "learning_rate": 1.7259771928587334e-05, + "loss": 1.2409, + "step": 8862 + }, + { + "epoch": 2.6398108676632104, + "grad_norm": 0.23257623612880707, + "learning_rate": 1.7259108544415425e-05, + "loss": 1.242, + "step": 8863 + }, + { + "epoch": 2.640108713862879, + "grad_norm": 0.3013313114643097, + "learning_rate": 1.725844509270562e-05, + "loss": 1.2302, + "step": 8864 + }, + { + "epoch": 2.640406560062548, + "grad_norm": 0.27417224645614624, + "learning_rate": 1.725778157346409e-05, + "loss": 1.2538, + "step": 8865 + }, + { + "epoch": 2.6407044062622163, + "grad_norm": 0.2881911098957062, + "learning_rate": 1.7257117986697013e-05, + "loss": 1.2465, + "step": 8866 + }, + { + "epoch": 2.641002252461885, + "grad_norm": 0.27448558807373047, + "learning_rate": 1.7256454332410556e-05, + "loss": 1.2466, + "step": 8867 + }, + { + "epoch": 2.641300098661554, + "grad_norm": 0.26696836948394775, + "learning_rate": 1.7255790610610908e-05, + "loss": 1.2378, + "step": 8868 + }, + { + "epoch": 2.641597944861222, + "grad_norm": 0.25289347767829895, + "learning_rate": 1.7255126821304228e-05, + "loss": 1.2508, + "step": 8869 + }, + { + "epoch": 2.641895791060891, + "grad_norm": 0.27484360337257385, + "learning_rate": 1.72544629644967e-05, + "loss": 1.2532, + "step": 8870 + }, + { + "epoch": 2.6421936372605597, + "grad_norm": 0.2266952246427536, + "learning_rate": 1.7253799040194503e-05, + "loss": 1.2331, + "step": 8871 + }, + { + "epoch": 2.642491483460228, + "grad_norm": 0.4980355203151703, + "learning_rate": 1.7253135048403808e-05, + "loss": 1.2608, + "step": 8872 + }, + { + "epoch": 2.6427893296598968, + "grad_norm": 0.28572672605514526, + "learning_rate": 1.7252470989130794e-05, + "loss": 1.2465, + "step": 8873 + }, + { + "epoch": 2.6430871758595655, + "grad_norm": 0.28275179862976074, + "learning_rate": 1.725180686238164e-05, + "loss": 1.2432, + "step": 8874 + }, + { + "epoch": 2.6433850220592343, + "grad_norm": 0.2620451748371124, + "learning_rate": 1.725114266816253e-05, + "loss": 1.2261, + "step": 8875 + }, + { + "epoch": 2.643682868258903, + "grad_norm": 0.23029804229736328, + "learning_rate": 1.7250478406479632e-05, + "loss": 1.2427, + "step": 8876 + }, + { + "epoch": 2.6439807144585714, + "grad_norm": 0.2825107276439667, + "learning_rate": 1.724981407733914e-05, + "loss": 1.2483, + "step": 8877 + }, + { + "epoch": 2.64427856065824, + "grad_norm": 0.30828404426574707, + "learning_rate": 1.7249149680747225e-05, + "loss": 1.2614, + "step": 8878 + }, + { + "epoch": 2.644576406857909, + "grad_norm": 0.23624587059020996, + "learning_rate": 1.724848521671007e-05, + "loss": 1.2612, + "step": 8879 + }, + { + "epoch": 2.6448742530575773, + "grad_norm": 0.24183116853237152, + "learning_rate": 1.724782068523386e-05, + "loss": 1.2401, + "step": 8880 + }, + { + "epoch": 2.645172099257246, + "grad_norm": 0.2445949912071228, + "learning_rate": 1.7247156086324776e-05, + "loss": 1.256, + "step": 8881 + }, + { + "epoch": 2.645469945456915, + "grad_norm": 0.25342297554016113, + "learning_rate": 1.7246491419989002e-05, + "loss": 1.2516, + "step": 8882 + }, + { + "epoch": 2.645767791656583, + "grad_norm": 0.24959760904312134, + "learning_rate": 1.7245826686232722e-05, + "loss": 1.2689, + "step": 8883 + }, + { + "epoch": 2.646065637856252, + "grad_norm": 0.23694045841693878, + "learning_rate": 1.724516188506212e-05, + "loss": 1.2302, + "step": 8884 + }, + { + "epoch": 2.6463634840559207, + "grad_norm": 0.3361573815345764, + "learning_rate": 1.724449701648338e-05, + "loss": 1.2655, + "step": 8885 + }, + { + "epoch": 2.646661330255589, + "grad_norm": 0.2407407909631729, + "learning_rate": 1.724383208050269e-05, + "loss": 1.2471, + "step": 8886 + }, + { + "epoch": 2.646959176455258, + "grad_norm": 0.24118772149085999, + "learning_rate": 1.7243167077126234e-05, + "loss": 1.2424, + "step": 8887 + }, + { + "epoch": 2.6472570226549266, + "grad_norm": 0.23885789513587952, + "learning_rate": 1.7242502006360203e-05, + "loss": 1.2703, + "step": 8888 + }, + { + "epoch": 2.6475548688545953, + "grad_norm": 0.24546580016613007, + "learning_rate": 1.7241836868210783e-05, + "loss": 1.2654, + "step": 8889 + }, + { + "epoch": 2.647852715054264, + "grad_norm": 0.22894099354743958, + "learning_rate": 1.7241171662684162e-05, + "loss": 1.2448, + "step": 8890 + }, + { + "epoch": 2.6481505612539324, + "grad_norm": 0.2342892438173294, + "learning_rate": 1.724050638978653e-05, + "loss": 1.2566, + "step": 8891 + }, + { + "epoch": 2.648448407453601, + "grad_norm": 0.23162709176540375, + "learning_rate": 1.7239841049524074e-05, + "loss": 1.2571, + "step": 8892 + }, + { + "epoch": 2.64874625365327, + "grad_norm": 0.2768057584762573, + "learning_rate": 1.7239175641902985e-05, + "loss": 1.2423, + "step": 8893 + }, + { + "epoch": 2.6490440998529383, + "grad_norm": 0.23433153331279755, + "learning_rate": 1.7238510166929457e-05, + "loss": 1.2486, + "step": 8894 + }, + { + "epoch": 2.649341946052607, + "grad_norm": 0.24096284806728363, + "learning_rate": 1.7237844624609678e-05, + "loss": 1.246, + "step": 8895 + }, + { + "epoch": 2.649639792252276, + "grad_norm": 0.24480442702770233, + "learning_rate": 1.723717901494984e-05, + "loss": 1.2442, + "step": 8896 + }, + { + "epoch": 2.649937638451944, + "grad_norm": 0.23331323266029358, + "learning_rate": 1.7236513337956136e-05, + "loss": 1.2463, + "step": 8897 + }, + { + "epoch": 2.650235484651613, + "grad_norm": 0.23452609777450562, + "learning_rate": 1.7235847593634764e-05, + "loss": 1.2486, + "step": 8898 + }, + { + "epoch": 2.6505333308512817, + "grad_norm": 0.2568226158618927, + "learning_rate": 1.7235181781991915e-05, + "loss": 1.2689, + "step": 8899 + }, + { + "epoch": 2.65083117705095, + "grad_norm": 0.2641594111919403, + "learning_rate": 1.7234515903033782e-05, + "loss": 1.2318, + "step": 8900 + }, + { + "epoch": 2.651129023250619, + "grad_norm": 0.231359601020813, + "learning_rate": 1.723384995676656e-05, + "loss": 1.2387, + "step": 8901 + }, + { + "epoch": 2.6514268694502876, + "grad_norm": 0.23017573356628418, + "learning_rate": 1.723318394319645e-05, + "loss": 1.2469, + "step": 8902 + }, + { + "epoch": 2.6517247156499564, + "grad_norm": 0.2576947808265686, + "learning_rate": 1.7232517862329642e-05, + "loss": 1.2415, + "step": 8903 + }, + { + "epoch": 2.652022561849625, + "grad_norm": 0.22903068363666534, + "learning_rate": 1.7231851714172336e-05, + "loss": 1.2454, + "step": 8904 + }, + { + "epoch": 2.6523204080492935, + "grad_norm": 0.24105381965637207, + "learning_rate": 1.723118549873073e-05, + "loss": 1.2361, + "step": 8905 + }, + { + "epoch": 2.6526182542489622, + "grad_norm": 0.2395734190940857, + "learning_rate": 1.723051921601102e-05, + "loss": 1.2416, + "step": 8906 + }, + { + "epoch": 2.652916100448631, + "grad_norm": 0.24813932180404663, + "learning_rate": 1.722985286601941e-05, + "loss": 1.2449, + "step": 8907 + }, + { + "epoch": 2.6532139466482993, + "grad_norm": 0.2432156652212143, + "learning_rate": 1.7229186448762093e-05, + "loss": 1.2563, + "step": 8908 + }, + { + "epoch": 2.653511792847968, + "grad_norm": 0.2334570735692978, + "learning_rate": 1.722851996424528e-05, + "loss": 1.241, + "step": 8909 + }, + { + "epoch": 2.653809639047637, + "grad_norm": 0.23906110227108002, + "learning_rate": 1.7227853412475158e-05, + "loss": 1.2623, + "step": 8910 + }, + { + "epoch": 2.654107485247305, + "grad_norm": 0.23201993107795715, + "learning_rate": 1.722718679345794e-05, + "loss": 1.2486, + "step": 8911 + }, + { + "epoch": 2.654405331446974, + "grad_norm": 0.2461710125207901, + "learning_rate": 1.722652010719982e-05, + "loss": 1.2365, + "step": 8912 + }, + { + "epoch": 2.6547031776466428, + "grad_norm": 0.23649927973747253, + "learning_rate": 1.7225853353707e-05, + "loss": 1.2453, + "step": 8913 + }, + { + "epoch": 2.655001023846311, + "grad_norm": 0.2550906538963318, + "learning_rate": 1.7225186532985698e-05, + "loss": 1.2586, + "step": 8914 + }, + { + "epoch": 2.65529887004598, + "grad_norm": 0.23077456653118134, + "learning_rate": 1.7224519645042104e-05, + "loss": 1.2569, + "step": 8915 + }, + { + "epoch": 2.6555967162456486, + "grad_norm": 0.2596879303455353, + "learning_rate": 1.7223852689882425e-05, + "loss": 1.2433, + "step": 8916 + }, + { + "epoch": 2.6558945624453174, + "grad_norm": 0.23304714262485504, + "learning_rate": 1.722318566751287e-05, + "loss": 1.2546, + "step": 8917 + }, + { + "epoch": 2.656192408644986, + "grad_norm": 0.25453877449035645, + "learning_rate": 1.722251857793964e-05, + "loss": 1.251, + "step": 8918 + }, + { + "epoch": 2.6564902548446545, + "grad_norm": 0.24530337750911713, + "learning_rate": 1.7221851421168943e-05, + "loss": 1.2359, + "step": 8919 + }, + { + "epoch": 2.6567881010443233, + "grad_norm": 0.36431068181991577, + "learning_rate": 1.7221184197206993e-05, + "loss": 1.2488, + "step": 8920 + }, + { + "epoch": 2.657085947243992, + "grad_norm": 0.31601202487945557, + "learning_rate": 1.7220516906059986e-05, + "loss": 1.2416, + "step": 8921 + }, + { + "epoch": 2.6573837934436604, + "grad_norm": 0.24828962981700897, + "learning_rate": 1.721984954773414e-05, + "loss": 1.2479, + "step": 8922 + }, + { + "epoch": 2.657681639643329, + "grad_norm": 0.36653435230255127, + "learning_rate": 1.721918212223566e-05, + "loss": 1.2519, + "step": 8923 + }, + { + "epoch": 2.657979485842998, + "grad_norm": 0.26021063327789307, + "learning_rate": 1.7218514629570756e-05, + "loss": 1.2496, + "step": 8924 + }, + { + "epoch": 2.6582773320426663, + "grad_norm": 0.2584244906902313, + "learning_rate": 1.721784706974564e-05, + "loss": 1.2466, + "step": 8925 + }, + { + "epoch": 2.658575178242335, + "grad_norm": 0.22850880026817322, + "learning_rate": 1.7217179442766515e-05, + "loss": 1.2398, + "step": 8926 + }, + { + "epoch": 2.658873024442004, + "grad_norm": 0.28881415724754333, + "learning_rate": 1.7216511748639605e-05, + "loss": 1.2431, + "step": 8927 + }, + { + "epoch": 2.6591708706416726, + "grad_norm": 0.2384619563817978, + "learning_rate": 1.7215843987371114e-05, + "loss": 1.2348, + "step": 8928 + }, + { + "epoch": 2.659468716841341, + "grad_norm": 0.26697084307670593, + "learning_rate": 1.7215176158967256e-05, + "loss": 1.2305, + "step": 8929 + }, + { + "epoch": 2.6597665630410097, + "grad_norm": 0.2338797152042389, + "learning_rate": 1.7214508263434244e-05, + "loss": 1.2421, + "step": 8930 + }, + { + "epoch": 2.6600644092406784, + "grad_norm": 0.34127315878868103, + "learning_rate": 1.7213840300778297e-05, + "loss": 1.2468, + "step": 8931 + }, + { + "epoch": 2.660362255440347, + "grad_norm": 0.26202157139778137, + "learning_rate": 1.7213172271005623e-05, + "loss": 1.2385, + "step": 8932 + }, + { + "epoch": 2.6606601016400155, + "grad_norm": 0.3033076524734497, + "learning_rate": 1.721250417412244e-05, + "loss": 1.236, + "step": 8933 + }, + { + "epoch": 2.6609579478396843, + "grad_norm": 0.24419964849948883, + "learning_rate": 1.7211836010134965e-05, + "loss": 1.2542, + "step": 8934 + }, + { + "epoch": 2.661255794039353, + "grad_norm": 0.49213454127311707, + "learning_rate": 1.7211167779049414e-05, + "loss": 1.2475, + "step": 8935 + }, + { + "epoch": 2.6615536402390214, + "grad_norm": 0.23509784042835236, + "learning_rate": 1.7210499480872003e-05, + "loss": 1.2434, + "step": 8936 + }, + { + "epoch": 2.66185148643869, + "grad_norm": 0.24729567766189575, + "learning_rate": 1.720983111560895e-05, + "loss": 1.2399, + "step": 8937 + }, + { + "epoch": 2.662149332638359, + "grad_norm": 0.24425852298736572, + "learning_rate": 1.7209162683266473e-05, + "loss": 1.2512, + "step": 8938 + }, + { + "epoch": 2.6624471788380273, + "grad_norm": 0.23785199224948883, + "learning_rate": 1.720849418385079e-05, + "loss": 1.2525, + "step": 8939 + }, + { + "epoch": 2.662745025037696, + "grad_norm": 0.23505166172981262, + "learning_rate": 1.7207825617368125e-05, + "loss": 1.2473, + "step": 8940 + }, + { + "epoch": 2.663042871237365, + "grad_norm": 0.23657561838626862, + "learning_rate": 1.72071569838247e-05, + "loss": 1.254, + "step": 8941 + }, + { + "epoch": 2.6633407174370336, + "grad_norm": 0.23401376605033875, + "learning_rate": 1.7206488283226726e-05, + "loss": 1.2789, + "step": 8942 + }, + { + "epoch": 2.6636385636367024, + "grad_norm": 0.2471046894788742, + "learning_rate": 1.720581951558043e-05, + "loss": 1.2377, + "step": 8943 + }, + { + "epoch": 2.6639364098363707, + "grad_norm": 0.2282712161540985, + "learning_rate": 1.7205150680892035e-05, + "loss": 1.2615, + "step": 8944 + }, + { + "epoch": 2.6642342560360395, + "grad_norm": 0.2226599007844925, + "learning_rate": 1.7204481779167762e-05, + "loss": 1.2393, + "step": 8945 + }, + { + "epoch": 2.6645321022357082, + "grad_norm": 0.23276494443416595, + "learning_rate": 1.7203812810413838e-05, + "loss": 1.2498, + "step": 8946 + }, + { + "epoch": 2.6648299484353766, + "grad_norm": 0.2419513314962387, + "learning_rate": 1.7203143774636485e-05, + "loss": 1.2484, + "step": 8947 + }, + { + "epoch": 2.6651277946350453, + "grad_norm": 0.22819313406944275, + "learning_rate": 1.7202474671841925e-05, + "loss": 1.2448, + "step": 8948 + }, + { + "epoch": 2.665425640834714, + "grad_norm": 0.24116648733615875, + "learning_rate": 1.7201805502036386e-05, + "loss": 1.2422, + "step": 8949 + }, + { + "epoch": 2.6657234870343824, + "grad_norm": 0.25838974118232727, + "learning_rate": 1.7201136265226093e-05, + "loss": 1.2455, + "step": 8950 + }, + { + "epoch": 2.666021333234051, + "grad_norm": 0.22826877236366272, + "learning_rate": 1.7200466961417272e-05, + "loss": 1.2466, + "step": 8951 + }, + { + "epoch": 2.66631917943372, + "grad_norm": 0.2496095895767212, + "learning_rate": 1.719979759061615e-05, + "loss": 1.2475, + "step": 8952 + }, + { + "epoch": 2.6666170256333883, + "grad_norm": 0.24863339960575104, + "learning_rate": 1.7199128152828958e-05, + "loss": 1.2442, + "step": 8953 + }, + { + "epoch": 2.666914871833057, + "grad_norm": 0.2691372334957123, + "learning_rate": 1.719845864806192e-05, + "loss": 1.247, + "step": 8954 + }, + { + "epoch": 2.667212718032726, + "grad_norm": 0.23048639297485352, + "learning_rate": 1.7197789076321267e-05, + "loss": 1.2415, + "step": 8955 + }, + { + "epoch": 2.6675105642323946, + "grad_norm": 0.24400363862514496, + "learning_rate": 1.7197119437613227e-05, + "loss": 1.255, + "step": 8956 + }, + { + "epoch": 2.6678084104320634, + "grad_norm": 0.2377277910709381, + "learning_rate": 1.7196449731944037e-05, + "loss": 1.2548, + "step": 8957 + }, + { + "epoch": 2.6681062566317317, + "grad_norm": 0.23374474048614502, + "learning_rate": 1.719577995931992e-05, + "loss": 1.2587, + "step": 8958 + }, + { + "epoch": 2.6684041028314005, + "grad_norm": 0.28326719999313354, + "learning_rate": 1.7195110119747106e-05, + "loss": 1.243, + "step": 8959 + }, + { + "epoch": 2.6687019490310693, + "grad_norm": 0.2740752696990967, + "learning_rate": 1.7194440213231836e-05, + "loss": 1.2416, + "step": 8960 + }, + { + "epoch": 2.6689997952307376, + "grad_norm": 0.23700512945652008, + "learning_rate": 1.7193770239780336e-05, + "loss": 1.246, + "step": 8961 + }, + { + "epoch": 2.6692976414304064, + "grad_norm": 0.23480559885501862, + "learning_rate": 1.7193100199398843e-05, + "loss": 1.2547, + "step": 8962 + }, + { + "epoch": 2.669595487630075, + "grad_norm": 0.2860928177833557, + "learning_rate": 1.7192430092093586e-05, + "loss": 1.2521, + "step": 8963 + }, + { + "epoch": 2.6698933338297435, + "grad_norm": 0.25524553656578064, + "learning_rate": 1.7191759917870805e-05, + "loss": 1.2596, + "step": 8964 + }, + { + "epoch": 2.6701911800294122, + "grad_norm": 0.25950267910957336, + "learning_rate": 1.719108967673673e-05, + "loss": 1.2354, + "step": 8965 + }, + { + "epoch": 2.670489026229081, + "grad_norm": 0.22618107497692108, + "learning_rate": 1.7190419368697605e-05, + "loss": 1.2401, + "step": 8966 + }, + { + "epoch": 2.6707868724287493, + "grad_norm": 0.24542158842086792, + "learning_rate": 1.718974899375966e-05, + "loss": 1.2799, + "step": 8967 + }, + { + "epoch": 2.671084718628418, + "grad_norm": 0.24449419975280762, + "learning_rate": 1.7189078551929133e-05, + "loss": 1.2433, + "step": 8968 + }, + { + "epoch": 2.671382564828087, + "grad_norm": 0.26330330967903137, + "learning_rate": 1.7188408043212258e-05, + "loss": 1.253, + "step": 8969 + }, + { + "epoch": 2.6716804110277557, + "grad_norm": 0.244807630777359, + "learning_rate": 1.7187737467615285e-05, + "loss": 1.2448, + "step": 8970 + }, + { + "epoch": 2.6719782572274244, + "grad_norm": 0.23932670056819916, + "learning_rate": 1.718706682514444e-05, + "loss": 1.2532, + "step": 8971 + }, + { + "epoch": 2.6722761034270928, + "grad_norm": 0.27525243163108826, + "learning_rate": 1.718639611580597e-05, + "loss": 1.2414, + "step": 8972 + }, + { + "epoch": 2.6725739496267615, + "grad_norm": 0.26601141691207886, + "learning_rate": 1.7185725339606116e-05, + "loss": 1.2491, + "step": 8973 + }, + { + "epoch": 2.6728717958264303, + "grad_norm": 0.2720913589000702, + "learning_rate": 1.718505449655111e-05, + "loss": 1.2546, + "step": 8974 + }, + { + "epoch": 2.6731696420260986, + "grad_norm": 0.31385520100593567, + "learning_rate": 1.71843835866472e-05, + "loss": 1.2641, + "step": 8975 + }, + { + "epoch": 2.6734674882257674, + "grad_norm": 0.23295383155345917, + "learning_rate": 1.7183712609900635e-05, + "loss": 1.2493, + "step": 8976 + }, + { + "epoch": 2.673765334425436, + "grad_norm": 0.262129545211792, + "learning_rate": 1.7183041566317643e-05, + "loss": 1.2486, + "step": 8977 + }, + { + "epoch": 2.6740631806251045, + "grad_norm": 0.22411711513996124, + "learning_rate": 1.7182370455904477e-05, + "loss": 1.243, + "step": 8978 + }, + { + "epoch": 2.6743610268247733, + "grad_norm": 0.31297796964645386, + "learning_rate": 1.7181699278667378e-05, + "loss": 1.2644, + "step": 8979 + }, + { + "epoch": 2.674658873024442, + "grad_norm": 0.23683494329452515, + "learning_rate": 1.7181028034612596e-05, + "loss": 1.2426, + "step": 8980 + }, + { + "epoch": 2.6749567192241104, + "grad_norm": 0.2417498379945755, + "learning_rate": 1.7180356723746365e-05, + "loss": 1.2402, + "step": 8981 + }, + { + "epoch": 2.675254565423779, + "grad_norm": 0.2354615032672882, + "learning_rate": 1.7179685346074938e-05, + "loss": 1.2343, + "step": 8982 + }, + { + "epoch": 2.675552411623448, + "grad_norm": 0.2346847802400589, + "learning_rate": 1.717901390160456e-05, + "loss": 1.2688, + "step": 8983 + }, + { + "epoch": 2.6758502578231167, + "grad_norm": 0.22196222841739655, + "learning_rate": 1.717834239034148e-05, + "loss": 1.2431, + "step": 8984 + }, + { + "epoch": 2.6761481040227855, + "grad_norm": 0.2591729164123535, + "learning_rate": 1.7177670812291946e-05, + "loss": 1.2365, + "step": 8985 + }, + { + "epoch": 2.676445950222454, + "grad_norm": 0.23349237442016602, + "learning_rate": 1.7176999167462204e-05, + "loss": 1.2561, + "step": 8986 + }, + { + "epoch": 2.6767437964221226, + "grad_norm": 0.25721341371536255, + "learning_rate": 1.7176327455858503e-05, + "loss": 1.2503, + "step": 8987 + }, + { + "epoch": 2.6770416426217913, + "grad_norm": 0.22645686566829681, + "learning_rate": 1.717565567748709e-05, + "loss": 1.2461, + "step": 8988 + }, + { + "epoch": 2.6773394888214597, + "grad_norm": 0.2434178739786148, + "learning_rate": 1.717498383235422e-05, + "loss": 1.2475, + "step": 8989 + }, + { + "epoch": 2.6776373350211284, + "grad_norm": 0.23041358590126038, + "learning_rate": 1.7174311920466143e-05, + "loss": 1.2682, + "step": 8990 + }, + { + "epoch": 2.677935181220797, + "grad_norm": 0.2594456076622009, + "learning_rate": 1.7173639941829105e-05, + "loss": 1.2538, + "step": 8991 + }, + { + "epoch": 2.6782330274204655, + "grad_norm": 0.23500095307826996, + "learning_rate": 1.7172967896449367e-05, + "loss": 1.2457, + "step": 8992 + }, + { + "epoch": 2.6785308736201343, + "grad_norm": 0.24464409053325653, + "learning_rate": 1.7172295784333174e-05, + "loss": 1.2386, + "step": 8993 + }, + { + "epoch": 2.678828719819803, + "grad_norm": 0.2371082305908203, + "learning_rate": 1.7171623605486784e-05, + "loss": 1.2359, + "step": 8994 + }, + { + "epoch": 2.679126566019472, + "grad_norm": 0.43126392364501953, + "learning_rate": 1.7170951359916447e-05, + "loss": 1.2472, + "step": 8995 + }, + { + "epoch": 2.67942441221914, + "grad_norm": 0.3418256938457489, + "learning_rate": 1.717027904762842e-05, + "loss": 1.2415, + "step": 8996 + }, + { + "epoch": 2.679722258418809, + "grad_norm": 0.33027663826942444, + "learning_rate": 1.716960666862896e-05, + "loss": 1.2323, + "step": 8997 + }, + { + "epoch": 2.6800201046184777, + "grad_norm": 0.5020977258682251, + "learning_rate": 1.7168934222924317e-05, + "loss": 1.2485, + "step": 8998 + }, + { + "epoch": 2.6803179508181465, + "grad_norm": 0.2648580074310303, + "learning_rate": 1.7168261710520753e-05, + "loss": 1.2492, + "step": 8999 + }, + { + "epoch": 2.680615797017815, + "grad_norm": 0.2582843601703644, + "learning_rate": 1.716758913142452e-05, + "loss": 1.2548, + "step": 9000 + }, + { + "epoch": 2.680615797017815, + "eval_loss": 1.33614182472229, + "eval_runtime": 19.8208, + "eval_samples_per_second": 87.484, + "eval_steps_per_second": 5.499, + "step": 9000 + }, + { + "epoch": 2.6809136432174836, + "grad_norm": 0.25006046891212463, + "learning_rate": 1.7166916485641882e-05, + "loss": 1.2476, + "step": 9001 + }, + { + "epoch": 2.6812114894171524, + "grad_norm": 0.23376592993736267, + "learning_rate": 1.716624377317909e-05, + "loss": 1.2347, + "step": 9002 + }, + { + "epoch": 2.6815093356168207, + "grad_norm": 0.3001675009727478, + "learning_rate": 1.716557099404241e-05, + "loss": 1.2519, + "step": 9003 + }, + { + "epoch": 2.6818071818164895, + "grad_norm": 0.2258872091770172, + "learning_rate": 1.7164898148238094e-05, + "loss": 1.2471, + "step": 9004 + }, + { + "epoch": 2.6821050280161582, + "grad_norm": 0.24445126950740814, + "learning_rate": 1.7164225235772406e-05, + "loss": 1.2362, + "step": 9005 + }, + { + "epoch": 2.6824028742158266, + "grad_norm": 0.25726720690727234, + "learning_rate": 1.7163552256651608e-05, + "loss": 1.2454, + "step": 9006 + }, + { + "epoch": 2.6827007204154953, + "grad_norm": 0.23210914433002472, + "learning_rate": 1.716287921088196e-05, + "loss": 1.2582, + "step": 9007 + }, + { + "epoch": 2.682998566615164, + "grad_norm": 0.29994168877601624, + "learning_rate": 1.7162206098469724e-05, + "loss": 1.2275, + "step": 9008 + }, + { + "epoch": 2.683296412814833, + "grad_norm": 0.24527543783187866, + "learning_rate": 1.716153291942116e-05, + "loss": 1.2275, + "step": 9009 + }, + { + "epoch": 2.6835942590145017, + "grad_norm": 0.2653655707836151, + "learning_rate": 1.7160859673742537e-05, + "loss": 1.2394, + "step": 9010 + }, + { + "epoch": 2.68389210521417, + "grad_norm": 0.2707829177379608, + "learning_rate": 1.7160186361440113e-05, + "loss": 1.2532, + "step": 9011 + }, + { + "epoch": 2.6841899514138388, + "grad_norm": 0.23000174760818481, + "learning_rate": 1.7159512982520153e-05, + "loss": 1.2502, + "step": 9012 + }, + { + "epoch": 2.6844877976135075, + "grad_norm": 0.23766934871673584, + "learning_rate": 1.7158839536988926e-05, + "loss": 1.2514, + "step": 9013 + }, + { + "epoch": 2.684785643813176, + "grad_norm": 0.2337920069694519, + "learning_rate": 1.7158166024852697e-05, + "loss": 1.2418, + "step": 9014 + }, + { + "epoch": 2.6850834900128446, + "grad_norm": 0.22401787340641022, + "learning_rate": 1.715749244611773e-05, + "loss": 1.2556, + "step": 9015 + }, + { + "epoch": 2.6853813362125134, + "grad_norm": 0.254700243473053, + "learning_rate": 1.7156818800790292e-05, + "loss": 1.2355, + "step": 9016 + }, + { + "epoch": 2.6856791824121817, + "grad_norm": 0.24101154506206512, + "learning_rate": 1.715614508887665e-05, + "loss": 1.2473, + "step": 9017 + }, + { + "epoch": 2.6859770286118505, + "grad_norm": 0.24811016023159027, + "learning_rate": 1.7155471310383073e-05, + "loss": 1.243, + "step": 9018 + }, + { + "epoch": 2.6862748748115193, + "grad_norm": 0.23110364377498627, + "learning_rate": 1.7154797465315834e-05, + "loss": 1.2407, + "step": 9019 + }, + { + "epoch": 2.6865727210111876, + "grad_norm": 0.22973401844501495, + "learning_rate": 1.7154123553681194e-05, + "loss": 1.2461, + "step": 9020 + }, + { + "epoch": 2.6868705672108564, + "grad_norm": 0.23058158159255981, + "learning_rate": 1.7153449575485428e-05, + "loss": 1.2422, + "step": 9021 + }, + { + "epoch": 2.687168413410525, + "grad_norm": 0.25269100069999695, + "learning_rate": 1.715277553073481e-05, + "loss": 1.256, + "step": 9022 + }, + { + "epoch": 2.687466259610194, + "grad_norm": 0.24133670330047607, + "learning_rate": 1.7152101419435602e-05, + "loss": 1.2458, + "step": 9023 + }, + { + "epoch": 2.6877641058098627, + "grad_norm": 0.24933025240898132, + "learning_rate": 1.715142724159408e-05, + "loss": 1.243, + "step": 9024 + }, + { + "epoch": 2.688061952009531, + "grad_norm": 0.24056760966777802, + "learning_rate": 1.7150752997216524e-05, + "loss": 1.2355, + "step": 9025 + }, + { + "epoch": 2.6883597982092, + "grad_norm": 0.2292553037405014, + "learning_rate": 1.7150078686309198e-05, + "loss": 1.2432, + "step": 9026 + }, + { + "epoch": 2.6886576444088686, + "grad_norm": 0.2316311001777649, + "learning_rate": 1.714940430887838e-05, + "loss": 1.2593, + "step": 9027 + }, + { + "epoch": 2.688955490608537, + "grad_norm": 0.23243388533592224, + "learning_rate": 1.7148729864930337e-05, + "loss": 1.2449, + "step": 9028 + }, + { + "epoch": 2.6892533368082057, + "grad_norm": 0.23164501786231995, + "learning_rate": 1.7148055354471355e-05, + "loss": 1.2512, + "step": 9029 + }, + { + "epoch": 2.6895511830078744, + "grad_norm": 0.2510435879230499, + "learning_rate": 1.7147380777507703e-05, + "loss": 1.2529, + "step": 9030 + }, + { + "epoch": 2.6898490292075428, + "grad_norm": 0.2775104343891144, + "learning_rate": 1.714670613404566e-05, + "loss": 1.2607, + "step": 9031 + }, + { + "epoch": 2.6901468754072115, + "grad_norm": 0.2796226143836975, + "learning_rate": 1.7146031424091497e-05, + "loss": 1.2476, + "step": 9032 + }, + { + "epoch": 2.6904447216068803, + "grad_norm": 0.2853524088859558, + "learning_rate": 1.7145356647651498e-05, + "loss": 1.2699, + "step": 9033 + }, + { + "epoch": 2.6907425678065486, + "grad_norm": 0.2359253168106079, + "learning_rate": 1.714468180473194e-05, + "loss": 1.233, + "step": 9034 + }, + { + "epoch": 2.6910404140062174, + "grad_norm": 0.42600083351135254, + "learning_rate": 1.7144006895339098e-05, + "loss": 1.2383, + "step": 9035 + }, + { + "epoch": 2.691338260205886, + "grad_norm": 0.28889214992523193, + "learning_rate": 1.7143331919479252e-05, + "loss": 1.2344, + "step": 9036 + }, + { + "epoch": 2.691636106405555, + "grad_norm": 0.2893359959125519, + "learning_rate": 1.7142656877158684e-05, + "loss": 1.2479, + "step": 9037 + }, + { + "epoch": 2.6919339526052237, + "grad_norm": 0.2502742409706116, + "learning_rate": 1.714198176838368e-05, + "loss": 1.2388, + "step": 9038 + }, + { + "epoch": 2.692231798804892, + "grad_norm": 0.31276842951774597, + "learning_rate": 1.7141306593160508e-05, + "loss": 1.2458, + "step": 9039 + }, + { + "epoch": 2.692529645004561, + "grad_norm": 0.22691497206687927, + "learning_rate": 1.714063135149546e-05, + "loss": 1.2313, + "step": 9040 + }, + { + "epoch": 2.6928274912042296, + "grad_norm": 0.23439256846904755, + "learning_rate": 1.7139956043394814e-05, + "loss": 1.2554, + "step": 9041 + }, + { + "epoch": 2.693125337403898, + "grad_norm": 0.24257373809814453, + "learning_rate": 1.7139280668864855e-05, + "loss": 1.254, + "step": 9042 + }, + { + "epoch": 2.6934231836035667, + "grad_norm": 0.23966045677661896, + "learning_rate": 1.713860522791187e-05, + "loss": 1.2383, + "step": 9043 + }, + { + "epoch": 2.6937210298032355, + "grad_norm": 0.24259738624095917, + "learning_rate": 1.7137929720542136e-05, + "loss": 1.2466, + "step": 9044 + }, + { + "epoch": 2.694018876002904, + "grad_norm": 0.2348119467496872, + "learning_rate": 1.7137254146761936e-05, + "loss": 1.2496, + "step": 9045 + }, + { + "epoch": 2.6943167222025726, + "grad_norm": 0.24347592890262604, + "learning_rate": 1.7136578506577562e-05, + "loss": 1.2474, + "step": 9046 + }, + { + "epoch": 2.6946145684022413, + "grad_norm": 0.2700434625148773, + "learning_rate": 1.7135902799995302e-05, + "loss": 1.2574, + "step": 9047 + }, + { + "epoch": 2.6949124146019097, + "grad_norm": 0.28286752104759216, + "learning_rate": 1.713522702702144e-05, + "loss": 1.2407, + "step": 9048 + }, + { + "epoch": 2.6952102608015784, + "grad_norm": 0.35509929060935974, + "learning_rate": 1.713455118766226e-05, + "loss": 1.2573, + "step": 9049 + }, + { + "epoch": 2.695508107001247, + "grad_norm": 0.6191769242286682, + "learning_rate": 1.7133875281924054e-05, + "loss": 1.236, + "step": 9050 + }, + { + "epoch": 2.695805953200916, + "grad_norm": 0.2674236595630646, + "learning_rate": 1.7133199309813106e-05, + "loss": 1.2502, + "step": 9051 + }, + { + "epoch": 2.6961037994005848, + "grad_norm": 0.6483727693557739, + "learning_rate": 1.7132523271335714e-05, + "loss": 1.2507, + "step": 9052 + }, + { + "epoch": 2.696401645600253, + "grad_norm": 0.3155398666858673, + "learning_rate": 1.7131847166498156e-05, + "loss": 1.2408, + "step": 9053 + }, + { + "epoch": 2.696699491799922, + "grad_norm": 0.27703872323036194, + "learning_rate": 1.713117099530673e-05, + "loss": 1.2381, + "step": 9054 + }, + { + "epoch": 2.6969973379995906, + "grad_norm": 0.24894583225250244, + "learning_rate": 1.7130494757767724e-05, + "loss": 1.2517, + "step": 9055 + }, + { + "epoch": 2.697295184199259, + "grad_norm": 0.24170449376106262, + "learning_rate": 1.7129818453887432e-05, + "loss": 1.2401, + "step": 9056 + }, + { + "epoch": 2.6975930303989277, + "grad_norm": 0.2369874268770218, + "learning_rate": 1.7129142083672147e-05, + "loss": 1.2461, + "step": 9057 + }, + { + "epoch": 2.6978908765985965, + "grad_norm": 0.2657559812068939, + "learning_rate": 1.712846564712816e-05, + "loss": 1.2578, + "step": 9058 + }, + { + "epoch": 2.698188722798265, + "grad_norm": 0.25176188349723816, + "learning_rate": 1.712778914426176e-05, + "loss": 1.2268, + "step": 9059 + }, + { + "epoch": 2.6984865689979336, + "grad_norm": 0.22299958765506744, + "learning_rate": 1.712711257507925e-05, + "loss": 1.2458, + "step": 9060 + }, + { + "epoch": 2.6987844151976024, + "grad_norm": 0.2352006584405899, + "learning_rate": 1.712643593958692e-05, + "loss": 1.2495, + "step": 9061 + }, + { + "epoch": 2.699082261397271, + "grad_norm": 0.22989533841609955, + "learning_rate": 1.7125759237791065e-05, + "loss": 1.2413, + "step": 9062 + }, + { + "epoch": 2.6993801075969395, + "grad_norm": 0.23323993384838104, + "learning_rate": 1.712508246969798e-05, + "loss": 1.2525, + "step": 9063 + }, + { + "epoch": 2.6996779537966082, + "grad_norm": 0.23476077616214752, + "learning_rate": 1.7124405635313964e-05, + "loss": 1.2624, + "step": 9064 + }, + { + "epoch": 2.699975799996277, + "grad_norm": 0.23075537383556366, + "learning_rate": 1.7123728734645316e-05, + "loss": 1.2441, + "step": 9065 + }, + { + "epoch": 2.700273646195946, + "grad_norm": 0.236238032579422, + "learning_rate": 1.712305176769833e-05, + "loss": 1.2417, + "step": 9066 + }, + { + "epoch": 2.700571492395614, + "grad_norm": 0.2502848505973816, + "learning_rate": 1.7122374734479304e-05, + "loss": 1.2418, + "step": 9067 + }, + { + "epoch": 2.700869338595283, + "grad_norm": 0.235133096575737, + "learning_rate": 1.712169763499454e-05, + "loss": 1.2432, + "step": 9068 + }, + { + "epoch": 2.7011671847949517, + "grad_norm": 0.23175573348999023, + "learning_rate": 1.7121020469250336e-05, + "loss": 1.2393, + "step": 9069 + }, + { + "epoch": 2.70146503099462, + "grad_norm": 0.22638940811157227, + "learning_rate": 1.7120343237252994e-05, + "loss": 1.2297, + "step": 9070 + }, + { + "epoch": 2.7017628771942888, + "grad_norm": 0.23068824410438538, + "learning_rate": 1.7119665939008808e-05, + "loss": 1.2597, + "step": 9071 + }, + { + "epoch": 2.7020607233939575, + "grad_norm": 0.23865114152431488, + "learning_rate": 1.7118988574524095e-05, + "loss": 1.2633, + "step": 9072 + }, + { + "epoch": 2.702358569593626, + "grad_norm": 0.22494173049926758, + "learning_rate": 1.711831114380514e-05, + "loss": 1.2515, + "step": 9073 + }, + { + "epoch": 2.7026564157932946, + "grad_norm": 0.2372240573167801, + "learning_rate": 1.7117633646858252e-05, + "loss": 1.2282, + "step": 9074 + }, + { + "epoch": 2.7029542619929634, + "grad_norm": 0.238014817237854, + "learning_rate": 1.7116956083689737e-05, + "loss": 1.2564, + "step": 9075 + }, + { + "epoch": 2.703252108192632, + "grad_norm": 0.2317420244216919, + "learning_rate": 1.7116278454305898e-05, + "loss": 1.2509, + "step": 9076 + }, + { + "epoch": 2.703549954392301, + "grad_norm": 0.23609192669391632, + "learning_rate": 1.711560075871304e-05, + "loss": 1.2327, + "step": 9077 + }, + { + "epoch": 2.7038478005919693, + "grad_norm": 0.2346174120903015, + "learning_rate": 1.7114922996917465e-05, + "loss": 1.2454, + "step": 9078 + }, + { + "epoch": 2.704145646791638, + "grad_norm": 0.2298157811164856, + "learning_rate": 1.711424516892548e-05, + "loss": 1.258, + "step": 9079 + }, + { + "epoch": 2.704443492991307, + "grad_norm": 0.22545742988586426, + "learning_rate": 1.711356727474339e-05, + "loss": 1.2577, + "step": 9080 + }, + { + "epoch": 2.704741339190975, + "grad_norm": 0.23808005452156067, + "learning_rate": 1.7112889314377508e-05, + "loss": 1.2388, + "step": 9081 + }, + { + "epoch": 2.705039185390644, + "grad_norm": 0.21936507523059845, + "learning_rate": 1.711221128783414e-05, + "loss": 1.2303, + "step": 9082 + }, + { + "epoch": 2.7053370315903127, + "grad_norm": 0.23477523028850555, + "learning_rate": 1.7111533195119586e-05, + "loss": 1.2479, + "step": 9083 + }, + { + "epoch": 2.705634877789981, + "grad_norm": 0.2425217479467392, + "learning_rate": 1.711085503624017e-05, + "loss": 1.244, + "step": 9084 + }, + { + "epoch": 2.70593272398965, + "grad_norm": 0.2296759933233261, + "learning_rate": 1.7110176811202185e-05, + "loss": 1.2626, + "step": 9085 + }, + { + "epoch": 2.7062305701893186, + "grad_norm": 0.22694678604602814, + "learning_rate": 1.710949852001195e-05, + "loss": 1.2469, + "step": 9086 + }, + { + "epoch": 2.706528416388987, + "grad_norm": 0.225328266620636, + "learning_rate": 1.7108820162675777e-05, + "loss": 1.2362, + "step": 9087 + }, + { + "epoch": 2.7068262625886557, + "grad_norm": 0.24670012295246124, + "learning_rate": 1.7108141739199973e-05, + "loss": 1.2357, + "step": 9088 + }, + { + "epoch": 2.7071241087883244, + "grad_norm": 0.23392459750175476, + "learning_rate": 1.7107463249590852e-05, + "loss": 1.241, + "step": 9089 + }, + { + "epoch": 2.707421954987993, + "grad_norm": 0.2339889109134674, + "learning_rate": 1.7106784693854726e-05, + "loss": 1.238, + "step": 9090 + }, + { + "epoch": 2.707719801187662, + "grad_norm": 0.23321010172367096, + "learning_rate": 1.710610607199791e-05, + "loss": 1.2428, + "step": 9091 + }, + { + "epoch": 2.7080176473873303, + "grad_norm": 0.2305080145597458, + "learning_rate": 1.7105427384026715e-05, + "loss": 1.2368, + "step": 9092 + }, + { + "epoch": 2.708315493586999, + "grad_norm": 0.23064188659191132, + "learning_rate": 1.7104748629947456e-05, + "loss": 1.2597, + "step": 9093 + }, + { + "epoch": 2.708613339786668, + "grad_norm": 0.2329948991537094, + "learning_rate": 1.710406980976645e-05, + "loss": 1.2711, + "step": 9094 + }, + { + "epoch": 2.708911185986336, + "grad_norm": 0.24623043835163116, + "learning_rate": 1.7103390923490013e-05, + "loss": 1.2457, + "step": 9095 + }, + { + "epoch": 2.709209032186005, + "grad_norm": 0.23980486392974854, + "learning_rate": 1.7102711971124458e-05, + "loss": 1.2447, + "step": 9096 + }, + { + "epoch": 2.7095068783856737, + "grad_norm": 0.23110947012901306, + "learning_rate": 1.7102032952676103e-05, + "loss": 1.2432, + "step": 9097 + }, + { + "epoch": 2.709804724585342, + "grad_norm": 0.23680636286735535, + "learning_rate": 1.7101353868151268e-05, + "loss": 1.2545, + "step": 9098 + }, + { + "epoch": 2.710102570785011, + "grad_norm": 0.2491663545370102, + "learning_rate": 1.7100674717556273e-05, + "loss": 1.239, + "step": 9099 + }, + { + "epoch": 2.7104004169846796, + "grad_norm": 0.22969883680343628, + "learning_rate": 1.7099995500897425e-05, + "loss": 1.2437, + "step": 9100 + }, + { + "epoch": 2.710698263184348, + "grad_norm": 0.23873046040534973, + "learning_rate": 1.709931621818106e-05, + "loss": 1.2533, + "step": 9101 + }, + { + "epoch": 2.7109961093840167, + "grad_norm": 0.24019987881183624, + "learning_rate": 1.7098636869413483e-05, + "loss": 1.2535, + "step": 9102 + }, + { + "epoch": 2.7112939555836855, + "grad_norm": 0.23425638675689697, + "learning_rate": 1.709795745460102e-05, + "loss": 1.2585, + "step": 9103 + }, + { + "epoch": 2.7115918017833542, + "grad_norm": 0.22539055347442627, + "learning_rate": 1.7097277973749998e-05, + "loss": 1.2401, + "step": 9104 + }, + { + "epoch": 2.711889647983023, + "grad_norm": 0.2362004667520523, + "learning_rate": 1.7096598426866732e-05, + "loss": 1.2391, + "step": 9105 + }, + { + "epoch": 2.7121874941826913, + "grad_norm": 0.22755102813243866, + "learning_rate": 1.7095918813957547e-05, + "loss": 1.2386, + "step": 9106 + }, + { + "epoch": 2.71248534038236, + "grad_norm": 0.2261369824409485, + "learning_rate": 1.7095239135028767e-05, + "loss": 1.2402, + "step": 9107 + }, + { + "epoch": 2.712783186582029, + "grad_norm": 0.23672804236412048, + "learning_rate": 1.709455939008671e-05, + "loss": 1.2347, + "step": 9108 + }, + { + "epoch": 2.713081032781697, + "grad_norm": 0.23127327859401703, + "learning_rate": 1.7093879579137705e-05, + "loss": 1.2493, + "step": 9109 + }, + { + "epoch": 2.713378878981366, + "grad_norm": 0.2461974322795868, + "learning_rate": 1.709319970218808e-05, + "loss": 1.248, + "step": 9110 + }, + { + "epoch": 2.7136767251810348, + "grad_norm": 0.24692919850349426, + "learning_rate": 1.7092519759244153e-05, + "loss": 1.2595, + "step": 9111 + }, + { + "epoch": 2.713974571380703, + "grad_norm": 0.23184597492218018, + "learning_rate": 1.7091839750312255e-05, + "loss": 1.2519, + "step": 9112 + }, + { + "epoch": 2.714272417580372, + "grad_norm": 0.2404884696006775, + "learning_rate": 1.7091159675398713e-05, + "loss": 1.2429, + "step": 9113 + }, + { + "epoch": 2.7145702637800406, + "grad_norm": 0.2422194927930832, + "learning_rate": 1.7090479534509853e-05, + "loss": 1.2648, + "step": 9114 + }, + { + "epoch": 2.714868109979709, + "grad_norm": 0.23007488250732422, + "learning_rate": 1.7089799327652003e-05, + "loss": 1.2389, + "step": 9115 + }, + { + "epoch": 2.7151659561793777, + "grad_norm": 0.23888500034809113, + "learning_rate": 1.7089119054831486e-05, + "loss": 1.2513, + "step": 9116 + }, + { + "epoch": 2.7154638023790465, + "grad_norm": 0.22726529836654663, + "learning_rate": 1.708843871605464e-05, + "loss": 1.2396, + "step": 9117 + }, + { + "epoch": 2.7157616485787153, + "grad_norm": 0.2422054260969162, + "learning_rate": 1.7087758311327794e-05, + "loss": 1.2355, + "step": 9118 + }, + { + "epoch": 2.716059494778384, + "grad_norm": 0.23579560220241547, + "learning_rate": 1.7087077840657274e-05, + "loss": 1.2475, + "step": 9119 + }, + { + "epoch": 2.7163573409780524, + "grad_norm": 0.22729834914207458, + "learning_rate": 1.7086397304049413e-05, + "loss": 1.2398, + "step": 9120 + }, + { + "epoch": 2.716655187177721, + "grad_norm": 0.23567086458206177, + "learning_rate": 1.708571670151054e-05, + "loss": 1.2448, + "step": 9121 + }, + { + "epoch": 2.71695303337739, + "grad_norm": 0.22253277897834778, + "learning_rate": 1.7085036033046996e-05, + "loss": 1.2379, + "step": 9122 + }, + { + "epoch": 2.7172508795770582, + "grad_norm": 0.22736045718193054, + "learning_rate": 1.7084355298665104e-05, + "loss": 1.2387, + "step": 9123 + }, + { + "epoch": 2.717548725776727, + "grad_norm": 0.23885032534599304, + "learning_rate": 1.70836744983712e-05, + "loss": 1.2433, + "step": 9124 + }, + { + "epoch": 2.717846571976396, + "grad_norm": 0.24466484785079956, + "learning_rate": 1.7082993632171622e-05, + "loss": 1.2455, + "step": 9125 + }, + { + "epoch": 2.718144418176064, + "grad_norm": 0.25239646434783936, + "learning_rate": 1.7082312700072697e-05, + "loss": 1.2287, + "step": 9126 + }, + { + "epoch": 2.718442264375733, + "grad_norm": 0.23017901182174683, + "learning_rate": 1.708163170208077e-05, + "loss": 1.2397, + "step": 9127 + }, + { + "epoch": 2.7187401105754017, + "grad_norm": 0.2403314709663391, + "learning_rate": 1.708095063820217e-05, + "loss": 1.2339, + "step": 9128 + }, + { + "epoch": 2.7190379567750704, + "grad_norm": 0.2271287739276886, + "learning_rate": 1.708026950844323e-05, + "loss": 1.2424, + "step": 9129 + }, + { + "epoch": 2.7193358029747388, + "grad_norm": 0.2517382502555847, + "learning_rate": 1.70795883128103e-05, + "loss": 1.2648, + "step": 9130 + }, + { + "epoch": 2.7196336491744075, + "grad_norm": 0.2596951723098755, + "learning_rate": 1.707890705130971e-05, + "loss": 1.2476, + "step": 9131 + }, + { + "epoch": 2.7199314953740763, + "grad_norm": 0.24782678484916687, + "learning_rate": 1.7078225723947798e-05, + "loss": 1.2248, + "step": 9132 + }, + { + "epoch": 2.720229341573745, + "grad_norm": 0.2564922571182251, + "learning_rate": 1.7077544330730903e-05, + "loss": 1.2654, + "step": 9133 + }, + { + "epoch": 2.7205271877734134, + "grad_norm": 0.2629776895046234, + "learning_rate": 1.707686287166537e-05, + "loss": 1.2564, + "step": 9134 + }, + { + "epoch": 2.720825033973082, + "grad_norm": 0.2627808451652527, + "learning_rate": 1.7076181346757527e-05, + "loss": 1.2343, + "step": 9135 + }, + { + "epoch": 2.721122880172751, + "grad_norm": 0.26616424322128296, + "learning_rate": 1.7075499756013728e-05, + "loss": 1.2442, + "step": 9136 + }, + { + "epoch": 2.7214207263724193, + "grad_norm": 0.3545578122138977, + "learning_rate": 1.7074818099440306e-05, + "loss": 1.2486, + "step": 9137 + }, + { + "epoch": 2.721718572572088, + "grad_norm": 0.24880005419254303, + "learning_rate": 1.707413637704361e-05, + "loss": 1.2425, + "step": 9138 + }, + { + "epoch": 2.722016418771757, + "grad_norm": 0.26369282603263855, + "learning_rate": 1.7073454588829976e-05, + "loss": 1.2397, + "step": 9139 + }, + { + "epoch": 2.722314264971425, + "grad_norm": 0.23869788646697998, + "learning_rate": 1.707277273480575e-05, + "loss": 1.2683, + "step": 9140 + }, + { + "epoch": 2.722612111171094, + "grad_norm": 0.2874651849269867, + "learning_rate": 1.7072090814977275e-05, + "loss": 1.2305, + "step": 9141 + }, + { + "epoch": 2.7229099573707627, + "grad_norm": 0.25917285680770874, + "learning_rate": 1.7071408829350896e-05, + "loss": 1.2485, + "step": 9142 + }, + { + "epoch": 2.7232078035704315, + "grad_norm": 0.2596162259578705, + "learning_rate": 1.707072677793296e-05, + "loss": 1.2544, + "step": 9143 + }, + { + "epoch": 2.7235056497701002, + "grad_norm": 0.26855939626693726, + "learning_rate": 1.7070044660729814e-05, + "loss": 1.2483, + "step": 9144 + }, + { + "epoch": 2.7238034959697686, + "grad_norm": 0.23040041327476501, + "learning_rate": 1.7069362477747798e-05, + "loss": 1.2545, + "step": 9145 + }, + { + "epoch": 2.7241013421694373, + "grad_norm": 0.2614309787750244, + "learning_rate": 1.7068680228993263e-05, + "loss": 1.2396, + "step": 9146 + }, + { + "epoch": 2.724399188369106, + "grad_norm": 0.23362092673778534, + "learning_rate": 1.7067997914472557e-05, + "loss": 1.2332, + "step": 9147 + }, + { + "epoch": 2.7246970345687744, + "grad_norm": 0.320008248090744, + "learning_rate": 1.7067315534192024e-05, + "loss": 1.2546, + "step": 9148 + }, + { + "epoch": 2.724994880768443, + "grad_norm": 0.25644442439079285, + "learning_rate": 1.7066633088158017e-05, + "loss": 1.2362, + "step": 9149 + }, + { + "epoch": 2.725292726968112, + "grad_norm": 0.24778707325458527, + "learning_rate": 1.7065950576376886e-05, + "loss": 1.2299, + "step": 9150 + }, + { + "epoch": 2.7255905731677803, + "grad_norm": 0.2621813118457794, + "learning_rate": 1.706526799885498e-05, + "loss": 1.2734, + "step": 9151 + }, + { + "epoch": 2.725888419367449, + "grad_norm": 0.2463354915380478, + "learning_rate": 1.706458535559865e-05, + "loss": 1.2475, + "step": 9152 + }, + { + "epoch": 2.726186265567118, + "grad_norm": 0.26674890518188477, + "learning_rate": 1.7063902646614242e-05, + "loss": 1.2425, + "step": 9153 + }, + { + "epoch": 2.726484111766786, + "grad_norm": 0.2545463442802429, + "learning_rate": 1.7063219871908118e-05, + "loss": 1.2567, + "step": 9154 + }, + { + "epoch": 2.726781957966455, + "grad_norm": 0.2650394141674042, + "learning_rate": 1.706253703148662e-05, + "loss": 1.2505, + "step": 9155 + }, + { + "epoch": 2.7270798041661237, + "grad_norm": 0.23900756239891052, + "learning_rate": 1.7061854125356107e-05, + "loss": 1.2443, + "step": 9156 + }, + { + "epoch": 2.7273776503657925, + "grad_norm": 0.22633816301822662, + "learning_rate": 1.7061171153522932e-05, + "loss": 1.2559, + "step": 9157 + }, + { + "epoch": 2.7276754965654613, + "grad_norm": 0.269744873046875, + "learning_rate": 1.7060488115993448e-05, + "loss": 1.2538, + "step": 9158 + }, + { + "epoch": 2.7279733427651296, + "grad_norm": 0.3422834277153015, + "learning_rate": 1.7059805012774008e-05, + "loss": 1.267, + "step": 9159 + }, + { + "epoch": 2.7282711889647984, + "grad_norm": 0.28951409459114075, + "learning_rate": 1.7059121843870975e-05, + "loss": 1.2645, + "step": 9160 + }, + { + "epoch": 2.728569035164467, + "grad_norm": 0.25171276926994324, + "learning_rate": 1.7058438609290697e-05, + "loss": 1.2359, + "step": 9161 + }, + { + "epoch": 2.7288668813641355, + "grad_norm": 0.3104507029056549, + "learning_rate": 1.7057755309039535e-05, + "loss": 1.2523, + "step": 9162 + }, + { + "epoch": 2.7291647275638042, + "grad_norm": 0.2615697383880615, + "learning_rate": 1.7057071943123845e-05, + "loss": 1.2636, + "step": 9163 + }, + { + "epoch": 2.729462573763473, + "grad_norm": 0.264263391494751, + "learning_rate": 1.7056388511549985e-05, + "loss": 1.2327, + "step": 9164 + }, + { + "epoch": 2.7297604199631413, + "grad_norm": 0.30043691396713257, + "learning_rate": 1.7055705014324313e-05, + "loss": 1.2422, + "step": 9165 + }, + { + "epoch": 2.73005826616281, + "grad_norm": 0.25919532775878906, + "learning_rate": 1.7055021451453188e-05, + "loss": 1.2371, + "step": 9166 + }, + { + "epoch": 2.730356112362479, + "grad_norm": 0.4882142245769501, + "learning_rate": 1.7054337822942976e-05, + "loss": 1.2396, + "step": 9167 + }, + { + "epoch": 2.730653958562147, + "grad_norm": 0.302798867225647, + "learning_rate": 1.7053654128800026e-05, + "loss": 1.2582, + "step": 9168 + }, + { + "epoch": 2.730951804761816, + "grad_norm": 0.2920604348182678, + "learning_rate": 1.705297036903071e-05, + "loss": 1.2511, + "step": 9169 + }, + { + "epoch": 2.7312496509614848, + "grad_norm": 0.2750656306743622, + "learning_rate": 1.705228654364138e-05, + "loss": 1.2484, + "step": 9170 + }, + { + "epoch": 2.7315474971611535, + "grad_norm": 0.3999350666999817, + "learning_rate": 1.7051602652638405e-05, + "loss": 1.2464, + "step": 9171 + }, + { + "epoch": 2.7318453433608223, + "grad_norm": 0.2648412883281708, + "learning_rate": 1.7050918696028147e-05, + "loss": 1.2611, + "step": 9172 + }, + { + "epoch": 2.7321431895604906, + "grad_norm": 0.23810319602489471, + "learning_rate": 1.7050234673816967e-05, + "loss": 1.2693, + "step": 9173 + }, + { + "epoch": 2.7324410357601594, + "grad_norm": 0.24768781661987305, + "learning_rate": 1.7049550586011234e-05, + "loss": 1.2419, + "step": 9174 + }, + { + "epoch": 2.732738881959828, + "grad_norm": 0.23804834485054016, + "learning_rate": 1.7048866432617303e-05, + "loss": 1.2585, + "step": 9175 + }, + { + "epoch": 2.7330367281594965, + "grad_norm": 0.23359544575214386, + "learning_rate": 1.7048182213641548e-05, + "loss": 1.2363, + "step": 9176 + }, + { + "epoch": 2.7333345743591653, + "grad_norm": 0.24701635539531708, + "learning_rate": 1.7047497929090332e-05, + "loss": 1.2334, + "step": 9177 + }, + { + "epoch": 2.733632420558834, + "grad_norm": 0.43969398736953735, + "learning_rate": 1.704681357897002e-05, + "loss": 1.246, + "step": 9178 + }, + { + "epoch": 2.7339302667585024, + "grad_norm": 0.27062731981277466, + "learning_rate": 1.7046129163286985e-05, + "loss": 1.257, + "step": 9179 + }, + { + "epoch": 2.734228112958171, + "grad_norm": 0.23545852303504944, + "learning_rate": 1.704544468204759e-05, + "loss": 1.2603, + "step": 9180 + }, + { + "epoch": 2.73452595915784, + "grad_norm": 0.23939058184623718, + "learning_rate": 1.7044760135258203e-05, + "loss": 1.2456, + "step": 9181 + }, + { + "epoch": 2.7348238053575082, + "grad_norm": 0.25682955980300903, + "learning_rate": 1.7044075522925192e-05, + "loss": 1.2545, + "step": 9182 + }, + { + "epoch": 2.735121651557177, + "grad_norm": 0.24427543580532074, + "learning_rate": 1.704339084505493e-05, + "loss": 1.2584, + "step": 9183 + }, + { + "epoch": 2.735419497756846, + "grad_norm": 0.33743566274642944, + "learning_rate": 1.7042706101653784e-05, + "loss": 1.2453, + "step": 9184 + }, + { + "epoch": 2.7357173439565146, + "grad_norm": 0.2339986264705658, + "learning_rate": 1.704202129272813e-05, + "loss": 1.2458, + "step": 9185 + }, + { + "epoch": 2.7360151901561833, + "grad_norm": 0.24717910587787628, + "learning_rate": 1.704133641828433e-05, + "loss": 1.2488, + "step": 9186 + }, + { + "epoch": 2.7363130363558517, + "grad_norm": 0.2506367564201355, + "learning_rate": 1.7040651478328765e-05, + "loss": 1.2379, + "step": 9187 + }, + { + "epoch": 2.7366108825555204, + "grad_norm": 0.24618619680404663, + "learning_rate": 1.7039966472867805e-05, + "loss": 1.2304, + "step": 9188 + }, + { + "epoch": 2.736908728755189, + "grad_norm": 0.24516215920448303, + "learning_rate": 1.7039281401907822e-05, + "loss": 1.2407, + "step": 9189 + }, + { + "epoch": 2.7372065749548575, + "grad_norm": 0.2300100177526474, + "learning_rate": 1.7038596265455188e-05, + "loss": 1.2432, + "step": 9190 + }, + { + "epoch": 2.7375044211545263, + "grad_norm": 0.25353389978408813, + "learning_rate": 1.7037911063516285e-05, + "loss": 1.2352, + "step": 9191 + }, + { + "epoch": 2.737802267354195, + "grad_norm": 0.23846863210201263, + "learning_rate": 1.703722579609748e-05, + "loss": 1.2391, + "step": 9192 + }, + { + "epoch": 2.7381001135538634, + "grad_norm": 0.2458648383617401, + "learning_rate": 1.703654046320515e-05, + "loss": 1.2521, + "step": 9193 + }, + { + "epoch": 2.738397959753532, + "grad_norm": 0.2297109216451645, + "learning_rate": 1.7035855064845672e-05, + "loss": 1.2378, + "step": 9194 + }, + { + "epoch": 2.738695805953201, + "grad_norm": 0.23073622584342957, + "learning_rate": 1.7035169601025426e-05, + "loss": 1.2488, + "step": 9195 + }, + { + "epoch": 2.7389936521528697, + "grad_norm": 0.24436400830745697, + "learning_rate": 1.7034484071750786e-05, + "loss": 1.2352, + "step": 9196 + }, + { + "epoch": 2.739291498352538, + "grad_norm": 0.24284270405769348, + "learning_rate": 1.703379847702813e-05, + "loss": 1.2321, + "step": 9197 + }, + { + "epoch": 2.739589344552207, + "grad_norm": 0.27496808767318726, + "learning_rate": 1.703311281686384e-05, + "loss": 1.2525, + "step": 9198 + }, + { + "epoch": 2.7398871907518756, + "grad_norm": 0.2907407581806183, + "learning_rate": 1.703242709126429e-05, + "loss": 1.2437, + "step": 9199 + }, + { + "epoch": 2.7401850369515444, + "grad_norm": 0.2722211182117462, + "learning_rate": 1.7031741300235863e-05, + "loss": 1.262, + "step": 9200 + }, + { + "epoch": 2.7404828831512127, + "grad_norm": 0.41545718908309937, + "learning_rate": 1.7031055443784943e-05, + "loss": 1.2439, + "step": 9201 + }, + { + "epoch": 2.7407807293508815, + "grad_norm": 0.2844906151294708, + "learning_rate": 1.7030369521917908e-05, + "loss": 1.2572, + "step": 9202 + }, + { + "epoch": 2.7410785755505502, + "grad_norm": 0.28576919436454773, + "learning_rate": 1.7029683534641136e-05, + "loss": 1.2475, + "step": 9203 + }, + { + "epoch": 2.7413764217502186, + "grad_norm": 0.30909910798072815, + "learning_rate": 1.7028997481961016e-05, + "loss": 1.2406, + "step": 9204 + }, + { + "epoch": 2.7416742679498873, + "grad_norm": 0.2515838146209717, + "learning_rate": 1.7028311363883925e-05, + "loss": 1.2579, + "step": 9205 + }, + { + "epoch": 2.741972114149556, + "grad_norm": 0.2506393492221832, + "learning_rate": 1.7027625180416247e-05, + "loss": 1.2601, + "step": 9206 + }, + { + "epoch": 2.7422699603492244, + "grad_norm": 0.2576870024204254, + "learning_rate": 1.7026938931564374e-05, + "loss": 1.2521, + "step": 9207 + }, + { + "epoch": 2.742567806548893, + "grad_norm": 0.24623283743858337, + "learning_rate": 1.7026252617334683e-05, + "loss": 1.2455, + "step": 9208 + }, + { + "epoch": 2.742865652748562, + "grad_norm": 0.30400317907333374, + "learning_rate": 1.702556623773356e-05, + "loss": 1.2555, + "step": 9209 + }, + { + "epoch": 2.7431634989482307, + "grad_norm": 0.2484186589717865, + "learning_rate": 1.7024879792767395e-05, + "loss": 1.2636, + "step": 9210 + }, + { + "epoch": 2.7434613451478995, + "grad_norm": 0.2822059392929077, + "learning_rate": 1.702419328244257e-05, + "loss": 1.2549, + "step": 9211 + }, + { + "epoch": 2.743759191347568, + "grad_norm": 0.24002540111541748, + "learning_rate": 1.7023506706765477e-05, + "loss": 1.2465, + "step": 9212 + }, + { + "epoch": 2.7440570375472366, + "grad_norm": 0.33280149102211, + "learning_rate": 1.70228200657425e-05, + "loss": 1.248, + "step": 9213 + }, + { + "epoch": 2.7443548837469054, + "grad_norm": 0.22955799102783203, + "learning_rate": 1.7022133359380028e-05, + "loss": 1.2651, + "step": 9214 + }, + { + "epoch": 2.7446527299465737, + "grad_norm": 0.2753639817237854, + "learning_rate": 1.702144658768445e-05, + "loss": 1.2535, + "step": 9215 + }, + { + "epoch": 2.7449505761462425, + "grad_norm": 0.25297975540161133, + "learning_rate": 1.7020759750662156e-05, + "loss": 1.2441, + "step": 9216 + }, + { + "epoch": 2.7452484223459113, + "grad_norm": 0.31673431396484375, + "learning_rate": 1.702007284831954e-05, + "loss": 1.2441, + "step": 9217 + }, + { + "epoch": 2.7455462685455796, + "grad_norm": 0.24094048142433167, + "learning_rate": 1.7019385880662985e-05, + "loss": 1.2575, + "step": 9218 + }, + { + "epoch": 2.7458441147452484, + "grad_norm": 0.2683247923851013, + "learning_rate": 1.7018698847698893e-05, + "loss": 1.268, + "step": 9219 + }, + { + "epoch": 2.746141960944917, + "grad_norm": 0.2574305236339569, + "learning_rate": 1.7018011749433646e-05, + "loss": 1.2405, + "step": 9220 + }, + { + "epoch": 2.7464398071445855, + "grad_norm": 0.28552350401878357, + "learning_rate": 1.701732458587364e-05, + "loss": 1.2457, + "step": 9221 + }, + { + "epoch": 2.7467376533442542, + "grad_norm": 0.2695595920085907, + "learning_rate": 1.701663735702527e-05, + "loss": 1.2521, + "step": 9222 + }, + { + "epoch": 2.747035499543923, + "grad_norm": 0.27349793910980225, + "learning_rate": 1.7015950062894928e-05, + "loss": 1.2538, + "step": 9223 + }, + { + "epoch": 2.747333345743592, + "grad_norm": 0.27242761850357056, + "learning_rate": 1.701526270348901e-05, + "loss": 1.2426, + "step": 9224 + }, + { + "epoch": 2.7476311919432606, + "grad_norm": 0.2456509917974472, + "learning_rate": 1.701457527881391e-05, + "loss": 1.2544, + "step": 9225 + }, + { + "epoch": 2.747929038142929, + "grad_norm": 0.27910783886909485, + "learning_rate": 1.7013887788876025e-05, + "loss": 1.2508, + "step": 9226 + }, + { + "epoch": 2.7482268843425977, + "grad_norm": 0.26685217022895813, + "learning_rate": 1.7013200233681752e-05, + "loss": 1.2436, + "step": 9227 + }, + { + "epoch": 2.7485247305422664, + "grad_norm": 0.2567109763622284, + "learning_rate": 1.7012512613237488e-05, + "loss": 1.2398, + "step": 9228 + }, + { + "epoch": 2.7488225767419348, + "grad_norm": 0.24262216687202454, + "learning_rate": 1.701182492754962e-05, + "loss": 1.2368, + "step": 9229 + }, + { + "epoch": 2.7491204229416035, + "grad_norm": 0.2572483420372009, + "learning_rate": 1.7011137176624562e-05, + "loss": 1.2506, + "step": 9230 + }, + { + "epoch": 2.7494182691412723, + "grad_norm": 0.24260063469409943, + "learning_rate": 1.7010449360468704e-05, + "loss": 1.257, + "step": 9231 + }, + { + "epoch": 2.7497161153409406, + "grad_norm": 0.24273715913295746, + "learning_rate": 1.700976147908845e-05, + "loss": 1.2426, + "step": 9232 + }, + { + "epoch": 2.7500139615406094, + "grad_norm": 0.2752387225627899, + "learning_rate": 1.7009073532490195e-05, + "loss": 1.2475, + "step": 9233 + }, + { + "epoch": 2.750311807740278, + "grad_norm": 0.22555620968341827, + "learning_rate": 1.700838552068034e-05, + "loss": 1.2351, + "step": 9234 + }, + { + "epoch": 2.7506096539399465, + "grad_norm": 0.26723402738571167, + "learning_rate": 1.7007697443665292e-05, + "loss": 1.2399, + "step": 9235 + }, + { + "epoch": 2.7509075001396153, + "grad_norm": 0.27106043696403503, + "learning_rate": 1.7007009301451446e-05, + "loss": 1.2398, + "step": 9236 + }, + { + "epoch": 2.751205346339284, + "grad_norm": 0.2399388551712036, + "learning_rate": 1.7006321094045205e-05, + "loss": 1.2508, + "step": 9237 + }, + { + "epoch": 2.751503192538953, + "grad_norm": 0.2277744561433792, + "learning_rate": 1.700563282145298e-05, + "loss": 1.2556, + "step": 9238 + }, + { + "epoch": 2.7518010387386216, + "grad_norm": 0.23171816766262054, + "learning_rate": 1.7004944483681164e-05, + "loss": 1.2339, + "step": 9239 + }, + { + "epoch": 2.75209888493829, + "grad_norm": 0.24868685007095337, + "learning_rate": 1.7004256080736167e-05, + "loss": 1.255, + "step": 9240 + }, + { + "epoch": 2.7523967311379587, + "grad_norm": 0.2417503297328949, + "learning_rate": 1.7003567612624393e-05, + "loss": 1.2489, + "step": 9241 + }, + { + "epoch": 2.7526945773376275, + "grad_norm": 0.27626389265060425, + "learning_rate": 1.7002879079352247e-05, + "loss": 1.247, + "step": 9242 + }, + { + "epoch": 2.752992423537296, + "grad_norm": 0.30410346388816833, + "learning_rate": 1.7002190480926138e-05, + "loss": 1.237, + "step": 9243 + }, + { + "epoch": 2.7532902697369646, + "grad_norm": 0.2708747982978821, + "learning_rate": 1.7001501817352468e-05, + "loss": 1.2487, + "step": 9244 + }, + { + "epoch": 2.7535881159366333, + "grad_norm": 0.2584022581577301, + "learning_rate": 1.7000813088637645e-05, + "loss": 1.2526, + "step": 9245 + }, + { + "epoch": 2.7538859621363017, + "grad_norm": 0.4966140687465668, + "learning_rate": 1.7000124294788078e-05, + "loss": 1.2393, + "step": 9246 + }, + { + "epoch": 2.7541838083359704, + "grad_norm": 0.32315561175346375, + "learning_rate": 1.6999435435810175e-05, + "loss": 1.2388, + "step": 9247 + }, + { + "epoch": 2.754481654535639, + "grad_norm": 0.2769085168838501, + "learning_rate": 1.699874651171035e-05, + "loss": 1.2641, + "step": 9248 + }, + { + "epoch": 2.7547795007353075, + "grad_norm": 0.2551030218601227, + "learning_rate": 1.6998057522495002e-05, + "loss": 1.2456, + "step": 9249 + }, + { + "epoch": 2.7550773469349763, + "grad_norm": 0.24251501262187958, + "learning_rate": 1.699736846817055e-05, + "loss": 1.2481, + "step": 9250 + }, + { + "epoch": 2.755375193134645, + "grad_norm": 0.3502722382545471, + "learning_rate": 1.6996679348743402e-05, + "loss": 1.236, + "step": 9251 + }, + { + "epoch": 2.755673039334314, + "grad_norm": 0.24152779579162598, + "learning_rate": 1.6995990164219973e-05, + "loss": 1.2549, + "step": 9252 + }, + { + "epoch": 2.7559708855339826, + "grad_norm": 0.23586568236351013, + "learning_rate": 1.699530091460667e-05, + "loss": 1.2281, + "step": 9253 + }, + { + "epoch": 2.756268731733651, + "grad_norm": 0.25303852558135986, + "learning_rate": 1.6994611599909907e-05, + "loss": 1.255, + "step": 9254 + }, + { + "epoch": 2.7565665779333197, + "grad_norm": 0.26098665595054626, + "learning_rate": 1.6993922220136098e-05, + "loss": 1.2457, + "step": 9255 + }, + { + "epoch": 2.7568644241329885, + "grad_norm": 0.23625260591506958, + "learning_rate": 1.6993232775291658e-05, + "loss": 1.2349, + "step": 9256 + }, + { + "epoch": 2.757162270332657, + "grad_norm": 0.28086790442466736, + "learning_rate": 1.6992543265382996e-05, + "loss": 1.245, + "step": 9257 + }, + { + "epoch": 2.7574601165323256, + "grad_norm": 0.2951943576335907, + "learning_rate": 1.6991853690416535e-05, + "loss": 1.254, + "step": 9258 + }, + { + "epoch": 2.7577579627319944, + "grad_norm": 0.2557200789451599, + "learning_rate": 1.6991164050398686e-05, + "loss": 1.2471, + "step": 9259 + }, + { + "epoch": 2.7580558089316627, + "grad_norm": 0.2863323390483856, + "learning_rate": 1.6990474345335866e-05, + "loss": 1.2658, + "step": 9260 + }, + { + "epoch": 2.7583536551313315, + "grad_norm": 0.25151994824409485, + "learning_rate": 1.6989784575234495e-05, + "loss": 1.2448, + "step": 9261 + }, + { + "epoch": 2.7586515013310002, + "grad_norm": 0.2653193771839142, + "learning_rate": 1.6989094740100987e-05, + "loss": 1.2325, + "step": 9262 + }, + { + "epoch": 2.758949347530669, + "grad_norm": 0.30489224195480347, + "learning_rate": 1.6988404839941763e-05, + "loss": 1.2447, + "step": 9263 + }, + { + "epoch": 2.7592471937303373, + "grad_norm": 0.2491319179534912, + "learning_rate": 1.6987714874763236e-05, + "loss": 1.2418, + "step": 9264 + }, + { + "epoch": 2.759545039930006, + "grad_norm": 0.28793221712112427, + "learning_rate": 1.698702484457183e-05, + "loss": 1.2545, + "step": 9265 + }, + { + "epoch": 2.759842886129675, + "grad_norm": 0.2668450176715851, + "learning_rate": 1.6986334749373965e-05, + "loss": 1.2442, + "step": 9266 + }, + { + "epoch": 2.7601407323293436, + "grad_norm": 0.23432527482509613, + "learning_rate": 1.698564458917606e-05, + "loss": 1.23, + "step": 9267 + }, + { + "epoch": 2.760438578529012, + "grad_norm": 0.3287263512611389, + "learning_rate": 1.6984954363984537e-05, + "loss": 1.2459, + "step": 9268 + }, + { + "epoch": 2.7607364247286807, + "grad_norm": 0.24850362539291382, + "learning_rate": 1.698426407380582e-05, + "loss": 1.2465, + "step": 9269 + }, + { + "epoch": 2.7610342709283495, + "grad_norm": 0.2640020251274109, + "learning_rate": 1.6983573718646328e-05, + "loss": 1.2357, + "step": 9270 + }, + { + "epoch": 2.761332117128018, + "grad_norm": 0.27306121587753296, + "learning_rate": 1.6982883298512483e-05, + "loss": 1.2486, + "step": 9271 + }, + { + "epoch": 2.7616299633276866, + "grad_norm": 0.24212612211704254, + "learning_rate": 1.6982192813410713e-05, + "loss": 1.2317, + "step": 9272 + }, + { + "epoch": 2.7619278095273554, + "grad_norm": 0.2920074164867401, + "learning_rate": 1.6981502263347438e-05, + "loss": 1.2299, + "step": 9273 + }, + { + "epoch": 2.7622256557270237, + "grad_norm": 0.25239890813827515, + "learning_rate": 1.6980811648329086e-05, + "loss": 1.2516, + "step": 9274 + }, + { + "epoch": 2.7625235019266925, + "grad_norm": 0.2624033987522125, + "learning_rate": 1.698012096836208e-05, + "loss": 1.2487, + "step": 9275 + }, + { + "epoch": 2.7628213481263613, + "grad_norm": 0.2670723497867584, + "learning_rate": 1.697943022345285e-05, + "loss": 1.2631, + "step": 9276 + }, + { + "epoch": 2.76311919432603, + "grad_norm": 0.24647146463394165, + "learning_rate": 1.6978739413607815e-05, + "loss": 1.2384, + "step": 9277 + }, + { + "epoch": 2.763417040525699, + "grad_norm": 0.27723342180252075, + "learning_rate": 1.697804853883341e-05, + "loss": 1.255, + "step": 9278 + }, + { + "epoch": 2.763714886725367, + "grad_norm": 0.2784757614135742, + "learning_rate": 1.6977357599136057e-05, + "loss": 1.2453, + "step": 9279 + }, + { + "epoch": 2.764012732925036, + "grad_norm": 0.27276426553726196, + "learning_rate": 1.697666659452219e-05, + "loss": 1.2494, + "step": 9280 + }, + { + "epoch": 2.7643105791247047, + "grad_norm": 0.2571217715740204, + "learning_rate": 1.6975975524998234e-05, + "loss": 1.2457, + "step": 9281 + }, + { + "epoch": 2.764608425324373, + "grad_norm": 0.23558710515499115, + "learning_rate": 1.6975284390570622e-05, + "loss": 1.2638, + "step": 9282 + }, + { + "epoch": 2.764906271524042, + "grad_norm": 0.2826679050922394, + "learning_rate": 1.6974593191245778e-05, + "loss": 1.2606, + "step": 9283 + }, + { + "epoch": 2.7652041177237106, + "grad_norm": 0.2285085916519165, + "learning_rate": 1.697390192703014e-05, + "loss": 1.2543, + "step": 9284 + }, + { + "epoch": 2.765501963923379, + "grad_norm": 0.2481565624475479, + "learning_rate": 1.6973210597930135e-05, + "loss": 1.2418, + "step": 9285 + }, + { + "epoch": 2.7657998101230477, + "grad_norm": 0.24361467361450195, + "learning_rate": 1.6972519203952194e-05, + "loss": 1.2509, + "step": 9286 + }, + { + "epoch": 2.7660976563227164, + "grad_norm": 0.24368710815906525, + "learning_rate": 1.6971827745102754e-05, + "loss": 1.2552, + "step": 9287 + }, + { + "epoch": 2.7663955025223848, + "grad_norm": 0.258975625038147, + "learning_rate": 1.697113622138825e-05, + "loss": 1.2506, + "step": 9288 + }, + { + "epoch": 2.7666933487220535, + "grad_norm": 0.23841984570026398, + "learning_rate": 1.6970444632815106e-05, + "loss": 1.2537, + "step": 9289 + }, + { + "epoch": 2.7669911949217223, + "grad_norm": 0.24847932159900665, + "learning_rate": 1.6969752979389763e-05, + "loss": 1.2323, + "step": 9290 + }, + { + "epoch": 2.767289041121391, + "grad_norm": 0.24395668506622314, + "learning_rate": 1.6969061261118658e-05, + "loss": 1.2449, + "step": 9291 + }, + { + "epoch": 2.76758688732106, + "grad_norm": 0.29188939929008484, + "learning_rate": 1.6968369478008224e-05, + "loss": 1.2464, + "step": 9292 + }, + { + "epoch": 2.767884733520728, + "grad_norm": 0.24827846884727478, + "learning_rate": 1.69676776300649e-05, + "loss": 1.2365, + "step": 9293 + }, + { + "epoch": 2.768182579720397, + "grad_norm": 0.27385246753692627, + "learning_rate": 1.6966985717295114e-05, + "loss": 1.235, + "step": 9294 + }, + { + "epoch": 2.7684804259200657, + "grad_norm": 0.30806779861450195, + "learning_rate": 1.6966293739705316e-05, + "loss": 1.2468, + "step": 9295 + }, + { + "epoch": 2.768778272119734, + "grad_norm": 0.23335260152816772, + "learning_rate": 1.6965601697301935e-05, + "loss": 1.2356, + "step": 9296 + }, + { + "epoch": 2.769076118319403, + "grad_norm": 0.23949144780635834, + "learning_rate": 1.6964909590091414e-05, + "loss": 1.2563, + "step": 9297 + }, + { + "epoch": 2.7693739645190716, + "grad_norm": 0.24809135496616364, + "learning_rate": 1.696421741808019e-05, + "loss": 1.2463, + "step": 9298 + }, + { + "epoch": 2.76967181071874, + "grad_norm": 0.29287445545196533, + "learning_rate": 1.6963525181274706e-05, + "loss": 1.2589, + "step": 9299 + }, + { + "epoch": 2.7699696569184087, + "grad_norm": 0.23110057413578033, + "learning_rate": 1.6962832879681396e-05, + "loss": 1.2331, + "step": 9300 + }, + { + "epoch": 2.7702675031180775, + "grad_norm": 0.26416778564453125, + "learning_rate": 1.6962140513306707e-05, + "loss": 1.2619, + "step": 9301 + }, + { + "epoch": 2.770565349317746, + "grad_norm": 0.27834850549697876, + "learning_rate": 1.696144808215708e-05, + "loss": 1.2324, + "step": 9302 + }, + { + "epoch": 2.7708631955174146, + "grad_norm": 0.2294827103614807, + "learning_rate": 1.696075558623896e-05, + "loss": 1.2399, + "step": 9303 + }, + { + "epoch": 2.7711610417170833, + "grad_norm": 0.24087277054786682, + "learning_rate": 1.6960063025558778e-05, + "loss": 1.2491, + "step": 9304 + }, + { + "epoch": 2.771458887916752, + "grad_norm": 0.2239975780248642, + "learning_rate": 1.6959370400122993e-05, + "loss": 1.231, + "step": 9305 + }, + { + "epoch": 2.771756734116421, + "grad_norm": 0.24244676530361176, + "learning_rate": 1.6958677709938037e-05, + "loss": 1.2539, + "step": 9306 + }, + { + "epoch": 2.772054580316089, + "grad_norm": 0.24967491626739502, + "learning_rate": 1.695798495501036e-05, + "loss": 1.2625, + "step": 9307 + }, + { + "epoch": 2.772352426515758, + "grad_norm": 0.23692266643047333, + "learning_rate": 1.695729213534641e-05, + "loss": 1.234, + "step": 9308 + }, + { + "epoch": 2.7726502727154267, + "grad_norm": 0.24569763243198395, + "learning_rate": 1.6956599250952627e-05, + "loss": 1.2449, + "step": 9309 + }, + { + "epoch": 2.772948118915095, + "grad_norm": 0.24894258379936218, + "learning_rate": 1.6955906301835465e-05, + "loss": 1.2534, + "step": 9310 + }, + { + "epoch": 2.773245965114764, + "grad_norm": 0.23010030388832092, + "learning_rate": 1.6955213288001362e-05, + "loss": 1.2241, + "step": 9311 + }, + { + "epoch": 2.7735438113144326, + "grad_norm": 0.23309226334095, + "learning_rate": 1.695452020945677e-05, + "loss": 1.2369, + "step": 9312 + }, + { + "epoch": 2.773841657514101, + "grad_norm": 0.23277804255485535, + "learning_rate": 1.6953827066208138e-05, + "loss": 1.2417, + "step": 9313 + }, + { + "epoch": 2.7741395037137697, + "grad_norm": 0.24053636193275452, + "learning_rate": 1.6953133858261916e-05, + "loss": 1.2355, + "step": 9314 + }, + { + "epoch": 2.7744373499134385, + "grad_norm": 0.2710404396057129, + "learning_rate": 1.6952440585624553e-05, + "loss": 1.2484, + "step": 9315 + }, + { + "epoch": 2.7747351961131073, + "grad_norm": 0.27087676525115967, + "learning_rate": 1.6951747248302495e-05, + "loss": 1.2536, + "step": 9316 + }, + { + "epoch": 2.7750330423127756, + "grad_norm": 0.24413885176181793, + "learning_rate": 1.6951053846302198e-05, + "loss": 1.2407, + "step": 9317 + }, + { + "epoch": 2.7753308885124444, + "grad_norm": 0.5456312298774719, + "learning_rate": 1.695036037963011e-05, + "loss": 1.2433, + "step": 9318 + }, + { + "epoch": 2.775628734712113, + "grad_norm": 0.3109140694141388, + "learning_rate": 1.6949666848292683e-05, + "loss": 1.2629, + "step": 9319 + }, + { + "epoch": 2.775926580911782, + "grad_norm": 0.2805221974849701, + "learning_rate": 1.6948973252296376e-05, + "loss": 1.2384, + "step": 9320 + }, + { + "epoch": 2.7762244271114502, + "grad_norm": 0.26506713032722473, + "learning_rate": 1.694827959164763e-05, + "loss": 1.2481, + "step": 9321 + }, + { + "epoch": 2.776522273311119, + "grad_norm": 0.24361567199230194, + "learning_rate": 1.694758586635291e-05, + "loss": 1.2376, + "step": 9322 + }, + { + "epoch": 2.7768201195107878, + "grad_norm": 0.24067051708698273, + "learning_rate": 1.6946892076418665e-05, + "loss": 1.2508, + "step": 9323 + }, + { + "epoch": 2.777117965710456, + "grad_norm": 0.27331697940826416, + "learning_rate": 1.6946198221851348e-05, + "loss": 1.2426, + "step": 9324 + }, + { + "epoch": 2.777415811910125, + "grad_norm": 0.2714940011501312, + "learning_rate": 1.6945504302657418e-05, + "loss": 1.2588, + "step": 9325 + }, + { + "epoch": 2.7777136581097936, + "grad_norm": 0.2522963583469391, + "learning_rate": 1.6944810318843332e-05, + "loss": 1.2494, + "step": 9326 + }, + { + "epoch": 2.778011504309462, + "grad_norm": 0.2336743324995041, + "learning_rate": 1.6944116270415546e-05, + "loss": 1.2546, + "step": 9327 + }, + { + "epoch": 2.7783093505091307, + "grad_norm": 0.23527158796787262, + "learning_rate": 1.6943422157380515e-05, + "loss": 1.2401, + "step": 9328 + }, + { + "epoch": 2.7786071967087995, + "grad_norm": 0.24809874594211578, + "learning_rate": 1.6942727979744697e-05, + "loss": 1.2405, + "step": 9329 + }, + { + "epoch": 2.7789050429084683, + "grad_norm": 0.24884772300720215, + "learning_rate": 1.694203373751455e-05, + "loss": 1.2594, + "step": 9330 + }, + { + "epoch": 2.7792028891081366, + "grad_norm": 0.2442944049835205, + "learning_rate": 1.6941339430696545e-05, + "loss": 1.2556, + "step": 9331 + }, + { + "epoch": 2.7795007353078054, + "grad_norm": 0.22943776845932007, + "learning_rate": 1.6940645059297122e-05, + "loss": 1.2518, + "step": 9332 + }, + { + "epoch": 2.779798581507474, + "grad_norm": 0.245171919465065, + "learning_rate": 1.6939950623322757e-05, + "loss": 1.231, + "step": 9333 + }, + { + "epoch": 2.780096427707143, + "grad_norm": 0.23680929839611053, + "learning_rate": 1.6939256122779904e-05, + "loss": 1.2571, + "step": 9334 + }, + { + "epoch": 2.7803942739068113, + "grad_norm": 0.2431255429983139, + "learning_rate": 1.6938561557675024e-05, + "loss": 1.2583, + "step": 9335 + }, + { + "epoch": 2.78069212010648, + "grad_norm": 0.22574183344841003, + "learning_rate": 1.6937866928014582e-05, + "loss": 1.2372, + "step": 9336 + }, + { + "epoch": 2.780989966306149, + "grad_norm": 0.25322556495666504, + "learning_rate": 1.693717223380504e-05, + "loss": 1.2282, + "step": 9337 + }, + { + "epoch": 2.781287812505817, + "grad_norm": 0.23538853228092194, + "learning_rate": 1.6936477475052862e-05, + "loss": 1.2342, + "step": 9338 + }, + { + "epoch": 2.781585658705486, + "grad_norm": 0.23108044266700745, + "learning_rate": 1.6935782651764506e-05, + "loss": 1.2423, + "step": 9339 + }, + { + "epoch": 2.7818835049051547, + "grad_norm": 0.2336004227399826, + "learning_rate": 1.6935087763946446e-05, + "loss": 1.2539, + "step": 9340 + }, + { + "epoch": 2.782181351104823, + "grad_norm": 0.23251007497310638, + "learning_rate": 1.6934392811605144e-05, + "loss": 1.2472, + "step": 9341 + }, + { + "epoch": 2.782479197304492, + "grad_norm": 0.23549939692020416, + "learning_rate": 1.6933697794747062e-05, + "loss": 1.2491, + "step": 9342 + }, + { + "epoch": 2.7827770435041606, + "grad_norm": 0.23823584616184235, + "learning_rate": 1.6933002713378667e-05, + "loss": 1.2329, + "step": 9343 + }, + { + "epoch": 2.7830748897038293, + "grad_norm": 0.22635841369628906, + "learning_rate": 1.6932307567506433e-05, + "loss": 1.2418, + "step": 9344 + }, + { + "epoch": 2.783372735903498, + "grad_norm": 0.23029498755931854, + "learning_rate": 1.6931612357136817e-05, + "loss": 1.2368, + "step": 9345 + }, + { + "epoch": 2.7836705821031664, + "grad_norm": 0.23168355226516724, + "learning_rate": 1.6930917082276295e-05, + "loss": 1.2332, + "step": 9346 + }, + { + "epoch": 2.783968428302835, + "grad_norm": 0.2235199213027954, + "learning_rate": 1.6930221742931334e-05, + "loss": 1.2566, + "step": 9347 + }, + { + "epoch": 2.784266274502504, + "grad_norm": 0.22563199698925018, + "learning_rate": 1.69295263391084e-05, + "loss": 1.2341, + "step": 9348 + }, + { + "epoch": 2.7845641207021723, + "grad_norm": 0.23209375143051147, + "learning_rate": 1.6928830870813965e-05, + "loss": 1.26, + "step": 9349 + }, + { + "epoch": 2.784861966901841, + "grad_norm": 0.252802312374115, + "learning_rate": 1.6928135338054502e-05, + "loss": 1.2449, + "step": 9350 + }, + { + "epoch": 2.78515981310151, + "grad_norm": 0.22846385836601257, + "learning_rate": 1.6927439740836483e-05, + "loss": 1.2584, + "step": 9351 + }, + { + "epoch": 2.785457659301178, + "grad_norm": 0.23619228601455688, + "learning_rate": 1.6926744079166375e-05, + "loss": 1.2397, + "step": 9352 + }, + { + "epoch": 2.785755505500847, + "grad_norm": 0.2411826252937317, + "learning_rate": 1.692604835305065e-05, + "loss": 1.2514, + "step": 9353 + }, + { + "epoch": 2.7860533517005157, + "grad_norm": 0.237585186958313, + "learning_rate": 1.6925352562495784e-05, + "loss": 1.2458, + "step": 9354 + }, + { + "epoch": 2.786351197900184, + "grad_norm": 0.23259398341178894, + "learning_rate": 1.6924656707508254e-05, + "loss": 1.243, + "step": 9355 + }, + { + "epoch": 2.786649044099853, + "grad_norm": 0.252854585647583, + "learning_rate": 1.6923960788094524e-05, + "loss": 1.2571, + "step": 9356 + }, + { + "epoch": 2.7869468902995216, + "grad_norm": 0.2337467074394226, + "learning_rate": 1.692326480426108e-05, + "loss": 1.2497, + "step": 9357 + }, + { + "epoch": 2.7872447364991904, + "grad_norm": 0.25307777523994446, + "learning_rate": 1.692256875601439e-05, + "loss": 1.2414, + "step": 9358 + }, + { + "epoch": 2.787542582698859, + "grad_norm": 0.23664496839046478, + "learning_rate": 1.692187264336093e-05, + "loss": 1.2317, + "step": 9359 + }, + { + "epoch": 2.7878404288985275, + "grad_norm": 0.28514930605888367, + "learning_rate": 1.692117646630718e-05, + "loss": 1.2497, + "step": 9360 + }, + { + "epoch": 2.7881382750981962, + "grad_norm": 0.23014235496520996, + "learning_rate": 1.6920480224859618e-05, + "loss": 1.2328, + "step": 9361 + }, + { + "epoch": 2.788436121297865, + "grad_norm": 0.2634677290916443, + "learning_rate": 1.6919783919024717e-05, + "loss": 1.2427, + "step": 9362 + }, + { + "epoch": 2.7887339674975333, + "grad_norm": 0.23677754402160645, + "learning_rate": 1.691908754880896e-05, + "loss": 1.2506, + "step": 9363 + }, + { + "epoch": 2.789031813697202, + "grad_norm": 0.3485448658466339, + "learning_rate": 1.691839111421882e-05, + "loss": 1.2555, + "step": 9364 + }, + { + "epoch": 2.789329659896871, + "grad_norm": 0.24845841526985168, + "learning_rate": 1.6917694615260785e-05, + "loss": 1.2627, + "step": 9365 + }, + { + "epoch": 2.789627506096539, + "grad_norm": 0.2844274938106537, + "learning_rate": 1.6916998051941326e-05, + "loss": 1.2422, + "step": 9366 + }, + { + "epoch": 2.789925352296208, + "grad_norm": 0.2269563376903534, + "learning_rate": 1.6916301424266933e-05, + "loss": 1.243, + "step": 9367 + }, + { + "epoch": 2.7902231984958767, + "grad_norm": 0.29760172963142395, + "learning_rate": 1.691560473224408e-05, + "loss": 1.2218, + "step": 9368 + }, + { + "epoch": 2.790521044695545, + "grad_norm": 0.24669700860977173, + "learning_rate": 1.691490797587925e-05, + "loss": 1.255, + "step": 9369 + }, + { + "epoch": 2.790818890895214, + "grad_norm": 0.28839683532714844, + "learning_rate": 1.691421115517893e-05, + "loss": 1.2516, + "step": 9370 + }, + { + "epoch": 2.7911167370948826, + "grad_norm": 0.23452536761760712, + "learning_rate": 1.69135142701496e-05, + "loss": 1.2457, + "step": 9371 + }, + { + "epoch": 2.7914145832945514, + "grad_norm": 0.45230355858802795, + "learning_rate": 1.6912817320797742e-05, + "loss": 1.2415, + "step": 9372 + }, + { + "epoch": 2.79171242949422, + "grad_norm": 0.2951183617115021, + "learning_rate": 1.6912120307129845e-05, + "loss": 1.2297, + "step": 9373 + }, + { + "epoch": 2.7920102756938885, + "grad_norm": 0.29169294238090515, + "learning_rate": 1.691142322915239e-05, + "loss": 1.2703, + "step": 9374 + }, + { + "epoch": 2.7923081218935573, + "grad_norm": 0.2725238800048828, + "learning_rate": 1.6910726086871863e-05, + "loss": 1.2458, + "step": 9375 + }, + { + "epoch": 2.792605968093226, + "grad_norm": 0.28612467646598816, + "learning_rate": 1.6910028880294748e-05, + "loss": 1.257, + "step": 9376 + }, + { + "epoch": 2.7929038142928944, + "grad_norm": 0.28999632596969604, + "learning_rate": 1.6909331609427536e-05, + "loss": 1.2396, + "step": 9377 + }, + { + "epoch": 2.793201660492563, + "grad_norm": 0.23493905365467072, + "learning_rate": 1.6908634274276718e-05, + "loss": 1.2394, + "step": 9378 + }, + { + "epoch": 2.793499506692232, + "grad_norm": 0.2562492787837982, + "learning_rate": 1.6907936874848774e-05, + "loss": 1.2443, + "step": 9379 + }, + { + "epoch": 2.7937973528919002, + "grad_norm": 0.2693139612674713, + "learning_rate": 1.6907239411150192e-05, + "loss": 1.2583, + "step": 9380 + }, + { + "epoch": 2.794095199091569, + "grad_norm": 0.2592118978500366, + "learning_rate": 1.6906541883187468e-05, + "loss": 1.2534, + "step": 9381 + }, + { + "epoch": 2.7943930452912378, + "grad_norm": 0.2605198621749878, + "learning_rate": 1.6905844290967087e-05, + "loss": 1.2462, + "step": 9382 + }, + { + "epoch": 2.7946908914909065, + "grad_norm": 0.28104788064956665, + "learning_rate": 1.6905146634495543e-05, + "loss": 1.2401, + "step": 9383 + }, + { + "epoch": 2.794988737690575, + "grad_norm": 0.24958474934101105, + "learning_rate": 1.6904448913779325e-05, + "loss": 1.2553, + "step": 9384 + }, + { + "epoch": 2.7952865838902436, + "grad_norm": 0.27100545167922974, + "learning_rate": 1.6903751128824917e-05, + "loss": 1.2469, + "step": 9385 + }, + { + "epoch": 2.7955844300899124, + "grad_norm": 0.2590443193912506, + "learning_rate": 1.6903053279638826e-05, + "loss": 1.2494, + "step": 9386 + }, + { + "epoch": 2.795882276289581, + "grad_norm": 0.262961208820343, + "learning_rate": 1.6902355366227535e-05, + "loss": 1.2557, + "step": 9387 + }, + { + "epoch": 2.7961801224892495, + "grad_norm": 0.31127819418907166, + "learning_rate": 1.6901657388597534e-05, + "loss": 1.2347, + "step": 9388 + }, + { + "epoch": 2.7964779686889183, + "grad_norm": 0.2874647378921509, + "learning_rate": 1.6900959346755327e-05, + "loss": 1.2609, + "step": 9389 + }, + { + "epoch": 2.796775814888587, + "grad_norm": 0.29039305448532104, + "learning_rate": 1.6900261240707402e-05, + "loss": 1.2289, + "step": 9390 + }, + { + "epoch": 2.7970736610882554, + "grad_norm": 0.24280205368995667, + "learning_rate": 1.6899563070460255e-05, + "loss": 1.2159, + "step": 9391 + }, + { + "epoch": 2.797371507287924, + "grad_norm": 0.32225850224494934, + "learning_rate": 1.6898864836020384e-05, + "loss": 1.2248, + "step": 9392 + }, + { + "epoch": 2.797669353487593, + "grad_norm": 0.23303130269050598, + "learning_rate": 1.689816653739428e-05, + "loss": 1.2548, + "step": 9393 + }, + { + "epoch": 2.7979671996872613, + "grad_norm": 0.256865918636322, + "learning_rate": 1.6897468174588447e-05, + "loss": 1.2558, + "step": 9394 + }, + { + "epoch": 2.79826504588693, + "grad_norm": 0.24131427705287933, + "learning_rate": 1.6896769747609378e-05, + "loss": 1.2383, + "step": 9395 + }, + { + "epoch": 2.798562892086599, + "grad_norm": 0.23806166648864746, + "learning_rate": 1.689607125646357e-05, + "loss": 1.2578, + "step": 9396 + }, + { + "epoch": 2.7988607382862676, + "grad_norm": 0.2850431501865387, + "learning_rate": 1.6895372701157527e-05, + "loss": 1.2362, + "step": 9397 + }, + { + "epoch": 2.7991585844859364, + "grad_norm": 0.2668682932853699, + "learning_rate": 1.6894674081697746e-05, + "loss": 1.2443, + "step": 9398 + }, + { + "epoch": 2.7994564306856047, + "grad_norm": 0.25615400075912476, + "learning_rate": 1.6893975398090723e-05, + "loss": 1.2526, + "step": 9399 + }, + { + "epoch": 2.7997542768852735, + "grad_norm": 0.22435817122459412, + "learning_rate": 1.689327665034296e-05, + "loss": 1.2431, + "step": 9400 + }, + { + "epoch": 2.8000521230849422, + "grad_norm": 0.2976968288421631, + "learning_rate": 1.6892577838460962e-05, + "loss": 1.2483, + "step": 9401 + }, + { + "epoch": 2.8003499692846106, + "grad_norm": 0.24760279059410095, + "learning_rate": 1.689187896245123e-05, + "loss": 1.2275, + "step": 9402 + }, + { + "epoch": 2.8006478154842793, + "grad_norm": 0.3072332441806793, + "learning_rate": 1.689118002232026e-05, + "loss": 1.2455, + "step": 9403 + }, + { + "epoch": 2.800945661683948, + "grad_norm": 0.2671337127685547, + "learning_rate": 1.6890481018074557e-05, + "loss": 1.2407, + "step": 9404 + }, + { + "epoch": 2.8012435078836164, + "grad_norm": 0.30636167526245117, + "learning_rate": 1.6889781949720632e-05, + "loss": 1.251, + "step": 9405 + }, + { + "epoch": 2.801541354083285, + "grad_norm": 0.2866518199443817, + "learning_rate": 1.6889082817264982e-05, + "loss": 1.2559, + "step": 9406 + }, + { + "epoch": 2.801839200282954, + "grad_norm": 0.2870074212551117, + "learning_rate": 1.6888383620714112e-05, + "loss": 1.2558, + "step": 9407 + }, + { + "epoch": 2.8021370464826223, + "grad_norm": 0.26505526900291443, + "learning_rate": 1.688768436007453e-05, + "loss": 1.2554, + "step": 9408 + }, + { + "epoch": 2.802434892682291, + "grad_norm": 0.2759658396244049, + "learning_rate": 1.688698503535274e-05, + "loss": 1.2309, + "step": 9409 + }, + { + "epoch": 2.80273273888196, + "grad_norm": 0.24766267836093903, + "learning_rate": 1.6886285646555248e-05, + "loss": 1.2385, + "step": 9410 + }, + { + "epoch": 2.8030305850816286, + "grad_norm": 0.2621782124042511, + "learning_rate": 1.6885586193688562e-05, + "loss": 1.2417, + "step": 9411 + }, + { + "epoch": 2.8033284312812974, + "grad_norm": 0.24561628699302673, + "learning_rate": 1.6884886676759193e-05, + "loss": 1.2317, + "step": 9412 + }, + { + "epoch": 2.8036262774809657, + "grad_norm": 0.24416610598564148, + "learning_rate": 1.6884187095773644e-05, + "loss": 1.243, + "step": 9413 + }, + { + "epoch": 2.8039241236806345, + "grad_norm": 0.24400001764297485, + "learning_rate": 1.688348745073842e-05, + "loss": 1.2522, + "step": 9414 + }, + { + "epoch": 2.8042219698803033, + "grad_norm": 0.24346937239170074, + "learning_rate": 1.6882787741660044e-05, + "loss": 1.2334, + "step": 9415 + }, + { + "epoch": 2.8045198160799716, + "grad_norm": 0.22662796080112457, + "learning_rate": 1.6882087968545014e-05, + "loss": 1.2643, + "step": 9416 + }, + { + "epoch": 2.8048176622796404, + "grad_norm": 0.2588282823562622, + "learning_rate": 1.6881388131399846e-05, + "loss": 1.2569, + "step": 9417 + }, + { + "epoch": 2.805115508479309, + "grad_norm": 0.2453807145357132, + "learning_rate": 1.688068823023105e-05, + "loss": 1.2205, + "step": 9418 + }, + { + "epoch": 2.8054133546789775, + "grad_norm": 0.2423257827758789, + "learning_rate": 1.687998826504514e-05, + "loss": 1.2279, + "step": 9419 + }, + { + "epoch": 2.8057112008786462, + "grad_norm": 0.24520215392112732, + "learning_rate": 1.6879288235848616e-05, + "loss": 1.2503, + "step": 9420 + }, + { + "epoch": 2.806009047078315, + "grad_norm": 0.23110315203666687, + "learning_rate": 1.687858814264801e-05, + "loss": 1.2406, + "step": 9421 + }, + { + "epoch": 2.8063068932779833, + "grad_norm": 0.23562389612197876, + "learning_rate": 1.6877887985449824e-05, + "loss": 1.2335, + "step": 9422 + }, + { + "epoch": 2.806604739477652, + "grad_norm": 0.23645806312561035, + "learning_rate": 1.6877187764260575e-05, + "loss": 1.2421, + "step": 9423 + }, + { + "epoch": 2.806902585677321, + "grad_norm": 0.2370063066482544, + "learning_rate": 1.6876487479086776e-05, + "loss": 1.2576, + "step": 9424 + }, + { + "epoch": 2.8072004318769896, + "grad_norm": 0.2854611575603485, + "learning_rate": 1.6875787129934945e-05, + "loss": 1.2474, + "step": 9425 + }, + { + "epoch": 2.8074982780766584, + "grad_norm": 0.2618495523929596, + "learning_rate": 1.6875086716811598e-05, + "loss": 1.245, + "step": 9426 + }, + { + "epoch": 2.8077961242763267, + "grad_norm": 0.24259112775325775, + "learning_rate": 1.6874386239723247e-05, + "loss": 1.2327, + "step": 9427 + }, + { + "epoch": 2.8080939704759955, + "grad_norm": 0.23218576610088348, + "learning_rate": 1.6873685698676417e-05, + "loss": 1.2481, + "step": 9428 + }, + { + "epoch": 2.8083918166756643, + "grad_norm": 0.29377731680870056, + "learning_rate": 1.6872985093677617e-05, + "loss": 1.2475, + "step": 9429 + }, + { + "epoch": 2.8086896628753326, + "grad_norm": 0.29341045022010803, + "learning_rate": 1.6872284424733373e-05, + "loss": 1.2406, + "step": 9430 + }, + { + "epoch": 2.8089875090750014, + "grad_norm": 0.2425510734319687, + "learning_rate": 1.6871583691850202e-05, + "loss": 1.2419, + "step": 9431 + }, + { + "epoch": 2.80928535527467, + "grad_norm": 0.2322937250137329, + "learning_rate": 1.6870882895034618e-05, + "loss": 1.2328, + "step": 9432 + }, + { + "epoch": 2.8095832014743385, + "grad_norm": 0.2481030374765396, + "learning_rate": 1.687018203429315e-05, + "loss": 1.2613, + "step": 9433 + }, + { + "epoch": 2.8098810476740073, + "grad_norm": 0.2392769157886505, + "learning_rate": 1.6869481109632308e-05, + "loss": 1.2473, + "step": 9434 + }, + { + "epoch": 2.810178893873676, + "grad_norm": 0.23280951380729675, + "learning_rate": 1.6868780121058622e-05, + "loss": 1.2301, + "step": 9435 + }, + { + "epoch": 2.8104767400733444, + "grad_norm": 0.226578027009964, + "learning_rate": 1.6868079068578614e-05, + "loss": 1.2368, + "step": 9436 + }, + { + "epoch": 2.810774586273013, + "grad_norm": 0.23894715309143066, + "learning_rate": 1.6867377952198797e-05, + "loss": 1.2347, + "step": 9437 + }, + { + "epoch": 2.811072432472682, + "grad_norm": 0.26301658153533936, + "learning_rate": 1.6866676771925706e-05, + "loss": 1.2508, + "step": 9438 + }, + { + "epoch": 2.8113702786723507, + "grad_norm": 0.274747371673584, + "learning_rate": 1.686597552776586e-05, + "loss": 1.2224, + "step": 9439 + }, + { + "epoch": 2.8116681248720194, + "grad_norm": 0.26488688588142395, + "learning_rate": 1.686527421972578e-05, + "loss": 1.2567, + "step": 9440 + }, + { + "epoch": 2.8119659710716878, + "grad_norm": 0.24709691107273102, + "learning_rate": 1.6864572847811995e-05, + "loss": 1.2406, + "step": 9441 + }, + { + "epoch": 2.8122638172713565, + "grad_norm": 0.25358718633651733, + "learning_rate": 1.686387141203103e-05, + "loss": 1.2619, + "step": 9442 + }, + { + "epoch": 2.8125616634710253, + "grad_norm": 0.450490802526474, + "learning_rate": 1.686316991238941e-05, + "loss": 1.2303, + "step": 9443 + }, + { + "epoch": 2.8128595096706936, + "grad_norm": 0.3292935788631439, + "learning_rate": 1.686246834889366e-05, + "loss": 1.2535, + "step": 9444 + }, + { + "epoch": 2.8131573558703624, + "grad_norm": 0.2989583909511566, + "learning_rate": 1.686176672155031e-05, + "loss": 1.2321, + "step": 9445 + }, + { + "epoch": 2.813455202070031, + "grad_norm": 0.2783900797367096, + "learning_rate": 1.686106503036589e-05, + "loss": 1.2499, + "step": 9446 + }, + { + "epoch": 2.8137530482696995, + "grad_norm": 0.5152345895767212, + "learning_rate": 1.6860363275346922e-05, + "loss": 1.2637, + "step": 9447 + }, + { + "epoch": 2.8140508944693683, + "grad_norm": 0.2280631810426712, + "learning_rate": 1.685966145649994e-05, + "loss": 1.2355, + "step": 9448 + }, + { + "epoch": 2.814348740669037, + "grad_norm": 0.2405257523059845, + "learning_rate": 1.685895957383147e-05, + "loss": 1.2655, + "step": 9449 + }, + { + "epoch": 2.814646586868706, + "grad_norm": 0.23572790622711182, + "learning_rate": 1.685825762734805e-05, + "loss": 1.2391, + "step": 9450 + }, + { + "epoch": 2.814944433068374, + "grad_norm": 0.7869846224784851, + "learning_rate": 1.68575556170562e-05, + "loss": 1.2549, + "step": 9451 + }, + { + "epoch": 2.815242279268043, + "grad_norm": 0.23556984961032867, + "learning_rate": 1.685685354296246e-05, + "loss": 1.2379, + "step": 9452 + }, + { + "epoch": 2.8155401254677117, + "grad_norm": 0.24267776310443878, + "learning_rate": 1.6856151405073357e-05, + "loss": 1.2398, + "step": 9453 + }, + { + "epoch": 2.8158379716673805, + "grad_norm": 0.24611473083496094, + "learning_rate": 1.6855449203395425e-05, + "loss": 1.2371, + "step": 9454 + }, + { + "epoch": 2.816135817867049, + "grad_norm": 0.23646798729896545, + "learning_rate": 1.6854746937935197e-05, + "loss": 1.262, + "step": 9455 + }, + { + "epoch": 2.8164336640667176, + "grad_norm": 0.24050743877887726, + "learning_rate": 1.6854044608699206e-05, + "loss": 1.256, + "step": 9456 + }, + { + "epoch": 2.8167315102663864, + "grad_norm": 0.2404586374759674, + "learning_rate": 1.6853342215693994e-05, + "loss": 1.2494, + "step": 9457 + }, + { + "epoch": 2.8170293564660547, + "grad_norm": 0.23830142617225647, + "learning_rate": 1.6852639758926086e-05, + "loss": 1.2421, + "step": 9458 + }, + { + "epoch": 2.8173272026657235, + "grad_norm": 0.2322382777929306, + "learning_rate": 1.685193723840202e-05, + "loss": 1.2447, + "step": 9459 + }, + { + "epoch": 2.8176250488653922, + "grad_norm": 0.23787865042686462, + "learning_rate": 1.6851234654128334e-05, + "loss": 1.2381, + "step": 9460 + }, + { + "epoch": 2.8179228950650606, + "grad_norm": 0.2354193925857544, + "learning_rate": 1.6850532006111568e-05, + "loss": 1.242, + "step": 9461 + }, + { + "epoch": 2.8182207412647293, + "grad_norm": 0.23611146211624146, + "learning_rate": 1.684982929435825e-05, + "loss": 1.2408, + "step": 9462 + }, + { + "epoch": 2.818518587464398, + "grad_norm": 0.23927001655101776, + "learning_rate": 1.6849126518874923e-05, + "loss": 1.2625, + "step": 9463 + }, + { + "epoch": 2.818816433664067, + "grad_norm": 0.22399203479290009, + "learning_rate": 1.684842367966813e-05, + "loss": 1.2658, + "step": 9464 + }, + { + "epoch": 2.8191142798637356, + "grad_norm": 0.24528461694717407, + "learning_rate": 1.6847720776744404e-05, + "loss": 1.2563, + "step": 9465 + }, + { + "epoch": 2.819412126063404, + "grad_norm": 0.23209093511104584, + "learning_rate": 1.684701781011029e-05, + "loss": 1.2479, + "step": 9466 + }, + { + "epoch": 2.8197099722630727, + "grad_norm": 0.2589212656021118, + "learning_rate": 1.684631477977232e-05, + "loss": 1.2366, + "step": 9467 + }, + { + "epoch": 2.8200078184627415, + "grad_norm": 0.22971342504024506, + "learning_rate": 1.6845611685737044e-05, + "loss": 1.2532, + "step": 9468 + }, + { + "epoch": 2.82030566466241, + "grad_norm": 0.2506147623062134, + "learning_rate": 1.6844908528011e-05, + "loss": 1.236, + "step": 9469 + }, + { + "epoch": 2.8206035108620786, + "grad_norm": 0.2360735684633255, + "learning_rate": 1.6844205306600727e-05, + "loss": 1.2402, + "step": 9470 + }, + { + "epoch": 2.8209013570617474, + "grad_norm": 0.23734350502490997, + "learning_rate": 1.6843502021512774e-05, + "loss": 1.2565, + "step": 9471 + }, + { + "epoch": 2.8211992032614157, + "grad_norm": 0.2421860247850418, + "learning_rate": 1.6842798672753677e-05, + "loss": 1.2343, + "step": 9472 + }, + { + "epoch": 2.8214970494610845, + "grad_norm": 0.23866495490074158, + "learning_rate": 1.6842095260329988e-05, + "loss": 1.2541, + "step": 9473 + }, + { + "epoch": 2.8217948956607533, + "grad_norm": 0.26501211524009705, + "learning_rate": 1.6841391784248246e-05, + "loss": 1.2583, + "step": 9474 + }, + { + "epoch": 2.8220927418604216, + "grad_norm": 0.22950270771980286, + "learning_rate": 1.684068824451499e-05, + "loss": 1.2452, + "step": 9475 + }, + { + "epoch": 2.8223905880600904, + "grad_norm": 0.24660466611385345, + "learning_rate": 1.683998464113678e-05, + "loss": 1.2573, + "step": 9476 + }, + { + "epoch": 2.822688434259759, + "grad_norm": 0.23101870715618134, + "learning_rate": 1.6839280974120153e-05, + "loss": 1.2362, + "step": 9477 + }, + { + "epoch": 2.822986280459428, + "grad_norm": 0.29405537247657776, + "learning_rate": 1.6838577243471657e-05, + "loss": 1.2485, + "step": 9478 + }, + { + "epoch": 2.8232841266590967, + "grad_norm": 0.3243562877178192, + "learning_rate": 1.683787344919784e-05, + "loss": 1.2392, + "step": 9479 + }, + { + "epoch": 2.823581972858765, + "grad_norm": 0.23325003683567047, + "learning_rate": 1.6837169591305254e-05, + "loss": 1.252, + "step": 9480 + }, + { + "epoch": 2.8238798190584338, + "grad_norm": 0.2814958691596985, + "learning_rate": 1.6836465669800442e-05, + "loss": 1.2445, + "step": 9481 + }, + { + "epoch": 2.8241776652581025, + "grad_norm": 0.2902512550354004, + "learning_rate": 1.6835761684689954e-05, + "loss": 1.2623, + "step": 9482 + }, + { + "epoch": 2.824475511457771, + "grad_norm": 0.24863779544830322, + "learning_rate": 1.683505763598034e-05, + "loss": 1.2478, + "step": 9483 + }, + { + "epoch": 2.8247733576574396, + "grad_norm": 0.63090580701828, + "learning_rate": 1.6834353523678154e-05, + "loss": 1.2607, + "step": 9484 + }, + { + "epoch": 2.8250712038571084, + "grad_norm": 0.2958478629589081, + "learning_rate": 1.683364934778994e-05, + "loss": 1.2441, + "step": 9485 + }, + { + "epoch": 2.8253690500567767, + "grad_norm": 0.2794075608253479, + "learning_rate": 1.683294510832226e-05, + "loss": 1.2454, + "step": 9486 + }, + { + "epoch": 2.8256668962564455, + "grad_norm": 0.24498534202575684, + "learning_rate": 1.6832240805281654e-05, + "loss": 1.2481, + "step": 9487 + }, + { + "epoch": 2.8259647424561143, + "grad_norm": 0.23207266628742218, + "learning_rate": 1.6831536438674685e-05, + "loss": 1.2408, + "step": 9488 + }, + { + "epoch": 2.8262625886557826, + "grad_norm": 0.24805179238319397, + "learning_rate": 1.68308320085079e-05, + "loss": 1.2397, + "step": 9489 + }, + { + "epoch": 2.8265604348554514, + "grad_norm": 0.24835854768753052, + "learning_rate": 1.6830127514787856e-05, + "loss": 1.2409, + "step": 9490 + }, + { + "epoch": 2.82685828105512, + "grad_norm": 0.25730326771736145, + "learning_rate": 1.6829422957521106e-05, + "loss": 1.2664, + "step": 9491 + }, + { + "epoch": 2.827156127254789, + "grad_norm": 0.2531678378582001, + "learning_rate": 1.6828718336714204e-05, + "loss": 1.2582, + "step": 9492 + }, + { + "epoch": 2.8274539734544577, + "grad_norm": 0.2367067188024521, + "learning_rate": 1.682801365237371e-05, + "loss": 1.2395, + "step": 9493 + }, + { + "epoch": 2.827751819654126, + "grad_norm": 0.2317972034215927, + "learning_rate": 1.6827308904506175e-05, + "loss": 1.2383, + "step": 9494 + }, + { + "epoch": 2.828049665853795, + "grad_norm": 0.2436864972114563, + "learning_rate": 1.6826604093118164e-05, + "loss": 1.2677, + "step": 9495 + }, + { + "epoch": 2.8283475120534636, + "grad_norm": 0.24664410948753357, + "learning_rate": 1.6825899218216224e-05, + "loss": 1.2388, + "step": 9496 + }, + { + "epoch": 2.828645358253132, + "grad_norm": 0.23872210085391998, + "learning_rate": 1.682519427980692e-05, + "loss": 1.258, + "step": 9497 + }, + { + "epoch": 2.8289432044528007, + "grad_norm": 0.23003363609313965, + "learning_rate": 1.6824489277896807e-05, + "loss": 1.2481, + "step": 9498 + }, + { + "epoch": 2.8292410506524694, + "grad_norm": 0.22335799038410187, + "learning_rate": 1.6823784212492448e-05, + "loss": 1.2492, + "step": 9499 + }, + { + "epoch": 2.8295388968521378, + "grad_norm": 0.23637713491916656, + "learning_rate": 1.68230790836004e-05, + "loss": 1.2483, + "step": 9500 + }, + { + "epoch": 2.8295388968521378, + "eval_loss": 1.3350661993026733, + "eval_runtime": 20.7853, + "eval_samples_per_second": 83.424, + "eval_steps_per_second": 5.244, + "step": 9500 + }, + { + "epoch": 2.8298367430518065, + "grad_norm": 0.23937050998210907, + "learning_rate": 1.6822373891227223e-05, + "loss": 1.2397, + "step": 9501 + }, + { + "epoch": 2.8301345892514753, + "grad_norm": 0.23899784684181213, + "learning_rate": 1.6821668635379486e-05, + "loss": 1.2504, + "step": 9502 + }, + { + "epoch": 2.8304324354511436, + "grad_norm": 0.2262616753578186, + "learning_rate": 1.6820963316063733e-05, + "loss": 1.2707, + "step": 9503 + }, + { + "epoch": 2.8307302816508124, + "grad_norm": 0.23564161360263824, + "learning_rate": 1.6820257933286544e-05, + "loss": 1.2389, + "step": 9504 + }, + { + "epoch": 2.831028127850481, + "grad_norm": 0.22219909727573395, + "learning_rate": 1.6819552487054474e-05, + "loss": 1.2446, + "step": 9505 + }, + { + "epoch": 2.83132597405015, + "grad_norm": 0.23048289120197296, + "learning_rate": 1.6818846977374087e-05, + "loss": 1.228, + "step": 9506 + }, + { + "epoch": 2.8316238202498187, + "grad_norm": 0.24245646595954895, + "learning_rate": 1.681814140425195e-05, + "loss": 1.2585, + "step": 9507 + }, + { + "epoch": 2.831921666449487, + "grad_norm": 0.23912543058395386, + "learning_rate": 1.6817435767694622e-05, + "loss": 1.2476, + "step": 9508 + }, + { + "epoch": 2.832219512649156, + "grad_norm": 0.23558421432971954, + "learning_rate": 1.681673006770867e-05, + "loss": 1.2515, + "step": 9509 + }, + { + "epoch": 2.8325173588488246, + "grad_norm": 0.23728139698505402, + "learning_rate": 1.681602430430066e-05, + "loss": 1.2538, + "step": 9510 + }, + { + "epoch": 2.832815205048493, + "grad_norm": 0.24715609848499298, + "learning_rate": 1.6815318477477166e-05, + "loss": 1.2464, + "step": 9511 + }, + { + "epoch": 2.8331130512481617, + "grad_norm": 0.2315111607313156, + "learning_rate": 1.6814612587244743e-05, + "loss": 1.24, + "step": 9512 + }, + { + "epoch": 2.8334108974478305, + "grad_norm": 0.22887147963047028, + "learning_rate": 1.6813906633609964e-05, + "loss": 1.2469, + "step": 9513 + }, + { + "epoch": 2.833708743647499, + "grad_norm": 0.23694492876529694, + "learning_rate": 1.6813200616579396e-05, + "loss": 1.2523, + "step": 9514 + }, + { + "epoch": 2.8340065898471676, + "grad_norm": 0.2390718162059784, + "learning_rate": 1.681249453615961e-05, + "loss": 1.2363, + "step": 9515 + }, + { + "epoch": 2.8343044360468364, + "grad_norm": 0.23013268411159515, + "learning_rate": 1.6811788392357175e-05, + "loss": 1.2423, + "step": 9516 + }, + { + "epoch": 2.834602282246505, + "grad_norm": 0.23531004786491394, + "learning_rate": 1.6811082185178658e-05, + "loss": 1.2487, + "step": 9517 + }, + { + "epoch": 2.8349001284461735, + "grad_norm": 0.24330763518810272, + "learning_rate": 1.681037591463063e-05, + "loss": 1.2301, + "step": 9518 + }, + { + "epoch": 2.8351979746458422, + "grad_norm": 0.24473343789577484, + "learning_rate": 1.6809669580719664e-05, + "loss": 1.2649, + "step": 9519 + }, + { + "epoch": 2.835495820845511, + "grad_norm": 0.22644692659378052, + "learning_rate": 1.680896318345233e-05, + "loss": 1.2358, + "step": 9520 + }, + { + "epoch": 2.8357936670451798, + "grad_norm": 0.23970761895179749, + "learning_rate": 1.6808256722835202e-05, + "loss": 1.2454, + "step": 9521 + }, + { + "epoch": 2.836091513244848, + "grad_norm": 0.23349694907665253, + "learning_rate": 1.680755019887485e-05, + "loss": 1.2553, + "step": 9522 + }, + { + "epoch": 2.836389359444517, + "grad_norm": 0.2522037923336029, + "learning_rate": 1.680684361157785e-05, + "loss": 1.2487, + "step": 9523 + }, + { + "epoch": 2.8366872056441856, + "grad_norm": 0.24215124547481537, + "learning_rate": 1.6806136960950778e-05, + "loss": 1.2278, + "step": 9524 + }, + { + "epoch": 2.836985051843854, + "grad_norm": 0.23304861783981323, + "learning_rate": 1.6805430247000203e-05, + "loss": 1.2435, + "step": 9525 + }, + { + "epoch": 2.8372828980435227, + "grad_norm": 0.2447548359632492, + "learning_rate": 1.6804723469732704e-05, + "loss": 1.2606, + "step": 9526 + }, + { + "epoch": 2.8375807442431915, + "grad_norm": 0.23060426115989685, + "learning_rate": 1.6804016629154854e-05, + "loss": 1.2403, + "step": 9527 + }, + { + "epoch": 2.83787859044286, + "grad_norm": 0.22644715011119843, + "learning_rate": 1.6803309725273235e-05, + "loss": 1.2252, + "step": 9528 + }, + { + "epoch": 2.8381764366425286, + "grad_norm": 0.2479182928800583, + "learning_rate": 1.6802602758094416e-05, + "loss": 1.2555, + "step": 9529 + }, + { + "epoch": 2.8384742828421974, + "grad_norm": 0.24726583063602448, + "learning_rate": 1.680189572762498e-05, + "loss": 1.24, + "step": 9530 + }, + { + "epoch": 2.838772129041866, + "grad_norm": 0.24094510078430176, + "learning_rate": 1.6801188633871507e-05, + "loss": 1.254, + "step": 9531 + }, + { + "epoch": 2.839069975241535, + "grad_norm": 0.23302826285362244, + "learning_rate": 1.680048147684057e-05, + "loss": 1.2621, + "step": 9532 + }, + { + "epoch": 2.8393678214412033, + "grad_norm": 0.22267597913742065, + "learning_rate": 1.679977425653875e-05, + "loss": 1.2357, + "step": 9533 + }, + { + "epoch": 2.839665667640872, + "grad_norm": 0.23312880098819733, + "learning_rate": 1.6799066972972628e-05, + "loss": 1.2624, + "step": 9534 + }, + { + "epoch": 2.839963513840541, + "grad_norm": 0.22978711128234863, + "learning_rate": 1.6798359626148787e-05, + "loss": 1.2534, + "step": 9535 + }, + { + "epoch": 2.840261360040209, + "grad_norm": 0.24274848401546478, + "learning_rate": 1.67976522160738e-05, + "loss": 1.2629, + "step": 9536 + }, + { + "epoch": 2.840559206239878, + "grad_norm": 0.23926518857479095, + "learning_rate": 1.679694474275426e-05, + "loss": 1.2362, + "step": 9537 + }, + { + "epoch": 2.8408570524395467, + "grad_norm": 0.23229800164699554, + "learning_rate": 1.679623720619674e-05, + "loss": 1.244, + "step": 9538 + }, + { + "epoch": 2.841154898639215, + "grad_norm": 0.22292126715183258, + "learning_rate": 1.6795529606407822e-05, + "loss": 1.2394, + "step": 9539 + }, + { + "epoch": 2.8414527448388838, + "grad_norm": 0.2399204522371292, + "learning_rate": 1.6794821943394097e-05, + "loss": 1.2399, + "step": 9540 + }, + { + "epoch": 2.8417505910385525, + "grad_norm": 0.23259685933589935, + "learning_rate": 1.6794114217162145e-05, + "loss": 1.2324, + "step": 9541 + }, + { + "epoch": 2.842048437238221, + "grad_norm": 0.22789466381072998, + "learning_rate": 1.6793406427718554e-05, + "loss": 1.2245, + "step": 9542 + }, + { + "epoch": 2.8423462834378896, + "grad_norm": 0.23625433444976807, + "learning_rate": 1.6792698575069906e-05, + "loss": 1.2465, + "step": 9543 + }, + { + "epoch": 2.8426441296375584, + "grad_norm": 0.23433490097522736, + "learning_rate": 1.6791990659222782e-05, + "loss": 1.235, + "step": 9544 + }, + { + "epoch": 2.842941975837227, + "grad_norm": 0.24511510133743286, + "learning_rate": 1.6791282680183778e-05, + "loss": 1.242, + "step": 9545 + }, + { + "epoch": 2.843239822036896, + "grad_norm": 0.23765471577644348, + "learning_rate": 1.6790574637959473e-05, + "loss": 1.2391, + "step": 9546 + }, + { + "epoch": 2.8435376682365643, + "grad_norm": 0.23262514173984528, + "learning_rate": 1.6789866532556462e-05, + "loss": 1.2363, + "step": 9547 + }, + { + "epoch": 2.843835514436233, + "grad_norm": 0.23182271420955658, + "learning_rate": 1.6789158363981327e-05, + "loss": 1.2435, + "step": 9548 + }, + { + "epoch": 2.844133360635902, + "grad_norm": 0.247736856341362, + "learning_rate": 1.6788450132240658e-05, + "loss": 1.2321, + "step": 9549 + }, + { + "epoch": 2.84443120683557, + "grad_norm": 0.24930934607982635, + "learning_rate": 1.678774183734104e-05, + "loss": 1.2445, + "step": 9550 + }, + { + "epoch": 2.844729053035239, + "grad_norm": 0.24114583432674408, + "learning_rate": 1.6787033479289077e-05, + "loss": 1.2331, + "step": 9551 + }, + { + "epoch": 2.8450268992349077, + "grad_norm": 0.22924873232841492, + "learning_rate": 1.6786325058091346e-05, + "loss": 1.238, + "step": 9552 + }, + { + "epoch": 2.845324745434576, + "grad_norm": 0.25451821088790894, + "learning_rate": 1.6785616573754444e-05, + "loss": 1.2356, + "step": 9553 + }, + { + "epoch": 2.845622591634245, + "grad_norm": 0.23745770752429962, + "learning_rate": 1.6784908026284958e-05, + "loss": 1.2539, + "step": 9554 + }, + { + "epoch": 2.8459204378339136, + "grad_norm": 0.25693124532699585, + "learning_rate": 1.6784199415689488e-05, + "loss": 1.2456, + "step": 9555 + }, + { + "epoch": 2.846218284033582, + "grad_norm": 0.2471061795949936, + "learning_rate": 1.678349074197462e-05, + "loss": 1.2238, + "step": 9556 + }, + { + "epoch": 2.8465161302332507, + "grad_norm": 0.2600265443325043, + "learning_rate": 1.678278200514695e-05, + "loss": 1.2429, + "step": 9557 + }, + { + "epoch": 2.8468139764329194, + "grad_norm": 0.23116445541381836, + "learning_rate": 1.678207320521307e-05, + "loss": 1.2502, + "step": 9558 + }, + { + "epoch": 2.847111822632588, + "grad_norm": 0.2419513314962387, + "learning_rate": 1.678136434217958e-05, + "loss": 1.2478, + "step": 9559 + }, + { + "epoch": 2.847409668832257, + "grad_norm": 0.23644323647022247, + "learning_rate": 1.6780655416053067e-05, + "loss": 1.2479, + "step": 9560 + }, + { + "epoch": 2.8477075150319253, + "grad_norm": 0.23632511496543884, + "learning_rate": 1.6779946426840134e-05, + "loss": 1.2474, + "step": 9561 + }, + { + "epoch": 2.848005361231594, + "grad_norm": 0.22760628163814545, + "learning_rate": 1.677923737454737e-05, + "loss": 1.2512, + "step": 9562 + }, + { + "epoch": 2.848303207431263, + "grad_norm": 0.2346828281879425, + "learning_rate": 1.677852825918138e-05, + "loss": 1.2428, + "step": 9563 + }, + { + "epoch": 2.848601053630931, + "grad_norm": 0.2310081124305725, + "learning_rate": 1.6777819080748758e-05, + "loss": 1.2424, + "step": 9564 + }, + { + "epoch": 2.8488988998306, + "grad_norm": 0.23505978286266327, + "learning_rate": 1.67771098392561e-05, + "loss": 1.2535, + "step": 9565 + }, + { + "epoch": 2.8491967460302687, + "grad_norm": 0.22905951738357544, + "learning_rate": 1.677640053471001e-05, + "loss": 1.2335, + "step": 9566 + }, + { + "epoch": 2.849494592229937, + "grad_norm": 0.23601317405700684, + "learning_rate": 1.677569116711708e-05, + "loss": 1.2329, + "step": 9567 + }, + { + "epoch": 2.849792438429606, + "grad_norm": 0.2386091947555542, + "learning_rate": 1.677498173648392e-05, + "loss": 1.2542, + "step": 9568 + }, + { + "epoch": 2.8500902846292746, + "grad_norm": 0.24330885708332062, + "learning_rate": 1.6774272242817122e-05, + "loss": 1.2576, + "step": 9569 + }, + { + "epoch": 2.850388130828943, + "grad_norm": 0.25018224120140076, + "learning_rate": 1.6773562686123285e-05, + "loss": 1.2348, + "step": 9570 + }, + { + "epoch": 2.8506859770286117, + "grad_norm": 0.24150174856185913, + "learning_rate": 1.677285306640902e-05, + "loss": 1.2293, + "step": 9571 + }, + { + "epoch": 2.8509838232282805, + "grad_norm": 0.31612199544906616, + "learning_rate": 1.677214338368092e-05, + "loss": 1.2493, + "step": 9572 + }, + { + "epoch": 2.8512816694279493, + "grad_norm": 0.2653801441192627, + "learning_rate": 1.67714336379456e-05, + "loss": 1.25, + "step": 9573 + }, + { + "epoch": 2.851579515627618, + "grad_norm": 0.24127154052257538, + "learning_rate": 1.6770723829209648e-05, + "loss": 1.2442, + "step": 9574 + }, + { + "epoch": 2.8518773618272864, + "grad_norm": 0.2629128694534302, + "learning_rate": 1.6770013957479677e-05, + "loss": 1.2571, + "step": 9575 + }, + { + "epoch": 2.852175208026955, + "grad_norm": 0.23995302617549896, + "learning_rate": 1.6769304022762292e-05, + "loss": 1.2426, + "step": 9576 + }, + { + "epoch": 2.852473054226624, + "grad_norm": 0.228716641664505, + "learning_rate": 1.6768594025064095e-05, + "loss": 1.2414, + "step": 9577 + }, + { + "epoch": 2.8527709004262922, + "grad_norm": 0.23739491403102875, + "learning_rate": 1.6767883964391692e-05, + "loss": 1.2336, + "step": 9578 + }, + { + "epoch": 2.853068746625961, + "grad_norm": 0.26312458515167236, + "learning_rate": 1.676717384075169e-05, + "loss": 1.2566, + "step": 9579 + }, + { + "epoch": 2.8533665928256298, + "grad_norm": 0.24080444872379303, + "learning_rate": 1.67664636541507e-05, + "loss": 1.2411, + "step": 9580 + }, + { + "epoch": 2.853664439025298, + "grad_norm": 0.2274615615606308, + "learning_rate": 1.6765753404595322e-05, + "loss": 1.2481, + "step": 9581 + }, + { + "epoch": 2.853962285224967, + "grad_norm": 0.2274451106786728, + "learning_rate": 1.6765043092092167e-05, + "loss": 1.2453, + "step": 9582 + }, + { + "epoch": 2.8542601314246356, + "grad_norm": 0.23121044039726257, + "learning_rate": 1.676433271664785e-05, + "loss": 1.2562, + "step": 9583 + }, + { + "epoch": 2.8545579776243044, + "grad_norm": 0.23248712718486786, + "learning_rate": 1.6763622278268968e-05, + "loss": 1.2601, + "step": 9584 + }, + { + "epoch": 2.8548558238239727, + "grad_norm": 0.24338187277317047, + "learning_rate": 1.676291177696214e-05, + "loss": 1.2459, + "step": 9585 + }, + { + "epoch": 2.8551536700236415, + "grad_norm": 0.2362908124923706, + "learning_rate": 1.6762201212733974e-05, + "loss": 1.2413, + "step": 9586 + }, + { + "epoch": 2.8554515162233103, + "grad_norm": 0.3114035427570343, + "learning_rate": 1.676149058559108e-05, + "loss": 1.2266, + "step": 9587 + }, + { + "epoch": 2.855749362422979, + "grad_norm": 0.30946192145347595, + "learning_rate": 1.676077989554007e-05, + "loss": 1.249, + "step": 9588 + }, + { + "epoch": 2.8560472086226474, + "grad_norm": 0.2479553520679474, + "learning_rate": 1.6760069142587562e-05, + "loss": 1.2431, + "step": 9589 + }, + { + "epoch": 2.856345054822316, + "grad_norm": 0.5116506218910217, + "learning_rate": 1.6759358326740157e-05, + "loss": 1.2438, + "step": 9590 + }, + { + "epoch": 2.856642901021985, + "grad_norm": 0.3225812017917633, + "learning_rate": 1.675864744800448e-05, + "loss": 1.2424, + "step": 9591 + }, + { + "epoch": 2.8569407472216533, + "grad_norm": 0.2772776782512665, + "learning_rate": 1.6757936506387134e-05, + "loss": 1.2536, + "step": 9592 + }, + { + "epoch": 2.857238593421322, + "grad_norm": 0.238128662109375, + "learning_rate": 1.6757225501894744e-05, + "loss": 1.2472, + "step": 9593 + }, + { + "epoch": 2.857536439620991, + "grad_norm": 0.23039375245571136, + "learning_rate": 1.6756514434533915e-05, + "loss": 1.2291, + "step": 9594 + }, + { + "epoch": 2.857834285820659, + "grad_norm": 0.3229292035102844, + "learning_rate": 1.6755803304311272e-05, + "loss": 1.2391, + "step": 9595 + }, + { + "epoch": 2.858132132020328, + "grad_norm": 0.25468021631240845, + "learning_rate": 1.675509211123343e-05, + "loss": 1.2454, + "step": 9596 + }, + { + "epoch": 2.8584299782199967, + "grad_norm": 0.22903305292129517, + "learning_rate": 1.6754380855306998e-05, + "loss": 1.2355, + "step": 9597 + }, + { + "epoch": 2.8587278244196654, + "grad_norm": 0.23688064515590668, + "learning_rate": 1.67536695365386e-05, + "loss": 1.2426, + "step": 9598 + }, + { + "epoch": 2.859025670619334, + "grad_norm": 0.2441447377204895, + "learning_rate": 1.6752958154934854e-05, + "loss": 1.2289, + "step": 9599 + }, + { + "epoch": 2.8593235168190025, + "grad_norm": 0.2363729327917099, + "learning_rate": 1.6752246710502377e-05, + "loss": 1.2306, + "step": 9600 + }, + { + "epoch": 2.8596213630186713, + "grad_norm": 0.25469234585762024, + "learning_rate": 1.6751535203247785e-05, + "loss": 1.2572, + "step": 9601 + }, + { + "epoch": 2.85991920921834, + "grad_norm": 0.23895514011383057, + "learning_rate": 1.6750823633177703e-05, + "loss": 1.2399, + "step": 9602 + }, + { + "epoch": 2.8602170554180084, + "grad_norm": 0.2803517282009125, + "learning_rate": 1.6750112000298752e-05, + "loss": 1.2436, + "step": 9603 + }, + { + "epoch": 2.860514901617677, + "grad_norm": 0.2815166115760803, + "learning_rate": 1.674940030461755e-05, + "loss": 1.235, + "step": 9604 + }, + { + "epoch": 2.860812747817346, + "grad_norm": 0.2441072165966034, + "learning_rate": 1.6748688546140717e-05, + "loss": 1.2388, + "step": 9605 + }, + { + "epoch": 2.8611105940170143, + "grad_norm": 0.25432687997817993, + "learning_rate": 1.674797672487488e-05, + "loss": 1.2639, + "step": 9606 + }, + { + "epoch": 2.861408440216683, + "grad_norm": 0.26335468888282776, + "learning_rate": 1.674726484082666e-05, + "loss": 1.2202, + "step": 9607 + }, + { + "epoch": 2.861706286416352, + "grad_norm": 0.2342562973499298, + "learning_rate": 1.674655289400267e-05, + "loss": 1.2534, + "step": 9608 + }, + { + "epoch": 2.86200413261602, + "grad_norm": 0.294867604970932, + "learning_rate": 1.674584088440955e-05, + "loss": 1.2438, + "step": 9609 + }, + { + "epoch": 2.862301978815689, + "grad_norm": 0.23600003123283386, + "learning_rate": 1.674512881205392e-05, + "loss": 1.2456, + "step": 9610 + }, + { + "epoch": 2.8625998250153577, + "grad_norm": 0.2641909718513489, + "learning_rate": 1.6744416676942398e-05, + "loss": 1.2253, + "step": 9611 + }, + { + "epoch": 2.8628976712150265, + "grad_norm": 0.24446533620357513, + "learning_rate": 1.6743704479081616e-05, + "loss": 1.2466, + "step": 9612 + }, + { + "epoch": 2.8631955174146952, + "grad_norm": 0.25604259967803955, + "learning_rate": 1.6742992218478197e-05, + "loss": 1.2358, + "step": 9613 + }, + { + "epoch": 2.8634933636143636, + "grad_norm": 0.25585129857063293, + "learning_rate": 1.674227989513877e-05, + "loss": 1.2427, + "step": 9614 + }, + { + "epoch": 2.8637912098140323, + "grad_norm": 0.23947425186634064, + "learning_rate": 1.674156750906996e-05, + "loss": 1.2284, + "step": 9615 + }, + { + "epoch": 2.864089056013701, + "grad_norm": 0.25628355145454407, + "learning_rate": 1.6740855060278396e-05, + "loss": 1.2357, + "step": 9616 + }, + { + "epoch": 2.8643869022133694, + "grad_norm": 0.23210285604000092, + "learning_rate": 1.674014254877071e-05, + "loss": 1.2336, + "step": 9617 + }, + { + "epoch": 2.864684748413038, + "grad_norm": 0.29729771614074707, + "learning_rate": 1.6739429974553527e-05, + "loss": 1.2457, + "step": 9618 + }, + { + "epoch": 2.864982594612707, + "grad_norm": 0.24117979407310486, + "learning_rate": 1.6738717337633478e-05, + "loss": 1.2576, + "step": 9619 + }, + { + "epoch": 2.8652804408123753, + "grad_norm": 0.25205516815185547, + "learning_rate": 1.6738004638017192e-05, + "loss": 1.2615, + "step": 9620 + }, + { + "epoch": 2.865578287012044, + "grad_norm": 0.2526441514492035, + "learning_rate": 1.6737291875711303e-05, + "loss": 1.2409, + "step": 9621 + }, + { + "epoch": 2.865876133211713, + "grad_norm": 0.26580315828323364, + "learning_rate": 1.6736579050722438e-05, + "loss": 1.2576, + "step": 9622 + }, + { + "epoch": 2.866173979411381, + "grad_norm": 0.26286303997039795, + "learning_rate": 1.6735866163057234e-05, + "loss": 1.2408, + "step": 9623 + }, + { + "epoch": 2.86647182561105, + "grad_norm": 0.25381582975387573, + "learning_rate": 1.673515321272232e-05, + "loss": 1.2359, + "step": 9624 + }, + { + "epoch": 2.8667696718107187, + "grad_norm": 0.2711793780326843, + "learning_rate": 1.6734440199724328e-05, + "loss": 1.2412, + "step": 9625 + }, + { + "epoch": 2.8670675180103875, + "grad_norm": 0.23535223305225372, + "learning_rate": 1.6733727124069896e-05, + "loss": 1.2361, + "step": 9626 + }, + { + "epoch": 2.8673653642100563, + "grad_norm": 0.3429569900035858, + "learning_rate": 1.6733013985765658e-05, + "loss": 1.2486, + "step": 9627 + }, + { + "epoch": 2.8676632104097246, + "grad_norm": 0.2517103850841522, + "learning_rate": 1.6732300784818244e-05, + "loss": 1.2309, + "step": 9628 + }, + { + "epoch": 2.8679610566093934, + "grad_norm": 0.27148956060409546, + "learning_rate": 1.6731587521234296e-05, + "loss": 1.2542, + "step": 9629 + }, + { + "epoch": 2.868258902809062, + "grad_norm": 0.2414814680814743, + "learning_rate": 1.6730874195020447e-05, + "loss": 1.2581, + "step": 9630 + }, + { + "epoch": 2.8685567490087305, + "grad_norm": 0.35265201330184937, + "learning_rate": 1.673016080618333e-05, + "loss": 1.2499, + "step": 9631 + }, + { + "epoch": 2.8688545952083992, + "grad_norm": 0.22770635783672333, + "learning_rate": 1.672944735472959e-05, + "loss": 1.2165, + "step": 9632 + }, + { + "epoch": 2.869152441408068, + "grad_norm": 0.26756811141967773, + "learning_rate": 1.672873384066586e-05, + "loss": 1.2399, + "step": 9633 + }, + { + "epoch": 2.8694502876077363, + "grad_norm": 0.24693620204925537, + "learning_rate": 1.672802026399878e-05, + "loss": 1.2505, + "step": 9634 + }, + { + "epoch": 2.869748133807405, + "grad_norm": 0.36966007947921753, + "learning_rate": 1.672730662473499e-05, + "loss": 1.2275, + "step": 9635 + }, + { + "epoch": 2.870045980007074, + "grad_norm": 0.2499130368232727, + "learning_rate": 1.6726592922881124e-05, + "loss": 1.2419, + "step": 9636 + }, + { + "epoch": 2.870343826206742, + "grad_norm": 0.27664774656295776, + "learning_rate": 1.6725879158443826e-05, + "loss": 1.2279, + "step": 9637 + }, + { + "epoch": 2.870641672406411, + "grad_norm": 0.2373204231262207, + "learning_rate": 1.6725165331429743e-05, + "loss": 1.2387, + "step": 9638 + }, + { + "epoch": 2.8709395186060798, + "grad_norm": 0.3052253723144531, + "learning_rate": 1.6724451441845502e-05, + "loss": 1.2499, + "step": 9639 + }, + { + "epoch": 2.8712373648057485, + "grad_norm": 0.2708706259727478, + "learning_rate": 1.672373748969776e-05, + "loss": 1.2603, + "step": 9640 + }, + { + "epoch": 2.8715352110054173, + "grad_norm": 0.24858544766902924, + "learning_rate": 1.672302347499315e-05, + "loss": 1.2466, + "step": 9641 + }, + { + "epoch": 2.8718330572050856, + "grad_norm": 0.298337459564209, + "learning_rate": 1.6722309397738322e-05, + "loss": 1.2356, + "step": 9642 + }, + { + "epoch": 2.8721309034047544, + "grad_norm": 0.22833894193172455, + "learning_rate": 1.672159525793991e-05, + "loss": 1.2413, + "step": 9643 + }, + { + "epoch": 2.872428749604423, + "grad_norm": 0.23819833993911743, + "learning_rate": 1.6720881055604565e-05, + "loss": 1.2289, + "step": 9644 + }, + { + "epoch": 2.8727265958040915, + "grad_norm": 0.24357788264751434, + "learning_rate": 1.6720166790738934e-05, + "loss": 1.241, + "step": 9645 + }, + { + "epoch": 2.8730244420037603, + "grad_norm": 0.26105859875679016, + "learning_rate": 1.671945246334966e-05, + "loss": 1.2417, + "step": 9646 + }, + { + "epoch": 2.873322288203429, + "grad_norm": 0.22648964822292328, + "learning_rate": 1.671873807344338e-05, + "loss": 1.2446, + "step": 9647 + }, + { + "epoch": 2.8736201344030974, + "grad_norm": 0.26460540294647217, + "learning_rate": 1.6718023621026757e-05, + "loss": 1.2576, + "step": 9648 + }, + { + "epoch": 2.873917980602766, + "grad_norm": 0.25131869316101074, + "learning_rate": 1.6717309106106426e-05, + "loss": 1.2362, + "step": 9649 + }, + { + "epoch": 2.874215826802435, + "grad_norm": 0.42317044734954834, + "learning_rate": 1.671659452868904e-05, + "loss": 1.2244, + "step": 9650 + }, + { + "epoch": 2.8745136730021037, + "grad_norm": 0.2991759777069092, + "learning_rate": 1.671587988878125e-05, + "loss": 1.2495, + "step": 9651 + }, + { + "epoch": 2.874811519201772, + "grad_norm": 0.3073268532752991, + "learning_rate": 1.671516518638969e-05, + "loss": 1.23, + "step": 9652 + }, + { + "epoch": 2.875109365401441, + "grad_norm": 0.23857441544532776, + "learning_rate": 1.671445042152103e-05, + "loss": 1.2352, + "step": 9653 + }, + { + "epoch": 2.8754072116011096, + "grad_norm": 0.5188261270523071, + "learning_rate": 1.671373559418191e-05, + "loss": 1.2468, + "step": 9654 + }, + { + "epoch": 2.8757050578007783, + "grad_norm": 0.26395002007484436, + "learning_rate": 1.671302070437898e-05, + "loss": 1.2334, + "step": 9655 + }, + { + "epoch": 2.8760029040004467, + "grad_norm": 0.24812112748622894, + "learning_rate": 1.6712305752118894e-05, + "loss": 1.2381, + "step": 9656 + }, + { + "epoch": 2.8763007502001154, + "grad_norm": 0.24590472877025604, + "learning_rate": 1.6711590737408297e-05, + "loss": 1.2276, + "step": 9657 + }, + { + "epoch": 2.876598596399784, + "grad_norm": 0.23740769922733307, + "learning_rate": 1.6710875660253852e-05, + "loss": 1.2413, + "step": 9658 + }, + { + "epoch": 2.8768964425994525, + "grad_norm": 0.23263880610466003, + "learning_rate": 1.6710160520662207e-05, + "loss": 1.2511, + "step": 9659 + }, + { + "epoch": 2.8771942887991213, + "grad_norm": 0.24080054461956024, + "learning_rate": 1.6709445318640015e-05, + "loss": 1.2656, + "step": 9660 + }, + { + "epoch": 2.87749213499879, + "grad_norm": 0.2370612770318985, + "learning_rate": 1.670873005419393e-05, + "loss": 1.2597, + "step": 9661 + }, + { + "epoch": 2.8777899811984584, + "grad_norm": 0.2382216602563858, + "learning_rate": 1.6708014727330605e-05, + "loss": 1.2182, + "step": 9662 + }, + { + "epoch": 2.878087827398127, + "grad_norm": 0.23752722144126892, + "learning_rate": 1.67072993380567e-05, + "loss": 1.2448, + "step": 9663 + }, + { + "epoch": 2.878385673597796, + "grad_norm": 0.23380430042743683, + "learning_rate": 1.6706583886378868e-05, + "loss": 1.2295, + "step": 9664 + }, + { + "epoch": 2.8786835197974647, + "grad_norm": 0.2371366024017334, + "learning_rate": 1.6705868372303765e-05, + "loss": 1.2584, + "step": 9665 + }, + { + "epoch": 2.8789813659971335, + "grad_norm": 0.238836407661438, + "learning_rate": 1.670515279583805e-05, + "loss": 1.2522, + "step": 9666 + }, + { + "epoch": 2.879279212196802, + "grad_norm": 0.23018185794353485, + "learning_rate": 1.670443715698838e-05, + "loss": 1.2475, + "step": 9667 + }, + { + "epoch": 2.8795770583964706, + "grad_norm": 0.23417013883590698, + "learning_rate": 1.670372145576141e-05, + "loss": 1.2362, + "step": 9668 + }, + { + "epoch": 2.8798749045961394, + "grad_norm": 0.24360010027885437, + "learning_rate": 1.6703005692163804e-05, + "loss": 1.2639, + "step": 9669 + }, + { + "epoch": 2.8801727507958077, + "grad_norm": 0.2241598516702652, + "learning_rate": 1.670228986620222e-05, + "loss": 1.244, + "step": 9670 + }, + { + "epoch": 2.8804705969954765, + "grad_norm": 0.23797936737537384, + "learning_rate": 1.6701573977883314e-05, + "loss": 1.2357, + "step": 9671 + }, + { + "epoch": 2.8807684431951452, + "grad_norm": 0.23201867938041687, + "learning_rate": 1.6700858027213755e-05, + "loss": 1.2564, + "step": 9672 + }, + { + "epoch": 2.8810662893948136, + "grad_norm": 0.22328731417655945, + "learning_rate": 1.6700142014200197e-05, + "loss": 1.2547, + "step": 9673 + }, + { + "epoch": 2.8813641355944823, + "grad_norm": 0.23917587101459503, + "learning_rate": 1.66994259388493e-05, + "loss": 1.2463, + "step": 9674 + }, + { + "epoch": 2.881661981794151, + "grad_norm": 0.2283514440059662, + "learning_rate": 1.669870980116773e-05, + "loss": 1.2639, + "step": 9675 + }, + { + "epoch": 2.8819598279938194, + "grad_norm": 0.23538215458393097, + "learning_rate": 1.6697993601162152e-05, + "loss": 1.234, + "step": 9676 + }, + { + "epoch": 2.882257674193488, + "grad_norm": 0.23896881937980652, + "learning_rate": 1.6697277338839227e-05, + "loss": 1.2432, + "step": 9677 + }, + { + "epoch": 2.882555520393157, + "grad_norm": 0.23404815793037415, + "learning_rate": 1.6696561014205615e-05, + "loss": 1.237, + "step": 9678 + }, + { + "epoch": 2.8828533665928258, + "grad_norm": 0.22328928112983704, + "learning_rate": 1.669584462726799e-05, + "loss": 1.2493, + "step": 9679 + }, + { + "epoch": 2.8831512127924945, + "grad_norm": 0.22693878412246704, + "learning_rate": 1.669512817803301e-05, + "loss": 1.2638, + "step": 9680 + }, + { + "epoch": 2.883449058992163, + "grad_norm": 0.23578110337257385, + "learning_rate": 1.6694411666507343e-05, + "loss": 1.2557, + "step": 9681 + }, + { + "epoch": 2.8837469051918316, + "grad_norm": 0.23133975267410278, + "learning_rate": 1.669369509269765e-05, + "loss": 1.2467, + "step": 9682 + }, + { + "epoch": 2.8840447513915004, + "grad_norm": 0.2348644733428955, + "learning_rate": 1.6692978456610607e-05, + "loss": 1.2256, + "step": 9683 + }, + { + "epoch": 2.8843425975911687, + "grad_norm": 0.22899694740772247, + "learning_rate": 1.669226175825288e-05, + "loss": 1.2205, + "step": 9684 + }, + { + "epoch": 2.8846404437908375, + "grad_norm": 0.23254863917827606, + "learning_rate": 1.669154499763113e-05, + "loss": 1.2274, + "step": 9685 + }, + { + "epoch": 2.8849382899905063, + "grad_norm": 0.24034923315048218, + "learning_rate": 1.669082817475203e-05, + "loss": 1.2576, + "step": 9686 + }, + { + "epoch": 2.8852361361901746, + "grad_norm": 0.23759359121322632, + "learning_rate": 1.6690111289622254e-05, + "loss": 1.2472, + "step": 9687 + }, + { + "epoch": 2.8855339823898434, + "grad_norm": 0.23265060782432556, + "learning_rate": 1.668939434224846e-05, + "loss": 1.2261, + "step": 9688 + }, + { + "epoch": 2.885831828589512, + "grad_norm": 0.2382369488477707, + "learning_rate": 1.668867733263733e-05, + "loss": 1.2542, + "step": 9689 + }, + { + "epoch": 2.8861296747891805, + "grad_norm": 0.22997981309890747, + "learning_rate": 1.668796026079553e-05, + "loss": 1.2596, + "step": 9690 + }, + { + "epoch": 2.8864275209888492, + "grad_norm": 0.23344625532627106, + "learning_rate": 1.668724312672973e-05, + "loss": 1.2658, + "step": 9691 + }, + { + "epoch": 2.886725367188518, + "grad_norm": 0.23847095668315887, + "learning_rate": 1.668652593044661e-05, + "loss": 1.2479, + "step": 9692 + }, + { + "epoch": 2.887023213388187, + "grad_norm": 0.2525794506072998, + "learning_rate": 1.6685808671952827e-05, + "loss": 1.2473, + "step": 9693 + }, + { + "epoch": 2.8873210595878556, + "grad_norm": 0.24680624902248383, + "learning_rate": 1.6685091351255072e-05, + "loss": 1.2422, + "step": 9694 + }, + { + "epoch": 2.887618905787524, + "grad_norm": 0.24009333550930023, + "learning_rate": 1.6684373968360007e-05, + "loss": 1.2432, + "step": 9695 + }, + { + "epoch": 2.8879167519871927, + "grad_norm": 0.23311027884483337, + "learning_rate": 1.6683656523274316e-05, + "loss": 1.2285, + "step": 9696 + }, + { + "epoch": 2.8882145981868614, + "grad_norm": 0.24567042291164398, + "learning_rate": 1.6682939016004663e-05, + "loss": 1.2203, + "step": 9697 + }, + { + "epoch": 2.8885124443865298, + "grad_norm": 0.25748467445373535, + "learning_rate": 1.668222144655773e-05, + "loss": 1.2479, + "step": 9698 + }, + { + "epoch": 2.8888102905861985, + "grad_norm": 0.26577097177505493, + "learning_rate": 1.6681503814940195e-05, + "loss": 1.2561, + "step": 9699 + }, + { + "epoch": 2.8891081367858673, + "grad_norm": 0.24358929693698883, + "learning_rate": 1.668078612115873e-05, + "loss": 1.2498, + "step": 9700 + }, + { + "epoch": 2.8894059829855356, + "grad_norm": 0.24860145151615143, + "learning_rate": 1.6680068365220013e-05, + "loss": 1.2476, + "step": 9701 + }, + { + "epoch": 2.8897038291852044, + "grad_norm": 0.23227480053901672, + "learning_rate": 1.6679350547130725e-05, + "loss": 1.2411, + "step": 9702 + }, + { + "epoch": 2.890001675384873, + "grad_norm": 0.24423043429851532, + "learning_rate": 1.667863266689754e-05, + "loss": 1.2468, + "step": 9703 + }, + { + "epoch": 2.8902995215845415, + "grad_norm": 0.23363593220710754, + "learning_rate": 1.6677914724527145e-05, + "loss": 1.2563, + "step": 9704 + }, + { + "epoch": 2.8905973677842103, + "grad_norm": 0.23444494605064392, + "learning_rate": 1.667719672002621e-05, + "loss": 1.2416, + "step": 9705 + }, + { + "epoch": 2.890895213983879, + "grad_norm": 0.23438218235969543, + "learning_rate": 1.667647865340142e-05, + "loss": 1.2299, + "step": 9706 + }, + { + "epoch": 2.891193060183548, + "grad_norm": 0.22477249801158905, + "learning_rate": 1.667576052465946e-05, + "loss": 1.2562, + "step": 9707 + }, + { + "epoch": 2.8914909063832166, + "grad_norm": 0.22039125859737396, + "learning_rate": 1.6675042333807004e-05, + "loss": 1.2288, + "step": 9708 + }, + { + "epoch": 2.891788752582885, + "grad_norm": 0.23163697123527527, + "learning_rate": 1.6674324080850738e-05, + "loss": 1.2629, + "step": 9709 + }, + { + "epoch": 2.8920865987825537, + "grad_norm": 0.23662646114826202, + "learning_rate": 1.667360576579734e-05, + "loss": 1.239, + "step": 9710 + }, + { + "epoch": 2.8923844449822225, + "grad_norm": 0.23235507309436798, + "learning_rate": 1.6672887388653497e-05, + "loss": 1.2331, + "step": 9711 + }, + { + "epoch": 2.892682291181891, + "grad_norm": 0.23247478902339935, + "learning_rate": 1.6672168949425897e-05, + "loss": 1.2475, + "step": 9712 + }, + { + "epoch": 2.8929801373815596, + "grad_norm": 0.22635920345783234, + "learning_rate": 1.6671450448121217e-05, + "loss": 1.2385, + "step": 9713 + }, + { + "epoch": 2.8932779835812283, + "grad_norm": 0.2372814565896988, + "learning_rate": 1.667073188474614e-05, + "loss": 1.2609, + "step": 9714 + }, + { + "epoch": 2.8935758297808967, + "grad_norm": 0.2527531087398529, + "learning_rate": 1.6670013259307363e-05, + "loss": 1.2417, + "step": 9715 + }, + { + "epoch": 2.8938736759805654, + "grad_norm": 0.24779640138149261, + "learning_rate": 1.666929457181156e-05, + "loss": 1.256, + "step": 9716 + }, + { + "epoch": 2.894171522180234, + "grad_norm": 0.23665443062782288, + "learning_rate": 1.6668575822265426e-05, + "loss": 1.2405, + "step": 9717 + }, + { + "epoch": 2.894469368379903, + "grad_norm": 0.2807767689228058, + "learning_rate": 1.666785701067564e-05, + "loss": 1.2478, + "step": 9718 + }, + { + "epoch": 2.8947672145795713, + "grad_norm": 0.25625476241111755, + "learning_rate": 1.6667138137048896e-05, + "loss": 1.2354, + "step": 9719 + }, + { + "epoch": 2.89506506077924, + "grad_norm": 0.22842606902122498, + "learning_rate": 1.6666419201391884e-05, + "loss": 1.2347, + "step": 9720 + }, + { + "epoch": 2.895362906978909, + "grad_norm": 0.271775484085083, + "learning_rate": 1.6665700203711287e-05, + "loss": 1.2421, + "step": 9721 + }, + { + "epoch": 2.8956607531785776, + "grad_norm": 0.24809524416923523, + "learning_rate": 1.6664981144013794e-05, + "loss": 1.2492, + "step": 9722 + }, + { + "epoch": 2.895958599378246, + "grad_norm": 0.2469061017036438, + "learning_rate": 1.6664262022306103e-05, + "loss": 1.2309, + "step": 9723 + }, + { + "epoch": 2.8962564455779147, + "grad_norm": 0.33535274863243103, + "learning_rate": 1.6663542838594895e-05, + "loss": 1.2476, + "step": 9724 + }, + { + "epoch": 2.8965542917775835, + "grad_norm": 0.30277758836746216, + "learning_rate": 1.6662823592886866e-05, + "loss": 1.242, + "step": 9725 + }, + { + "epoch": 2.896852137977252, + "grad_norm": 0.23677143454551697, + "learning_rate": 1.6662104285188714e-05, + "loss": 1.2373, + "step": 9726 + }, + { + "epoch": 2.8971499841769206, + "grad_norm": 0.26896932721138, + "learning_rate": 1.666138491550712e-05, + "loss": 1.2305, + "step": 9727 + }, + { + "epoch": 2.8974478303765894, + "grad_norm": 0.245836541056633, + "learning_rate": 1.6660665483848783e-05, + "loss": 1.2537, + "step": 9728 + }, + { + "epoch": 2.8977456765762577, + "grad_norm": 0.2385386824607849, + "learning_rate": 1.6659945990220393e-05, + "loss": 1.2649, + "step": 9729 + }, + { + "epoch": 2.8980435227759265, + "grad_norm": 0.24647222459316254, + "learning_rate": 1.665922643462865e-05, + "loss": 1.2528, + "step": 9730 + }, + { + "epoch": 2.8983413689755952, + "grad_norm": 0.23912402987480164, + "learning_rate": 1.6658506817080246e-05, + "loss": 1.2577, + "step": 9731 + }, + { + "epoch": 2.898639215175264, + "grad_norm": 0.2441381812095642, + "learning_rate": 1.665778713758187e-05, + "loss": 1.2274, + "step": 9732 + }, + { + "epoch": 2.898937061374933, + "grad_norm": 0.2690194845199585, + "learning_rate": 1.6657067396140226e-05, + "loss": 1.2369, + "step": 9733 + }, + { + "epoch": 2.899234907574601, + "grad_norm": 0.2437698096036911, + "learning_rate": 1.665634759276201e-05, + "loss": 1.2469, + "step": 9734 + }, + { + "epoch": 2.89953275377427, + "grad_norm": 0.24350915849208832, + "learning_rate": 1.6655627727453912e-05, + "loss": 1.2442, + "step": 9735 + }, + { + "epoch": 2.8998305999739387, + "grad_norm": 0.27060574293136597, + "learning_rate": 1.6654907800222638e-05, + "loss": 1.2428, + "step": 9736 + }, + { + "epoch": 2.900128446173607, + "grad_norm": 0.2574555575847626, + "learning_rate": 1.665418781107488e-05, + "loss": 1.2533, + "step": 9737 + }, + { + "epoch": 2.9004262923732758, + "grad_norm": 0.23176690936088562, + "learning_rate": 1.665346776001734e-05, + "loss": 1.2595, + "step": 9738 + }, + { + "epoch": 2.9007241385729445, + "grad_norm": 0.2270326465368271, + "learning_rate": 1.6652747647056714e-05, + "loss": 1.2263, + "step": 9739 + }, + { + "epoch": 2.901021984772613, + "grad_norm": 0.2572081685066223, + "learning_rate": 1.665202747219971e-05, + "loss": 1.2456, + "step": 9740 + }, + { + "epoch": 2.9013198309722816, + "grad_norm": 0.24711395800113678, + "learning_rate": 1.6651307235453015e-05, + "loss": 1.2354, + "step": 9741 + }, + { + "epoch": 2.9016176771719504, + "grad_norm": 0.23309578001499176, + "learning_rate": 1.665058693682334e-05, + "loss": 1.244, + "step": 9742 + }, + { + "epoch": 2.9019155233716187, + "grad_norm": 0.25636541843414307, + "learning_rate": 1.6649866576317387e-05, + "loss": 1.2501, + "step": 9743 + }, + { + "epoch": 2.9022133695712875, + "grad_norm": 0.23264876008033752, + "learning_rate": 1.6649146153941854e-05, + "loss": 1.2478, + "step": 9744 + }, + { + "epoch": 2.9025112157709563, + "grad_norm": 0.2976943552494049, + "learning_rate": 1.6648425669703442e-05, + "loss": 1.2496, + "step": 9745 + }, + { + "epoch": 2.902809061970625, + "grad_norm": 0.31915852427482605, + "learning_rate": 1.664770512360886e-05, + "loss": 1.2558, + "step": 9746 + }, + { + "epoch": 2.903106908170294, + "grad_norm": 0.2434307187795639, + "learning_rate": 1.6646984515664806e-05, + "loss": 1.2483, + "step": 9747 + }, + { + "epoch": 2.903404754369962, + "grad_norm": 0.7460813522338867, + "learning_rate": 1.6646263845877993e-05, + "loss": 1.23, + "step": 9748 + }, + { + "epoch": 2.903702600569631, + "grad_norm": 0.30449822545051575, + "learning_rate": 1.6645543114255115e-05, + "loss": 1.2504, + "step": 9749 + }, + { + "epoch": 2.9040004467692997, + "grad_norm": 0.30233049392700195, + "learning_rate": 1.664482232080289e-05, + "loss": 1.2721, + "step": 9750 + }, + { + "epoch": 2.904298292968968, + "grad_norm": 0.2563141882419586, + "learning_rate": 1.664410146552801e-05, + "loss": 1.2322, + "step": 9751 + }, + { + "epoch": 2.904596139168637, + "grad_norm": 0.24350601434707642, + "learning_rate": 1.664338054843719e-05, + "loss": 1.2413, + "step": 9752 + }, + { + "epoch": 2.9048939853683056, + "grad_norm": 0.23631763458251953, + "learning_rate": 1.664265956953714e-05, + "loss": 1.2587, + "step": 9753 + }, + { + "epoch": 2.905191831567974, + "grad_norm": 0.26359960436820984, + "learning_rate": 1.6641938528834566e-05, + "loss": 1.2554, + "step": 9754 + }, + { + "epoch": 2.9054896777676427, + "grad_norm": 0.2576713562011719, + "learning_rate": 1.6641217426336167e-05, + "loss": 1.2564, + "step": 9755 + }, + { + "epoch": 2.9057875239673114, + "grad_norm": 0.253892183303833, + "learning_rate": 1.6640496262048663e-05, + "loss": 1.2217, + "step": 9756 + }, + { + "epoch": 2.9060853701669798, + "grad_norm": 0.23795241117477417, + "learning_rate": 1.6639775035978766e-05, + "loss": 1.2464, + "step": 9757 + }, + { + "epoch": 2.9063832163666485, + "grad_norm": 0.22954751551151276, + "learning_rate": 1.6639053748133176e-05, + "loss": 1.2469, + "step": 9758 + }, + { + "epoch": 2.9066810625663173, + "grad_norm": 0.24584701657295227, + "learning_rate": 1.6638332398518607e-05, + "loss": 1.2331, + "step": 9759 + }, + { + "epoch": 2.906978908765986, + "grad_norm": 0.2480352371931076, + "learning_rate": 1.6637610987141774e-05, + "loss": 1.2469, + "step": 9760 + }, + { + "epoch": 2.907276754965655, + "grad_norm": 0.24403853714466095, + "learning_rate": 1.6636889514009387e-05, + "loss": 1.2408, + "step": 9761 + }, + { + "epoch": 2.907574601165323, + "grad_norm": 0.24723169207572937, + "learning_rate": 1.6636167979128157e-05, + "loss": 1.2423, + "step": 9762 + }, + { + "epoch": 2.907872447364992, + "grad_norm": 0.2441592812538147, + "learning_rate": 1.66354463825048e-05, + "loss": 1.2566, + "step": 9763 + }, + { + "epoch": 2.9081702935646607, + "grad_norm": 0.2384587973356247, + "learning_rate": 1.6634724724146028e-05, + "loss": 1.2588, + "step": 9764 + }, + { + "epoch": 2.908468139764329, + "grad_norm": 0.2357000708580017, + "learning_rate": 1.6634003004058553e-05, + "loss": 1.2239, + "step": 9765 + }, + { + "epoch": 2.908765985963998, + "grad_norm": 0.2468337118625641, + "learning_rate": 1.6633281222249092e-05, + "loss": 1.2347, + "step": 9766 + }, + { + "epoch": 2.9090638321636666, + "grad_norm": 0.22989989817142487, + "learning_rate": 1.6632559378724364e-05, + "loss": 1.2435, + "step": 9767 + }, + { + "epoch": 2.909361678363335, + "grad_norm": 0.23285478353500366, + "learning_rate": 1.6631837473491075e-05, + "loss": 1.2466, + "step": 9768 + }, + { + "epoch": 2.9096595245630037, + "grad_norm": 0.23242908716201782, + "learning_rate": 1.6631115506555953e-05, + "loss": 1.2317, + "step": 9769 + }, + { + "epoch": 2.9099573707626725, + "grad_norm": 0.2319924533367157, + "learning_rate": 1.663039347792571e-05, + "loss": 1.2345, + "step": 9770 + }, + { + "epoch": 2.910255216962341, + "grad_norm": 0.23280765116214752, + "learning_rate": 1.6629671387607062e-05, + "loss": 1.2381, + "step": 9771 + }, + { + "epoch": 2.9105530631620096, + "grad_norm": 0.23628489673137665, + "learning_rate": 1.662894923560673e-05, + "loss": 1.2329, + "step": 9772 + }, + { + "epoch": 2.9108509093616783, + "grad_norm": 0.23635146021842957, + "learning_rate": 1.662822702193143e-05, + "loss": 1.2503, + "step": 9773 + }, + { + "epoch": 2.911148755561347, + "grad_norm": 0.2271934151649475, + "learning_rate": 1.6627504746587885e-05, + "loss": 1.246, + "step": 9774 + }, + { + "epoch": 2.911446601761016, + "grad_norm": 0.2384863942861557, + "learning_rate": 1.662678240958281e-05, + "loss": 1.2476, + "step": 9775 + }, + { + "epoch": 2.911744447960684, + "grad_norm": 0.23095139861106873, + "learning_rate": 1.6626060010922927e-05, + "loss": 1.2491, + "step": 9776 + }, + { + "epoch": 2.912042294160353, + "grad_norm": 0.23577161133289337, + "learning_rate": 1.6625337550614963e-05, + "loss": 1.2349, + "step": 9777 + }, + { + "epoch": 2.9123401403600218, + "grad_norm": 0.2283676713705063, + "learning_rate": 1.6624615028665636e-05, + "loss": 1.2447, + "step": 9778 + }, + { + "epoch": 2.91263798655969, + "grad_norm": 0.22701942920684814, + "learning_rate": 1.6623892445081665e-05, + "loss": 1.2444, + "step": 9779 + }, + { + "epoch": 2.912935832759359, + "grad_norm": 0.22096702456474304, + "learning_rate": 1.662316979986978e-05, + "loss": 1.2516, + "step": 9780 + }, + { + "epoch": 2.9132336789590276, + "grad_norm": 0.2332075834274292, + "learning_rate": 1.6622447093036697e-05, + "loss": 1.251, + "step": 9781 + }, + { + "epoch": 2.913531525158696, + "grad_norm": 0.23211286962032318, + "learning_rate": 1.662172432458914e-05, + "loss": 1.2372, + "step": 9782 + }, + { + "epoch": 2.9138293713583647, + "grad_norm": 0.22454820573329926, + "learning_rate": 1.662100149453384e-05, + "loss": 1.2394, + "step": 9783 + }, + { + "epoch": 2.9141272175580335, + "grad_norm": 0.23275113105773926, + "learning_rate": 1.6620278602877517e-05, + "loss": 1.2469, + "step": 9784 + }, + { + "epoch": 2.9144250637577023, + "grad_norm": 0.23850785195827484, + "learning_rate": 1.6619555649626894e-05, + "loss": 1.2473, + "step": 9785 + }, + { + "epoch": 2.9147229099573706, + "grad_norm": 0.23938463628292084, + "learning_rate": 1.661883263478871e-05, + "loss": 1.2342, + "step": 9786 + }, + { + "epoch": 2.9150207561570394, + "grad_norm": 0.22498579323291779, + "learning_rate": 1.6618109558369676e-05, + "loss": 1.2506, + "step": 9787 + }, + { + "epoch": 2.915318602356708, + "grad_norm": 0.23149646818637848, + "learning_rate": 1.6617386420376532e-05, + "loss": 1.2331, + "step": 9788 + }, + { + "epoch": 2.915616448556377, + "grad_norm": 0.2308996319770813, + "learning_rate": 1.6616663220815996e-05, + "loss": 1.2339, + "step": 9789 + }, + { + "epoch": 2.9159142947560452, + "grad_norm": 0.23279112577438354, + "learning_rate": 1.6615939959694805e-05, + "loss": 1.2537, + "step": 9790 + }, + { + "epoch": 2.916212140955714, + "grad_norm": 0.22437404096126556, + "learning_rate": 1.6615216637019683e-05, + "loss": 1.2215, + "step": 9791 + }, + { + "epoch": 2.916509987155383, + "grad_norm": 0.22464318573474884, + "learning_rate": 1.6614493252797365e-05, + "loss": 1.2589, + "step": 9792 + }, + { + "epoch": 2.916807833355051, + "grad_norm": 0.2366705983877182, + "learning_rate": 1.661376980703457e-05, + "loss": 1.2355, + "step": 9793 + }, + { + "epoch": 2.91710567955472, + "grad_norm": 0.22409525513648987, + "learning_rate": 1.661304629973804e-05, + "loss": 1.2458, + "step": 9794 + }, + { + "epoch": 2.9174035257543887, + "grad_norm": 0.23578611016273499, + "learning_rate": 1.6612322730914505e-05, + "loss": 1.2289, + "step": 9795 + }, + { + "epoch": 2.917701371954057, + "grad_norm": 0.24363459646701813, + "learning_rate": 1.661159910057069e-05, + "loss": 1.2426, + "step": 9796 + }, + { + "epoch": 2.9179992181537258, + "grad_norm": 0.2294311672449112, + "learning_rate": 1.6610875408713335e-05, + "loss": 1.2305, + "step": 9797 + }, + { + "epoch": 2.9182970643533945, + "grad_norm": 0.2467951625585556, + "learning_rate": 1.6610151655349173e-05, + "loss": 1.2366, + "step": 9798 + }, + { + "epoch": 2.9185949105530633, + "grad_norm": 0.22517724335193634, + "learning_rate": 1.660942784048493e-05, + "loss": 1.2428, + "step": 9799 + }, + { + "epoch": 2.918892756752732, + "grad_norm": 0.2336469143629074, + "learning_rate": 1.660870396412735e-05, + "loss": 1.2372, + "step": 9800 + }, + { + "epoch": 2.9191906029524004, + "grad_norm": 0.22752323746681213, + "learning_rate": 1.660798002628316e-05, + "loss": 1.2208, + "step": 9801 + }, + { + "epoch": 2.919488449152069, + "grad_norm": 0.22840507328510284, + "learning_rate": 1.66072560269591e-05, + "loss": 1.247, + "step": 9802 + }, + { + "epoch": 2.919786295351738, + "grad_norm": 0.2431611567735672, + "learning_rate": 1.6606531966161906e-05, + "loss": 1.2537, + "step": 9803 + }, + { + "epoch": 2.9200841415514063, + "grad_norm": 0.24127916991710663, + "learning_rate": 1.660580784389831e-05, + "loss": 1.2346, + "step": 9804 + }, + { + "epoch": 2.920381987751075, + "grad_norm": 0.21927498281002045, + "learning_rate": 1.6605083660175055e-05, + "loss": 1.2371, + "step": 9805 + }, + { + "epoch": 2.920679833950744, + "grad_norm": 0.23330777883529663, + "learning_rate": 1.6604359414998877e-05, + "loss": 1.258, + "step": 9806 + }, + { + "epoch": 2.920977680150412, + "grad_norm": 0.23680800199508667, + "learning_rate": 1.660363510837651e-05, + "loss": 1.2515, + "step": 9807 + }, + { + "epoch": 2.921275526350081, + "grad_norm": 0.23028503358364105, + "learning_rate": 1.66029107403147e-05, + "loss": 1.2444, + "step": 9808 + }, + { + "epoch": 2.9215733725497497, + "grad_norm": 0.24483390152454376, + "learning_rate": 1.660218631082018e-05, + "loss": 1.2477, + "step": 9809 + }, + { + "epoch": 2.921871218749418, + "grad_norm": 0.23751649260520935, + "learning_rate": 1.6601461819899694e-05, + "loss": 1.2474, + "step": 9810 + }, + { + "epoch": 2.922169064949087, + "grad_norm": 0.22798416018486023, + "learning_rate": 1.660073726755998e-05, + "loss": 1.2276, + "step": 9811 + }, + { + "epoch": 2.9224669111487556, + "grad_norm": 0.23340913653373718, + "learning_rate": 1.660001265380778e-05, + "loss": 1.2574, + "step": 9812 + }, + { + "epoch": 2.9227647573484243, + "grad_norm": 0.22620655596256256, + "learning_rate": 1.659928797864984e-05, + "loss": 1.2226, + "step": 9813 + }, + { + "epoch": 2.923062603548093, + "grad_norm": 0.23551420867443085, + "learning_rate": 1.6598563242092895e-05, + "loss": 1.2439, + "step": 9814 + }, + { + "epoch": 2.9233604497477614, + "grad_norm": 0.24146650731563568, + "learning_rate": 1.6597838444143697e-05, + "loss": 1.2468, + "step": 9815 + }, + { + "epoch": 2.92365829594743, + "grad_norm": 0.24275468289852142, + "learning_rate": 1.659711358480898e-05, + "loss": 1.2258, + "step": 9816 + }, + { + "epoch": 2.923956142147099, + "grad_norm": 0.2402866780757904, + "learning_rate": 1.6596388664095487e-05, + "loss": 1.2566, + "step": 9817 + }, + { + "epoch": 2.9242539883467673, + "grad_norm": 0.23758667707443237, + "learning_rate": 1.6595663682009973e-05, + "loss": 1.2603, + "step": 9818 + }, + { + "epoch": 2.924551834546436, + "grad_norm": 0.23961032927036285, + "learning_rate": 1.659493863855918e-05, + "loss": 1.2449, + "step": 9819 + }, + { + "epoch": 2.924849680746105, + "grad_norm": 0.23793166875839233, + "learning_rate": 1.6594213533749846e-05, + "loss": 1.2412, + "step": 9820 + }, + { + "epoch": 2.925147526945773, + "grad_norm": 0.22784459590911865, + "learning_rate": 1.659348836758872e-05, + "loss": 1.2201, + "step": 9821 + }, + { + "epoch": 2.925445373145442, + "grad_norm": 0.23748725652694702, + "learning_rate": 1.6592763140082556e-05, + "loss": 1.2613, + "step": 9822 + }, + { + "epoch": 2.9257432193451107, + "grad_norm": 0.2378886193037033, + "learning_rate": 1.6592037851238097e-05, + "loss": 1.232, + "step": 9823 + }, + { + "epoch": 2.926041065544779, + "grad_norm": 0.2596132159233093, + "learning_rate": 1.659131250106209e-05, + "loss": 1.2399, + "step": 9824 + }, + { + "epoch": 2.926338911744448, + "grad_norm": 0.23729299008846283, + "learning_rate": 1.6590587089561287e-05, + "loss": 1.2499, + "step": 9825 + }, + { + "epoch": 2.9266367579441166, + "grad_norm": 0.2374809831380844, + "learning_rate": 1.6589861616742434e-05, + "loss": 1.2425, + "step": 9826 + }, + { + "epoch": 2.9269346041437854, + "grad_norm": 0.23805709183216095, + "learning_rate": 1.6589136082612277e-05, + "loss": 1.2432, + "step": 9827 + }, + { + "epoch": 2.927232450343454, + "grad_norm": 0.24178627133369446, + "learning_rate": 1.6588410487177572e-05, + "loss": 1.2409, + "step": 9828 + }, + { + "epoch": 2.9275302965431225, + "grad_norm": 0.24564237892627716, + "learning_rate": 1.6587684830445074e-05, + "loss": 1.2289, + "step": 9829 + }, + { + "epoch": 2.9278281427427912, + "grad_norm": 0.22558070719242096, + "learning_rate": 1.6586959112421523e-05, + "loss": 1.2531, + "step": 9830 + }, + { + "epoch": 2.92812598894246, + "grad_norm": 0.23690807819366455, + "learning_rate": 1.6586233333113678e-05, + "loss": 1.2414, + "step": 9831 + }, + { + "epoch": 2.9284238351421283, + "grad_norm": 0.22581611573696136, + "learning_rate": 1.658550749252829e-05, + "loss": 1.2254, + "step": 9832 + }, + { + "epoch": 2.928721681341797, + "grad_norm": 0.2420835644006729, + "learning_rate": 1.6584781590672116e-05, + "loss": 1.2655, + "step": 9833 + }, + { + "epoch": 2.929019527541466, + "grad_norm": 0.23327945172786713, + "learning_rate": 1.6584055627551904e-05, + "loss": 1.2352, + "step": 9834 + }, + { + "epoch": 2.929317373741134, + "grad_norm": 0.2318052500486374, + "learning_rate": 1.658332960317441e-05, + "loss": 1.238, + "step": 9835 + }, + { + "epoch": 2.929615219940803, + "grad_norm": 0.23198439180850983, + "learning_rate": 1.6582603517546388e-05, + "loss": 1.2388, + "step": 9836 + }, + { + "epoch": 2.9299130661404718, + "grad_norm": 0.22870133817195892, + "learning_rate": 1.6581877370674596e-05, + "loss": 1.2363, + "step": 9837 + }, + { + "epoch": 2.93021091234014, + "grad_norm": 0.22588759660720825, + "learning_rate": 1.6581151162565788e-05, + "loss": 1.2404, + "step": 9838 + }, + { + "epoch": 2.930508758539809, + "grad_norm": 0.23626545071601868, + "learning_rate": 1.6580424893226723e-05, + "loss": 1.2428, + "step": 9839 + }, + { + "epoch": 2.9308066047394776, + "grad_norm": 0.23212206363677979, + "learning_rate": 1.6579698562664157e-05, + "loss": 1.2289, + "step": 9840 + }, + { + "epoch": 2.9311044509391464, + "grad_norm": 0.2351151555776596, + "learning_rate": 1.6578972170884843e-05, + "loss": 1.2396, + "step": 9841 + }, + { + "epoch": 2.931402297138815, + "grad_norm": 0.24143719673156738, + "learning_rate": 1.657824571789555e-05, + "loss": 1.2353, + "step": 9842 + }, + { + "epoch": 2.9317001433384835, + "grad_norm": 0.23602788150310516, + "learning_rate": 1.6577519203703025e-05, + "loss": 1.2361, + "step": 9843 + }, + { + "epoch": 2.9319979895381523, + "grad_norm": 0.23900477588176727, + "learning_rate": 1.6576792628314033e-05, + "loss": 1.2599, + "step": 9844 + }, + { + "epoch": 2.932295835737821, + "grad_norm": 0.22072771191596985, + "learning_rate": 1.6576065991735336e-05, + "loss": 1.248, + "step": 9845 + }, + { + "epoch": 2.9325936819374894, + "grad_norm": 0.22451645135879517, + "learning_rate": 1.657533929397369e-05, + "loss": 1.2394, + "step": 9846 + }, + { + "epoch": 2.932891528137158, + "grad_norm": 0.23405350744724274, + "learning_rate": 1.6574612535035857e-05, + "loss": 1.2491, + "step": 9847 + }, + { + "epoch": 2.933189374336827, + "grad_norm": 0.23247672617435455, + "learning_rate": 1.6573885714928604e-05, + "loss": 1.2508, + "step": 9848 + }, + { + "epoch": 2.9334872205364952, + "grad_norm": 0.23231729865074158, + "learning_rate": 1.6573158833658688e-05, + "loss": 1.2512, + "step": 9849 + }, + { + "epoch": 2.933785066736164, + "grad_norm": 0.23208829760551453, + "learning_rate": 1.657243189123287e-05, + "loss": 1.2394, + "step": 9850 + }, + { + "epoch": 2.934082912935833, + "grad_norm": 0.22789599001407623, + "learning_rate": 1.657170488765792e-05, + "loss": 1.222, + "step": 9851 + }, + { + "epoch": 2.9343807591355016, + "grad_norm": 0.22650061547756195, + "learning_rate": 1.65709778229406e-05, + "loss": 1.2518, + "step": 9852 + }, + { + "epoch": 2.93467860533517, + "grad_norm": 0.22586821019649506, + "learning_rate": 1.6570250697087668e-05, + "loss": 1.2375, + "step": 9853 + }, + { + "epoch": 2.9349764515348387, + "grad_norm": 0.2344573438167572, + "learning_rate": 1.6569523510105898e-05, + "loss": 1.2535, + "step": 9854 + }, + { + "epoch": 2.9352742977345074, + "grad_norm": 0.2413981705904007, + "learning_rate": 1.6568796262002048e-05, + "loss": 1.2379, + "step": 9855 + }, + { + "epoch": 2.935572143934176, + "grad_norm": 0.2279159277677536, + "learning_rate": 1.656806895278289e-05, + "loss": 1.2452, + "step": 9856 + }, + { + "epoch": 2.9358699901338445, + "grad_norm": 0.23538503050804138, + "learning_rate": 1.656734158245519e-05, + "loss": 1.2397, + "step": 9857 + }, + { + "epoch": 2.9361678363335133, + "grad_norm": 0.22665321826934814, + "learning_rate": 1.656661415102571e-05, + "loss": 1.2455, + "step": 9858 + }, + { + "epoch": 2.936465682533182, + "grad_norm": 0.23031972348690033, + "learning_rate": 1.6565886658501226e-05, + "loss": 1.2423, + "step": 9859 + }, + { + "epoch": 2.9367635287328504, + "grad_norm": 0.22960679233074188, + "learning_rate": 1.65651591048885e-05, + "loss": 1.2505, + "step": 9860 + }, + { + "epoch": 2.937061374932519, + "grad_norm": 0.2389422059059143, + "learning_rate": 1.6564431490194306e-05, + "loss": 1.234, + "step": 9861 + }, + { + "epoch": 2.937359221132188, + "grad_norm": 0.23131327331066132, + "learning_rate": 1.6563703814425408e-05, + "loss": 1.25, + "step": 9862 + }, + { + "epoch": 2.9376570673318563, + "grad_norm": 0.23465389013290405, + "learning_rate": 1.6562976077588582e-05, + "loss": 1.2497, + "step": 9863 + }, + { + "epoch": 2.937954913531525, + "grad_norm": 0.2397429347038269, + "learning_rate": 1.6562248279690594e-05, + "loss": 1.2384, + "step": 9864 + }, + { + "epoch": 2.938252759731194, + "grad_norm": 0.2289528101682663, + "learning_rate": 1.656152042073822e-05, + "loss": 1.241, + "step": 9865 + }, + { + "epoch": 2.9385506059308626, + "grad_norm": 0.22949634492397308, + "learning_rate": 1.656079250073823e-05, + "loss": 1.2297, + "step": 9866 + }, + { + "epoch": 2.9388484521305314, + "grad_norm": 0.22586174309253693, + "learning_rate": 1.6560064519697393e-05, + "loss": 1.2611, + "step": 9867 + }, + { + "epoch": 2.9391462983301997, + "grad_norm": 0.22224444150924683, + "learning_rate": 1.6559336477622486e-05, + "loss": 1.2322, + "step": 9868 + }, + { + "epoch": 2.9394441445298685, + "grad_norm": 0.22912226617336273, + "learning_rate": 1.655860837452028e-05, + "loss": 1.2504, + "step": 9869 + }, + { + "epoch": 2.9397419907295372, + "grad_norm": 0.2326570451259613, + "learning_rate": 1.655788021039755e-05, + "loss": 1.247, + "step": 9870 + }, + { + "epoch": 2.9400398369292056, + "grad_norm": 0.2364460974931717, + "learning_rate": 1.6557151985261074e-05, + "loss": 1.2426, + "step": 9871 + }, + { + "epoch": 2.9403376831288743, + "grad_norm": 0.22613415122032166, + "learning_rate": 1.6556423699117626e-05, + "loss": 1.2329, + "step": 9872 + }, + { + "epoch": 2.940635529328543, + "grad_norm": 0.23236924409866333, + "learning_rate": 1.655569535197398e-05, + "loss": 1.2511, + "step": 9873 + }, + { + "epoch": 2.9409333755282114, + "grad_norm": 0.24027229845523834, + "learning_rate": 1.6554966943836914e-05, + "loss": 1.2489, + "step": 9874 + }, + { + "epoch": 2.94123122172788, + "grad_norm": 0.23759745061397552, + "learning_rate": 1.6554238474713204e-05, + "loss": 1.2559, + "step": 9875 + }, + { + "epoch": 2.941529067927549, + "grad_norm": 0.23697206377983093, + "learning_rate": 1.6553509944609626e-05, + "loss": 1.2519, + "step": 9876 + }, + { + "epoch": 2.9418269141272173, + "grad_norm": 0.23689374327659607, + "learning_rate": 1.655278135353296e-05, + "loss": 1.253, + "step": 9877 + }, + { + "epoch": 2.942124760326886, + "grad_norm": 0.22923915088176727, + "learning_rate": 1.6552052701489985e-05, + "loss": 1.2367, + "step": 9878 + }, + { + "epoch": 2.942422606526555, + "grad_norm": 0.226065993309021, + "learning_rate": 1.655132398848748e-05, + "loss": 1.2276, + "step": 9879 + }, + { + "epoch": 2.9427204527262236, + "grad_norm": 0.23301160335540771, + "learning_rate": 1.6550595214532226e-05, + "loss": 1.2392, + "step": 9880 + }, + { + "epoch": 2.9430182989258924, + "grad_norm": 0.22971084713935852, + "learning_rate": 1.6549866379631005e-05, + "loss": 1.2288, + "step": 9881 + }, + { + "epoch": 2.9433161451255607, + "grad_norm": 0.22882910072803497, + "learning_rate": 1.6549137483790588e-05, + "loss": 1.2432, + "step": 9882 + }, + { + "epoch": 2.9436139913252295, + "grad_norm": 0.2380112111568451, + "learning_rate": 1.6548408527017768e-05, + "loss": 1.2654, + "step": 9883 + }, + { + "epoch": 2.9439118375248983, + "grad_norm": 0.2446528822183609, + "learning_rate": 1.6547679509319322e-05, + "loss": 1.2594, + "step": 9884 + }, + { + "epoch": 2.9442096837245666, + "grad_norm": 0.22983220219612122, + "learning_rate": 1.6546950430702036e-05, + "loss": 1.2399, + "step": 9885 + }, + { + "epoch": 2.9445075299242354, + "grad_norm": 0.23347225785255432, + "learning_rate": 1.6546221291172687e-05, + "loss": 1.2367, + "step": 9886 + }, + { + "epoch": 2.944805376123904, + "grad_norm": 0.24225640296936035, + "learning_rate": 1.6545492090738067e-05, + "loss": 1.2312, + "step": 9887 + }, + { + "epoch": 2.9451032223235725, + "grad_norm": 0.23472385108470917, + "learning_rate": 1.6544762829404956e-05, + "loss": 1.2397, + "step": 9888 + }, + { + "epoch": 2.9454010685232412, + "grad_norm": 0.22931373119354248, + "learning_rate": 1.6544033507180133e-05, + "loss": 1.2454, + "step": 9889 + }, + { + "epoch": 2.94569891472291, + "grad_norm": 0.23852002620697021, + "learning_rate": 1.6543304124070397e-05, + "loss": 1.2512, + "step": 9890 + }, + { + "epoch": 2.9459967609225783, + "grad_norm": 0.24039921164512634, + "learning_rate": 1.6542574680082525e-05, + "loss": 1.2504, + "step": 9891 + }, + { + "epoch": 2.946294607122247, + "grad_norm": 0.24208903312683105, + "learning_rate": 1.6541845175223305e-05, + "loss": 1.247, + "step": 9892 + }, + { + "epoch": 2.946592453321916, + "grad_norm": 0.23167331516742706, + "learning_rate": 1.654111560949952e-05, + "loss": 1.2448, + "step": 9893 + }, + { + "epoch": 2.9468902995215847, + "grad_norm": 0.2375795543193817, + "learning_rate": 1.6540385982917968e-05, + "loss": 1.2339, + "step": 9894 + }, + { + "epoch": 2.9471881457212534, + "grad_norm": 0.21858654916286469, + "learning_rate": 1.653965629548543e-05, + "loss": 1.233, + "step": 9895 + }, + { + "epoch": 2.9474859919209218, + "grad_norm": 0.24020157754421234, + "learning_rate": 1.6538926547208695e-05, + "loss": 1.2513, + "step": 9896 + }, + { + "epoch": 2.9477838381205905, + "grad_norm": 0.2464299201965332, + "learning_rate": 1.653819673809456e-05, + "loss": 1.2628, + "step": 9897 + }, + { + "epoch": 2.9480816843202593, + "grad_norm": 0.23754477500915527, + "learning_rate": 1.65374668681498e-05, + "loss": 1.2486, + "step": 9898 + }, + { + "epoch": 2.9483795305199276, + "grad_norm": 0.2259082794189453, + "learning_rate": 1.653673693738122e-05, + "loss": 1.2404, + "step": 9899 + }, + { + "epoch": 2.9486773767195964, + "grad_norm": 0.2358454018831253, + "learning_rate": 1.653600694579561e-05, + "loss": 1.2525, + "step": 9900 + }, + { + "epoch": 2.948975222919265, + "grad_norm": 0.22510115802288055, + "learning_rate": 1.6535276893399753e-05, + "loss": 1.2303, + "step": 9901 + }, + { + "epoch": 2.9492730691189335, + "grad_norm": 0.22290019690990448, + "learning_rate": 1.6534546780200447e-05, + "loss": 1.2473, + "step": 9902 + }, + { + "epoch": 2.9495709153186023, + "grad_norm": 0.2295389622449875, + "learning_rate": 1.6533816606204483e-05, + "loss": 1.237, + "step": 9903 + }, + { + "epoch": 2.949868761518271, + "grad_norm": 0.24453958868980408, + "learning_rate": 1.6533086371418656e-05, + "loss": 1.2511, + "step": 9904 + }, + { + "epoch": 2.95016660771794, + "grad_norm": 0.22952069342136383, + "learning_rate": 1.653235607584976e-05, + "loss": 1.2452, + "step": 9905 + }, + { + "epoch": 2.950464453917608, + "grad_norm": 0.2298763394355774, + "learning_rate": 1.653162571950459e-05, + "loss": 1.2374, + "step": 9906 + }, + { + "epoch": 2.950762300117277, + "grad_norm": 0.23416483402252197, + "learning_rate": 1.6530895302389936e-05, + "loss": 1.2336, + "step": 9907 + }, + { + "epoch": 2.9510601463169457, + "grad_norm": 0.2344265580177307, + "learning_rate": 1.6530164824512606e-05, + "loss": 1.2541, + "step": 9908 + }, + { + "epoch": 2.9513579925166145, + "grad_norm": 0.23769165575504303, + "learning_rate": 1.652943428587938e-05, + "loss": 1.2336, + "step": 9909 + }, + { + "epoch": 2.951655838716283, + "grad_norm": 0.22906984388828278, + "learning_rate": 1.652870368649707e-05, + "loss": 1.2537, + "step": 9910 + }, + { + "epoch": 2.9519536849159516, + "grad_norm": 0.2274005115032196, + "learning_rate": 1.652797302637246e-05, + "loss": 1.2478, + "step": 9911 + }, + { + "epoch": 2.9522515311156203, + "grad_norm": 0.22509224712848663, + "learning_rate": 1.6527242305512358e-05, + "loss": 1.235, + "step": 9912 + }, + { + "epoch": 2.9525493773152887, + "grad_norm": 0.22659392654895782, + "learning_rate": 1.6526511523923558e-05, + "loss": 1.2266, + "step": 9913 + }, + { + "epoch": 2.9528472235149574, + "grad_norm": 0.22773674130439758, + "learning_rate": 1.6525780681612863e-05, + "loss": 1.2315, + "step": 9914 + }, + { + "epoch": 2.953145069714626, + "grad_norm": 0.23700878024101257, + "learning_rate": 1.6525049778587067e-05, + "loss": 1.2478, + "step": 9915 + }, + { + "epoch": 2.9534429159142945, + "grad_norm": 0.22286926209926605, + "learning_rate": 1.6524318814852973e-05, + "loss": 1.24, + "step": 9916 + }, + { + "epoch": 2.9537407621139633, + "grad_norm": 0.2216832935810089, + "learning_rate": 1.6523587790417384e-05, + "loss": 1.2419, + "step": 9917 + }, + { + "epoch": 2.954038608313632, + "grad_norm": 0.23151254653930664, + "learning_rate": 1.6522856705287097e-05, + "loss": 1.222, + "step": 9918 + }, + { + "epoch": 2.954336454513301, + "grad_norm": 0.23313990235328674, + "learning_rate": 1.6522125559468918e-05, + "loss": 1.2469, + "step": 9919 + }, + { + "epoch": 2.954634300712969, + "grad_norm": 0.2344045490026474, + "learning_rate": 1.6521394352969645e-05, + "loss": 1.2443, + "step": 9920 + }, + { + "epoch": 2.954932146912638, + "grad_norm": 0.22582167387008667, + "learning_rate": 1.6520663085796087e-05, + "loss": 1.246, + "step": 9921 + }, + { + "epoch": 2.9552299931123067, + "grad_norm": 0.23233038187026978, + "learning_rate": 1.651993175795504e-05, + "loss": 1.2395, + "step": 9922 + }, + { + "epoch": 2.9555278393119755, + "grad_norm": 0.2369987815618515, + "learning_rate": 1.6519200369453314e-05, + "loss": 1.2418, + "step": 9923 + }, + { + "epoch": 2.955825685511644, + "grad_norm": 0.23401440680027008, + "learning_rate": 1.6518468920297713e-05, + "loss": 1.2275, + "step": 9924 + }, + { + "epoch": 2.9561235317113126, + "grad_norm": 0.22939825057983398, + "learning_rate": 1.6517737410495044e-05, + "loss": 1.2331, + "step": 9925 + }, + { + "epoch": 2.9564213779109814, + "grad_norm": 0.2362087219953537, + "learning_rate": 1.6517005840052107e-05, + "loss": 1.2517, + "step": 9926 + }, + { + "epoch": 2.9567192241106497, + "grad_norm": 0.2244790494441986, + "learning_rate": 1.6516274208975712e-05, + "loss": 1.2336, + "step": 9927 + }, + { + "epoch": 2.9570170703103185, + "grad_norm": 0.2383660078048706, + "learning_rate": 1.6515542517272666e-05, + "loss": 1.2298, + "step": 9928 + }, + { + "epoch": 2.9573149165099872, + "grad_norm": 0.226767897605896, + "learning_rate": 1.6514810764949777e-05, + "loss": 1.228, + "step": 9929 + }, + { + "epoch": 2.9576127627096556, + "grad_norm": 0.23022127151489258, + "learning_rate": 1.6514078952013853e-05, + "loss": 1.2577, + "step": 9930 + }, + { + "epoch": 2.9579106089093243, + "grad_norm": 0.23604728281497955, + "learning_rate": 1.6513347078471706e-05, + "loss": 1.2478, + "step": 9931 + }, + { + "epoch": 2.958208455108993, + "grad_norm": 0.22337941825389862, + "learning_rate": 1.6512615144330134e-05, + "loss": 1.2392, + "step": 9932 + }, + { + "epoch": 2.958506301308662, + "grad_norm": 0.2234209179878235, + "learning_rate": 1.651188314959596e-05, + "loss": 1.246, + "step": 9933 + }, + { + "epoch": 2.9588041475083307, + "grad_norm": 0.22856219112873077, + "learning_rate": 1.6511151094275985e-05, + "loss": 1.2547, + "step": 9934 + }, + { + "epoch": 2.959101993707999, + "grad_norm": 0.2265620231628418, + "learning_rate": 1.6510418978377027e-05, + "loss": 1.2484, + "step": 9935 + }, + { + "epoch": 2.9593998399076678, + "grad_norm": 0.22914516925811768, + "learning_rate": 1.6509686801905892e-05, + "loss": 1.2492, + "step": 9936 + }, + { + "epoch": 2.9596976861073365, + "grad_norm": 0.23201251029968262, + "learning_rate": 1.6508954564869398e-05, + "loss": 1.2441, + "step": 9937 + }, + { + "epoch": 2.959995532307005, + "grad_norm": 0.23081888258457184, + "learning_rate": 1.650822226727435e-05, + "loss": 1.2398, + "step": 9938 + }, + { + "epoch": 2.9602933785066736, + "grad_norm": 0.23917965590953827, + "learning_rate": 1.6507489909127564e-05, + "loss": 1.2339, + "step": 9939 + }, + { + "epoch": 2.9605912247063424, + "grad_norm": 0.23118039965629578, + "learning_rate": 1.650675749043586e-05, + "loss": 1.2522, + "step": 9940 + }, + { + "epoch": 2.9608890709060107, + "grad_norm": 0.23159939050674438, + "learning_rate": 1.6506025011206045e-05, + "loss": 1.2628, + "step": 9941 + }, + { + "epoch": 2.9611869171056795, + "grad_norm": 0.2369726151227951, + "learning_rate": 1.6505292471444936e-05, + "loss": 1.2414, + "step": 9942 + }, + { + "epoch": 2.9614847633053483, + "grad_norm": 0.2323664128780365, + "learning_rate": 1.6504559871159348e-05, + "loss": 1.2513, + "step": 9943 + }, + { + "epoch": 2.9617826095050166, + "grad_norm": 0.23310616612434387, + "learning_rate": 1.65038272103561e-05, + "loss": 1.2316, + "step": 9944 + }, + { + "epoch": 2.9620804557046854, + "grad_norm": 0.2253895252943039, + "learning_rate": 1.6503094489042005e-05, + "loss": 1.2237, + "step": 9945 + }, + { + "epoch": 2.962378301904354, + "grad_norm": 0.228556826710701, + "learning_rate": 1.650236170722388e-05, + "loss": 1.2499, + "step": 9946 + }, + { + "epoch": 2.962676148104023, + "grad_norm": 0.2337954193353653, + "learning_rate": 1.6501628864908546e-05, + "loss": 1.242, + "step": 9947 + }, + { + "epoch": 2.9629739943036917, + "grad_norm": 0.23642444610595703, + "learning_rate": 1.650089596210282e-05, + "loss": 1.2356, + "step": 9948 + }, + { + "epoch": 2.96327184050336, + "grad_norm": 0.23544715344905853, + "learning_rate": 1.6500162998813517e-05, + "loss": 1.2414, + "step": 9949 + }, + { + "epoch": 2.963569686703029, + "grad_norm": 0.239651620388031, + "learning_rate": 1.6499429975047462e-05, + "loss": 1.228, + "step": 9950 + }, + { + "epoch": 2.9638675329026976, + "grad_norm": 0.23488640785217285, + "learning_rate": 1.6498696890811473e-05, + "loss": 1.2401, + "step": 9951 + }, + { + "epoch": 2.964165379102366, + "grad_norm": 0.23645220696926117, + "learning_rate": 1.6497963746112372e-05, + "loss": 1.2485, + "step": 9952 + }, + { + "epoch": 2.9644632253020347, + "grad_norm": 0.24776875972747803, + "learning_rate": 1.6497230540956976e-05, + "loss": 1.2586, + "step": 9953 + }, + { + "epoch": 2.9647610715017034, + "grad_norm": 0.2415783405303955, + "learning_rate": 1.6496497275352107e-05, + "loss": 1.2551, + "step": 9954 + }, + { + "epoch": 2.9650589177013718, + "grad_norm": 0.2303195595741272, + "learning_rate": 1.6495763949304594e-05, + "loss": 1.2505, + "step": 9955 + }, + { + "epoch": 2.9653567639010405, + "grad_norm": 0.23608770966529846, + "learning_rate": 1.649503056282125e-05, + "loss": 1.2421, + "step": 9956 + }, + { + "epoch": 2.9656546101007093, + "grad_norm": 0.2326173037290573, + "learning_rate": 1.6494297115908907e-05, + "loss": 1.2416, + "step": 9957 + }, + { + "epoch": 2.9659524563003776, + "grad_norm": 0.2311028689146042, + "learning_rate": 1.6493563608574385e-05, + "loss": 1.2425, + "step": 9958 + }, + { + "epoch": 2.9662503025000464, + "grad_norm": 0.23269657790660858, + "learning_rate": 1.6492830040824506e-05, + "loss": 1.2367, + "step": 9959 + }, + { + "epoch": 2.966548148699715, + "grad_norm": 0.24212895333766937, + "learning_rate": 1.64920964126661e-05, + "loss": 1.2466, + "step": 9960 + }, + { + "epoch": 2.966845994899384, + "grad_norm": 0.22782926261425018, + "learning_rate": 1.6491362724105985e-05, + "loss": 1.2348, + "step": 9961 + }, + { + "epoch": 2.9671438410990527, + "grad_norm": 0.24124827980995178, + "learning_rate": 1.6490628975150998e-05, + "loss": 1.2454, + "step": 9962 + }, + { + "epoch": 2.967441687298721, + "grad_norm": 0.2469920665025711, + "learning_rate": 1.6489895165807962e-05, + "loss": 1.2412, + "step": 9963 + }, + { + "epoch": 2.96773953349839, + "grad_norm": 0.24608944356441498, + "learning_rate": 1.64891612960837e-05, + "loss": 1.2487, + "step": 9964 + }, + { + "epoch": 2.9680373796980586, + "grad_norm": 0.24406841397285461, + "learning_rate": 1.648842736598504e-05, + "loss": 1.2477, + "step": 9965 + }, + { + "epoch": 2.968335225897727, + "grad_norm": 0.2579127848148346, + "learning_rate": 1.6487693375518815e-05, + "loss": 1.2541, + "step": 9966 + }, + { + "epoch": 2.9686330720973957, + "grad_norm": 0.24070177972316742, + "learning_rate": 1.648695932469185e-05, + "loss": 1.2532, + "step": 9967 + }, + { + "epoch": 2.9689309182970645, + "grad_norm": 0.24999187886714935, + "learning_rate": 1.6486225213510975e-05, + "loss": 1.2431, + "step": 9968 + }, + { + "epoch": 2.969228764496733, + "grad_norm": 0.2467721551656723, + "learning_rate": 1.6485491041983027e-05, + "loss": 1.2389, + "step": 9969 + }, + { + "epoch": 2.9695266106964016, + "grad_norm": 0.2413964718580246, + "learning_rate": 1.6484756810114825e-05, + "loss": 1.2456, + "step": 9970 + }, + { + "epoch": 2.9698244568960703, + "grad_norm": 0.25925734639167786, + "learning_rate": 1.6484022517913207e-05, + "loss": 1.2325, + "step": 9971 + }, + { + "epoch": 2.970122303095739, + "grad_norm": 0.2431621253490448, + "learning_rate": 1.6483288165385008e-05, + "loss": 1.2285, + "step": 9972 + }, + { + "epoch": 2.9704201492954074, + "grad_norm": 0.2343003749847412, + "learning_rate": 1.648255375253705e-05, + "loss": 1.2302, + "step": 9973 + }, + { + "epoch": 2.970717995495076, + "grad_norm": 0.23708860576152802, + "learning_rate": 1.6481819279376176e-05, + "loss": 1.2539, + "step": 9974 + }, + { + "epoch": 2.971015841694745, + "grad_norm": 0.23648568987846375, + "learning_rate": 1.6481084745909214e-05, + "loss": 1.2507, + "step": 9975 + }, + { + "epoch": 2.9713136878944137, + "grad_norm": 0.23991839587688446, + "learning_rate": 1.6480350152143e-05, + "loss": 1.2349, + "step": 9976 + }, + { + "epoch": 2.971611534094082, + "grad_norm": 0.24438275396823883, + "learning_rate": 1.647961549808437e-05, + "loss": 1.2539, + "step": 9977 + }, + { + "epoch": 2.971909380293751, + "grad_norm": 0.24863120913505554, + "learning_rate": 1.6478880783740153e-05, + "loss": 1.2439, + "step": 9978 + }, + { + "epoch": 2.9722072264934196, + "grad_norm": 0.23804743587970734, + "learning_rate": 1.6478146009117195e-05, + "loss": 1.2279, + "step": 9979 + }, + { + "epoch": 2.972505072693088, + "grad_norm": 0.2433329075574875, + "learning_rate": 1.6477411174222323e-05, + "loss": 1.2343, + "step": 9980 + }, + { + "epoch": 2.9728029188927567, + "grad_norm": 0.24504166841506958, + "learning_rate": 1.6476676279062376e-05, + "loss": 1.2495, + "step": 9981 + }, + { + "epoch": 2.9731007650924255, + "grad_norm": 0.23672258853912354, + "learning_rate": 1.6475941323644192e-05, + "loss": 1.248, + "step": 9982 + }, + { + "epoch": 2.973398611292094, + "grad_norm": 0.24193841218948364, + "learning_rate": 1.647520630797461e-05, + "loss": 1.2482, + "step": 9983 + }, + { + "epoch": 2.9736964574917626, + "grad_norm": 0.22066807746887207, + "learning_rate": 1.6474471232060468e-05, + "loss": 1.2481, + "step": 9984 + }, + { + "epoch": 2.9739943036914314, + "grad_norm": 0.29701849818229675, + "learning_rate": 1.6473736095908605e-05, + "loss": 1.2438, + "step": 9985 + }, + { + "epoch": 2.9742921498911, + "grad_norm": 0.2828117907047272, + "learning_rate": 1.6473000899525862e-05, + "loss": 1.2446, + "step": 9986 + }, + { + "epoch": 2.974589996090769, + "grad_norm": 0.26809558272361755, + "learning_rate": 1.6472265642919077e-05, + "loss": 1.2423, + "step": 9987 + }, + { + "epoch": 2.9748878422904372, + "grad_norm": 0.28550103306770325, + "learning_rate": 1.647153032609509e-05, + "loss": 1.2449, + "step": 9988 + }, + { + "epoch": 2.975185688490106, + "grad_norm": 0.22642813622951508, + "learning_rate": 1.6470794949060748e-05, + "loss": 1.2275, + "step": 9989 + }, + { + "epoch": 2.975483534689775, + "grad_norm": 0.2423151582479477, + "learning_rate": 1.6470059511822882e-05, + "loss": 1.2458, + "step": 9990 + }, + { + "epoch": 2.975781380889443, + "grad_norm": 0.23246872425079346, + "learning_rate": 1.6469324014388343e-05, + "loss": 1.2529, + "step": 9991 + }, + { + "epoch": 2.976079227089112, + "grad_norm": 0.2409932166337967, + "learning_rate": 1.6468588456763974e-05, + "loss": 1.2341, + "step": 9992 + }, + { + "epoch": 2.9763770732887807, + "grad_norm": 0.2303859293460846, + "learning_rate": 1.6467852838956618e-05, + "loss": 1.2365, + "step": 9993 + }, + { + "epoch": 2.976674919488449, + "grad_norm": 0.2380605936050415, + "learning_rate": 1.6467117160973116e-05, + "loss": 1.2535, + "step": 9994 + }, + { + "epoch": 2.9769727656881178, + "grad_norm": 0.24125492572784424, + "learning_rate": 1.646638142282031e-05, + "loss": 1.2511, + "step": 9995 + }, + { + "epoch": 2.9772706118877865, + "grad_norm": 0.23002773523330688, + "learning_rate": 1.6465645624505056e-05, + "loss": 1.2329, + "step": 9996 + }, + { + "epoch": 2.977568458087455, + "grad_norm": 0.23827829957008362, + "learning_rate": 1.6464909766034192e-05, + "loss": 1.2401, + "step": 9997 + }, + { + "epoch": 2.9778663042871236, + "grad_norm": 0.26324141025543213, + "learning_rate": 1.6464173847414563e-05, + "loss": 1.241, + "step": 9998 + }, + { + "epoch": 2.9781641504867924, + "grad_norm": 0.28758561611175537, + "learning_rate": 1.646343786865302e-05, + "loss": 1.2317, + "step": 9999 + }, + { + "epoch": 2.978461996686461, + "grad_norm": 0.23155654966831207, + "learning_rate": 1.646270182975641e-05, + "loss": 1.2442, + "step": 10000 + }, + { + "epoch": 2.978461996686461, + "eval_loss": 1.3382065296173096, + "eval_runtime": 21.0972, + "eval_samples_per_second": 82.191, + "eval_steps_per_second": 5.167, + "step": 10000 + }, + { + "epoch": 2.97875984288613, + "grad_norm": 0.4102727770805359, + "learning_rate": 1.6461965730731577e-05, + "loss": 1.2258, + "step": 10001 + }, + { + "epoch": 2.9790576890857983, + "grad_norm": 0.30318817496299744, + "learning_rate": 1.646122957158537e-05, + "loss": 1.2344, + "step": 10002 + }, + { + "epoch": 2.979355535285467, + "grad_norm": 0.3035779893398285, + "learning_rate": 1.6460493352324652e-05, + "loss": 1.2306, + "step": 10003 + }, + { + "epoch": 2.979653381485136, + "grad_norm": 0.24000269174575806, + "learning_rate": 1.6459757072956252e-05, + "loss": 1.2533, + "step": 10004 + }, + { + "epoch": 2.979951227684804, + "grad_norm": 0.25043413043022156, + "learning_rate": 1.6459020733487033e-05, + "loss": 1.244, + "step": 10005 + }, + { + "epoch": 2.980249073884473, + "grad_norm": 0.29275935888290405, + "learning_rate": 1.645828433392384e-05, + "loss": 1.2406, + "step": 10006 + }, + { + "epoch": 2.9805469200841417, + "grad_norm": 0.2531569302082062, + "learning_rate": 1.645754787427353e-05, + "loss": 1.2479, + "step": 10007 + }, + { + "epoch": 2.98084476628381, + "grad_norm": 0.24172085523605347, + "learning_rate": 1.6456811354542948e-05, + "loss": 1.2395, + "step": 10008 + }, + { + "epoch": 2.981142612483479, + "grad_norm": 0.23907482624053955, + "learning_rate": 1.6456074774738955e-05, + "loss": 1.2268, + "step": 10009 + }, + { + "epoch": 2.9814404586831476, + "grad_norm": 0.251799076795578, + "learning_rate": 1.6455338134868396e-05, + "loss": 1.2493, + "step": 10010 + }, + { + "epoch": 2.981738304882816, + "grad_norm": 0.2671225666999817, + "learning_rate": 1.645460143493813e-05, + "loss": 1.2318, + "step": 10011 + }, + { + "epoch": 2.9820361510824847, + "grad_norm": 0.2631953954696655, + "learning_rate": 1.645386467495501e-05, + "loss": 1.2437, + "step": 10012 + }, + { + "epoch": 2.9823339972821534, + "grad_norm": 0.22674435377120972, + "learning_rate": 1.645312785492589e-05, + "loss": 1.2618, + "step": 10013 + }, + { + "epoch": 2.982631843481822, + "grad_norm": 0.23676465451717377, + "learning_rate": 1.645239097485762e-05, + "loss": 1.2477, + "step": 10014 + }, + { + "epoch": 2.982929689681491, + "grad_norm": 0.23759299516677856, + "learning_rate": 1.6451654034757066e-05, + "loss": 1.2551, + "step": 10015 + }, + { + "epoch": 2.9832275358811593, + "grad_norm": 0.25936782360076904, + "learning_rate": 1.6450917034631076e-05, + "loss": 1.2643, + "step": 10016 + }, + { + "epoch": 2.983525382080828, + "grad_norm": 0.23249895870685577, + "learning_rate": 1.6450179974486516e-05, + "loss": 1.252, + "step": 10017 + }, + { + "epoch": 2.983823228280497, + "grad_norm": 0.2421620935201645, + "learning_rate": 1.6449442854330235e-05, + "loss": 1.2437, + "step": 10018 + }, + { + "epoch": 2.984121074480165, + "grad_norm": 0.22764573991298676, + "learning_rate": 1.644870567416909e-05, + "loss": 1.2379, + "step": 10019 + }, + { + "epoch": 2.984418920679834, + "grad_norm": 0.25051426887512207, + "learning_rate": 1.644796843400995e-05, + "loss": 1.2374, + "step": 10020 + }, + { + "epoch": 2.9847167668795027, + "grad_norm": 0.2495751678943634, + "learning_rate": 1.644723113385966e-05, + "loss": 1.2337, + "step": 10021 + }, + { + "epoch": 2.985014613079171, + "grad_norm": 0.24617891013622284, + "learning_rate": 1.6446493773725095e-05, + "loss": 1.2517, + "step": 10022 + }, + { + "epoch": 2.98531245927884, + "grad_norm": 0.2377837598323822, + "learning_rate": 1.6445756353613105e-05, + "loss": 1.264, + "step": 10023 + }, + { + "epoch": 2.9856103054785086, + "grad_norm": 0.23834186792373657, + "learning_rate": 1.6445018873530552e-05, + "loss": 1.2531, + "step": 10024 + }, + { + "epoch": 2.985908151678177, + "grad_norm": 0.23706509172916412, + "learning_rate": 1.64442813334843e-05, + "loss": 1.2448, + "step": 10025 + }, + { + "epoch": 2.9862059978778457, + "grad_norm": 0.25559601187705994, + "learning_rate": 1.644354373348121e-05, + "loss": 1.2515, + "step": 10026 + }, + { + "epoch": 2.9865038440775145, + "grad_norm": 0.23977580666542053, + "learning_rate": 1.644280607352815e-05, + "loss": 1.2304, + "step": 10027 + }, + { + "epoch": 2.9868016902771832, + "grad_norm": 0.22511330246925354, + "learning_rate": 1.644206835363197e-05, + "loss": 1.2362, + "step": 10028 + }, + { + "epoch": 2.987099536476852, + "grad_norm": 0.2313072681427002, + "learning_rate": 1.6441330573799546e-05, + "loss": 1.2339, + "step": 10029 + }, + { + "epoch": 2.9873973826765203, + "grad_norm": 0.23325109481811523, + "learning_rate": 1.644059273403774e-05, + "loss": 1.2507, + "step": 10030 + }, + { + "epoch": 2.987695228876189, + "grad_norm": 0.23916980624198914, + "learning_rate": 1.6439854834353412e-05, + "loss": 1.2473, + "step": 10031 + }, + { + "epoch": 2.987993075075858, + "grad_norm": 0.23768900334835052, + "learning_rate": 1.6439116874753426e-05, + "loss": 1.2488, + "step": 10032 + }, + { + "epoch": 2.988290921275526, + "grad_norm": 0.2517053186893463, + "learning_rate": 1.6438378855244655e-05, + "loss": 1.2495, + "step": 10033 + }, + { + "epoch": 2.988588767475195, + "grad_norm": 0.23658375442028046, + "learning_rate": 1.6437640775833963e-05, + "loss": 1.2394, + "step": 10034 + }, + { + "epoch": 2.9888866136748637, + "grad_norm": 0.23645053803920746, + "learning_rate": 1.6436902636528215e-05, + "loss": 1.2206, + "step": 10035 + }, + { + "epoch": 2.989184459874532, + "grad_norm": 0.2451772540807724, + "learning_rate": 1.643616443733428e-05, + "loss": 1.2519, + "step": 10036 + }, + { + "epoch": 2.989482306074201, + "grad_norm": 0.22844362258911133, + "learning_rate": 1.6435426178259025e-05, + "loss": 1.2361, + "step": 10037 + }, + { + "epoch": 2.9897801522738696, + "grad_norm": 0.24487826228141785, + "learning_rate": 1.643468785930932e-05, + "loss": 1.2463, + "step": 10038 + }, + { + "epoch": 2.9900779984735384, + "grad_norm": 0.24516311287879944, + "learning_rate": 1.6433949480492032e-05, + "loss": 1.2239, + "step": 10039 + }, + { + "epoch": 2.9903758446732067, + "grad_norm": 0.2422989457845688, + "learning_rate": 1.6433211041814036e-05, + "loss": 1.2561, + "step": 10040 + }, + { + "epoch": 2.9906736908728755, + "grad_norm": 0.22691424190998077, + "learning_rate": 1.6432472543282195e-05, + "loss": 1.2112, + "step": 10041 + }, + { + "epoch": 2.9909715370725443, + "grad_norm": 0.22487223148345947, + "learning_rate": 1.6431733984903386e-05, + "loss": 1.2274, + "step": 10042 + }, + { + "epoch": 2.991269383272213, + "grad_norm": 0.22842620313167572, + "learning_rate": 1.6430995366684478e-05, + "loss": 1.23, + "step": 10043 + }, + { + "epoch": 2.9915672294718814, + "grad_norm": 0.2280990481376648, + "learning_rate": 1.6430256688632345e-05, + "loss": 1.2465, + "step": 10044 + }, + { + "epoch": 2.99186507567155, + "grad_norm": 0.24010218679904938, + "learning_rate": 1.642951795075386e-05, + "loss": 1.2387, + "step": 10045 + }, + { + "epoch": 2.992162921871219, + "grad_norm": 0.23180556297302246, + "learning_rate": 1.6428779153055886e-05, + "loss": 1.2317, + "step": 10046 + }, + { + "epoch": 2.9924607680708872, + "grad_norm": 0.24207353591918945, + "learning_rate": 1.6428040295545308e-05, + "loss": 1.245, + "step": 10047 + }, + { + "epoch": 2.992758614270556, + "grad_norm": 0.23159845173358917, + "learning_rate": 1.6427301378229e-05, + "loss": 1.2436, + "step": 10048 + }, + { + "epoch": 2.9930564604702248, + "grad_norm": 0.24207107722759247, + "learning_rate": 1.642656240111383e-05, + "loss": 1.2449, + "step": 10049 + }, + { + "epoch": 2.993354306669893, + "grad_norm": 0.22876659035682678, + "learning_rate": 1.642582336420668e-05, + "loss": 1.2268, + "step": 10050 + }, + { + "epoch": 2.993652152869562, + "grad_norm": 0.23025888204574585, + "learning_rate": 1.642508426751442e-05, + "loss": 1.2499, + "step": 10051 + }, + { + "epoch": 2.9939499990692306, + "grad_norm": 0.2378217875957489, + "learning_rate": 1.642434511104393e-05, + "loss": 1.2434, + "step": 10052 + }, + { + "epoch": 2.9942478452688994, + "grad_norm": 0.24848827719688416, + "learning_rate": 1.642360589480209e-05, + "loss": 1.2405, + "step": 10053 + }, + { + "epoch": 2.994545691468568, + "grad_norm": 0.26267918944358826, + "learning_rate": 1.642286661879577e-05, + "loss": 1.2495, + "step": 10054 + }, + { + "epoch": 2.9948435376682365, + "grad_norm": 0.22974130511283875, + "learning_rate": 1.642212728303185e-05, + "loss": 1.2466, + "step": 10055 + }, + { + "epoch": 2.9951413838679053, + "grad_norm": 0.39148205518722534, + "learning_rate": 1.6421387887517215e-05, + "loss": 1.2371, + "step": 10056 + }, + { + "epoch": 2.995439230067574, + "grad_norm": 0.2892146706581116, + "learning_rate": 1.6420648432258743e-05, + "loss": 1.2605, + "step": 10057 + }, + { + "epoch": 2.9957370762672424, + "grad_norm": 0.284047394990921, + "learning_rate": 1.6419908917263305e-05, + "loss": 1.2328, + "step": 10058 + }, + { + "epoch": 2.996034922466911, + "grad_norm": 0.23381011188030243, + "learning_rate": 1.641916934253779e-05, + "loss": 1.2447, + "step": 10059 + }, + { + "epoch": 2.99633276866658, + "grad_norm": 0.34840264916419983, + "learning_rate": 1.6418429708089076e-05, + "loss": 1.2442, + "step": 10060 + }, + { + "epoch": 2.9966306148662483, + "grad_norm": 0.2386302649974823, + "learning_rate": 1.6417690013924046e-05, + "loss": 1.242, + "step": 10061 + }, + { + "epoch": 2.996928461065917, + "grad_norm": 0.26766037940979004, + "learning_rate": 1.641695026004958e-05, + "loss": 1.2283, + "step": 10062 + }, + { + "epoch": 2.997226307265586, + "grad_norm": 0.23906347155570984, + "learning_rate": 1.6416210446472555e-05, + "loss": 1.2335, + "step": 10063 + }, + { + "epoch": 2.997524153465254, + "grad_norm": 0.26001253724098206, + "learning_rate": 1.6415470573199867e-05, + "loss": 1.2677, + "step": 10064 + }, + { + "epoch": 2.997821999664923, + "grad_norm": 0.24827906489372253, + "learning_rate": 1.641473064023839e-05, + "loss": 1.2384, + "step": 10065 + }, + { + "epoch": 2.9981198458645917, + "grad_norm": 0.24536694586277008, + "learning_rate": 1.6413990647595016e-05, + "loss": 1.2615, + "step": 10066 + }, + { + "epoch": 2.9984176920642605, + "grad_norm": 0.2591598927974701, + "learning_rate": 1.6413250595276623e-05, + "loss": 1.2352, + "step": 10067 + }, + { + "epoch": 2.9987155382639292, + "grad_norm": 0.25052574276924133, + "learning_rate": 1.6412510483290098e-05, + "loss": 1.2383, + "step": 10068 + }, + { + "epoch": 2.9990133844635976, + "grad_norm": 0.24174103140830994, + "learning_rate": 1.6411770311642326e-05, + "loss": 1.2475, + "step": 10069 + }, + { + "epoch": 2.9993112306632663, + "grad_norm": 0.23268310725688934, + "learning_rate": 1.6411030080340195e-05, + "loss": 1.2196, + "step": 10070 + }, + { + "epoch": 2.999609076862935, + "grad_norm": 0.2571922540664673, + "learning_rate": 1.6410289789390598e-05, + "loss": 1.2401, + "step": 10071 + }, + { + "epoch": 2.9999069230626034, + "grad_norm": 0.24435901641845703, + "learning_rate": 1.6409549438800407e-05, + "loss": 1.238, + "step": 10072 + }, + { + "epoch": 3.000204769262272, + "grad_norm": 0.38965585827827454, + "learning_rate": 1.6408809028576526e-05, + "loss": 1.2529, + "step": 10073 + }, + { + "epoch": 3.000502615461941, + "grad_norm": 0.32282426953315735, + "learning_rate": 1.6408068558725835e-05, + "loss": 1.233, + "step": 10074 + }, + { + "epoch": 3.0008004616616093, + "grad_norm": 0.28637224435806274, + "learning_rate": 1.6407328029255225e-05, + "loss": 1.2493, + "step": 10075 + }, + { + "epoch": 3.001098307861278, + "grad_norm": 0.24057908356189728, + "learning_rate": 1.640658744017159e-05, + "loss": 1.2342, + "step": 10076 + }, + { + "epoch": 3.001396154060947, + "grad_norm": 0.43580493330955505, + "learning_rate": 1.6405846791481813e-05, + "loss": 1.2304, + "step": 10077 + }, + { + "epoch": 3.0016940002606156, + "grad_norm": 0.2577560544013977, + "learning_rate": 1.640510608319279e-05, + "loss": 1.225, + "step": 10078 + }, + { + "epoch": 3.001991846460284, + "grad_norm": 0.2544635534286499, + "learning_rate": 1.6404365315311412e-05, + "loss": 1.2572, + "step": 10079 + }, + { + "epoch": 3.0022896926599527, + "grad_norm": 0.24499472975730896, + "learning_rate": 1.640362448784457e-05, + "loss": 1.2353, + "step": 10080 + }, + { + "epoch": 3.0025875388596215, + "grad_norm": 0.231914684176445, + "learning_rate": 1.6402883600799153e-05, + "loss": 1.2213, + "step": 10081 + }, + { + "epoch": 3.00288538505929, + "grad_norm": 0.2565773129463196, + "learning_rate": 1.640214265418206e-05, + "loss": 1.2523, + "step": 10082 + }, + { + "epoch": 3.0031832312589586, + "grad_norm": 0.2541150450706482, + "learning_rate": 1.6401401648000182e-05, + "loss": 1.2475, + "step": 10083 + }, + { + "epoch": 3.0034810774586274, + "grad_norm": 0.2428370863199234, + "learning_rate": 1.640066058226042e-05, + "loss": 1.2408, + "step": 10084 + }, + { + "epoch": 3.003778923658296, + "grad_norm": 0.22767463326454163, + "learning_rate": 1.6399919456969654e-05, + "loss": 1.2381, + "step": 10085 + }, + { + "epoch": 3.0040767698579645, + "grad_norm": 0.24175630509853363, + "learning_rate": 1.6399178272134793e-05, + "loss": 1.2309, + "step": 10086 + }, + { + "epoch": 3.0043746160576332, + "grad_norm": 0.233686164021492, + "learning_rate": 1.6398437027762725e-05, + "loss": 1.2292, + "step": 10087 + }, + { + "epoch": 3.004672462257302, + "grad_norm": 0.236064612865448, + "learning_rate": 1.639769572386035e-05, + "loss": 1.232, + "step": 10088 + }, + { + "epoch": 3.0049703084569703, + "grad_norm": 0.23852574825286865, + "learning_rate": 1.6396954360434566e-05, + "loss": 1.2288, + "step": 10089 + }, + { + "epoch": 3.005268154656639, + "grad_norm": 0.23065021634101868, + "learning_rate": 1.6396212937492265e-05, + "loss": 1.2365, + "step": 10090 + }, + { + "epoch": 3.005566000856308, + "grad_norm": 0.23177464306354523, + "learning_rate": 1.639547145504035e-05, + "loss": 1.2563, + "step": 10091 + }, + { + "epoch": 3.0058638470559766, + "grad_norm": 0.24095739424228668, + "learning_rate": 1.6394729913085722e-05, + "loss": 1.227, + "step": 10092 + }, + { + "epoch": 3.006161693255645, + "grad_norm": 0.2505260705947876, + "learning_rate": 1.6393988311635273e-05, + "loss": 1.2523, + "step": 10093 + }, + { + "epoch": 3.0064595394553137, + "grad_norm": 0.2472001165151596, + "learning_rate": 1.6393246650695907e-05, + "loss": 1.2516, + "step": 10094 + }, + { + "epoch": 3.0067573856549825, + "grad_norm": 0.23386529088020325, + "learning_rate": 1.6392504930274528e-05, + "loss": 1.2361, + "step": 10095 + }, + { + "epoch": 3.007055231854651, + "grad_norm": 0.23205740749835968, + "learning_rate": 1.639176315037803e-05, + "loss": 1.244, + "step": 10096 + }, + { + "epoch": 3.0073530780543196, + "grad_norm": 0.24119286239147186, + "learning_rate": 1.6391021311013314e-05, + "loss": 1.2468, + "step": 10097 + }, + { + "epoch": 3.0076509242539884, + "grad_norm": 0.24198414385318756, + "learning_rate": 1.639027941218729e-05, + "loss": 1.2408, + "step": 10098 + }, + { + "epoch": 3.007948770453657, + "grad_norm": 0.2378912717103958, + "learning_rate": 1.6389537453906854e-05, + "loss": 1.2351, + "step": 10099 + }, + { + "epoch": 3.0082466166533255, + "grad_norm": 0.22698108851909637, + "learning_rate": 1.638879543617891e-05, + "loss": 1.229, + "step": 10100 + }, + { + "epoch": 3.0085444628529943, + "grad_norm": 0.23869843780994415, + "learning_rate": 1.6388053359010362e-05, + "loss": 1.2456, + "step": 10101 + }, + { + "epoch": 3.008842309052663, + "grad_norm": 0.22983010113239288, + "learning_rate": 1.638731122240812e-05, + "loss": 1.2401, + "step": 10102 + }, + { + "epoch": 3.0091401552523314, + "grad_norm": 0.23344950377941132, + "learning_rate": 1.6386569026379074e-05, + "loss": 1.2348, + "step": 10103 + }, + { + "epoch": 3.009438001452, + "grad_norm": 0.24137677252292633, + "learning_rate": 1.6385826770930147e-05, + "loss": 1.2311, + "step": 10104 + }, + { + "epoch": 3.009735847651669, + "grad_norm": 0.2299661934375763, + "learning_rate": 1.638508445606823e-05, + "loss": 1.2294, + "step": 10105 + }, + { + "epoch": 3.0100336938513377, + "grad_norm": 0.24330636858940125, + "learning_rate": 1.638434208180024e-05, + "loss": 1.2447, + "step": 10106 + }, + { + "epoch": 3.010331540051006, + "grad_norm": 0.23655346035957336, + "learning_rate": 1.6383599648133078e-05, + "loss": 1.2402, + "step": 10107 + }, + { + "epoch": 3.0106293862506748, + "grad_norm": 0.24257723987102509, + "learning_rate": 1.6382857155073658e-05, + "loss": 1.233, + "step": 10108 + }, + { + "epoch": 3.0109272324503435, + "grad_norm": 0.23627245426177979, + "learning_rate": 1.638211460262888e-05, + "loss": 1.2493, + "step": 10109 + }, + { + "epoch": 3.0112250786500123, + "grad_norm": 0.25192874670028687, + "learning_rate": 1.6381371990805656e-05, + "loss": 1.2466, + "step": 10110 + }, + { + "epoch": 3.0115229248496806, + "grad_norm": 0.24103054404258728, + "learning_rate": 1.6380629319610894e-05, + "loss": 1.2347, + "step": 10111 + }, + { + "epoch": 3.0118207710493494, + "grad_norm": 0.24143441021442413, + "learning_rate": 1.6379886589051506e-05, + "loss": 1.2557, + "step": 10112 + }, + { + "epoch": 3.012118617249018, + "grad_norm": 0.2334558069705963, + "learning_rate": 1.63791437991344e-05, + "loss": 1.2257, + "step": 10113 + }, + { + "epoch": 3.0124164634486865, + "grad_norm": 0.23910120129585266, + "learning_rate": 1.6378400949866493e-05, + "loss": 1.2467, + "step": 10114 + }, + { + "epoch": 3.0127143096483553, + "grad_norm": 0.24178540706634521, + "learning_rate": 1.6377658041254688e-05, + "loss": 1.2319, + "step": 10115 + }, + { + "epoch": 3.013012155848024, + "grad_norm": 0.2362511307001114, + "learning_rate": 1.6376915073305904e-05, + "loss": 1.237, + "step": 10116 + }, + { + "epoch": 3.013310002047693, + "grad_norm": 0.23667585849761963, + "learning_rate": 1.6376172046027043e-05, + "loss": 1.2372, + "step": 10117 + }, + { + "epoch": 3.013607848247361, + "grad_norm": 0.2358350157737732, + "learning_rate": 1.637542895942503e-05, + "loss": 1.2308, + "step": 10118 + }, + { + "epoch": 3.01390569444703, + "grad_norm": 0.2408997267484665, + "learning_rate": 1.6374685813506773e-05, + "loss": 1.2306, + "step": 10119 + }, + { + "epoch": 3.0142035406466987, + "grad_norm": 0.23724953830242157, + "learning_rate": 1.637394260827919e-05, + "loss": 1.2438, + "step": 10120 + }, + { + "epoch": 3.014501386846367, + "grad_norm": 0.2508860230445862, + "learning_rate": 1.6373199343749187e-05, + "loss": 1.2254, + "step": 10121 + }, + { + "epoch": 3.014799233046036, + "grad_norm": 0.24599908292293549, + "learning_rate": 1.6372456019923685e-05, + "loss": 1.2498, + "step": 10122 + }, + { + "epoch": 3.0150970792457046, + "grad_norm": 0.24873362481594086, + "learning_rate": 1.6371712636809603e-05, + "loss": 1.2336, + "step": 10123 + }, + { + "epoch": 3.0153949254453734, + "grad_norm": 0.24091993272304535, + "learning_rate": 1.637096919441385e-05, + "loss": 1.2315, + "step": 10124 + }, + { + "epoch": 3.0156927716450417, + "grad_norm": 0.247350811958313, + "learning_rate": 1.6370225692743348e-05, + "loss": 1.2399, + "step": 10125 + }, + { + "epoch": 3.0159906178447105, + "grad_norm": 0.2587589621543884, + "learning_rate": 1.6369482131805015e-05, + "loss": 1.2417, + "step": 10126 + }, + { + "epoch": 3.0162884640443792, + "grad_norm": 0.24647526443004608, + "learning_rate": 1.6368738511605764e-05, + "loss": 1.2353, + "step": 10127 + }, + { + "epoch": 3.0165863102440476, + "grad_norm": 0.23632420599460602, + "learning_rate": 1.636799483215252e-05, + "loss": 1.2449, + "step": 10128 + }, + { + "epoch": 3.0168841564437163, + "grad_norm": 0.24009020626544952, + "learning_rate": 1.6367251093452197e-05, + "loss": 1.2358, + "step": 10129 + }, + { + "epoch": 3.017182002643385, + "grad_norm": 0.2408895343542099, + "learning_rate": 1.6366507295511715e-05, + "loss": 1.2273, + "step": 10130 + }, + { + "epoch": 3.017479848843054, + "grad_norm": 0.2386654168367386, + "learning_rate": 1.6365763438338e-05, + "loss": 1.2413, + "step": 10131 + }, + { + "epoch": 3.017777695042722, + "grad_norm": 0.23126019537448883, + "learning_rate": 1.6365019521937964e-05, + "loss": 1.2347, + "step": 10132 + }, + { + "epoch": 3.018075541242391, + "grad_norm": 0.23686255514621735, + "learning_rate": 1.6364275546318535e-05, + "loss": 1.2486, + "step": 10133 + }, + { + "epoch": 3.0183733874420597, + "grad_norm": 0.23731625080108643, + "learning_rate": 1.6363531511486634e-05, + "loss": 1.2391, + "step": 10134 + }, + { + "epoch": 3.018671233641728, + "grad_norm": 0.2382894605398178, + "learning_rate": 1.636278741744918e-05, + "loss": 1.245, + "step": 10135 + }, + { + "epoch": 3.018969079841397, + "grad_norm": 0.23662148416042328, + "learning_rate": 1.6362043264213098e-05, + "loss": 1.2647, + "step": 10136 + }, + { + "epoch": 3.0192669260410656, + "grad_norm": 0.23939202725887299, + "learning_rate": 1.636129905178531e-05, + "loss": 1.2407, + "step": 10137 + }, + { + "epoch": 3.0195647722407344, + "grad_norm": 0.2415553778409958, + "learning_rate": 1.6360554780172745e-05, + "loss": 1.2289, + "step": 10138 + }, + { + "epoch": 3.0198626184404027, + "grad_norm": 0.25028204917907715, + "learning_rate": 1.635981044938232e-05, + "loss": 1.2224, + "step": 10139 + }, + { + "epoch": 3.0201604646400715, + "grad_norm": 0.26298055052757263, + "learning_rate": 1.6359066059420968e-05, + "loss": 1.2614, + "step": 10140 + }, + { + "epoch": 3.0204583108397403, + "grad_norm": 0.24997131526470184, + "learning_rate": 1.635832161029561e-05, + "loss": 1.2273, + "step": 10141 + }, + { + "epoch": 3.0207561570394086, + "grad_norm": 0.28070592880249023, + "learning_rate": 1.6357577102013173e-05, + "loss": 1.2347, + "step": 10142 + }, + { + "epoch": 3.0210540032390774, + "grad_norm": 0.2561105191707611, + "learning_rate": 1.6356832534580585e-05, + "loss": 1.2505, + "step": 10143 + }, + { + "epoch": 3.021351849438746, + "grad_norm": 0.2369934618473053, + "learning_rate": 1.6356087908004773e-05, + "loss": 1.2311, + "step": 10144 + }, + { + "epoch": 3.021649695638415, + "grad_norm": 0.2471408247947693, + "learning_rate": 1.6355343222292664e-05, + "loss": 1.2433, + "step": 10145 + }, + { + "epoch": 3.0219475418380832, + "grad_norm": 0.2605624496936798, + "learning_rate": 1.6354598477451187e-05, + "loss": 1.2439, + "step": 10146 + }, + { + "epoch": 3.022245388037752, + "grad_norm": 0.2423917055130005, + "learning_rate": 1.635385367348727e-05, + "loss": 1.2386, + "step": 10147 + }, + { + "epoch": 3.0225432342374208, + "grad_norm": 0.2808777689933777, + "learning_rate": 1.6353108810407845e-05, + "loss": 1.2583, + "step": 10148 + }, + { + "epoch": 3.022841080437089, + "grad_norm": 0.2584322988986969, + "learning_rate": 1.6352363888219838e-05, + "loss": 1.2342, + "step": 10149 + }, + { + "epoch": 3.023138926636758, + "grad_norm": 0.25437480211257935, + "learning_rate": 1.6351618906930188e-05, + "loss": 1.2257, + "step": 10150 + }, + { + "epoch": 3.0234367728364266, + "grad_norm": 0.3657657504081726, + "learning_rate": 1.6350873866545814e-05, + "loss": 1.2374, + "step": 10151 + }, + { + "epoch": 3.0237346190360954, + "grad_norm": 0.2797352075576782, + "learning_rate": 1.6350128767073655e-05, + "loss": 1.2313, + "step": 10152 + }, + { + "epoch": 3.0240324652357637, + "grad_norm": 0.30090221762657166, + "learning_rate": 1.6349383608520646e-05, + "loss": 1.2509, + "step": 10153 + }, + { + "epoch": 3.0243303114354325, + "grad_norm": 0.30294233560562134, + "learning_rate": 1.6348638390893717e-05, + "loss": 1.2392, + "step": 10154 + }, + { + "epoch": 3.0246281576351013, + "grad_norm": 0.2502883970737457, + "learning_rate": 1.6347893114199795e-05, + "loss": 1.2271, + "step": 10155 + }, + { + "epoch": 3.0249260038347696, + "grad_norm": 0.23533587157726288, + "learning_rate": 1.6347147778445823e-05, + "loss": 1.2297, + "step": 10156 + }, + { + "epoch": 3.0252238500344384, + "grad_norm": 0.25665155053138733, + "learning_rate": 1.6346402383638734e-05, + "loss": 1.2412, + "step": 10157 + }, + { + "epoch": 3.025521696234107, + "grad_norm": 0.25076520442962646, + "learning_rate": 1.6345656929785462e-05, + "loss": 1.2345, + "step": 10158 + }, + { + "epoch": 3.025819542433776, + "grad_norm": 0.2445412129163742, + "learning_rate": 1.634491141689294e-05, + "loss": 1.2363, + "step": 10159 + }, + { + "epoch": 3.0261173886334443, + "grad_norm": 0.2960551679134369, + "learning_rate": 1.6344165844968104e-05, + "loss": 1.2396, + "step": 10160 + }, + { + "epoch": 3.026415234833113, + "grad_norm": 0.2761836349964142, + "learning_rate": 1.63434202140179e-05, + "loss": 1.2329, + "step": 10161 + }, + { + "epoch": 3.026713081032782, + "grad_norm": 0.28803741931915283, + "learning_rate": 1.634267452404925e-05, + "loss": 1.2384, + "step": 10162 + }, + { + "epoch": 3.02701092723245, + "grad_norm": 0.2996866703033447, + "learning_rate": 1.6341928775069106e-05, + "loss": 1.2492, + "step": 10163 + }, + { + "epoch": 3.027308773432119, + "grad_norm": 0.2283649444580078, + "learning_rate": 1.6341182967084397e-05, + "loss": 1.243, + "step": 10164 + }, + { + "epoch": 3.0276066196317877, + "grad_norm": 0.24121533334255219, + "learning_rate": 1.6340437100102067e-05, + "loss": 1.2613, + "step": 10165 + }, + { + "epoch": 3.0279044658314564, + "grad_norm": 0.23092056810855865, + "learning_rate": 1.6339691174129053e-05, + "loss": 1.2385, + "step": 10166 + }, + { + "epoch": 3.0282023120311248, + "grad_norm": 0.25815844535827637, + "learning_rate": 1.6338945189172297e-05, + "loss": 1.2383, + "step": 10167 + }, + { + "epoch": 3.0285001582307935, + "grad_norm": 0.2433355301618576, + "learning_rate": 1.6338199145238737e-05, + "loss": 1.2436, + "step": 10168 + }, + { + "epoch": 3.0287980044304623, + "grad_norm": 0.24668675661087036, + "learning_rate": 1.6337453042335315e-05, + "loss": 1.2438, + "step": 10169 + }, + { + "epoch": 3.0290958506301306, + "grad_norm": 0.28404033184051514, + "learning_rate": 1.6336706880468972e-05, + "loss": 1.2417, + "step": 10170 + }, + { + "epoch": 3.0293936968297994, + "grad_norm": 0.2578742206096649, + "learning_rate": 1.6335960659646655e-05, + "loss": 1.2413, + "step": 10171 + }, + { + "epoch": 3.029691543029468, + "grad_norm": 0.23545892536640167, + "learning_rate": 1.63352143798753e-05, + "loss": 1.2427, + "step": 10172 + }, + { + "epoch": 3.029989389229137, + "grad_norm": 0.2703379988670349, + "learning_rate": 1.6334468041161854e-05, + "loss": 1.2585, + "step": 10173 + }, + { + "epoch": 3.0302872354288053, + "grad_norm": 0.24328207969665527, + "learning_rate": 1.633372164351326e-05, + "loss": 1.2272, + "step": 10174 + }, + { + "epoch": 3.030585081628474, + "grad_norm": 0.252121239900589, + "learning_rate": 1.6332975186936464e-05, + "loss": 1.2368, + "step": 10175 + }, + { + "epoch": 3.030882927828143, + "grad_norm": 0.23458684980869293, + "learning_rate": 1.633222867143841e-05, + "loss": 1.2332, + "step": 10176 + }, + { + "epoch": 3.0311807740278116, + "grad_norm": 0.26045963168144226, + "learning_rate": 1.633148209702604e-05, + "loss": 1.2394, + "step": 10177 + }, + { + "epoch": 3.03147862022748, + "grad_norm": 0.2555118203163147, + "learning_rate": 1.6330735463706305e-05, + "loss": 1.2362, + "step": 10178 + }, + { + "epoch": 3.0317764664271487, + "grad_norm": 0.23761750757694244, + "learning_rate": 1.6329988771486148e-05, + "loss": 1.2389, + "step": 10179 + }, + { + "epoch": 3.0320743126268175, + "grad_norm": 0.24389159679412842, + "learning_rate": 1.632924202037252e-05, + "loss": 1.2347, + "step": 10180 + }, + { + "epoch": 3.032372158826486, + "grad_norm": 0.24950288236141205, + "learning_rate": 1.6328495210372363e-05, + "loss": 1.2641, + "step": 10181 + }, + { + "epoch": 3.0326700050261546, + "grad_norm": 0.24158208072185516, + "learning_rate": 1.6327748341492633e-05, + "loss": 1.247, + "step": 10182 + }, + { + "epoch": 3.0329678512258234, + "grad_norm": 0.47347182035446167, + "learning_rate": 1.632700141374027e-05, + "loss": 1.2471, + "step": 10183 + }, + { + "epoch": 3.033265697425492, + "grad_norm": 0.41325369477272034, + "learning_rate": 1.6326254427122236e-05, + "loss": 1.2486, + "step": 10184 + }, + { + "epoch": 3.0335635436251605, + "grad_norm": 0.32898035645484924, + "learning_rate": 1.6325507381645464e-05, + "loss": 1.2249, + "step": 10185 + }, + { + "epoch": 3.0338613898248292, + "grad_norm": 0.27398014068603516, + "learning_rate": 1.6324760277316917e-05, + "loss": 1.2395, + "step": 10186 + }, + { + "epoch": 3.034159236024498, + "grad_norm": 0.3908424377441406, + "learning_rate": 1.632401311414354e-05, + "loss": 1.2287, + "step": 10187 + }, + { + "epoch": 3.0344570822241663, + "grad_norm": 0.24559953808784485, + "learning_rate": 1.632326589213229e-05, + "loss": 1.2402, + "step": 10188 + }, + { + "epoch": 3.034754928423835, + "grad_norm": 0.25984713435173035, + "learning_rate": 1.632251861129011e-05, + "loss": 1.2438, + "step": 10189 + }, + { + "epoch": 3.035052774623504, + "grad_norm": 0.25673434138298035, + "learning_rate": 1.632177127162396e-05, + "loss": 1.2219, + "step": 10190 + }, + { + "epoch": 3.0353506208231726, + "grad_norm": 0.23939383029937744, + "learning_rate": 1.6321023873140798e-05, + "loss": 1.2314, + "step": 10191 + }, + { + "epoch": 3.035648467022841, + "grad_norm": 0.24382399022579193, + "learning_rate": 1.6320276415847564e-05, + "loss": 1.2501, + "step": 10192 + }, + { + "epoch": 3.0359463132225097, + "grad_norm": 0.23208534717559814, + "learning_rate": 1.631952889975122e-05, + "loss": 1.2307, + "step": 10193 + }, + { + "epoch": 3.0362441594221785, + "grad_norm": 0.24375663697719574, + "learning_rate": 1.6318781324858723e-05, + "loss": 1.235, + "step": 10194 + }, + { + "epoch": 3.036542005621847, + "grad_norm": 0.2524307370185852, + "learning_rate": 1.6318033691177024e-05, + "loss": 1.2519, + "step": 10195 + }, + { + "epoch": 3.0368398518215156, + "grad_norm": 0.25700730085372925, + "learning_rate": 1.631728599871308e-05, + "loss": 1.2216, + "step": 10196 + }, + { + "epoch": 3.0371376980211844, + "grad_norm": 0.23987676203250885, + "learning_rate": 1.631653824747385e-05, + "loss": 1.2316, + "step": 10197 + }, + { + "epoch": 3.037435544220853, + "grad_norm": 0.23080593347549438, + "learning_rate": 1.6315790437466286e-05, + "loss": 1.2343, + "step": 10198 + }, + { + "epoch": 3.0377333904205215, + "grad_norm": 0.24482198059558868, + "learning_rate": 1.631504256869735e-05, + "loss": 1.2266, + "step": 10199 + }, + { + "epoch": 3.0380312366201903, + "grad_norm": 0.2541637122631073, + "learning_rate": 1.6314294641174e-05, + "loss": 1.2489, + "step": 10200 + }, + { + "epoch": 3.038329082819859, + "grad_norm": 0.23120158910751343, + "learning_rate": 1.631354665490319e-05, + "loss": 1.243, + "step": 10201 + }, + { + "epoch": 3.0386269290195274, + "grad_norm": 0.2334720492362976, + "learning_rate": 1.6312798609891883e-05, + "loss": 1.2342, + "step": 10202 + }, + { + "epoch": 3.038924775219196, + "grad_norm": 0.24195466935634613, + "learning_rate": 1.631205050614704e-05, + "loss": 1.2298, + "step": 10203 + }, + { + "epoch": 3.039222621418865, + "grad_norm": 0.24499407410621643, + "learning_rate": 1.6311302343675615e-05, + "loss": 1.2444, + "step": 10204 + }, + { + "epoch": 3.0395204676185337, + "grad_norm": 0.2399866133928299, + "learning_rate": 1.631055412248458e-05, + "loss": 1.2526, + "step": 10205 + }, + { + "epoch": 3.039818313818202, + "grad_norm": 0.2301747053861618, + "learning_rate": 1.6309805842580882e-05, + "loss": 1.2284, + "step": 10206 + }, + { + "epoch": 3.0401161600178708, + "grad_norm": 0.2331252545118332, + "learning_rate": 1.6309057503971497e-05, + "loss": 1.211, + "step": 10207 + }, + { + "epoch": 3.0404140062175395, + "grad_norm": 0.23599116504192352, + "learning_rate": 1.6308309106663375e-05, + "loss": 1.2338, + "step": 10208 + }, + { + "epoch": 3.040711852417208, + "grad_norm": 0.23467372357845306, + "learning_rate": 1.6307560650663487e-05, + "loss": 1.228, + "step": 10209 + }, + { + "epoch": 3.0410096986168766, + "grad_norm": 0.2402782142162323, + "learning_rate": 1.6306812135978794e-05, + "loss": 1.2308, + "step": 10210 + }, + { + "epoch": 3.0413075448165454, + "grad_norm": 0.24587568640708923, + "learning_rate": 1.6306063562616263e-05, + "loss": 1.2382, + "step": 10211 + }, + { + "epoch": 3.041605391016214, + "grad_norm": 0.22987627983093262, + "learning_rate": 1.6305314930582857e-05, + "loss": 1.2438, + "step": 10212 + }, + { + "epoch": 3.0419032372158825, + "grad_norm": 0.23999477922916412, + "learning_rate": 1.6304566239885535e-05, + "loss": 1.2467, + "step": 10213 + }, + { + "epoch": 3.0422010834155513, + "grad_norm": 0.23999232053756714, + "learning_rate": 1.6303817490531272e-05, + "loss": 1.251, + "step": 10214 + }, + { + "epoch": 3.04249892961522, + "grad_norm": 0.2424769252538681, + "learning_rate": 1.630306868252703e-05, + "loss": 1.2478, + "step": 10215 + }, + { + "epoch": 3.0427967758148884, + "grad_norm": 0.22676314413547516, + "learning_rate": 1.6302319815879773e-05, + "loss": 1.2393, + "step": 10216 + }, + { + "epoch": 3.043094622014557, + "grad_norm": 0.23452128469944, + "learning_rate": 1.6301570890596473e-05, + "loss": 1.2498, + "step": 10217 + }, + { + "epoch": 3.043392468214226, + "grad_norm": 0.23384414613246918, + "learning_rate": 1.63008219066841e-05, + "loss": 1.2365, + "step": 10218 + }, + { + "epoch": 3.0436903144138947, + "grad_norm": 0.24246850609779358, + "learning_rate": 1.6300072864149613e-05, + "loss": 1.2443, + "step": 10219 + }, + { + "epoch": 3.043988160613563, + "grad_norm": 0.2528890371322632, + "learning_rate": 1.629932376299999e-05, + "loss": 1.2387, + "step": 10220 + }, + { + "epoch": 3.044286006813232, + "grad_norm": 0.24717077612876892, + "learning_rate": 1.62985746032422e-05, + "loss": 1.253, + "step": 10221 + }, + { + "epoch": 3.0445838530129006, + "grad_norm": 0.25595220923423767, + "learning_rate": 1.6297825384883206e-05, + "loss": 1.2348, + "step": 10222 + }, + { + "epoch": 3.044881699212569, + "grad_norm": 0.22884708642959595, + "learning_rate": 1.6297076107929983e-05, + "loss": 1.2235, + "step": 10223 + }, + { + "epoch": 3.0451795454122377, + "grad_norm": 0.2743631899356842, + "learning_rate": 1.6296326772389507e-05, + "loss": 1.2154, + "step": 10224 + }, + { + "epoch": 3.0454773916119064, + "grad_norm": 0.22574536502361298, + "learning_rate": 1.629557737826874e-05, + "loss": 1.2348, + "step": 10225 + }, + { + "epoch": 3.045775237811575, + "grad_norm": 0.23804114758968353, + "learning_rate": 1.6294827925574663e-05, + "loss": 1.2326, + "step": 10226 + }, + { + "epoch": 3.0460730840112435, + "grad_norm": 0.24106575548648834, + "learning_rate": 1.6294078414314244e-05, + "loss": 1.2226, + "step": 10227 + }, + { + "epoch": 3.0463709302109123, + "grad_norm": 0.24153001606464386, + "learning_rate": 1.6293328844494456e-05, + "loss": 1.2385, + "step": 10228 + }, + { + "epoch": 3.046668776410581, + "grad_norm": 0.2359992265701294, + "learning_rate": 1.6292579216122276e-05, + "loss": 1.2453, + "step": 10229 + }, + { + "epoch": 3.0469666226102494, + "grad_norm": 0.2369110882282257, + "learning_rate": 1.6291829529204676e-05, + "loss": 1.2548, + "step": 10230 + }, + { + "epoch": 3.047264468809918, + "grad_norm": 0.27357247471809387, + "learning_rate": 1.6291079783748632e-05, + "loss": 1.2468, + "step": 10231 + }, + { + "epoch": 3.047562315009587, + "grad_norm": 0.3440316617488861, + "learning_rate": 1.629032997976112e-05, + "loss": 1.2342, + "step": 10232 + }, + { + "epoch": 3.0478601612092557, + "grad_norm": 0.2712406814098358, + "learning_rate": 1.6289580117249115e-05, + "loss": 1.2188, + "step": 10233 + }, + { + "epoch": 3.048158007408924, + "grad_norm": 0.27531933784484863, + "learning_rate": 1.6288830196219595e-05, + "loss": 1.2502, + "step": 10234 + }, + { + "epoch": 3.048455853608593, + "grad_norm": 0.34455469250679016, + "learning_rate": 1.6288080216679535e-05, + "loss": 1.2447, + "step": 10235 + }, + { + "epoch": 3.0487536998082616, + "grad_norm": 0.2437841147184372, + "learning_rate": 1.6287330178635916e-05, + "loss": 1.2342, + "step": 10236 + }, + { + "epoch": 3.04905154600793, + "grad_norm": 0.2584114074707031, + "learning_rate": 1.628658008209571e-05, + "loss": 1.2418, + "step": 10237 + }, + { + "epoch": 3.0493493922075987, + "grad_norm": 0.23838254809379578, + "learning_rate": 1.6285829927065907e-05, + "loss": 1.2346, + "step": 10238 + }, + { + "epoch": 3.0496472384072675, + "grad_norm": 0.2907816469669342, + "learning_rate": 1.6285079713553474e-05, + "loss": 1.2458, + "step": 10239 + }, + { + "epoch": 3.0499450846069363, + "grad_norm": 0.2896682620048523, + "learning_rate": 1.62843294415654e-05, + "loss": 1.2405, + "step": 10240 + }, + { + "epoch": 3.0502429308066046, + "grad_norm": 0.24636253714561462, + "learning_rate": 1.628357911110866e-05, + "loss": 1.251, + "step": 10241 + }, + { + "epoch": 3.0505407770062734, + "grad_norm": 0.26349523663520813, + "learning_rate": 1.6282828722190234e-05, + "loss": 1.2449, + "step": 10242 + }, + { + "epoch": 3.050838623205942, + "grad_norm": 0.23846521973609924, + "learning_rate": 1.628207827481711e-05, + "loss": 1.2286, + "step": 10243 + }, + { + "epoch": 3.051136469405611, + "grad_norm": 0.2381698191165924, + "learning_rate": 1.6281327768996266e-05, + "loss": 1.2255, + "step": 10244 + }, + { + "epoch": 3.0514343156052792, + "grad_norm": 0.24267499148845673, + "learning_rate": 1.6280577204734682e-05, + "loss": 1.2217, + "step": 10245 + }, + { + "epoch": 3.051732161804948, + "grad_norm": 0.2777910530567169, + "learning_rate": 1.6279826582039348e-05, + "loss": 1.2409, + "step": 10246 + }, + { + "epoch": 3.0520300080046168, + "grad_norm": 0.32253384590148926, + "learning_rate": 1.627907590091724e-05, + "loss": 1.2316, + "step": 10247 + }, + { + "epoch": 3.052327854204285, + "grad_norm": 0.2844820022583008, + "learning_rate": 1.627832516137535e-05, + "loss": 1.2502, + "step": 10248 + }, + { + "epoch": 3.052625700403954, + "grad_norm": 0.5343673825263977, + "learning_rate": 1.627757436342066e-05, + "loss": 1.2533, + "step": 10249 + }, + { + "epoch": 3.0529235466036226, + "grad_norm": 0.30444401502609253, + "learning_rate": 1.6276823507060152e-05, + "loss": 1.231, + "step": 10250 + }, + { + "epoch": 3.0532213928032914, + "grad_norm": 0.2842743992805481, + "learning_rate": 1.627607259230081e-05, + "loss": 1.238, + "step": 10251 + }, + { + "epoch": 3.0535192390029597, + "grad_norm": 0.2443382441997528, + "learning_rate": 1.627532161914963e-05, + "loss": 1.2513, + "step": 10252 + }, + { + "epoch": 3.0538170852026285, + "grad_norm": 0.25539764761924744, + "learning_rate": 1.6274570587613592e-05, + "loss": 1.2262, + "step": 10253 + }, + { + "epoch": 3.0541149314022973, + "grad_norm": 0.2958544194698334, + "learning_rate": 1.6273819497699682e-05, + "loss": 1.2302, + "step": 10254 + }, + { + "epoch": 3.0544127776019656, + "grad_norm": 0.24095271527767181, + "learning_rate": 1.6273068349414898e-05, + "loss": 1.2275, + "step": 10255 + }, + { + "epoch": 3.0547106238016344, + "grad_norm": 0.24353492259979248, + "learning_rate": 1.6272317142766217e-05, + "loss": 1.2339, + "step": 10256 + }, + { + "epoch": 3.055008470001303, + "grad_norm": 0.25018948316574097, + "learning_rate": 1.6271565877760632e-05, + "loss": 1.2237, + "step": 10257 + }, + { + "epoch": 3.055306316200972, + "grad_norm": 0.247747004032135, + "learning_rate": 1.6270814554405133e-05, + "loss": 1.2347, + "step": 10258 + }, + { + "epoch": 3.0556041624006403, + "grad_norm": 0.25322288274765015, + "learning_rate": 1.627006317270672e-05, + "loss": 1.2277, + "step": 10259 + }, + { + "epoch": 3.055902008600309, + "grad_norm": 0.27417194843292236, + "learning_rate": 1.6269311732672363e-05, + "loss": 1.2407, + "step": 10260 + }, + { + "epoch": 3.056199854799978, + "grad_norm": 0.23231066763401031, + "learning_rate": 1.626856023430907e-05, + "loss": 1.2221, + "step": 10261 + }, + { + "epoch": 3.056497700999646, + "grad_norm": 0.24387730658054352, + "learning_rate": 1.626780867762383e-05, + "loss": 1.2457, + "step": 10262 + }, + { + "epoch": 3.056795547199315, + "grad_norm": 0.26461726427078247, + "learning_rate": 1.6267057062623627e-05, + "loss": 1.2462, + "step": 10263 + }, + { + "epoch": 3.0570933933989837, + "grad_norm": 0.22684389352798462, + "learning_rate": 1.6266305389315463e-05, + "loss": 1.2299, + "step": 10264 + }, + { + "epoch": 3.0573912395986524, + "grad_norm": 0.2952120900154114, + "learning_rate": 1.626555365770633e-05, + "loss": 1.2412, + "step": 10265 + }, + { + "epoch": 3.0576890857983208, + "grad_norm": 0.25628674030303955, + "learning_rate": 1.6264801867803218e-05, + "loss": 1.2338, + "step": 10266 + }, + { + "epoch": 3.0579869319979895, + "grad_norm": 0.2316766083240509, + "learning_rate": 1.6264050019613125e-05, + "loss": 1.2487, + "step": 10267 + }, + { + "epoch": 3.0582847781976583, + "grad_norm": 0.25355660915374756, + "learning_rate": 1.6263298113143044e-05, + "loss": 1.2368, + "step": 10268 + }, + { + "epoch": 3.0585826243973266, + "grad_norm": 0.26539427042007446, + "learning_rate": 1.6262546148399977e-05, + "loss": 1.2378, + "step": 10269 + }, + { + "epoch": 3.0588804705969954, + "grad_norm": 0.23979556560516357, + "learning_rate": 1.626179412539091e-05, + "loss": 1.2353, + "step": 10270 + }, + { + "epoch": 3.059178316796664, + "grad_norm": 0.24969260394573212, + "learning_rate": 1.6261042044122845e-05, + "loss": 1.2402, + "step": 10271 + }, + { + "epoch": 3.059476162996333, + "grad_norm": 0.2942097783088684, + "learning_rate": 1.626028990460278e-05, + "loss": 1.2314, + "step": 10272 + }, + { + "epoch": 3.0597740091960013, + "grad_norm": 0.24863791465759277, + "learning_rate": 1.6259537706837712e-05, + "loss": 1.2532, + "step": 10273 + }, + { + "epoch": 3.06007185539567, + "grad_norm": 0.230848029255867, + "learning_rate": 1.6258785450834638e-05, + "loss": 1.215, + "step": 10274 + }, + { + "epoch": 3.060369701595339, + "grad_norm": 0.25193142890930176, + "learning_rate": 1.6258033136600556e-05, + "loss": 1.2307, + "step": 10275 + }, + { + "epoch": 3.060667547795007, + "grad_norm": 0.23902875185012817, + "learning_rate": 1.6257280764142472e-05, + "loss": 1.2429, + "step": 10276 + }, + { + "epoch": 3.060965393994676, + "grad_norm": 0.26887065172195435, + "learning_rate": 1.625652833346738e-05, + "loss": 1.2439, + "step": 10277 + }, + { + "epoch": 3.0612632401943447, + "grad_norm": 0.24252690374851227, + "learning_rate": 1.6255775844582284e-05, + "loss": 1.2458, + "step": 10278 + }, + { + "epoch": 3.0615610863940135, + "grad_norm": 0.2371646612882614, + "learning_rate": 1.6255023297494182e-05, + "loss": 1.2358, + "step": 10279 + }, + { + "epoch": 3.061858932593682, + "grad_norm": 0.2452089786529541, + "learning_rate": 1.6254270692210076e-05, + "loss": 1.2297, + "step": 10280 + }, + { + "epoch": 3.0621567787933506, + "grad_norm": 0.24095968902111053, + "learning_rate": 1.6253518028736967e-05, + "loss": 1.2482, + "step": 10281 + }, + { + "epoch": 3.0624546249930193, + "grad_norm": 0.31817057728767395, + "learning_rate": 1.625276530708186e-05, + "loss": 1.2335, + "step": 10282 + }, + { + "epoch": 3.0627524711926877, + "grad_norm": 0.30245932936668396, + "learning_rate": 1.625201252725176e-05, + "loss": 1.2211, + "step": 10283 + }, + { + "epoch": 3.0630503173923564, + "grad_norm": 0.2332882583141327, + "learning_rate": 1.625125968925367e-05, + "loss": 1.2401, + "step": 10284 + }, + { + "epoch": 3.063348163592025, + "grad_norm": 0.31895169615745544, + "learning_rate": 1.625050679309459e-05, + "loss": 1.2324, + "step": 10285 + }, + { + "epoch": 3.063646009791694, + "grad_norm": 0.3017706871032715, + "learning_rate": 1.624975383878153e-05, + "loss": 1.2357, + "step": 10286 + }, + { + "epoch": 3.0639438559913623, + "grad_norm": 0.2633976638317108, + "learning_rate": 1.624900082632149e-05, + "loss": 1.2568, + "step": 10287 + }, + { + "epoch": 3.064241702191031, + "grad_norm": 0.2533052861690521, + "learning_rate": 1.624824775572148e-05, + "loss": 1.2254, + "step": 10288 + }, + { + "epoch": 3.0645395483907, + "grad_norm": 0.25451239943504333, + "learning_rate": 1.624749462698851e-05, + "loss": 1.2414, + "step": 10289 + }, + { + "epoch": 3.064837394590368, + "grad_norm": 0.23318979144096375, + "learning_rate": 1.6246741440129575e-05, + "loss": 1.2284, + "step": 10290 + }, + { + "epoch": 3.065135240790037, + "grad_norm": 0.2820476293563843, + "learning_rate": 1.6245988195151696e-05, + "loss": 1.2481, + "step": 10291 + }, + { + "epoch": 3.0654330869897057, + "grad_norm": 0.27426910400390625, + "learning_rate": 1.624523489206187e-05, + "loss": 1.2402, + "step": 10292 + }, + { + "epoch": 3.0657309331893745, + "grad_norm": 0.22696912288665771, + "learning_rate": 1.6244481530867117e-05, + "loss": 1.2286, + "step": 10293 + }, + { + "epoch": 3.066028779389043, + "grad_norm": 0.2459140568971634, + "learning_rate": 1.6243728111574437e-05, + "loss": 1.2324, + "step": 10294 + }, + { + "epoch": 3.0663266255887116, + "grad_norm": 0.24593108892440796, + "learning_rate": 1.6242974634190846e-05, + "loss": 1.2344, + "step": 10295 + }, + { + "epoch": 3.0666244717883804, + "grad_norm": 0.24092045426368713, + "learning_rate": 1.6242221098723346e-05, + "loss": 1.2329, + "step": 10296 + }, + { + "epoch": 3.0669223179880487, + "grad_norm": 0.24796484410762787, + "learning_rate": 1.6241467505178957e-05, + "loss": 1.2413, + "step": 10297 + }, + { + "epoch": 3.0672201641877175, + "grad_norm": 0.2608395516872406, + "learning_rate": 1.6240713853564683e-05, + "loss": 1.218, + "step": 10298 + }, + { + "epoch": 3.0675180103873863, + "grad_norm": 0.2577863335609436, + "learning_rate": 1.623996014388754e-05, + "loss": 1.2273, + "step": 10299 + }, + { + "epoch": 3.067815856587055, + "grad_norm": 0.2353697419166565, + "learning_rate": 1.6239206376154543e-05, + "loss": 1.2224, + "step": 10300 + }, + { + "epoch": 3.0681137027867234, + "grad_norm": 0.2622057795524597, + "learning_rate": 1.6238452550372698e-05, + "loss": 1.2271, + "step": 10301 + }, + { + "epoch": 3.068411548986392, + "grad_norm": 0.2638491094112396, + "learning_rate": 1.6237698666549023e-05, + "loss": 1.2537, + "step": 10302 + }, + { + "epoch": 3.068709395186061, + "grad_norm": 0.24457314610481262, + "learning_rate": 1.623694472469053e-05, + "loss": 1.2341, + "step": 10303 + }, + { + "epoch": 3.0690072413857292, + "grad_norm": 0.276833176612854, + "learning_rate": 1.6236190724804238e-05, + "loss": 1.2308, + "step": 10304 + }, + { + "epoch": 3.069305087585398, + "grad_norm": 0.3370343744754791, + "learning_rate": 1.6235436666897153e-05, + "loss": 1.2445, + "step": 10305 + }, + { + "epoch": 3.0696029337850668, + "grad_norm": 0.39063307642936707, + "learning_rate": 1.62346825509763e-05, + "loss": 1.2447, + "step": 10306 + }, + { + "epoch": 3.0699007799847355, + "grad_norm": 0.3648836612701416, + "learning_rate": 1.623392837704869e-05, + "loss": 1.2614, + "step": 10307 + }, + { + "epoch": 3.070198626184404, + "grad_norm": 0.24658054113388062, + "learning_rate": 1.6233174145121346e-05, + "loss": 1.2472, + "step": 10308 + }, + { + "epoch": 3.0704964723840726, + "grad_norm": 0.6062577962875366, + "learning_rate": 1.6232419855201275e-05, + "loss": 1.2417, + "step": 10309 + }, + { + "epoch": 3.0707943185837414, + "grad_norm": 0.32666686177253723, + "learning_rate": 1.6231665507295503e-05, + "loss": 1.2496, + "step": 10310 + }, + { + "epoch": 3.07109216478341, + "grad_norm": 0.26822617650032043, + "learning_rate": 1.6230911101411048e-05, + "loss": 1.2204, + "step": 10311 + }, + { + "epoch": 3.0713900109830785, + "grad_norm": 0.2530498802661896, + "learning_rate": 1.6230156637554925e-05, + "loss": 1.2329, + "step": 10312 + }, + { + "epoch": 3.0716878571827473, + "grad_norm": 0.23866677284240723, + "learning_rate": 1.6229402115734157e-05, + "loss": 1.2285, + "step": 10313 + }, + { + "epoch": 3.071985703382416, + "grad_norm": 0.2549847364425659, + "learning_rate": 1.6228647535955758e-05, + "loss": 1.2365, + "step": 10314 + }, + { + "epoch": 3.0722835495820844, + "grad_norm": 0.2633138597011566, + "learning_rate": 1.6227892898226754e-05, + "loss": 1.2539, + "step": 10315 + }, + { + "epoch": 3.072581395781753, + "grad_norm": 0.25155434012413025, + "learning_rate": 1.6227138202554167e-05, + "loss": 1.2258, + "step": 10316 + }, + { + "epoch": 3.072879241981422, + "grad_norm": 0.23175309598445892, + "learning_rate": 1.6226383448945014e-05, + "loss": 1.2388, + "step": 10317 + }, + { + "epoch": 3.0731770881810907, + "grad_norm": 0.23968221247196198, + "learning_rate": 1.6225628637406322e-05, + "loss": 1.2301, + "step": 10318 + }, + { + "epoch": 3.073474934380759, + "grad_norm": 0.24314630031585693, + "learning_rate": 1.622487376794511e-05, + "loss": 1.2632, + "step": 10319 + }, + { + "epoch": 3.073772780580428, + "grad_norm": 0.2491050362586975, + "learning_rate": 1.62241188405684e-05, + "loss": 1.2327, + "step": 10320 + }, + { + "epoch": 3.0740706267800966, + "grad_norm": 0.2475767433643341, + "learning_rate": 1.622336385528322e-05, + "loss": 1.2517, + "step": 10321 + }, + { + "epoch": 3.074368472979765, + "grad_norm": 0.24345384538173676, + "learning_rate": 1.6222608812096594e-05, + "loss": 1.2258, + "step": 10322 + }, + { + "epoch": 3.0746663191794337, + "grad_norm": 0.25330597162246704, + "learning_rate": 1.6221853711015546e-05, + "loss": 1.2298, + "step": 10323 + }, + { + "epoch": 3.0749641653791024, + "grad_norm": 0.24096539616584778, + "learning_rate": 1.62210985520471e-05, + "loss": 1.2502, + "step": 10324 + }, + { + "epoch": 3.075262011578771, + "grad_norm": 0.2532306909561157, + "learning_rate": 1.6220343335198278e-05, + "loss": 1.2268, + "step": 10325 + }, + { + "epoch": 3.0755598577784395, + "grad_norm": 0.25411057472229004, + "learning_rate": 1.6219588060476116e-05, + "loss": 1.2301, + "step": 10326 + }, + { + "epoch": 3.0758577039781083, + "grad_norm": 0.24418389797210693, + "learning_rate": 1.6218832727887635e-05, + "loss": 1.2322, + "step": 10327 + }, + { + "epoch": 3.076155550177777, + "grad_norm": 0.2418355941772461, + "learning_rate": 1.621807733743986e-05, + "loss": 1.2527, + "step": 10328 + }, + { + "epoch": 3.0764533963774454, + "grad_norm": 0.24275583028793335, + "learning_rate": 1.6217321889139828e-05, + "loss": 1.2129, + "step": 10329 + }, + { + "epoch": 3.076751242577114, + "grad_norm": 0.23158468306064606, + "learning_rate": 1.621656638299456e-05, + "loss": 1.2479, + "step": 10330 + }, + { + "epoch": 3.077049088776783, + "grad_norm": 0.2408546507358551, + "learning_rate": 1.6215810819011087e-05, + "loss": 1.2292, + "step": 10331 + }, + { + "epoch": 3.0773469349764517, + "grad_norm": 0.2383338361978531, + "learning_rate": 1.621505519719644e-05, + "loss": 1.2286, + "step": 10332 + }, + { + "epoch": 3.07764478117612, + "grad_norm": 0.23233291506767273, + "learning_rate": 1.6214299517557648e-05, + "loss": 1.2297, + "step": 10333 + }, + { + "epoch": 3.077942627375789, + "grad_norm": 0.24617774784564972, + "learning_rate": 1.6213543780101743e-05, + "loss": 1.2401, + "step": 10334 + }, + { + "epoch": 3.0782404735754576, + "grad_norm": 0.23213894665241241, + "learning_rate": 1.621278798483575e-05, + "loss": 1.2369, + "step": 10335 + }, + { + "epoch": 3.078538319775126, + "grad_norm": 0.23981331288814545, + "learning_rate": 1.621203213176671e-05, + "loss": 1.2322, + "step": 10336 + }, + { + "epoch": 3.0788361659747947, + "grad_norm": 0.23576320707798004, + "learning_rate": 1.6211276220901655e-05, + "loss": 1.2377, + "step": 10337 + }, + { + "epoch": 3.0791340121744635, + "grad_norm": 0.24305744469165802, + "learning_rate": 1.621052025224761e-05, + "loss": 1.2433, + "step": 10338 + }, + { + "epoch": 3.0794318583741322, + "grad_norm": 0.23215025663375854, + "learning_rate": 1.6209764225811615e-05, + "loss": 1.2458, + "step": 10339 + }, + { + "epoch": 3.0797297045738006, + "grad_norm": 0.23149891197681427, + "learning_rate": 1.62090081416007e-05, + "loss": 1.2241, + "step": 10340 + }, + { + "epoch": 3.0800275507734693, + "grad_norm": 0.23072408139705658, + "learning_rate": 1.6208251999621902e-05, + "loss": 1.2215, + "step": 10341 + }, + { + "epoch": 3.080325396973138, + "grad_norm": 0.22406615316867828, + "learning_rate": 1.6207495799882255e-05, + "loss": 1.2484, + "step": 10342 + }, + { + "epoch": 3.0806232431728064, + "grad_norm": 0.253277987241745, + "learning_rate": 1.6206739542388795e-05, + "loss": 1.2602, + "step": 10343 + }, + { + "epoch": 3.080921089372475, + "grad_norm": 0.2351500242948532, + "learning_rate": 1.6205983227148562e-05, + "loss": 1.248, + "step": 10344 + }, + { + "epoch": 3.081218935572144, + "grad_norm": 0.2387404590845108, + "learning_rate": 1.6205226854168583e-05, + "loss": 1.2587, + "step": 10345 + }, + { + "epoch": 3.0815167817718128, + "grad_norm": 0.24336200952529907, + "learning_rate": 1.6204470423455902e-05, + "loss": 1.2345, + "step": 10346 + }, + { + "epoch": 3.081814627971481, + "grad_norm": 0.23612387478351593, + "learning_rate": 1.620371393501756e-05, + "loss": 1.2376, + "step": 10347 + }, + { + "epoch": 3.08211247417115, + "grad_norm": 0.24053697288036346, + "learning_rate": 1.6202957388860588e-05, + "loss": 1.2375, + "step": 10348 + }, + { + "epoch": 3.0824103203708186, + "grad_norm": 0.22427183389663696, + "learning_rate": 1.6202200784992025e-05, + "loss": 1.2409, + "step": 10349 + }, + { + "epoch": 3.082708166570487, + "grad_norm": 0.22489367425441742, + "learning_rate": 1.620144412341892e-05, + "loss": 1.2421, + "step": 10350 + }, + { + "epoch": 3.0830060127701557, + "grad_norm": 0.23169724643230438, + "learning_rate": 1.62006874041483e-05, + "loss": 1.2574, + "step": 10351 + }, + { + "epoch": 3.0833038589698245, + "grad_norm": 0.22526168823242188, + "learning_rate": 1.6199930627187215e-05, + "loss": 1.2298, + "step": 10352 + }, + { + "epoch": 3.0836017051694933, + "grad_norm": 0.2435971051454544, + "learning_rate": 1.61991737925427e-05, + "loss": 1.2262, + "step": 10353 + }, + { + "epoch": 3.0838995513691616, + "grad_norm": 0.23552916944026947, + "learning_rate": 1.61984169002218e-05, + "loss": 1.2548, + "step": 10354 + }, + { + "epoch": 3.0841973975688304, + "grad_norm": 0.23748822510242462, + "learning_rate": 1.6197659950231556e-05, + "loss": 1.239, + "step": 10355 + }, + { + "epoch": 3.084495243768499, + "grad_norm": 0.24067635834217072, + "learning_rate": 1.6196902942579012e-05, + "loss": 1.2358, + "step": 10356 + }, + { + "epoch": 3.0847930899681675, + "grad_norm": 0.23165231943130493, + "learning_rate": 1.619614587727121e-05, + "loss": 1.2315, + "step": 10357 + }, + { + "epoch": 3.0850909361678363, + "grad_norm": 0.23566889762878418, + "learning_rate": 1.619538875431519e-05, + "loss": 1.2312, + "step": 10358 + }, + { + "epoch": 3.085388782367505, + "grad_norm": 0.26848578453063965, + "learning_rate": 1.6194631573718e-05, + "loss": 1.2412, + "step": 10359 + }, + { + "epoch": 3.085686628567174, + "grad_norm": 0.24758121371269226, + "learning_rate": 1.6193874335486687e-05, + "loss": 1.2355, + "step": 10360 + }, + { + "epoch": 3.085984474766842, + "grad_norm": 0.2501175105571747, + "learning_rate": 1.6193117039628293e-05, + "loss": 1.2436, + "step": 10361 + }, + { + "epoch": 3.086282320966511, + "grad_norm": 0.2576286494731903, + "learning_rate": 1.6192359686149863e-05, + "loss": 1.229, + "step": 10362 + }, + { + "epoch": 3.0865801671661797, + "grad_norm": 0.24462206661701202, + "learning_rate": 1.6191602275058444e-05, + "loss": 1.2467, + "step": 10363 + }, + { + "epoch": 3.0868780133658484, + "grad_norm": 0.3026152551174164, + "learning_rate": 1.619084480636109e-05, + "loss": 1.2202, + "step": 10364 + }, + { + "epoch": 3.0871758595655168, + "grad_norm": 0.2340417504310608, + "learning_rate": 1.6190087280064834e-05, + "loss": 1.2348, + "step": 10365 + }, + { + "epoch": 3.0874737057651855, + "grad_norm": 0.3344682455062866, + "learning_rate": 1.6189329696176735e-05, + "loss": 1.2428, + "step": 10366 + }, + { + "epoch": 3.0877715519648543, + "grad_norm": 0.28237810730934143, + "learning_rate": 1.6188572054703837e-05, + "loss": 1.2158, + "step": 10367 + }, + { + "epoch": 3.0880693981645226, + "grad_norm": 0.27612945437431335, + "learning_rate": 1.6187814355653193e-05, + "loss": 1.2359, + "step": 10368 + }, + { + "epoch": 3.0883672443641914, + "grad_norm": 0.2473704218864441, + "learning_rate": 1.6187056599031844e-05, + "loss": 1.2402, + "step": 10369 + }, + { + "epoch": 3.08866509056386, + "grad_norm": 0.3062663972377777, + "learning_rate": 1.6186298784846854e-05, + "loss": 1.2367, + "step": 10370 + }, + { + "epoch": 3.0889629367635285, + "grad_norm": 0.2383635938167572, + "learning_rate": 1.618554091310526e-05, + "loss": 1.245, + "step": 10371 + }, + { + "epoch": 3.0892607829631973, + "grad_norm": 0.2796862721443176, + "learning_rate": 1.618478298381412e-05, + "loss": 1.2474, + "step": 10372 + }, + { + "epoch": 3.089558629162866, + "grad_norm": 0.26445525884628296, + "learning_rate": 1.6184024996980485e-05, + "loss": 1.237, + "step": 10373 + }, + { + "epoch": 3.089856475362535, + "grad_norm": 0.27199339866638184, + "learning_rate": 1.6183266952611405e-05, + "loss": 1.2605, + "step": 10374 + }, + { + "epoch": 3.090154321562203, + "grad_norm": 0.30469194054603577, + "learning_rate": 1.6182508850713937e-05, + "loss": 1.2297, + "step": 10375 + }, + { + "epoch": 3.090452167761872, + "grad_norm": 0.24834252893924713, + "learning_rate": 1.618175069129513e-05, + "loss": 1.2371, + "step": 10376 + }, + { + "epoch": 3.0907500139615407, + "grad_norm": 0.2820184528827667, + "learning_rate": 1.618099247436204e-05, + "loss": 1.2239, + "step": 10377 + }, + { + "epoch": 3.0910478601612095, + "grad_norm": 0.2574171721935272, + "learning_rate": 1.618023419992172e-05, + "loss": 1.2436, + "step": 10378 + }, + { + "epoch": 3.091345706360878, + "grad_norm": 0.2358943074941635, + "learning_rate": 1.6179475867981225e-05, + "loss": 1.2441, + "step": 10379 + }, + { + "epoch": 3.0916435525605466, + "grad_norm": 0.3953135013580322, + "learning_rate": 1.6178717478547613e-05, + "loss": 1.2473, + "step": 10380 + }, + { + "epoch": 3.0919413987602153, + "grad_norm": 0.24281415343284607, + "learning_rate": 1.6177959031627937e-05, + "loss": 1.249, + "step": 10381 + }, + { + "epoch": 3.0922392449598837, + "grad_norm": 0.29101502895355225, + "learning_rate": 1.6177200527229256e-05, + "loss": 1.2495, + "step": 10382 + }, + { + "epoch": 3.0925370911595524, + "grad_norm": 0.2608293294906616, + "learning_rate": 1.6176441965358624e-05, + "loss": 1.2364, + "step": 10383 + }, + { + "epoch": 3.092834937359221, + "grad_norm": 0.273247092962265, + "learning_rate": 1.61756833460231e-05, + "loss": 1.2452, + "step": 10384 + }, + { + "epoch": 3.09313278355889, + "grad_norm": 0.3250589966773987, + "learning_rate": 1.6174924669229746e-05, + "loss": 1.238, + "step": 10385 + }, + { + "epoch": 3.0934306297585583, + "grad_norm": 0.24804839491844177, + "learning_rate": 1.6174165934985612e-05, + "loss": 1.2258, + "step": 10386 + }, + { + "epoch": 3.093728475958227, + "grad_norm": 0.2775920331478119, + "learning_rate": 1.6173407143297767e-05, + "loss": 1.2304, + "step": 10387 + }, + { + "epoch": 3.094026322157896, + "grad_norm": 0.2390071302652359, + "learning_rate": 1.6172648294173265e-05, + "loss": 1.2396, + "step": 10388 + }, + { + "epoch": 3.094324168357564, + "grad_norm": 0.2723798453807831, + "learning_rate": 1.6171889387619163e-05, + "loss": 1.2346, + "step": 10389 + }, + { + "epoch": 3.094622014557233, + "grad_norm": 0.2721167802810669, + "learning_rate": 1.617113042364253e-05, + "loss": 1.2442, + "step": 10390 + }, + { + "epoch": 3.0949198607569017, + "grad_norm": 0.2453288584947586, + "learning_rate": 1.6170371402250418e-05, + "loss": 1.235, + "step": 10391 + }, + { + "epoch": 3.0952177069565705, + "grad_norm": 0.2587590217590332, + "learning_rate": 1.61696123234499e-05, + "loss": 1.2282, + "step": 10392 + }, + { + "epoch": 3.095515553156239, + "grad_norm": 0.25568780303001404, + "learning_rate": 1.616885318724803e-05, + "loss": 1.2463, + "step": 10393 + }, + { + "epoch": 3.0958133993559076, + "grad_norm": 0.26561030745506287, + "learning_rate": 1.6168093993651873e-05, + "loss": 1.2367, + "step": 10394 + }, + { + "epoch": 3.0961112455555764, + "grad_norm": 0.2805534899234772, + "learning_rate": 1.6167334742668493e-05, + "loss": 1.2284, + "step": 10395 + }, + { + "epoch": 3.0964090917552447, + "grad_norm": 0.24491840600967407, + "learning_rate": 1.6166575434304953e-05, + "loss": 1.2498, + "step": 10396 + }, + { + "epoch": 3.0967069379549135, + "grad_norm": 0.2690185308456421, + "learning_rate": 1.616581606856832e-05, + "loss": 1.2286, + "step": 10397 + }, + { + "epoch": 3.0970047841545822, + "grad_norm": 0.23889923095703125, + "learning_rate": 1.6165056645465657e-05, + "loss": 1.2396, + "step": 10398 + }, + { + "epoch": 3.097302630354251, + "grad_norm": 0.23897404968738556, + "learning_rate": 1.616429716500403e-05, + "loss": 1.2348, + "step": 10399 + }, + { + "epoch": 3.0976004765539193, + "grad_norm": 0.2513929009437561, + "learning_rate": 1.6163537627190506e-05, + "loss": 1.229, + "step": 10400 + }, + { + "epoch": 3.097898322753588, + "grad_norm": 0.2689685523509979, + "learning_rate": 1.6162778032032147e-05, + "loss": 1.2385, + "step": 10401 + }, + { + "epoch": 3.098196168953257, + "grad_norm": 0.2547231912612915, + "learning_rate": 1.6162018379536027e-05, + "loss": 1.2441, + "step": 10402 + }, + { + "epoch": 3.098494015152925, + "grad_norm": 0.26805955171585083, + "learning_rate": 1.6161258669709208e-05, + "loss": 1.2324, + "step": 10403 + }, + { + "epoch": 3.098791861352594, + "grad_norm": 0.22966517508029938, + "learning_rate": 1.6160498902558762e-05, + "loss": 1.227, + "step": 10404 + }, + { + "epoch": 3.0990897075522628, + "grad_norm": 0.2782258987426758, + "learning_rate": 1.615973907809176e-05, + "loss": 1.2354, + "step": 10405 + }, + { + "epoch": 3.0993875537519315, + "grad_norm": 0.23317642509937286, + "learning_rate": 1.6158979196315266e-05, + "loss": 1.2362, + "step": 10406 + }, + { + "epoch": 3.0996853999516, + "grad_norm": 0.2576300799846649, + "learning_rate": 1.6158219257236346e-05, + "loss": 1.2264, + "step": 10407 + }, + { + "epoch": 3.0999832461512686, + "grad_norm": 0.23924222588539124, + "learning_rate": 1.6157459260862082e-05, + "loss": 1.2451, + "step": 10408 + }, + { + "epoch": 3.1002810923509374, + "grad_norm": 0.26835858821868896, + "learning_rate": 1.615669920719954e-05, + "loss": 1.2473, + "step": 10409 + }, + { + "epoch": 3.1005789385506057, + "grad_norm": 0.2458593100309372, + "learning_rate": 1.615593909625579e-05, + "loss": 1.2437, + "step": 10410 + }, + { + "epoch": 3.1008767847502745, + "grad_norm": 0.22977596521377563, + "learning_rate": 1.6155178928037904e-05, + "loss": 1.2337, + "step": 10411 + }, + { + "epoch": 3.1011746309499433, + "grad_norm": 0.24524131417274475, + "learning_rate": 1.6154418702552953e-05, + "loss": 1.225, + "step": 10412 + }, + { + "epoch": 3.101472477149612, + "grad_norm": 0.25615182518959045, + "learning_rate": 1.6153658419808014e-05, + "loss": 1.2275, + "step": 10413 + }, + { + "epoch": 3.1017703233492804, + "grad_norm": 0.23454606533050537, + "learning_rate": 1.615289807981016e-05, + "loss": 1.237, + "step": 10414 + }, + { + "epoch": 3.102068169548949, + "grad_norm": 0.28615736961364746, + "learning_rate": 1.615213768256646e-05, + "loss": 1.2139, + "step": 10415 + }, + { + "epoch": 3.102366015748618, + "grad_norm": 0.2971478998661041, + "learning_rate": 1.6151377228083994e-05, + "loss": 1.2346, + "step": 10416 + }, + { + "epoch": 3.1026638619482863, + "grad_norm": 0.23978576064109802, + "learning_rate": 1.6150616716369832e-05, + "loss": 1.2468, + "step": 10417 + }, + { + "epoch": 3.102961708147955, + "grad_norm": 0.2721368372440338, + "learning_rate": 1.614985614743106e-05, + "loss": 1.2357, + "step": 10418 + }, + { + "epoch": 3.103259554347624, + "grad_norm": 0.23517760634422302, + "learning_rate": 1.6149095521274746e-05, + "loss": 1.2485, + "step": 10419 + }, + { + "epoch": 3.1035574005472926, + "grad_norm": 0.31406596302986145, + "learning_rate": 1.6148334837907965e-05, + "loss": 1.2359, + "step": 10420 + }, + { + "epoch": 3.103855246746961, + "grad_norm": 0.31216081976890564, + "learning_rate": 1.61475740973378e-05, + "loss": 1.2379, + "step": 10421 + }, + { + "epoch": 3.1041530929466297, + "grad_norm": 0.2626805901527405, + "learning_rate": 1.614681329957133e-05, + "loss": 1.2457, + "step": 10422 + }, + { + "epoch": 3.1044509391462984, + "grad_norm": 0.4179708659648895, + "learning_rate": 1.6146052444615624e-05, + "loss": 1.2328, + "step": 10423 + }, + { + "epoch": 3.1047487853459668, + "grad_norm": 0.3220103085041046, + "learning_rate": 1.614529153247777e-05, + "loss": 1.2401, + "step": 10424 + }, + { + "epoch": 3.1050466315456355, + "grad_norm": 0.25905534625053406, + "learning_rate": 1.614453056316484e-05, + "loss": 1.2439, + "step": 10425 + }, + { + "epoch": 3.1053444777453043, + "grad_norm": 0.3609839081764221, + "learning_rate": 1.6143769536683926e-05, + "loss": 1.2387, + "step": 10426 + }, + { + "epoch": 3.105642323944973, + "grad_norm": 0.26535311341285706, + "learning_rate": 1.6143008453042094e-05, + "loss": 1.2566, + "step": 10427 + }, + { + "epoch": 3.1059401701446414, + "grad_norm": 0.26972994208335876, + "learning_rate": 1.6142247312246432e-05, + "loss": 1.2366, + "step": 10428 + }, + { + "epoch": 3.10623801634431, + "grad_norm": 0.2537599802017212, + "learning_rate": 1.6141486114304026e-05, + "loss": 1.23, + "step": 10429 + }, + { + "epoch": 3.106535862543979, + "grad_norm": 0.2508467137813568, + "learning_rate": 1.6140724859221946e-05, + "loss": 1.2542, + "step": 10430 + }, + { + "epoch": 3.1068337087436477, + "grad_norm": 0.2471814900636673, + "learning_rate": 1.6139963547007288e-05, + "loss": 1.2352, + "step": 10431 + }, + { + "epoch": 3.107131554943316, + "grad_norm": 0.25865426659584045, + "learning_rate": 1.6139202177667128e-05, + "loss": 1.2445, + "step": 10432 + }, + { + "epoch": 3.107429401142985, + "grad_norm": 0.25801217555999756, + "learning_rate": 1.6138440751208554e-05, + "loss": 1.236, + "step": 10433 + }, + { + "epoch": 3.1077272473426536, + "grad_norm": 0.23547405004501343, + "learning_rate": 1.6137679267638642e-05, + "loss": 1.2303, + "step": 10434 + }, + { + "epoch": 3.108025093542322, + "grad_norm": 0.23247171938419342, + "learning_rate": 1.613691772696448e-05, + "loss": 1.2418, + "step": 10435 + }, + { + "epoch": 3.1083229397419907, + "grad_norm": 0.24636529386043549, + "learning_rate": 1.613615612919316e-05, + "loss": 1.2471, + "step": 10436 + }, + { + "epoch": 3.1086207859416595, + "grad_norm": 0.25281500816345215, + "learning_rate": 1.6135394474331764e-05, + "loss": 1.2402, + "step": 10437 + }, + { + "epoch": 3.108918632141328, + "grad_norm": 0.24236537516117096, + "learning_rate": 1.6134632762387373e-05, + "loss": 1.24, + "step": 10438 + }, + { + "epoch": 3.1092164783409966, + "grad_norm": 0.23605842888355255, + "learning_rate": 1.6133870993367077e-05, + "loss": 1.2196, + "step": 10439 + }, + { + "epoch": 3.1095143245406653, + "grad_norm": 0.23824907839298248, + "learning_rate": 1.6133109167277973e-05, + "loss": 1.2444, + "step": 10440 + }, + { + "epoch": 3.109812170740334, + "grad_norm": 0.24137970805168152, + "learning_rate": 1.6132347284127133e-05, + "loss": 1.2342, + "step": 10441 + }, + { + "epoch": 3.1101100169400024, + "grad_norm": 0.25432923436164856, + "learning_rate": 1.6131585343921654e-05, + "loss": 1.2226, + "step": 10442 + }, + { + "epoch": 3.110407863139671, + "grad_norm": 0.23455332219600677, + "learning_rate": 1.6130823346668628e-05, + "loss": 1.2453, + "step": 10443 + }, + { + "epoch": 3.11070570933934, + "grad_norm": 0.3594229519367218, + "learning_rate": 1.6130061292375133e-05, + "loss": 1.2408, + "step": 10444 + }, + { + "epoch": 3.1110035555390088, + "grad_norm": 0.3436545729637146, + "learning_rate": 1.6129299181048273e-05, + "loss": 1.2425, + "step": 10445 + }, + { + "epoch": 3.111301401738677, + "grad_norm": 0.2786564230918884, + "learning_rate": 1.612853701269513e-05, + "loss": 1.2225, + "step": 10446 + }, + { + "epoch": 3.111599247938346, + "grad_norm": 0.45427578687667847, + "learning_rate": 1.6127774787322797e-05, + "loss": 1.2307, + "step": 10447 + }, + { + "epoch": 3.1118970941380146, + "grad_norm": 0.2831638753414154, + "learning_rate": 1.6127012504938366e-05, + "loss": 1.2417, + "step": 10448 + }, + { + "epoch": 3.112194940337683, + "grad_norm": 0.3059787154197693, + "learning_rate": 1.6126250165548932e-05, + "loss": 1.2278, + "step": 10449 + }, + { + "epoch": 3.1124927865373517, + "grad_norm": 0.28319501876831055, + "learning_rate": 1.612548776916158e-05, + "loss": 1.244, + "step": 10450 + }, + { + "epoch": 3.1127906327370205, + "grad_norm": 0.2845174968242645, + "learning_rate": 1.612472531578341e-05, + "loss": 1.2483, + "step": 10451 + }, + { + "epoch": 3.1130884789366893, + "grad_norm": 0.30333369970321655, + "learning_rate": 1.6123962805421515e-05, + "loss": 1.2265, + "step": 10452 + }, + { + "epoch": 3.1133863251363576, + "grad_norm": 0.24139992892742157, + "learning_rate": 1.612320023808299e-05, + "loss": 1.241, + "step": 10453 + }, + { + "epoch": 3.1136841713360264, + "grad_norm": 0.36457669734954834, + "learning_rate": 1.6122437613774925e-05, + "loss": 1.2475, + "step": 10454 + }, + { + "epoch": 3.113982017535695, + "grad_norm": 0.26496487855911255, + "learning_rate": 1.612167493250442e-05, + "loss": 1.2339, + "step": 10455 + }, + { + "epoch": 3.1142798637353635, + "grad_norm": 0.28735244274139404, + "learning_rate": 1.6120912194278566e-05, + "loss": 1.2275, + "step": 10456 + }, + { + "epoch": 3.1145777099350322, + "grad_norm": 0.2546387016773224, + "learning_rate": 1.6120149399104465e-05, + "loss": 1.2378, + "step": 10457 + }, + { + "epoch": 3.114875556134701, + "grad_norm": 0.3327142596244812, + "learning_rate": 1.6119386546989214e-05, + "loss": 1.2443, + "step": 10458 + }, + { + "epoch": 3.11517340233437, + "grad_norm": 0.30269357562065125, + "learning_rate": 1.6118623637939904e-05, + "loss": 1.2258, + "step": 10459 + }, + { + "epoch": 3.115471248534038, + "grad_norm": 0.29396873712539673, + "learning_rate": 1.6117860671963642e-05, + "loss": 1.2415, + "step": 10460 + }, + { + "epoch": 3.115769094733707, + "grad_norm": 0.3337930142879486, + "learning_rate": 1.6117097649067517e-05, + "loss": 1.2348, + "step": 10461 + }, + { + "epoch": 3.1160669409333757, + "grad_norm": 0.23203662037849426, + "learning_rate": 1.6116334569258633e-05, + "loss": 1.2292, + "step": 10462 + }, + { + "epoch": 3.116364787133044, + "grad_norm": 0.2526288628578186, + "learning_rate": 1.6115571432544093e-05, + "loss": 1.2363, + "step": 10463 + }, + { + "epoch": 3.1166626333327128, + "grad_norm": 0.26672297716140747, + "learning_rate": 1.611480823893099e-05, + "loss": 1.2188, + "step": 10464 + }, + { + "epoch": 3.1169604795323815, + "grad_norm": 0.2600806653499603, + "learning_rate": 1.611404498842643e-05, + "loss": 1.2358, + "step": 10465 + }, + { + "epoch": 3.1172583257320503, + "grad_norm": 0.2883581221103668, + "learning_rate": 1.611328168103751e-05, + "loss": 1.2414, + "step": 10466 + }, + { + "epoch": 3.1175561719317186, + "grad_norm": 0.2521454393863678, + "learning_rate": 1.611251831677134e-05, + "loss": 1.2409, + "step": 10467 + }, + { + "epoch": 3.1178540181313874, + "grad_norm": 0.28055715560913086, + "learning_rate": 1.611175489563501e-05, + "loss": 1.2257, + "step": 10468 + }, + { + "epoch": 3.118151864331056, + "grad_norm": 0.22997577488422394, + "learning_rate": 1.6110991417635633e-05, + "loss": 1.2432, + "step": 10469 + }, + { + "epoch": 3.1184497105307245, + "grad_norm": 0.3932369649410248, + "learning_rate": 1.6110227882780307e-05, + "loss": 1.2493, + "step": 10470 + }, + { + "epoch": 3.1187475567303933, + "grad_norm": 0.30058619379997253, + "learning_rate": 1.6109464291076137e-05, + "loss": 1.2401, + "step": 10471 + }, + { + "epoch": 3.119045402930062, + "grad_norm": 0.28633445501327515, + "learning_rate": 1.610870064253023e-05, + "loss": 1.2464, + "step": 10472 + }, + { + "epoch": 3.119343249129731, + "grad_norm": 0.2538171708583832, + "learning_rate": 1.6107936937149684e-05, + "loss": 1.2413, + "step": 10473 + }, + { + "epoch": 3.119641095329399, + "grad_norm": 0.37864917516708374, + "learning_rate": 1.6107173174941614e-05, + "loss": 1.2419, + "step": 10474 + }, + { + "epoch": 3.119938941529068, + "grad_norm": 0.2942824363708496, + "learning_rate": 1.6106409355913117e-05, + "loss": 1.2283, + "step": 10475 + }, + { + "epoch": 3.1202367877287367, + "grad_norm": 0.29793575406074524, + "learning_rate": 1.6105645480071305e-05, + "loss": 1.2412, + "step": 10476 + }, + { + "epoch": 3.120534633928405, + "grad_norm": 0.2526813745498657, + "learning_rate": 1.6104881547423286e-05, + "loss": 1.2465, + "step": 10477 + }, + { + "epoch": 3.120832480128074, + "grad_norm": 0.3530627489089966, + "learning_rate": 1.610411755797616e-05, + "loss": 1.2215, + "step": 10478 + }, + { + "epoch": 3.1211303263277426, + "grad_norm": 0.26530155539512634, + "learning_rate": 1.6103353511737046e-05, + "loss": 1.2516, + "step": 10479 + }, + { + "epoch": 3.1214281725274113, + "grad_norm": 0.2525983154773712, + "learning_rate": 1.6102589408713042e-05, + "loss": 1.2424, + "step": 10480 + }, + { + "epoch": 3.1217260187270797, + "grad_norm": 0.2890133261680603, + "learning_rate": 1.6101825248911264e-05, + "loss": 1.2417, + "step": 10481 + }, + { + "epoch": 3.1220238649267484, + "grad_norm": 0.2439301609992981, + "learning_rate": 1.6101061032338817e-05, + "loss": 1.2248, + "step": 10482 + }, + { + "epoch": 3.122321711126417, + "grad_norm": 0.372954398393631, + "learning_rate": 1.6100296759002817e-05, + "loss": 1.232, + "step": 10483 + }, + { + "epoch": 3.1226195573260855, + "grad_norm": 0.2565677762031555, + "learning_rate": 1.6099532428910367e-05, + "loss": 1.2246, + "step": 10484 + }, + { + "epoch": 3.1229174035257543, + "grad_norm": 0.29265743494033813, + "learning_rate": 1.6098768042068587e-05, + "loss": 1.2416, + "step": 10485 + }, + { + "epoch": 3.123215249725423, + "grad_norm": 0.25104451179504395, + "learning_rate": 1.6098003598484582e-05, + "loss": 1.2654, + "step": 10486 + }, + { + "epoch": 3.123513095925092, + "grad_norm": 0.43743887543678284, + "learning_rate": 1.6097239098165466e-05, + "loss": 1.2483, + "step": 10487 + }, + { + "epoch": 3.12381094212476, + "grad_norm": 0.24702873826026917, + "learning_rate": 1.609647454111835e-05, + "loss": 1.2509, + "step": 10488 + }, + { + "epoch": 3.124108788324429, + "grad_norm": 0.2761531174182892, + "learning_rate": 1.6095709927350357e-05, + "loss": 1.2474, + "step": 10489 + }, + { + "epoch": 3.1244066345240977, + "grad_norm": 0.2544757127761841, + "learning_rate": 1.609494525686859e-05, + "loss": 1.2211, + "step": 10490 + }, + { + "epoch": 3.124704480723766, + "grad_norm": 0.2518884539604187, + "learning_rate": 1.6094180529680166e-05, + "loss": 1.2381, + "step": 10491 + }, + { + "epoch": 3.125002326923435, + "grad_norm": 0.27676719427108765, + "learning_rate": 1.60934157457922e-05, + "loss": 1.2485, + "step": 10492 + }, + { + "epoch": 3.1253001731231036, + "grad_norm": 0.24191485345363617, + "learning_rate": 1.609265090521181e-05, + "loss": 1.2344, + "step": 10493 + }, + { + "epoch": 3.1255980193227724, + "grad_norm": 0.23960517346858978, + "learning_rate": 1.6091886007946114e-05, + "loss": 1.2341, + "step": 10494 + }, + { + "epoch": 3.1258958655224407, + "grad_norm": 0.25541895627975464, + "learning_rate": 1.609112105400222e-05, + "loss": 1.2313, + "step": 10495 + }, + { + "epoch": 3.1261937117221095, + "grad_norm": 0.23986327648162842, + "learning_rate": 1.609035604338725e-05, + "loss": 1.247, + "step": 10496 + }, + { + "epoch": 3.1264915579217782, + "grad_norm": 0.2870851755142212, + "learning_rate": 1.6089590976108326e-05, + "loss": 1.2471, + "step": 10497 + }, + { + "epoch": 3.126789404121447, + "grad_norm": 0.26950156688690186, + "learning_rate": 1.6088825852172556e-05, + "loss": 1.2406, + "step": 10498 + }, + { + "epoch": 3.1270872503211153, + "grad_norm": 0.2400730550289154, + "learning_rate": 1.6088060671587067e-05, + "loss": 1.2593, + "step": 10499 + }, + { + "epoch": 3.127385096520784, + "grad_norm": 0.23325183987617493, + "learning_rate": 1.6087295434358977e-05, + "loss": 1.2426, + "step": 10500 + }, + { + "epoch": 3.127385096520784, + "eval_loss": 1.3344223499298096, + "eval_runtime": 21.0599, + "eval_samples_per_second": 82.337, + "eval_steps_per_second": 5.176, + "step": 10500 + }, + { + "epoch": 3.127682942720453, + "grad_norm": 0.24747851490974426, + "learning_rate": 1.60865301404954e-05, + "loss": 1.2233, + "step": 10501 + }, + { + "epoch": 3.127980788920121, + "grad_norm": 0.24357332289218903, + "learning_rate": 1.6085764790003465e-05, + "loss": 1.2412, + "step": 10502 + }, + { + "epoch": 3.12827863511979, + "grad_norm": 0.24626581370830536, + "learning_rate": 1.6084999382890287e-05, + "loss": 1.2391, + "step": 10503 + }, + { + "epoch": 3.1285764813194588, + "grad_norm": 0.23055605590343475, + "learning_rate": 1.6084233919162988e-05, + "loss": 1.2247, + "step": 10504 + }, + { + "epoch": 3.128874327519127, + "grad_norm": 0.2400212287902832, + "learning_rate": 1.6083468398828687e-05, + "loss": 1.2394, + "step": 10505 + }, + { + "epoch": 3.129172173718796, + "grad_norm": 0.24657058715820312, + "learning_rate": 1.608270282189451e-05, + "loss": 1.2261, + "step": 10506 + }, + { + "epoch": 3.1294700199184646, + "grad_norm": 0.23674990236759186, + "learning_rate": 1.6081937188367582e-05, + "loss": 1.2436, + "step": 10507 + }, + { + "epoch": 3.1297678661181334, + "grad_norm": 0.24882350862026215, + "learning_rate": 1.6081171498255024e-05, + "loss": 1.2608, + "step": 10508 + }, + { + "epoch": 3.1300657123178017, + "grad_norm": 0.2526969909667969, + "learning_rate": 1.608040575156396e-05, + "loss": 1.2457, + "step": 10509 + }, + { + "epoch": 3.1303635585174705, + "grad_norm": 0.2874167859554291, + "learning_rate": 1.607963994830151e-05, + "loss": 1.2234, + "step": 10510 + }, + { + "epoch": 3.1306614047171393, + "grad_norm": 0.23955944180488586, + "learning_rate": 1.607887408847481e-05, + "loss": 1.2525, + "step": 10511 + }, + { + "epoch": 3.130959250916808, + "grad_norm": 0.4108958840370178, + "learning_rate": 1.607810817209097e-05, + "loss": 1.2191, + "step": 10512 + }, + { + "epoch": 3.1312570971164764, + "grad_norm": 0.3459058403968811, + "learning_rate": 1.6077342199157125e-05, + "loss": 1.244, + "step": 10513 + }, + { + "epoch": 3.131554943316145, + "grad_norm": 0.3005227744579315, + "learning_rate": 1.6076576169680404e-05, + "loss": 1.2411, + "step": 10514 + }, + { + "epoch": 3.131852789515814, + "grad_norm": 0.40811359882354736, + "learning_rate": 1.6075810083667933e-05, + "loss": 1.2507, + "step": 10515 + }, + { + "epoch": 3.1321506357154822, + "grad_norm": 0.24512659013271332, + "learning_rate": 1.607504394112683e-05, + "loss": 1.2354, + "step": 10516 + }, + { + "epoch": 3.132448481915151, + "grad_norm": 0.26784753799438477, + "learning_rate": 1.6074277742064237e-05, + "loss": 1.2302, + "step": 10517 + }, + { + "epoch": 3.13274632811482, + "grad_norm": 0.25369390845298767, + "learning_rate": 1.6073511486487276e-05, + "loss": 1.246, + "step": 10518 + }, + { + "epoch": 3.1330441743144886, + "grad_norm": 0.27158766984939575, + "learning_rate": 1.6072745174403073e-05, + "loss": 1.2461, + "step": 10519 + }, + { + "epoch": 3.133342020514157, + "grad_norm": 0.2626797556877136, + "learning_rate": 1.6071978805818765e-05, + "loss": 1.2269, + "step": 10520 + }, + { + "epoch": 3.1336398667138257, + "grad_norm": 0.26225483417510986, + "learning_rate": 1.6071212380741475e-05, + "loss": 1.2436, + "step": 10521 + }, + { + "epoch": 3.1339377129134944, + "grad_norm": 0.30332276225090027, + "learning_rate": 1.607044589917834e-05, + "loss": 1.2383, + "step": 10522 + }, + { + "epoch": 3.1342355591131628, + "grad_norm": 0.25240257382392883, + "learning_rate": 1.6069679361136484e-05, + "loss": 1.2286, + "step": 10523 + }, + { + "epoch": 3.1345334053128315, + "grad_norm": 0.2635928690433502, + "learning_rate": 1.6068912766623043e-05, + "loss": 1.245, + "step": 10524 + }, + { + "epoch": 3.1348312515125003, + "grad_norm": 0.25835949182510376, + "learning_rate": 1.6068146115645156e-05, + "loss": 1.2267, + "step": 10525 + }, + { + "epoch": 3.135129097712169, + "grad_norm": 0.23610354959964752, + "learning_rate": 1.6067379408209945e-05, + "loss": 1.2332, + "step": 10526 + }, + { + "epoch": 3.1354269439118374, + "grad_norm": 0.23394012451171875, + "learning_rate": 1.6066612644324545e-05, + "loss": 1.2418, + "step": 10527 + }, + { + "epoch": 3.135724790111506, + "grad_norm": 0.23350240290164948, + "learning_rate": 1.6065845823996095e-05, + "loss": 1.2325, + "step": 10528 + }, + { + "epoch": 3.136022636311175, + "grad_norm": 0.2493225485086441, + "learning_rate": 1.6065078947231727e-05, + "loss": 1.2402, + "step": 10529 + }, + { + "epoch": 3.1363204825108433, + "grad_norm": 0.24471496045589447, + "learning_rate": 1.6064312014038576e-05, + "loss": 1.2323, + "step": 10530 + }, + { + "epoch": 3.136618328710512, + "grad_norm": 0.23390144109725952, + "learning_rate": 1.606354502442377e-05, + "loss": 1.2367, + "step": 10531 + }, + { + "epoch": 3.136916174910181, + "grad_norm": 0.24722252786159515, + "learning_rate": 1.606277797839446e-05, + "loss": 1.2428, + "step": 10532 + }, + { + "epoch": 3.1372140211098496, + "grad_norm": 0.24313843250274658, + "learning_rate": 1.6062010875957774e-05, + "loss": 1.2283, + "step": 10533 + }, + { + "epoch": 3.137511867309518, + "grad_norm": 0.23428331315517426, + "learning_rate": 1.606124371712085e-05, + "loss": 1.2348, + "step": 10534 + }, + { + "epoch": 3.1378097135091867, + "grad_norm": 0.2426571398973465, + "learning_rate": 1.6060476501890824e-05, + "loss": 1.2467, + "step": 10535 + }, + { + "epoch": 3.1381075597088555, + "grad_norm": 0.2308879941701889, + "learning_rate": 1.6059709230274837e-05, + "loss": 1.2375, + "step": 10536 + }, + { + "epoch": 3.138405405908524, + "grad_norm": 0.2395615577697754, + "learning_rate": 1.605894190228002e-05, + "loss": 1.2537, + "step": 10537 + }, + { + "epoch": 3.1387032521081926, + "grad_norm": 0.2675611674785614, + "learning_rate": 1.6058174517913525e-05, + "loss": 1.2334, + "step": 10538 + }, + { + "epoch": 3.1390010983078613, + "grad_norm": 0.3454257547855377, + "learning_rate": 1.605740707718248e-05, + "loss": 1.257, + "step": 10539 + }, + { + "epoch": 3.13929894450753, + "grad_norm": 0.2523891031742096, + "learning_rate": 1.6056639580094032e-05, + "loss": 1.2475, + "step": 10540 + }, + { + "epoch": 3.1395967907071984, + "grad_norm": 0.36959531903266907, + "learning_rate": 1.605587202665532e-05, + "loss": 1.2261, + "step": 10541 + }, + { + "epoch": 3.139894636906867, + "grad_norm": 0.34983065724372864, + "learning_rate": 1.6055104416873485e-05, + "loss": 1.2399, + "step": 10542 + }, + { + "epoch": 3.140192483106536, + "grad_norm": 0.30975499749183655, + "learning_rate": 1.6054336750755667e-05, + "loss": 1.2525, + "step": 10543 + }, + { + "epoch": 3.1404903293062043, + "grad_norm": 0.49515029788017273, + "learning_rate": 1.605356902830901e-05, + "loss": 1.2196, + "step": 10544 + }, + { + "epoch": 3.140788175505873, + "grad_norm": 0.28890347480773926, + "learning_rate": 1.6052801249540656e-05, + "loss": 1.2392, + "step": 10545 + }, + { + "epoch": 3.141086021705542, + "grad_norm": 0.2818622291088104, + "learning_rate": 1.6052033414457753e-05, + "loss": 1.2316, + "step": 10546 + }, + { + "epoch": 3.1413838679052106, + "grad_norm": 0.2801220715045929, + "learning_rate": 1.6051265523067437e-05, + "loss": 1.2478, + "step": 10547 + }, + { + "epoch": 3.141681714104879, + "grad_norm": 0.28101953864097595, + "learning_rate": 1.6050497575376857e-05, + "loss": 1.238, + "step": 10548 + }, + { + "epoch": 3.1419795603045477, + "grad_norm": 0.24674144387245178, + "learning_rate": 1.6049729571393155e-05, + "loss": 1.2242, + "step": 10549 + }, + { + "epoch": 3.1422774065042165, + "grad_norm": 0.2553524672985077, + "learning_rate": 1.6048961511123484e-05, + "loss": 1.2394, + "step": 10550 + }, + { + "epoch": 3.142575252703885, + "grad_norm": 0.2494676113128662, + "learning_rate": 1.6048193394574978e-05, + "loss": 1.2347, + "step": 10551 + }, + { + "epoch": 3.1428730989035536, + "grad_norm": 0.26495182514190674, + "learning_rate": 1.6047425221754794e-05, + "loss": 1.2294, + "step": 10552 + }, + { + "epoch": 3.1431709451032224, + "grad_norm": 0.24896548688411713, + "learning_rate": 1.6046656992670074e-05, + "loss": 1.2491, + "step": 10553 + }, + { + "epoch": 3.143468791302891, + "grad_norm": 0.24919770658016205, + "learning_rate": 1.6045888707327967e-05, + "loss": 1.2173, + "step": 10554 + }, + { + "epoch": 3.1437666375025595, + "grad_norm": 0.24065832793712616, + "learning_rate": 1.6045120365735618e-05, + "loss": 1.2394, + "step": 10555 + }, + { + "epoch": 3.1440644837022282, + "grad_norm": 0.2379603236913681, + "learning_rate": 1.6044351967900183e-05, + "loss": 1.237, + "step": 10556 + }, + { + "epoch": 3.144362329901897, + "grad_norm": 0.2357507199048996, + "learning_rate": 1.60435835138288e-05, + "loss": 1.2368, + "step": 10557 + }, + { + "epoch": 3.1446601761015653, + "grad_norm": 0.23212207853794098, + "learning_rate": 1.6042815003528627e-05, + "loss": 1.2521, + "step": 10558 + }, + { + "epoch": 3.144958022301234, + "grad_norm": 0.24604001641273499, + "learning_rate": 1.6042046437006814e-05, + "loss": 1.2277, + "step": 10559 + }, + { + "epoch": 3.145255868500903, + "grad_norm": 0.23578615486621857, + "learning_rate": 1.6041277814270508e-05, + "loss": 1.2409, + "step": 10560 + }, + { + "epoch": 3.1455537147005717, + "grad_norm": 0.27423205971717834, + "learning_rate": 1.604050913532686e-05, + "loss": 1.24, + "step": 10561 + }, + { + "epoch": 3.14585156090024, + "grad_norm": 0.29923346638679504, + "learning_rate": 1.6039740400183024e-05, + "loss": 1.2299, + "step": 10562 + }, + { + "epoch": 3.1461494070999088, + "grad_norm": 0.24445225298404694, + "learning_rate": 1.6038971608846155e-05, + "loss": 1.222, + "step": 10563 + }, + { + "epoch": 3.1464472532995775, + "grad_norm": 0.46557796001434326, + "learning_rate": 1.6038202761323398e-05, + "loss": 1.2244, + "step": 10564 + }, + { + "epoch": 3.1467450994992463, + "grad_norm": 0.33012205362319946, + "learning_rate": 1.6037433857621913e-05, + "loss": 1.2332, + "step": 10565 + }, + { + "epoch": 3.1470429456989146, + "grad_norm": 0.2876785099506378, + "learning_rate": 1.6036664897748852e-05, + "loss": 1.2284, + "step": 10566 + }, + { + "epoch": 3.1473407918985834, + "grad_norm": 0.24542902410030365, + "learning_rate": 1.6035895881711367e-05, + "loss": 1.2457, + "step": 10567 + }, + { + "epoch": 3.147638638098252, + "grad_norm": 0.32998529076576233, + "learning_rate": 1.6035126809516614e-05, + "loss": 1.2433, + "step": 10568 + }, + { + "epoch": 3.1479364842979205, + "grad_norm": 0.23939268290996552, + "learning_rate": 1.603435768117175e-05, + "loss": 1.227, + "step": 10569 + }, + { + "epoch": 3.1482343304975893, + "grad_norm": 0.24985511600971222, + "learning_rate": 1.6033588496683927e-05, + "loss": 1.2424, + "step": 10570 + }, + { + "epoch": 3.148532176697258, + "grad_norm": 0.26416710019111633, + "learning_rate": 1.603281925606031e-05, + "loss": 1.2471, + "step": 10571 + }, + { + "epoch": 3.1488300228969264, + "grad_norm": 0.24465322494506836, + "learning_rate": 1.6032049959308044e-05, + "loss": 1.2423, + "step": 10572 + }, + { + "epoch": 3.149127869096595, + "grad_norm": 0.28843262791633606, + "learning_rate": 1.6031280606434298e-05, + "loss": 1.2557, + "step": 10573 + }, + { + "epoch": 3.149425715296264, + "grad_norm": 0.2554221451282501, + "learning_rate": 1.603051119744622e-05, + "loss": 1.2416, + "step": 10574 + }, + { + "epoch": 3.1497235614959327, + "grad_norm": 0.2663499414920807, + "learning_rate": 1.6029741732350973e-05, + "loss": 1.2289, + "step": 10575 + }, + { + "epoch": 3.150021407695601, + "grad_norm": 0.24723687767982483, + "learning_rate": 1.602897221115572e-05, + "loss": 1.2591, + "step": 10576 + }, + { + "epoch": 3.15031925389527, + "grad_norm": 0.2523896396160126, + "learning_rate": 1.6028202633867615e-05, + "loss": 1.2279, + "step": 10577 + }, + { + "epoch": 3.1506171000949386, + "grad_norm": 0.23409320414066315, + "learning_rate": 1.602743300049382e-05, + "loss": 1.2491, + "step": 10578 + }, + { + "epoch": 3.1509149462946073, + "grad_norm": 0.2615102231502533, + "learning_rate": 1.6026663311041492e-05, + "loss": 1.2392, + "step": 10579 + }, + { + "epoch": 3.1512127924942757, + "grad_norm": 0.24396833777427673, + "learning_rate": 1.60258935655178e-05, + "loss": 1.2382, + "step": 10580 + }, + { + "epoch": 3.1515106386939444, + "grad_norm": 0.24034400284290314, + "learning_rate": 1.6025123763929894e-05, + "loss": 1.2151, + "step": 10581 + }, + { + "epoch": 3.151808484893613, + "grad_norm": 0.24459712207317352, + "learning_rate": 1.602435390628495e-05, + "loss": 1.2251, + "step": 10582 + }, + { + "epoch": 3.1521063310932815, + "grad_norm": 0.26458051800727844, + "learning_rate": 1.6023583992590118e-05, + "loss": 1.2436, + "step": 10583 + }, + { + "epoch": 3.1524041772929503, + "grad_norm": 0.2689766585826874, + "learning_rate": 1.6022814022852573e-05, + "loss": 1.2425, + "step": 10584 + }, + { + "epoch": 3.152702023492619, + "grad_norm": 0.2262040078639984, + "learning_rate": 1.6022043997079468e-05, + "loss": 1.2391, + "step": 10585 + }, + { + "epoch": 3.152999869692288, + "grad_norm": 0.263418585062027, + "learning_rate": 1.6021273915277973e-05, + "loss": 1.2237, + "step": 10586 + }, + { + "epoch": 3.153297715891956, + "grad_norm": 0.29484131932258606, + "learning_rate": 1.6020503777455254e-05, + "loss": 1.2376, + "step": 10587 + }, + { + "epoch": 3.153595562091625, + "grad_norm": 0.25313621759414673, + "learning_rate": 1.601973358361847e-05, + "loss": 1.2397, + "step": 10588 + }, + { + "epoch": 3.1538934082912937, + "grad_norm": 0.2395484745502472, + "learning_rate": 1.6018963333774792e-05, + "loss": 1.239, + "step": 10589 + }, + { + "epoch": 3.154191254490962, + "grad_norm": 0.2361753135919571, + "learning_rate": 1.6018193027931385e-05, + "loss": 1.2323, + "step": 10590 + }, + { + "epoch": 3.154489100690631, + "grad_norm": 0.2424056977033615, + "learning_rate": 1.6017422666095417e-05, + "loss": 1.2449, + "step": 10591 + }, + { + "epoch": 3.1547869468902996, + "grad_norm": 0.2801514267921448, + "learning_rate": 1.6016652248274055e-05, + "loss": 1.2446, + "step": 10592 + }, + { + "epoch": 3.1550847930899684, + "grad_norm": 0.358261376619339, + "learning_rate": 1.6015881774474463e-05, + "loss": 1.2503, + "step": 10593 + }, + { + "epoch": 3.1553826392896367, + "grad_norm": 0.3260704576969147, + "learning_rate": 1.6015111244703812e-05, + "loss": 1.2271, + "step": 10594 + }, + { + "epoch": 3.1556804854893055, + "grad_norm": 0.25165343284606934, + "learning_rate": 1.6014340658969274e-05, + "loss": 1.244, + "step": 10595 + }, + { + "epoch": 3.1559783316889742, + "grad_norm": 0.41584092378616333, + "learning_rate": 1.601357001727802e-05, + "loss": 1.2474, + "step": 10596 + }, + { + "epoch": 3.1562761778886426, + "grad_norm": 0.26856347918510437, + "learning_rate": 1.6012799319637208e-05, + "loss": 1.2424, + "step": 10597 + }, + { + "epoch": 3.1565740240883113, + "grad_norm": 0.28092923760414124, + "learning_rate": 1.6012028566054018e-05, + "loss": 1.2372, + "step": 10598 + }, + { + "epoch": 3.15687187028798, + "grad_norm": 0.2309153825044632, + "learning_rate": 1.6011257756535618e-05, + "loss": 1.2288, + "step": 10599 + }, + { + "epoch": 3.157169716487649, + "grad_norm": 0.5773800611495972, + "learning_rate": 1.6010486891089184e-05, + "loss": 1.2306, + "step": 10600 + }, + { + "epoch": 3.157467562687317, + "grad_norm": 0.3086642622947693, + "learning_rate": 1.6009715969721883e-05, + "loss": 1.2397, + "step": 10601 + }, + { + "epoch": 3.157765408886986, + "grad_norm": 0.2767390012741089, + "learning_rate": 1.600894499244089e-05, + "loss": 1.2353, + "step": 10602 + }, + { + "epoch": 3.1580632550866548, + "grad_norm": 0.2537829875946045, + "learning_rate": 1.6008173959253378e-05, + "loss": 1.219, + "step": 10603 + }, + { + "epoch": 3.158361101286323, + "grad_norm": 0.23458589613437653, + "learning_rate": 1.600740287016652e-05, + "loss": 1.2382, + "step": 10604 + }, + { + "epoch": 3.158658947485992, + "grad_norm": 0.2574675977230072, + "learning_rate": 1.600663172518749e-05, + "loss": 1.2451, + "step": 10605 + }, + { + "epoch": 3.1589567936856606, + "grad_norm": 0.23827262222766876, + "learning_rate": 1.600586052432346e-05, + "loss": 1.2456, + "step": 10606 + }, + { + "epoch": 3.1592546398853294, + "grad_norm": 0.2524144649505615, + "learning_rate": 1.600508926758161e-05, + "loss": 1.2392, + "step": 10607 + }, + { + "epoch": 3.1595524860849977, + "grad_norm": 0.22839228808879852, + "learning_rate": 1.6004317954969114e-05, + "loss": 1.2532, + "step": 10608 + }, + { + "epoch": 3.1598503322846665, + "grad_norm": 0.23768730461597443, + "learning_rate": 1.600354658649315e-05, + "loss": 1.2378, + "step": 10609 + }, + { + "epoch": 3.1601481784843353, + "grad_norm": 0.2440677136182785, + "learning_rate": 1.600277516216089e-05, + "loss": 1.2295, + "step": 10610 + }, + { + "epoch": 3.1604460246840036, + "grad_norm": 0.24208486080169678, + "learning_rate": 1.6002003681979513e-05, + "loss": 1.2439, + "step": 10611 + }, + { + "epoch": 3.1607438708836724, + "grad_norm": 0.2367459386587143, + "learning_rate": 1.60012321459562e-05, + "loss": 1.2267, + "step": 10612 + }, + { + "epoch": 3.161041717083341, + "grad_norm": 0.22994917631149292, + "learning_rate": 1.6000460554098126e-05, + "loss": 1.23, + "step": 10613 + }, + { + "epoch": 3.16133956328301, + "grad_norm": 0.23039232194423676, + "learning_rate": 1.599968890641247e-05, + "loss": 1.2238, + "step": 10614 + }, + { + "epoch": 3.1616374094826782, + "grad_norm": 0.24741627275943756, + "learning_rate": 1.5998917202906414e-05, + "loss": 1.2337, + "step": 10615 + }, + { + "epoch": 3.161935255682347, + "grad_norm": 0.24553634226322174, + "learning_rate": 1.599814544358713e-05, + "loss": 1.2307, + "step": 10616 + }, + { + "epoch": 3.162233101882016, + "grad_norm": 0.2533511519432068, + "learning_rate": 1.599737362846181e-05, + "loss": 1.2557, + "step": 10617 + }, + { + "epoch": 3.162530948081684, + "grad_norm": 0.2352992445230484, + "learning_rate": 1.599660175753763e-05, + "loss": 1.2315, + "step": 10618 + }, + { + "epoch": 3.162828794281353, + "grad_norm": 0.23641620576381683, + "learning_rate": 1.5995829830821766e-05, + "loss": 1.2307, + "step": 10619 + }, + { + "epoch": 3.1631266404810217, + "grad_norm": 0.23822817206382751, + "learning_rate": 1.599505784832141e-05, + "loss": 1.2334, + "step": 10620 + }, + { + "epoch": 3.1634244866806904, + "grad_norm": 0.25682422518730164, + "learning_rate": 1.5994285810043733e-05, + "loss": 1.2388, + "step": 10621 + }, + { + "epoch": 3.1637223328803588, + "grad_norm": 0.23904050886631012, + "learning_rate": 1.5993513715995925e-05, + "loss": 1.2381, + "step": 10622 + }, + { + "epoch": 3.1640201790800275, + "grad_norm": 0.23546943068504333, + "learning_rate": 1.599274156618517e-05, + "loss": 1.243, + "step": 10623 + }, + { + "epoch": 3.1643180252796963, + "grad_norm": 0.24305890500545502, + "learning_rate": 1.5991969360618647e-05, + "loss": 1.2377, + "step": 10624 + }, + { + "epoch": 3.1646158714793646, + "grad_norm": 0.23653298616409302, + "learning_rate": 1.5991197099303546e-05, + "loss": 1.2373, + "step": 10625 + }, + { + "epoch": 3.1649137176790334, + "grad_norm": 0.2401796281337738, + "learning_rate": 1.599042478224705e-05, + "loss": 1.2428, + "step": 10626 + }, + { + "epoch": 3.165211563878702, + "grad_norm": 0.23065708577632904, + "learning_rate": 1.5989652409456346e-05, + "loss": 1.239, + "step": 10627 + }, + { + "epoch": 3.165509410078371, + "grad_norm": 0.2518618702888489, + "learning_rate": 1.5988879980938615e-05, + "loss": 1.2256, + "step": 10628 + }, + { + "epoch": 3.1658072562780393, + "grad_norm": 0.24840891361236572, + "learning_rate": 1.5988107496701045e-05, + "loss": 1.2454, + "step": 10629 + }, + { + "epoch": 3.166105102477708, + "grad_norm": 0.23902809619903564, + "learning_rate": 1.598733495675083e-05, + "loss": 1.239, + "step": 10630 + }, + { + "epoch": 3.166402948677377, + "grad_norm": 0.2358437478542328, + "learning_rate": 1.5986562361095153e-05, + "loss": 1.2446, + "step": 10631 + }, + { + "epoch": 3.1667007948770456, + "grad_norm": 0.2648789584636688, + "learning_rate": 1.5985789709741197e-05, + "loss": 1.2413, + "step": 10632 + }, + { + "epoch": 3.166998641076714, + "grad_norm": 0.2546963393688202, + "learning_rate": 1.598501700269616e-05, + "loss": 1.2379, + "step": 10633 + }, + { + "epoch": 3.1672964872763827, + "grad_norm": 0.2382189929485321, + "learning_rate": 1.5984244239967225e-05, + "loss": 1.2242, + "step": 10634 + }, + { + "epoch": 3.1675943334760515, + "grad_norm": 0.25194206833839417, + "learning_rate": 1.5983471421561584e-05, + "loss": 1.2239, + "step": 10635 + }, + { + "epoch": 3.16789217967572, + "grad_norm": 0.24583874642848969, + "learning_rate": 1.5982698547486423e-05, + "loss": 1.24, + "step": 10636 + }, + { + "epoch": 3.1681900258753886, + "grad_norm": 0.23640339076519012, + "learning_rate": 1.598192561774894e-05, + "loss": 1.2345, + "step": 10637 + }, + { + "epoch": 3.1684878720750573, + "grad_norm": 0.24777191877365112, + "learning_rate": 1.5981152632356324e-05, + "loss": 1.2611, + "step": 10638 + }, + { + "epoch": 3.1687857182747257, + "grad_norm": 0.24595105648040771, + "learning_rate": 1.598037959131576e-05, + "loss": 1.2318, + "step": 10639 + }, + { + "epoch": 3.1690835644743944, + "grad_norm": 0.23297661542892456, + "learning_rate": 1.597960649463445e-05, + "loss": 1.2389, + "step": 10640 + }, + { + "epoch": 3.169381410674063, + "grad_norm": 0.23024040460586548, + "learning_rate": 1.597883334231958e-05, + "loss": 1.2273, + "step": 10641 + }, + { + "epoch": 3.169679256873732, + "grad_norm": 0.24169017374515533, + "learning_rate": 1.5978060134378348e-05, + "loss": 1.2303, + "step": 10642 + }, + { + "epoch": 3.1699771030734003, + "grad_norm": 0.2444298267364502, + "learning_rate": 1.5977286870817943e-05, + "loss": 1.2454, + "step": 10643 + }, + { + "epoch": 3.170274949273069, + "grad_norm": 0.24222426116466522, + "learning_rate": 1.5976513551645563e-05, + "loss": 1.2487, + "step": 10644 + }, + { + "epoch": 3.170572795472738, + "grad_norm": 0.2733427882194519, + "learning_rate": 1.59757401768684e-05, + "loss": 1.2317, + "step": 10645 + }, + { + "epoch": 3.1708706416724066, + "grad_norm": 0.24904723465442657, + "learning_rate": 1.5974966746493656e-05, + "loss": 1.2417, + "step": 10646 + }, + { + "epoch": 3.171168487872075, + "grad_norm": 0.28826475143432617, + "learning_rate": 1.5974193260528518e-05, + "loss": 1.2374, + "step": 10647 + }, + { + "epoch": 3.1714663340717437, + "grad_norm": 0.28618383407592773, + "learning_rate": 1.5973419718980187e-05, + "loss": 1.2489, + "step": 10648 + }, + { + "epoch": 3.1717641802714125, + "grad_norm": 0.2855173349380493, + "learning_rate": 1.597264612185586e-05, + "loss": 1.2383, + "step": 10649 + }, + { + "epoch": 3.172062026471081, + "grad_norm": 0.27851954102516174, + "learning_rate": 1.5971872469162732e-05, + "loss": 1.2414, + "step": 10650 + }, + { + "epoch": 3.1723598726707496, + "grad_norm": 0.24966329336166382, + "learning_rate": 1.5971098760908007e-05, + "loss": 1.2286, + "step": 10651 + }, + { + "epoch": 3.1726577188704184, + "grad_norm": 0.25896716117858887, + "learning_rate": 1.5970324997098875e-05, + "loss": 1.2321, + "step": 10652 + }, + { + "epoch": 3.172955565070087, + "grad_norm": 0.24297773838043213, + "learning_rate": 1.5969551177742542e-05, + "loss": 1.2283, + "step": 10653 + }, + { + "epoch": 3.1732534112697555, + "grad_norm": 0.2746482193470001, + "learning_rate": 1.5968777302846204e-05, + "loss": 1.2415, + "step": 10654 + }, + { + "epoch": 3.1735512574694242, + "grad_norm": 0.2508457899093628, + "learning_rate": 1.596800337241706e-05, + "loss": 1.2466, + "step": 10655 + }, + { + "epoch": 3.173849103669093, + "grad_norm": 0.259013295173645, + "learning_rate": 1.5967229386462314e-05, + "loss": 1.235, + "step": 10656 + }, + { + "epoch": 3.1741469498687613, + "grad_norm": 0.2628565728664398, + "learning_rate": 1.5966455344989168e-05, + "loss": 1.2392, + "step": 10657 + }, + { + "epoch": 3.17444479606843, + "grad_norm": 0.23000118136405945, + "learning_rate": 1.596568124800482e-05, + "loss": 1.2404, + "step": 10658 + }, + { + "epoch": 3.174742642268099, + "grad_norm": 0.3210986256599426, + "learning_rate": 1.596490709551647e-05, + "loss": 1.2302, + "step": 10659 + }, + { + "epoch": 3.1750404884677677, + "grad_norm": 0.2435050904750824, + "learning_rate": 1.5964132887531327e-05, + "loss": 1.2289, + "step": 10660 + }, + { + "epoch": 3.175338334667436, + "grad_norm": 0.26220378279685974, + "learning_rate": 1.596335862405659e-05, + "loss": 1.2306, + "step": 10661 + }, + { + "epoch": 3.1756361808671048, + "grad_norm": 0.23695498704910278, + "learning_rate": 1.5962584305099463e-05, + "loss": 1.2269, + "step": 10662 + }, + { + "epoch": 3.1759340270667735, + "grad_norm": 0.26109692454338074, + "learning_rate": 1.5961809930667152e-05, + "loss": 1.2385, + "step": 10663 + }, + { + "epoch": 3.176231873266442, + "grad_norm": 0.24443170428276062, + "learning_rate": 1.596103550076686e-05, + "loss": 1.2492, + "step": 10664 + }, + { + "epoch": 3.1765297194661106, + "grad_norm": 0.2502957880496979, + "learning_rate": 1.5960261015405795e-05, + "loss": 1.2291, + "step": 10665 + }, + { + "epoch": 3.1768275656657794, + "grad_norm": 0.2379729002714157, + "learning_rate": 1.5959486474591158e-05, + "loss": 1.2359, + "step": 10666 + }, + { + "epoch": 3.177125411865448, + "grad_norm": 0.276965469121933, + "learning_rate": 1.5958711878330158e-05, + "loss": 1.2421, + "step": 10667 + }, + { + "epoch": 3.1774232580651165, + "grad_norm": 0.24670004844665527, + "learning_rate": 1.595793722663e-05, + "loss": 1.246, + "step": 10668 + }, + { + "epoch": 3.1777211042647853, + "grad_norm": 0.2542455792427063, + "learning_rate": 1.5957162519497897e-05, + "loss": 1.2483, + "step": 10669 + }, + { + "epoch": 3.178018950464454, + "grad_norm": 0.23900531232357025, + "learning_rate": 1.595638775694105e-05, + "loss": 1.2383, + "step": 10670 + }, + { + "epoch": 3.1783167966641224, + "grad_norm": 0.2516971230506897, + "learning_rate": 1.5955612938966667e-05, + "loss": 1.2575, + "step": 10671 + }, + { + "epoch": 3.178614642863791, + "grad_norm": 0.25015169382095337, + "learning_rate": 1.5954838065581962e-05, + "loss": 1.228, + "step": 10672 + }, + { + "epoch": 3.17891248906346, + "grad_norm": 0.23840080201625824, + "learning_rate": 1.5954063136794143e-05, + "loss": 1.2269, + "step": 10673 + }, + { + "epoch": 3.1792103352631287, + "grad_norm": 0.2403010129928589, + "learning_rate": 1.5953288152610416e-05, + "loss": 1.2285, + "step": 10674 + }, + { + "epoch": 3.179508181462797, + "grad_norm": 0.23212282359600067, + "learning_rate": 1.5952513113037996e-05, + "loss": 1.2315, + "step": 10675 + }, + { + "epoch": 3.179806027662466, + "grad_norm": 0.25795066356658936, + "learning_rate": 1.595173801808409e-05, + "loss": 1.2421, + "step": 10676 + }, + { + "epoch": 3.1801038738621346, + "grad_norm": 0.2992171347141266, + "learning_rate": 1.5950962867755916e-05, + "loss": 1.2389, + "step": 10677 + }, + { + "epoch": 3.180401720061803, + "grad_norm": 0.2883983254432678, + "learning_rate": 1.595018766206068e-05, + "loss": 1.2252, + "step": 10678 + }, + { + "epoch": 3.1806995662614717, + "grad_norm": 0.22906909883022308, + "learning_rate": 1.5949412401005594e-05, + "loss": 1.2426, + "step": 10679 + }, + { + "epoch": 3.1809974124611404, + "grad_norm": 0.24934309720993042, + "learning_rate": 1.5948637084597873e-05, + "loss": 1.2319, + "step": 10680 + }, + { + "epoch": 3.181295258660809, + "grad_norm": 0.33420678973197937, + "learning_rate": 1.594786171284473e-05, + "loss": 1.2409, + "step": 10681 + }, + { + "epoch": 3.1815931048604775, + "grad_norm": 0.31714025139808655, + "learning_rate": 1.594708628575338e-05, + "loss": 1.2292, + "step": 10682 + }, + { + "epoch": 3.1818909510601463, + "grad_norm": 0.25144240260124207, + "learning_rate": 1.5946310803331035e-05, + "loss": 1.2403, + "step": 10683 + }, + { + "epoch": 3.182188797259815, + "grad_norm": 0.4745495319366455, + "learning_rate": 1.594553526558491e-05, + "loss": 1.2337, + "step": 10684 + }, + { + "epoch": 3.182486643459484, + "grad_norm": 0.35296693444252014, + "learning_rate": 1.5944759672522227e-05, + "loss": 1.2524, + "step": 10685 + }, + { + "epoch": 3.182784489659152, + "grad_norm": 0.3100326657295227, + "learning_rate": 1.5943984024150195e-05, + "loss": 1.2409, + "step": 10686 + }, + { + "epoch": 3.183082335858821, + "grad_norm": 0.3078671991825104, + "learning_rate": 1.5943208320476032e-05, + "loss": 1.2203, + "step": 10687 + }, + { + "epoch": 3.1833801820584897, + "grad_norm": 0.2713058888912201, + "learning_rate": 1.5942432561506956e-05, + "loss": 1.2566, + "step": 10688 + }, + { + "epoch": 3.183678028258158, + "grad_norm": 0.24255773425102234, + "learning_rate": 1.5941656747250186e-05, + "loss": 1.2302, + "step": 10689 + }, + { + "epoch": 3.183975874457827, + "grad_norm": 0.2739361822605133, + "learning_rate": 1.5940880877712935e-05, + "loss": 1.2347, + "step": 10690 + }, + { + "epoch": 3.1842737206574956, + "grad_norm": 0.24218833446502686, + "learning_rate": 1.5940104952902427e-05, + "loss": 1.2394, + "step": 10691 + }, + { + "epoch": 3.184571566857164, + "grad_norm": 0.295357882976532, + "learning_rate": 1.593932897282588e-05, + "loss": 1.2416, + "step": 10692 + }, + { + "epoch": 3.1848694130568327, + "grad_norm": 0.23636461794376373, + "learning_rate": 1.5938552937490512e-05, + "loss": 1.2442, + "step": 10693 + }, + { + "epoch": 3.1851672592565015, + "grad_norm": 0.25766557455062866, + "learning_rate": 1.593777684690354e-05, + "loss": 1.2243, + "step": 10694 + }, + { + "epoch": 3.1854651054561702, + "grad_norm": 0.23682431876659393, + "learning_rate": 1.5937000701072193e-05, + "loss": 1.2317, + "step": 10695 + }, + { + "epoch": 3.1857629516558386, + "grad_norm": 0.28741541504859924, + "learning_rate": 1.593622450000369e-05, + "loss": 1.2417, + "step": 10696 + }, + { + "epoch": 3.1860607978555073, + "grad_norm": 0.24891297519207, + "learning_rate": 1.5935448243705244e-05, + "loss": 1.2336, + "step": 10697 + }, + { + "epoch": 3.186358644055176, + "grad_norm": 0.27397966384887695, + "learning_rate": 1.5934671932184088e-05, + "loss": 1.2407, + "step": 10698 + }, + { + "epoch": 3.186656490254845, + "grad_norm": 0.24041998386383057, + "learning_rate": 1.5933895565447438e-05, + "loss": 1.2267, + "step": 10699 + }, + { + "epoch": 3.186954336454513, + "grad_norm": 0.44697391986846924, + "learning_rate": 1.593311914350252e-05, + "loss": 1.2516, + "step": 10700 + }, + { + "epoch": 3.187252182654182, + "grad_norm": 0.28662437200546265, + "learning_rate": 1.593234266635656e-05, + "loss": 1.2548, + "step": 10701 + }, + { + "epoch": 3.1875500288538507, + "grad_norm": 0.29262790083885193, + "learning_rate": 1.5931566134016776e-05, + "loss": 1.2208, + "step": 10702 + }, + { + "epoch": 3.187847875053519, + "grad_norm": 0.2651198208332062, + "learning_rate": 1.5930789546490397e-05, + "loss": 1.2393, + "step": 10703 + }, + { + "epoch": 3.188145721253188, + "grad_norm": 0.3279496729373932, + "learning_rate": 1.5930012903784647e-05, + "loss": 1.2232, + "step": 10704 + }, + { + "epoch": 3.1884435674528566, + "grad_norm": 0.32991090416908264, + "learning_rate": 1.5929236205906752e-05, + "loss": 1.2306, + "step": 10705 + }, + { + "epoch": 3.188741413652525, + "grad_norm": 0.2659478783607483, + "learning_rate": 1.5928459452863942e-05, + "loss": 1.2562, + "step": 10706 + }, + { + "epoch": 3.1890392598521937, + "grad_norm": 0.29939010739326477, + "learning_rate": 1.5927682644663438e-05, + "loss": 1.2342, + "step": 10707 + }, + { + "epoch": 3.1893371060518625, + "grad_norm": 0.256277859210968, + "learning_rate": 1.592690578131247e-05, + "loss": 1.2411, + "step": 10708 + }, + { + "epoch": 3.1896349522515313, + "grad_norm": 0.3427721858024597, + "learning_rate": 1.5926128862818267e-05, + "loss": 1.2419, + "step": 10709 + }, + { + "epoch": 3.1899327984511996, + "grad_norm": 0.24358530342578888, + "learning_rate": 1.5925351889188053e-05, + "loss": 1.2348, + "step": 10710 + }, + { + "epoch": 3.1902306446508684, + "grad_norm": 0.27404895424842834, + "learning_rate": 1.5924574860429064e-05, + "loss": 1.2291, + "step": 10711 + }, + { + "epoch": 3.190528490850537, + "grad_norm": 0.24499614536762238, + "learning_rate": 1.592379777654852e-05, + "loss": 1.2249, + "step": 10712 + }, + { + "epoch": 3.190826337050206, + "grad_norm": 0.25218456983566284, + "learning_rate": 1.592302063755366e-05, + "loss": 1.2261, + "step": 10713 + }, + { + "epoch": 3.1911241832498742, + "grad_norm": 0.2580556273460388, + "learning_rate": 1.592224344345171e-05, + "loss": 1.244, + "step": 10714 + }, + { + "epoch": 3.191422029449543, + "grad_norm": 0.25089213252067566, + "learning_rate": 1.59214661942499e-05, + "loss": 1.2316, + "step": 10715 + }, + { + "epoch": 3.191719875649212, + "grad_norm": 0.2475767582654953, + "learning_rate": 1.5920688889955463e-05, + "loss": 1.2315, + "step": 10716 + }, + { + "epoch": 3.19201772184888, + "grad_norm": 0.2405940592288971, + "learning_rate": 1.5919911530575634e-05, + "loss": 1.2361, + "step": 10717 + }, + { + "epoch": 3.192315568048549, + "grad_norm": 0.23215091228485107, + "learning_rate": 1.591913411611764e-05, + "loss": 1.2113, + "step": 10718 + }, + { + "epoch": 3.1926134142482177, + "grad_norm": 0.22682683169841766, + "learning_rate": 1.591835664658872e-05, + "loss": 1.2321, + "step": 10719 + }, + { + "epoch": 3.1929112604478864, + "grad_norm": 0.23340408504009247, + "learning_rate": 1.5917579121996096e-05, + "loss": 1.235, + "step": 10720 + }, + { + "epoch": 3.1932091066475548, + "grad_norm": 0.22769591212272644, + "learning_rate": 1.5916801542347013e-05, + "loss": 1.224, + "step": 10721 + }, + { + "epoch": 3.1935069528472235, + "grad_norm": 0.24305878579616547, + "learning_rate": 1.5916023907648703e-05, + "loss": 1.2164, + "step": 10722 + }, + { + "epoch": 3.1938047990468923, + "grad_norm": 0.24259690940380096, + "learning_rate": 1.5915246217908403e-05, + "loss": 1.2328, + "step": 10723 + }, + { + "epoch": 3.1941026452465606, + "grad_norm": 0.2590729296207428, + "learning_rate": 1.5914468473133343e-05, + "loss": 1.2271, + "step": 10724 + }, + { + "epoch": 3.1944004914462294, + "grad_norm": 0.36325037479400635, + "learning_rate": 1.591369067333076e-05, + "loss": 1.2415, + "step": 10725 + }, + { + "epoch": 3.194698337645898, + "grad_norm": 0.2736909091472626, + "learning_rate": 1.5912912818507897e-05, + "loss": 1.2337, + "step": 10726 + }, + { + "epoch": 3.194996183845567, + "grad_norm": 0.26318758726119995, + "learning_rate": 1.5912134908671985e-05, + "loss": 1.2378, + "step": 10727 + }, + { + "epoch": 3.1952940300452353, + "grad_norm": 0.3223790228366852, + "learning_rate": 1.5911356943830264e-05, + "loss": 1.2276, + "step": 10728 + }, + { + "epoch": 3.195591876244904, + "grad_norm": 0.23630988597869873, + "learning_rate": 1.591057892398997e-05, + "loss": 1.2438, + "step": 10729 + }, + { + "epoch": 3.195889722444573, + "grad_norm": 0.2934418320655823, + "learning_rate": 1.590980084915834e-05, + "loss": 1.222, + "step": 10730 + }, + { + "epoch": 3.196187568644241, + "grad_norm": 0.2269815057516098, + "learning_rate": 1.590902271934262e-05, + "loss": 1.2147, + "step": 10731 + }, + { + "epoch": 3.19648541484391, + "grad_norm": 0.3533363938331604, + "learning_rate": 1.590824453455004e-05, + "loss": 1.2396, + "step": 10732 + }, + { + "epoch": 3.1967832610435787, + "grad_norm": 0.3048786222934723, + "learning_rate": 1.5907466294787852e-05, + "loss": 1.2195, + "step": 10733 + }, + { + "epoch": 3.1970811072432475, + "grad_norm": 0.2923799455165863, + "learning_rate": 1.590668800006329e-05, + "loss": 1.2354, + "step": 10734 + }, + { + "epoch": 3.197378953442916, + "grad_norm": 0.4716499149799347, + "learning_rate": 1.590590965038359e-05, + "loss": 1.233, + "step": 10735 + }, + { + "epoch": 3.1976767996425846, + "grad_norm": 0.308133602142334, + "learning_rate": 1.5905131245756004e-05, + "loss": 1.2179, + "step": 10736 + }, + { + "epoch": 3.1979746458422533, + "grad_norm": 0.29060718417167664, + "learning_rate": 1.590435278618777e-05, + "loss": 1.2341, + "step": 10737 + }, + { + "epoch": 3.1982724920419217, + "grad_norm": 0.3171718120574951, + "learning_rate": 1.5903574271686126e-05, + "loss": 1.2442, + "step": 10738 + }, + { + "epoch": 3.1985703382415904, + "grad_norm": 0.25217702984809875, + "learning_rate": 1.590279570225832e-05, + "loss": 1.2133, + "step": 10739 + }, + { + "epoch": 3.198868184441259, + "grad_norm": 0.28292983770370483, + "learning_rate": 1.5902017077911596e-05, + "loss": 1.2306, + "step": 10740 + }, + { + "epoch": 3.199166030640928, + "grad_norm": 0.26874682307243347, + "learning_rate": 1.5901238398653197e-05, + "loss": 1.2335, + "step": 10741 + }, + { + "epoch": 3.1994638768405963, + "grad_norm": 0.3263053894042969, + "learning_rate": 1.590045966449037e-05, + "loss": 1.2385, + "step": 10742 + }, + { + "epoch": 3.199761723040265, + "grad_norm": 0.23493176698684692, + "learning_rate": 1.5899680875430355e-05, + "loss": 1.2344, + "step": 10743 + }, + { + "epoch": 3.200059569239934, + "grad_norm": 0.25662684440612793, + "learning_rate": 1.5898902031480403e-05, + "loss": 1.236, + "step": 10744 + }, + { + "epoch": 3.200357415439602, + "grad_norm": 0.2345069795846939, + "learning_rate": 1.5898123132647757e-05, + "loss": 1.2223, + "step": 10745 + }, + { + "epoch": 3.200655261639271, + "grad_norm": 0.2831743061542511, + "learning_rate": 1.5897344178939666e-05, + "loss": 1.2452, + "step": 10746 + }, + { + "epoch": 3.2009531078389397, + "grad_norm": 0.24404355883598328, + "learning_rate": 1.5896565170363375e-05, + "loss": 1.2286, + "step": 10747 + }, + { + "epoch": 3.2012509540386085, + "grad_norm": 0.30020880699157715, + "learning_rate": 1.5895786106926135e-05, + "loss": 1.2309, + "step": 10748 + }, + { + "epoch": 3.201548800238277, + "grad_norm": 0.23508435487747192, + "learning_rate": 1.5895006988635195e-05, + "loss": 1.2301, + "step": 10749 + }, + { + "epoch": 3.2018466464379456, + "grad_norm": 0.38590341806411743, + "learning_rate": 1.5894227815497797e-05, + "loss": 1.2362, + "step": 10750 + }, + { + "epoch": 3.2021444926376144, + "grad_norm": 0.3294226825237274, + "learning_rate": 1.5893448587521196e-05, + "loss": 1.2446, + "step": 10751 + }, + { + "epoch": 3.202442338837283, + "grad_norm": 0.29960083961486816, + "learning_rate": 1.589266930471264e-05, + "loss": 1.2398, + "step": 10752 + }, + { + "epoch": 3.2027401850369515, + "grad_norm": 0.3394383490085602, + "learning_rate": 1.589188996707938e-05, + "loss": 1.2325, + "step": 10753 + }, + { + "epoch": 3.2030380312366202, + "grad_norm": 0.23774844408035278, + "learning_rate": 1.5891110574628664e-05, + "loss": 1.2313, + "step": 10754 + }, + { + "epoch": 3.203335877436289, + "grad_norm": 0.2551165223121643, + "learning_rate": 1.589033112736775e-05, + "loss": 1.2399, + "step": 10755 + }, + { + "epoch": 3.2036337236359573, + "grad_norm": 0.24792854487895966, + "learning_rate": 1.5889551625303883e-05, + "loss": 1.2388, + "step": 10756 + }, + { + "epoch": 3.203931569835626, + "grad_norm": 0.3705975413322449, + "learning_rate": 1.588877206844432e-05, + "loss": 1.227, + "step": 10757 + }, + { + "epoch": 3.204229416035295, + "grad_norm": 0.29245322942733765, + "learning_rate": 1.5887992456796313e-05, + "loss": 1.2415, + "step": 10758 + }, + { + "epoch": 3.204527262234963, + "grad_norm": 0.3218289315700531, + "learning_rate": 1.5887212790367113e-05, + "loss": 1.2421, + "step": 10759 + }, + { + "epoch": 3.204825108434632, + "grad_norm": 0.4285733699798584, + "learning_rate": 1.5886433069163975e-05, + "loss": 1.249, + "step": 10760 + }, + { + "epoch": 3.2051229546343007, + "grad_norm": 0.25649911165237427, + "learning_rate": 1.5885653293194157e-05, + "loss": 1.226, + "step": 10761 + }, + { + "epoch": 3.2054208008339695, + "grad_norm": 0.26349279284477234, + "learning_rate": 1.588487346246491e-05, + "loss": 1.2504, + "step": 10762 + }, + { + "epoch": 3.205718647033638, + "grad_norm": 0.23419275879859924, + "learning_rate": 1.588409357698349e-05, + "loss": 1.2246, + "step": 10763 + }, + { + "epoch": 3.2060164932333066, + "grad_norm": 0.26687389612197876, + "learning_rate": 1.588331363675715e-05, + "loss": 1.2408, + "step": 10764 + }, + { + "epoch": 3.2063143394329754, + "grad_norm": 0.24406246840953827, + "learning_rate": 1.5882533641793154e-05, + "loss": 1.2235, + "step": 10765 + }, + { + "epoch": 3.206612185632644, + "grad_norm": 0.28717195987701416, + "learning_rate": 1.5881753592098753e-05, + "loss": 1.2321, + "step": 10766 + }, + { + "epoch": 3.2069100318323125, + "grad_norm": 0.23939520120620728, + "learning_rate": 1.5880973487681207e-05, + "loss": 1.2357, + "step": 10767 + }, + { + "epoch": 3.2072078780319813, + "grad_norm": 0.2434358447790146, + "learning_rate": 1.588019332854777e-05, + "loss": 1.2345, + "step": 10768 + }, + { + "epoch": 3.20750572423165, + "grad_norm": 0.23970021307468414, + "learning_rate": 1.587941311470571e-05, + "loss": 1.2256, + "step": 10769 + }, + { + "epoch": 3.2078035704313184, + "grad_norm": 0.25342997908592224, + "learning_rate": 1.5878632846162277e-05, + "loss": 1.2514, + "step": 10770 + }, + { + "epoch": 3.208101416630987, + "grad_norm": 0.2785789966583252, + "learning_rate": 1.5877852522924733e-05, + "loss": 1.245, + "step": 10771 + }, + { + "epoch": 3.208399262830656, + "grad_norm": 0.22904382646083832, + "learning_rate": 1.587707214500034e-05, + "loss": 1.2308, + "step": 10772 + }, + { + "epoch": 3.2086971090303242, + "grad_norm": 0.25558245182037354, + "learning_rate": 1.5876291712396353e-05, + "loss": 1.2583, + "step": 10773 + }, + { + "epoch": 3.208994955229993, + "grad_norm": 0.2670227587223053, + "learning_rate": 1.587551122512004e-05, + "loss": 1.221, + "step": 10774 + }, + { + "epoch": 3.209292801429662, + "grad_norm": 0.24412156641483307, + "learning_rate": 1.587473068317866e-05, + "loss": 1.2382, + "step": 10775 + }, + { + "epoch": 3.2095906476293306, + "grad_norm": 0.24204950034618378, + "learning_rate": 1.5873950086579473e-05, + "loss": 1.2446, + "step": 10776 + }, + { + "epoch": 3.209888493828999, + "grad_norm": 0.2510155737400055, + "learning_rate": 1.5873169435329745e-05, + "loss": 1.2272, + "step": 10777 + }, + { + "epoch": 3.2101863400286677, + "grad_norm": 0.2400822639465332, + "learning_rate": 1.5872388729436735e-05, + "loss": 1.2434, + "step": 10778 + }, + { + "epoch": 3.2104841862283364, + "grad_norm": 0.2394513189792633, + "learning_rate": 1.5871607968907712e-05, + "loss": 1.2273, + "step": 10779 + }, + { + "epoch": 3.210782032428005, + "grad_norm": 0.2357054054737091, + "learning_rate": 1.5870827153749932e-05, + "loss": 1.226, + "step": 10780 + }, + { + "epoch": 3.2110798786276735, + "grad_norm": 0.23843006789684296, + "learning_rate": 1.5870046283970667e-05, + "loss": 1.2487, + "step": 10781 + }, + { + "epoch": 3.2113777248273423, + "grad_norm": 0.23922370374202728, + "learning_rate": 1.5869265359577184e-05, + "loss": 1.225, + "step": 10782 + }, + { + "epoch": 3.211675571027011, + "grad_norm": 0.23820941150188446, + "learning_rate": 1.586848438057674e-05, + "loss": 1.2506, + "step": 10783 + }, + { + "epoch": 3.2119734172266794, + "grad_norm": 0.23231257498264313, + "learning_rate": 1.5867703346976607e-05, + "loss": 1.2477, + "step": 10784 + }, + { + "epoch": 3.212271263426348, + "grad_norm": 0.2385302484035492, + "learning_rate": 1.5866922258784048e-05, + "loss": 1.2381, + "step": 10785 + }, + { + "epoch": 3.212569109626017, + "grad_norm": 0.28267809748649597, + "learning_rate": 1.586614111600633e-05, + "loss": 1.2455, + "step": 10786 + }, + { + "epoch": 3.2128669558256857, + "grad_norm": 0.30639517307281494, + "learning_rate": 1.5865359918650728e-05, + "loss": 1.2284, + "step": 10787 + }, + { + "epoch": 3.213164802025354, + "grad_norm": 0.2467852681875229, + "learning_rate": 1.5864578666724505e-05, + "loss": 1.2389, + "step": 10788 + }, + { + "epoch": 3.213462648225023, + "grad_norm": 0.2759745121002197, + "learning_rate": 1.586379736023493e-05, + "loss": 1.2245, + "step": 10789 + }, + { + "epoch": 3.2137604944246916, + "grad_norm": 0.2421119511127472, + "learning_rate": 1.586301599918927e-05, + "loss": 1.2296, + "step": 10790 + }, + { + "epoch": 3.21405834062436, + "grad_norm": 0.24279387295246124, + "learning_rate": 1.58622345835948e-05, + "loss": 1.2397, + "step": 10791 + }, + { + "epoch": 3.2143561868240287, + "grad_norm": 0.2437242716550827, + "learning_rate": 1.586145311345878e-05, + "loss": 1.241, + "step": 10792 + }, + { + "epoch": 3.2146540330236975, + "grad_norm": 0.35955995321273804, + "learning_rate": 1.586067158878849e-05, + "loss": 1.2486, + "step": 10793 + }, + { + "epoch": 3.2149518792233662, + "grad_norm": 0.35823380947113037, + "learning_rate": 1.5859890009591204e-05, + "loss": 1.2346, + "step": 10794 + }, + { + "epoch": 3.2152497254230346, + "grad_norm": 0.2452254593372345, + "learning_rate": 1.5859108375874184e-05, + "loss": 1.2518, + "step": 10795 + }, + { + "epoch": 3.2155475716227033, + "grad_norm": 0.533737301826477, + "learning_rate": 1.585832668764471e-05, + "loss": 1.2364, + "step": 10796 + }, + { + "epoch": 3.215845417822372, + "grad_norm": 0.36993083357810974, + "learning_rate": 1.585754494491005e-05, + "loss": 1.2328, + "step": 10797 + }, + { + "epoch": 3.2161432640220404, + "grad_norm": 0.3465012311935425, + "learning_rate": 1.5856763147677476e-05, + "loss": 1.2261, + "step": 10798 + }, + { + "epoch": 3.216441110221709, + "grad_norm": 0.24372775852680206, + "learning_rate": 1.5855981295954267e-05, + "loss": 1.2389, + "step": 10799 + }, + { + "epoch": 3.216738956421378, + "grad_norm": 0.49915915727615356, + "learning_rate": 1.5855199389747693e-05, + "loss": 1.2295, + "step": 10800 + }, + { + "epoch": 3.2170368026210467, + "grad_norm": 0.272409588098526, + "learning_rate": 1.5854417429065032e-05, + "loss": 1.2399, + "step": 10801 + }, + { + "epoch": 3.217334648820715, + "grad_norm": 0.27181148529052734, + "learning_rate": 1.585363541391356e-05, + "loss": 1.2167, + "step": 10802 + }, + { + "epoch": 3.217632495020384, + "grad_norm": 0.2770851254463196, + "learning_rate": 1.5852853344300546e-05, + "loss": 1.2477, + "step": 10803 + }, + { + "epoch": 3.2179303412200526, + "grad_norm": 0.23387804627418518, + "learning_rate": 1.585207122023327e-05, + "loss": 1.2409, + "step": 10804 + }, + { + "epoch": 3.218228187419721, + "grad_norm": 0.37050560116767883, + "learning_rate": 1.5851289041719017e-05, + "loss": 1.2347, + "step": 10805 + }, + { + "epoch": 3.2185260336193897, + "grad_norm": 0.24400877952575684, + "learning_rate": 1.585050680876505e-05, + "loss": 1.2295, + "step": 10806 + }, + { + "epoch": 3.2188238798190585, + "grad_norm": 0.2682911455631256, + "learning_rate": 1.5849724521378656e-05, + "loss": 1.2426, + "step": 10807 + }, + { + "epoch": 3.2191217260187273, + "grad_norm": 0.26314225792884827, + "learning_rate": 1.5848942179567114e-05, + "loss": 1.2317, + "step": 10808 + }, + { + "epoch": 3.2194195722183956, + "grad_norm": 0.24134303629398346, + "learning_rate": 1.5848159783337692e-05, + "loss": 1.2355, + "step": 10809 + }, + { + "epoch": 3.2197174184180644, + "grad_norm": 0.2847636044025421, + "learning_rate": 1.5847377332697683e-05, + "loss": 1.2257, + "step": 10810 + }, + { + "epoch": 3.220015264617733, + "grad_norm": 0.24767592549324036, + "learning_rate": 1.584659482765436e-05, + "loss": 1.2428, + "step": 10811 + }, + { + "epoch": 3.2203131108174015, + "grad_norm": 0.2538882791996002, + "learning_rate": 1.5845812268215005e-05, + "loss": 1.239, + "step": 10812 + }, + { + "epoch": 3.2206109570170702, + "grad_norm": 0.2436237782239914, + "learning_rate": 1.5845029654386895e-05, + "loss": 1.2261, + "step": 10813 + }, + { + "epoch": 3.220908803216739, + "grad_norm": 0.24263125658035278, + "learning_rate": 1.5844246986177322e-05, + "loss": 1.2439, + "step": 10814 + }, + { + "epoch": 3.2212066494164078, + "grad_norm": 0.2770474851131439, + "learning_rate": 1.5843464263593553e-05, + "loss": 1.2418, + "step": 10815 + }, + { + "epoch": 3.221504495616076, + "grad_norm": 0.258004367351532, + "learning_rate": 1.584268148664288e-05, + "loss": 1.2383, + "step": 10816 + }, + { + "epoch": 3.221802341815745, + "grad_norm": 0.23440416157245636, + "learning_rate": 1.5841898655332582e-05, + "loss": 1.2377, + "step": 10817 + }, + { + "epoch": 3.2221001880154136, + "grad_norm": 0.24034483730793, + "learning_rate": 1.584111576966995e-05, + "loss": 1.2303, + "step": 10818 + }, + { + "epoch": 3.2223980342150824, + "grad_norm": 0.24852849543094635, + "learning_rate": 1.5840332829662255e-05, + "loss": 1.2275, + "step": 10819 + }, + { + "epoch": 3.2226958804147507, + "grad_norm": 0.2526701092720032, + "learning_rate": 1.583954983531679e-05, + "loss": 1.2381, + "step": 10820 + }, + { + "epoch": 3.2229937266144195, + "grad_norm": 0.25460922718048096, + "learning_rate": 1.5838766786640842e-05, + "loss": 1.2287, + "step": 10821 + }, + { + "epoch": 3.2232915728140883, + "grad_norm": 0.25999483466148376, + "learning_rate": 1.5837983683641688e-05, + "loss": 1.223, + "step": 10822 + }, + { + "epoch": 3.2235894190137566, + "grad_norm": 0.23224976658821106, + "learning_rate": 1.583720052632662e-05, + "loss": 1.2314, + "step": 10823 + }, + { + "epoch": 3.2238872652134254, + "grad_norm": 0.270042359828949, + "learning_rate": 1.583641731470292e-05, + "loss": 1.2612, + "step": 10824 + }, + { + "epoch": 3.224185111413094, + "grad_norm": 0.23013587296009064, + "learning_rate": 1.583563404877788e-05, + "loss": 1.2248, + "step": 10825 + }, + { + "epoch": 3.2244829576127625, + "grad_norm": 0.26332661509513855, + "learning_rate": 1.5834850728558787e-05, + "loss": 1.2432, + "step": 10826 + }, + { + "epoch": 3.2247808038124313, + "grad_norm": 0.28763964772224426, + "learning_rate": 1.5834067354052926e-05, + "loss": 1.243, + "step": 10827 + }, + { + "epoch": 3.2250786500121, + "grad_norm": 0.23542463779449463, + "learning_rate": 1.5833283925267587e-05, + "loss": 1.2295, + "step": 10828 + }, + { + "epoch": 3.225376496211769, + "grad_norm": 0.3327104151248932, + "learning_rate": 1.583250044221006e-05, + "loss": 1.2452, + "step": 10829 + }, + { + "epoch": 3.225674342411437, + "grad_norm": 0.3312518000602722, + "learning_rate": 1.5831716904887625e-05, + "loss": 1.2219, + "step": 10830 + }, + { + "epoch": 3.225972188611106, + "grad_norm": 0.24583038687705994, + "learning_rate": 1.5830933313307585e-05, + "loss": 1.2252, + "step": 10831 + }, + { + "epoch": 3.2262700348107747, + "grad_norm": 0.37174612283706665, + "learning_rate": 1.583014966747723e-05, + "loss": 1.2595, + "step": 10832 + }, + { + "epoch": 3.2265678810104435, + "grad_norm": 0.2938191592693329, + "learning_rate": 1.582936596740384e-05, + "loss": 1.2381, + "step": 10833 + }, + { + "epoch": 3.226865727210112, + "grad_norm": 0.257517546415329, + "learning_rate": 1.5828582213094713e-05, + "loss": 1.2322, + "step": 10834 + }, + { + "epoch": 3.2271635734097806, + "grad_norm": 0.25996196269989014, + "learning_rate": 1.582779840455714e-05, + "loss": 1.2467, + "step": 10835 + }, + { + "epoch": 3.2274614196094493, + "grad_norm": 0.23953726887702942, + "learning_rate": 1.5827014541798415e-05, + "loss": 1.2305, + "step": 10836 + }, + { + "epoch": 3.2277592658091177, + "grad_norm": 0.23734961450099945, + "learning_rate": 1.582623062482583e-05, + "loss": 1.2248, + "step": 10837 + }, + { + "epoch": 3.2280571120087864, + "grad_norm": 0.26539120078086853, + "learning_rate": 1.582544665364668e-05, + "loss": 1.2496, + "step": 10838 + }, + { + "epoch": 3.228354958208455, + "grad_norm": 0.2751491367816925, + "learning_rate": 1.5824662628268258e-05, + "loss": 1.247, + "step": 10839 + }, + { + "epoch": 3.2286528044081235, + "grad_norm": 0.24858997762203217, + "learning_rate": 1.5823878548697856e-05, + "loss": 1.2419, + "step": 10840 + }, + { + "epoch": 3.2289506506077923, + "grad_norm": 0.28213387727737427, + "learning_rate": 1.5823094414942774e-05, + "loss": 1.2231, + "step": 10841 + }, + { + "epoch": 3.229248496807461, + "grad_norm": 0.23853297531604767, + "learning_rate": 1.58223102270103e-05, + "loss": 1.2393, + "step": 10842 + }, + { + "epoch": 3.22954634300713, + "grad_norm": 0.25614315271377563, + "learning_rate": 1.582152598490774e-05, + "loss": 1.2393, + "step": 10843 + }, + { + "epoch": 3.229844189206798, + "grad_norm": 0.2433617264032364, + "learning_rate": 1.582074168864238e-05, + "loss": 1.2434, + "step": 10844 + }, + { + "epoch": 3.230142035406467, + "grad_norm": 0.2417619228363037, + "learning_rate": 1.581995733822152e-05, + "loss": 1.2409, + "step": 10845 + }, + { + "epoch": 3.2304398816061357, + "grad_norm": 0.2536410689353943, + "learning_rate": 1.5819172933652464e-05, + "loss": 1.2197, + "step": 10846 + }, + { + "epoch": 3.2307377278058045, + "grad_norm": 0.23557448387145996, + "learning_rate": 1.58183884749425e-05, + "loss": 1.2417, + "step": 10847 + }, + { + "epoch": 3.231035574005473, + "grad_norm": 0.2389982044696808, + "learning_rate": 1.5817603962098938e-05, + "loss": 1.2517, + "step": 10848 + }, + { + "epoch": 3.2313334202051416, + "grad_norm": 0.26373356580734253, + "learning_rate": 1.5816819395129072e-05, + "loss": 1.2488, + "step": 10849 + }, + { + "epoch": 3.2316312664048104, + "grad_norm": 0.2483980804681778, + "learning_rate": 1.5816034774040193e-05, + "loss": 1.2431, + "step": 10850 + }, + { + "epoch": 3.2319291126044787, + "grad_norm": 0.237098827958107, + "learning_rate": 1.5815250098839615e-05, + "loss": 1.2162, + "step": 10851 + }, + { + "epoch": 3.2322269588041475, + "grad_norm": 0.2552669048309326, + "learning_rate": 1.581446536953463e-05, + "loss": 1.2449, + "step": 10852 + }, + { + "epoch": 3.2325248050038162, + "grad_norm": 0.24945805966854095, + "learning_rate": 1.581368058613254e-05, + "loss": 1.2326, + "step": 10853 + }, + { + "epoch": 3.232822651203485, + "grad_norm": 0.2533094882965088, + "learning_rate": 1.581289574864065e-05, + "loss": 1.2341, + "step": 10854 + }, + { + "epoch": 3.2331204974031533, + "grad_norm": 0.2475655972957611, + "learning_rate": 1.5812110857066257e-05, + "loss": 1.241, + "step": 10855 + }, + { + "epoch": 3.233418343602822, + "grad_norm": 0.25987479090690613, + "learning_rate": 1.5811325911416668e-05, + "loss": 1.2436, + "step": 10856 + }, + { + "epoch": 3.233716189802491, + "grad_norm": 0.3800262212753296, + "learning_rate": 1.5810540911699183e-05, + "loss": 1.2387, + "step": 10857 + }, + { + "epoch": 3.234014036002159, + "grad_norm": 0.34478265047073364, + "learning_rate": 1.5809755857921104e-05, + "loss": 1.245, + "step": 10858 + }, + { + "epoch": 3.234311882201828, + "grad_norm": 0.2621350884437561, + "learning_rate": 1.5808970750089744e-05, + "loss": 1.2449, + "step": 10859 + }, + { + "epoch": 3.2346097284014967, + "grad_norm": 0.7660012245178223, + "learning_rate": 1.5808185588212396e-05, + "loss": 1.2498, + "step": 10860 + }, + { + "epoch": 3.2349075746011655, + "grad_norm": 0.26017045974731445, + "learning_rate": 1.580740037229637e-05, + "loss": 1.2325, + "step": 10861 + }, + { + "epoch": 3.235205420800834, + "grad_norm": 0.2521427869796753, + "learning_rate": 1.5806615102348976e-05, + "loss": 1.2429, + "step": 10862 + }, + { + "epoch": 3.2355032670005026, + "grad_norm": 0.2565779983997345, + "learning_rate": 1.5805829778377518e-05, + "loss": 1.2556, + "step": 10863 + }, + { + "epoch": 3.2358011132001714, + "grad_norm": 0.23941807448863983, + "learning_rate": 1.5805044400389295e-05, + "loss": 1.2361, + "step": 10864 + }, + { + "epoch": 3.2360989593998397, + "grad_norm": 0.24612638354301453, + "learning_rate": 1.5804258968391622e-05, + "loss": 1.2244, + "step": 10865 + }, + { + "epoch": 3.2363968055995085, + "grad_norm": 0.24015018343925476, + "learning_rate": 1.5803473482391804e-05, + "loss": 1.2153, + "step": 10866 + }, + { + "epoch": 3.2366946517991773, + "grad_norm": 0.2549809515476227, + "learning_rate": 1.580268794239715e-05, + "loss": 1.2317, + "step": 10867 + }, + { + "epoch": 3.236992497998846, + "grad_norm": 0.24323932826519012, + "learning_rate": 1.5801902348414966e-05, + "loss": 1.2456, + "step": 10868 + }, + { + "epoch": 3.2372903441985144, + "grad_norm": 0.23269499838352203, + "learning_rate": 1.5801116700452565e-05, + "loss": 1.2372, + "step": 10869 + }, + { + "epoch": 3.237588190398183, + "grad_norm": 0.24246612191200256, + "learning_rate": 1.5800330998517252e-05, + "loss": 1.2365, + "step": 10870 + }, + { + "epoch": 3.237886036597852, + "grad_norm": 0.24057693779468536, + "learning_rate": 1.579954524261634e-05, + "loss": 1.234, + "step": 10871 + }, + { + "epoch": 3.2381838827975202, + "grad_norm": 0.23094598948955536, + "learning_rate": 1.579875943275714e-05, + "loss": 1.2459, + "step": 10872 + }, + { + "epoch": 3.238481728997189, + "grad_norm": 0.23706576228141785, + "learning_rate": 1.5797973568946965e-05, + "loss": 1.2446, + "step": 10873 + }, + { + "epoch": 3.2387795751968578, + "grad_norm": 0.23647253215312958, + "learning_rate": 1.579718765119312e-05, + "loss": 1.2426, + "step": 10874 + }, + { + "epoch": 3.2390774213965265, + "grad_norm": 0.23313447833061218, + "learning_rate": 1.579640167950292e-05, + "loss": 1.2452, + "step": 10875 + }, + { + "epoch": 3.239375267596195, + "grad_norm": 0.23108598589897156, + "learning_rate": 1.579561565388368e-05, + "loss": 1.2196, + "step": 10876 + }, + { + "epoch": 3.2396731137958636, + "grad_norm": 0.23014722764492035, + "learning_rate": 1.5794829574342717e-05, + "loss": 1.2129, + "step": 10877 + }, + { + "epoch": 3.2399709599955324, + "grad_norm": 0.23379579186439514, + "learning_rate": 1.5794043440887333e-05, + "loss": 1.2343, + "step": 10878 + }, + { + "epoch": 3.2402688061952007, + "grad_norm": 0.24715496599674225, + "learning_rate": 1.5793257253524848e-05, + "loss": 1.2487, + "step": 10879 + }, + { + "epoch": 3.2405666523948695, + "grad_norm": 0.24083739519119263, + "learning_rate": 1.579247101226258e-05, + "loss": 1.2565, + "step": 10880 + }, + { + "epoch": 3.2408644985945383, + "grad_norm": 0.23865579068660736, + "learning_rate": 1.579168471710784e-05, + "loss": 1.2424, + "step": 10881 + }, + { + "epoch": 3.241162344794207, + "grad_norm": 0.24451987445354462, + "learning_rate": 1.5790898368067945e-05, + "loss": 1.2374, + "step": 10882 + }, + { + "epoch": 3.2414601909938754, + "grad_norm": 0.24056999385356903, + "learning_rate": 1.5790111965150208e-05, + "loss": 1.2217, + "step": 10883 + }, + { + "epoch": 3.241758037193544, + "grad_norm": 0.2411208599805832, + "learning_rate": 1.578932550836195e-05, + "loss": 1.2213, + "step": 10884 + }, + { + "epoch": 3.242055883393213, + "grad_norm": 0.24081043899059296, + "learning_rate": 1.5788538997710487e-05, + "loss": 1.2354, + "step": 10885 + }, + { + "epoch": 3.2423537295928817, + "grad_norm": 0.238377645611763, + "learning_rate": 1.5787752433203136e-05, + "loss": 1.2285, + "step": 10886 + }, + { + "epoch": 3.24265157579255, + "grad_norm": 0.23264189064502716, + "learning_rate": 1.5786965814847214e-05, + "loss": 1.2424, + "step": 10887 + }, + { + "epoch": 3.242949421992219, + "grad_norm": 0.23121654987335205, + "learning_rate": 1.578617914265004e-05, + "loss": 1.2161, + "step": 10888 + }, + { + "epoch": 3.2432472681918876, + "grad_norm": 0.2341800332069397, + "learning_rate": 1.5785392416618933e-05, + "loss": 1.2286, + "step": 10889 + }, + { + "epoch": 3.243545114391556, + "grad_norm": 0.2328624576330185, + "learning_rate": 1.5784605636761218e-05, + "loss": 1.2207, + "step": 10890 + }, + { + "epoch": 3.2438429605912247, + "grad_norm": 0.24926938116550446, + "learning_rate": 1.5783818803084208e-05, + "loss": 1.2162, + "step": 10891 + }, + { + "epoch": 3.2441408067908934, + "grad_norm": 0.23768393695354462, + "learning_rate": 1.5783031915595222e-05, + "loss": 1.2464, + "step": 10892 + }, + { + "epoch": 3.2444386529905618, + "grad_norm": 0.2344312220811844, + "learning_rate": 1.578224497430159e-05, + "loss": 1.2441, + "step": 10893 + }, + { + "epoch": 3.2447364991902305, + "grad_norm": 0.2264043241739273, + "learning_rate": 1.578145797921063e-05, + "loss": 1.2413, + "step": 10894 + }, + { + "epoch": 3.2450343453898993, + "grad_norm": 0.23375320434570312, + "learning_rate": 1.5780670930329656e-05, + "loss": 1.2417, + "step": 10895 + }, + { + "epoch": 3.245332191589568, + "grad_norm": 0.23985905945301056, + "learning_rate": 1.5779883827666004e-05, + "loss": 1.24, + "step": 10896 + }, + { + "epoch": 3.2456300377892364, + "grad_norm": 0.23048816621303558, + "learning_rate": 1.577909667122699e-05, + "loss": 1.23, + "step": 10897 + }, + { + "epoch": 3.245927883988905, + "grad_norm": 0.23868009448051453, + "learning_rate": 1.5778309461019937e-05, + "loss": 1.2265, + "step": 10898 + }, + { + "epoch": 3.246225730188574, + "grad_norm": 0.2428135722875595, + "learning_rate": 1.5777522197052172e-05, + "loss": 1.2438, + "step": 10899 + }, + { + "epoch": 3.2465235763882427, + "grad_norm": 0.24055194854736328, + "learning_rate": 1.5776734879331015e-05, + "loss": 1.2302, + "step": 10900 + }, + { + "epoch": 3.246821422587911, + "grad_norm": 0.2537178099155426, + "learning_rate": 1.5775947507863795e-05, + "loss": 1.2404, + "step": 10901 + }, + { + "epoch": 3.24711926878758, + "grad_norm": 0.2449008971452713, + "learning_rate": 1.577516008265784e-05, + "loss": 1.235, + "step": 10902 + }, + { + "epoch": 3.2474171149872486, + "grad_norm": 0.23761901259422302, + "learning_rate": 1.577437260372047e-05, + "loss": 1.2363, + "step": 10903 + }, + { + "epoch": 3.247714961186917, + "grad_norm": 0.24395373463630676, + "learning_rate": 1.5773585071059013e-05, + "loss": 1.239, + "step": 10904 + }, + { + "epoch": 3.2480128073865857, + "grad_norm": 0.24250420928001404, + "learning_rate": 1.5772797484680798e-05, + "loss": 1.2315, + "step": 10905 + }, + { + "epoch": 3.2483106535862545, + "grad_norm": 0.23782804608345032, + "learning_rate": 1.5772009844593156e-05, + "loss": 1.2471, + "step": 10906 + }, + { + "epoch": 3.248608499785923, + "grad_norm": 0.2457427680492401, + "learning_rate": 1.5771222150803407e-05, + "loss": 1.2249, + "step": 10907 + }, + { + "epoch": 3.2489063459855916, + "grad_norm": 0.23378737270832062, + "learning_rate": 1.5770434403318885e-05, + "loss": 1.2415, + "step": 10908 + }, + { + "epoch": 3.2492041921852604, + "grad_norm": 0.23457074165344238, + "learning_rate": 1.5769646602146918e-05, + "loss": 1.2458, + "step": 10909 + }, + { + "epoch": 3.249502038384929, + "grad_norm": 0.24822695553302765, + "learning_rate": 1.5768858747294837e-05, + "loss": 1.2439, + "step": 10910 + }, + { + "epoch": 3.2497998845845975, + "grad_norm": 0.23484498262405396, + "learning_rate": 1.5768070838769972e-05, + "loss": 1.2398, + "step": 10911 + }, + { + "epoch": 3.2500977307842662, + "grad_norm": 0.24210688471794128, + "learning_rate": 1.5767282876579647e-05, + "loss": 1.2429, + "step": 10912 + }, + { + "epoch": 3.250395576983935, + "grad_norm": 0.22929832339286804, + "learning_rate": 1.57664948607312e-05, + "loss": 1.2198, + "step": 10913 + }, + { + "epoch": 3.2506934231836038, + "grad_norm": 0.24054791033267975, + "learning_rate": 1.5765706791231965e-05, + "loss": 1.2377, + "step": 10914 + }, + { + "epoch": 3.250991269383272, + "grad_norm": 0.23274177312850952, + "learning_rate": 1.5764918668089266e-05, + "loss": 1.2368, + "step": 10915 + }, + { + "epoch": 3.251289115582941, + "grad_norm": 0.24070748686790466, + "learning_rate": 1.5764130491310442e-05, + "loss": 1.2292, + "step": 10916 + }, + { + "epoch": 3.2515869617826096, + "grad_norm": 0.23932212591171265, + "learning_rate": 1.5763342260902824e-05, + "loss": 1.2513, + "step": 10917 + }, + { + "epoch": 3.251884807982278, + "grad_norm": 0.24381308257579803, + "learning_rate": 1.5762553976873745e-05, + "loss": 1.2236, + "step": 10918 + }, + { + "epoch": 3.2521826541819467, + "grad_norm": 0.23351305723190308, + "learning_rate": 1.5761765639230537e-05, + "loss": 1.2491, + "step": 10919 + }, + { + "epoch": 3.2524805003816155, + "grad_norm": 0.25579774379730225, + "learning_rate": 1.576097724798054e-05, + "loss": 1.2563, + "step": 10920 + }, + { + "epoch": 3.252778346581284, + "grad_norm": 0.23283495008945465, + "learning_rate": 1.5760188803131086e-05, + "loss": 1.226, + "step": 10921 + }, + { + "epoch": 3.2530761927809526, + "grad_norm": 0.23861253261566162, + "learning_rate": 1.575940030468951e-05, + "loss": 1.2386, + "step": 10922 + }, + { + "epoch": 3.2533740389806214, + "grad_norm": 0.25366294384002686, + "learning_rate": 1.575861175266315e-05, + "loss": 1.244, + "step": 10923 + }, + { + "epoch": 3.25367188518029, + "grad_norm": 0.2527261972427368, + "learning_rate": 1.5757823147059343e-05, + "loss": 1.2304, + "step": 10924 + }, + { + "epoch": 3.2539697313799585, + "grad_norm": 0.2507016360759735, + "learning_rate": 1.575703448788542e-05, + "loss": 1.2283, + "step": 10925 + }, + { + "epoch": 3.2542675775796273, + "grad_norm": 0.2321532815694809, + "learning_rate": 1.5756245775148723e-05, + "loss": 1.2341, + "step": 10926 + }, + { + "epoch": 3.254565423779296, + "grad_norm": 0.24228094518184662, + "learning_rate": 1.5755457008856598e-05, + "loss": 1.2465, + "step": 10927 + }, + { + "epoch": 3.254863269978965, + "grad_norm": 0.24019403755664825, + "learning_rate": 1.575466818901637e-05, + "loss": 1.248, + "step": 10928 + }, + { + "epoch": 3.255161116178633, + "grad_norm": 0.2458336353302002, + "learning_rate": 1.5753879315635384e-05, + "loss": 1.254, + "step": 10929 + }, + { + "epoch": 3.255458962378302, + "grad_norm": 0.2647082507610321, + "learning_rate": 1.575309038872098e-05, + "loss": 1.2229, + "step": 10930 + }, + { + "epoch": 3.2557568085779707, + "grad_norm": 0.24335896968841553, + "learning_rate": 1.57523014082805e-05, + "loss": 1.2437, + "step": 10931 + }, + { + "epoch": 3.256054654777639, + "grad_norm": 0.24166379868984222, + "learning_rate": 1.5751512374321277e-05, + "loss": 1.2387, + "step": 10932 + }, + { + "epoch": 3.2563525009773078, + "grad_norm": 0.27155637741088867, + "learning_rate": 1.5750723286850663e-05, + "loss": 1.2316, + "step": 10933 + }, + { + "epoch": 3.2566503471769765, + "grad_norm": 0.24128098785877228, + "learning_rate": 1.574993414587599e-05, + "loss": 1.2489, + "step": 10934 + }, + { + "epoch": 3.2569481933766453, + "grad_norm": 0.31059321761131287, + "learning_rate": 1.5749144951404606e-05, + "loss": 1.2562, + "step": 10935 + }, + { + "epoch": 3.2572460395763136, + "grad_norm": 0.24159196019172668, + "learning_rate": 1.574835570344385e-05, + "loss": 1.2447, + "step": 10936 + }, + { + "epoch": 3.2575438857759824, + "grad_norm": 0.25864771008491516, + "learning_rate": 1.574756640200107e-05, + "loss": 1.2353, + "step": 10937 + }, + { + "epoch": 3.257841731975651, + "grad_norm": 0.2590962052345276, + "learning_rate": 1.5746777047083607e-05, + "loss": 1.229, + "step": 10938 + }, + { + "epoch": 3.25813957817532, + "grad_norm": 0.23139087855815887, + "learning_rate": 1.57459876386988e-05, + "loss": 1.234, + "step": 10939 + }, + { + "epoch": 3.2584374243749883, + "grad_norm": 0.23297728598117828, + "learning_rate": 1.5745198176854e-05, + "loss": 1.2444, + "step": 10940 + }, + { + "epoch": 3.258735270574657, + "grad_norm": 0.2693626880645752, + "learning_rate": 1.5744408661556547e-05, + "loss": 1.2328, + "step": 10941 + }, + { + "epoch": 3.259033116774326, + "grad_norm": 0.25418147444725037, + "learning_rate": 1.5743619092813793e-05, + "loss": 1.2418, + "step": 10942 + }, + { + "epoch": 3.259330962973994, + "grad_norm": 0.2374105155467987, + "learning_rate": 1.5742829470633075e-05, + "loss": 1.2263, + "step": 10943 + }, + { + "epoch": 3.259628809173663, + "grad_norm": 0.24964316189289093, + "learning_rate": 1.5742039795021752e-05, + "loss": 1.2382, + "step": 10944 + }, + { + "epoch": 3.2599266553733317, + "grad_norm": 0.24822159111499786, + "learning_rate": 1.5741250065987158e-05, + "loss": 1.2376, + "step": 10945 + }, + { + "epoch": 3.260224501573, + "grad_norm": 0.2457820326089859, + "learning_rate": 1.5740460283536652e-05, + "loss": 1.2223, + "step": 10946 + }, + { + "epoch": 3.260522347772669, + "grad_norm": 0.24911081790924072, + "learning_rate": 1.5739670447677572e-05, + "loss": 1.245, + "step": 10947 + }, + { + "epoch": 3.2608201939723376, + "grad_norm": 0.23777316510677338, + "learning_rate": 1.5738880558417277e-05, + "loss": 1.2512, + "step": 10948 + }, + { + "epoch": 3.2611180401720063, + "grad_norm": 0.24039356410503387, + "learning_rate": 1.5738090615763107e-05, + "loss": 1.2284, + "step": 10949 + }, + { + "epoch": 3.2614158863716747, + "grad_norm": 0.2394830584526062, + "learning_rate": 1.5737300619722412e-05, + "loss": 1.2471, + "step": 10950 + }, + { + "epoch": 3.2617137325713434, + "grad_norm": 0.2711692452430725, + "learning_rate": 1.573651057030255e-05, + "loss": 1.234, + "step": 10951 + }, + { + "epoch": 3.262011578771012, + "grad_norm": 0.23467116057872772, + "learning_rate": 1.573572046751086e-05, + "loss": 1.2339, + "step": 10952 + }, + { + "epoch": 3.262309424970681, + "grad_norm": 0.2356289029121399, + "learning_rate": 1.5734930311354705e-05, + "loss": 1.2448, + "step": 10953 + }, + { + "epoch": 3.2626072711703493, + "grad_norm": 0.2496243566274643, + "learning_rate": 1.573414010184143e-05, + "loss": 1.2604, + "step": 10954 + }, + { + "epoch": 3.262905117370018, + "grad_norm": 0.23672722280025482, + "learning_rate": 1.573334983897839e-05, + "loss": 1.2503, + "step": 10955 + }, + { + "epoch": 3.263202963569687, + "grad_norm": 0.2826528251171112, + "learning_rate": 1.5732559522772926e-05, + "loss": 1.2452, + "step": 10956 + }, + { + "epoch": 3.263500809769355, + "grad_norm": 0.23860949277877808, + "learning_rate": 1.573176915323241e-05, + "loss": 1.2421, + "step": 10957 + }, + { + "epoch": 3.263798655969024, + "grad_norm": 0.2497885525226593, + "learning_rate": 1.5730978730364183e-05, + "loss": 1.2378, + "step": 10958 + }, + { + "epoch": 3.2640965021686927, + "grad_norm": 0.24569043517112732, + "learning_rate": 1.57301882541756e-05, + "loss": 1.2436, + "step": 10959 + }, + { + "epoch": 3.264394348368361, + "grad_norm": 0.23629222810268402, + "learning_rate": 1.572939772467402e-05, + "loss": 1.2269, + "step": 10960 + }, + { + "epoch": 3.26469219456803, + "grad_norm": 0.31353241205215454, + "learning_rate": 1.5728607141866797e-05, + "loss": 1.238, + "step": 10961 + }, + { + "epoch": 3.2649900407676986, + "grad_norm": 0.2657424211502075, + "learning_rate": 1.572781650576128e-05, + "loss": 1.2364, + "step": 10962 + }, + { + "epoch": 3.2652878869673674, + "grad_norm": 0.2878069281578064, + "learning_rate": 1.5727025816364834e-05, + "loss": 1.2491, + "step": 10963 + }, + { + "epoch": 3.2655857331670357, + "grad_norm": 0.24171844124794006, + "learning_rate": 1.5726235073684807e-05, + "loss": 1.2184, + "step": 10964 + }, + { + "epoch": 3.2658835793667045, + "grad_norm": 0.28105929493904114, + "learning_rate": 1.5725444277728565e-05, + "loss": 1.2454, + "step": 10965 + }, + { + "epoch": 3.2661814255663733, + "grad_norm": 0.2551528215408325, + "learning_rate": 1.572465342850346e-05, + "loss": 1.2454, + "step": 10966 + }, + { + "epoch": 3.266479271766042, + "grad_norm": 0.24841690063476562, + "learning_rate": 1.572386252601685e-05, + "loss": 1.2285, + "step": 10967 + }, + { + "epoch": 3.2667771179657104, + "grad_norm": 0.269426554441452, + "learning_rate": 1.5723071570276095e-05, + "loss": 1.2326, + "step": 10968 + }, + { + "epoch": 3.267074964165379, + "grad_norm": 0.23246163129806519, + "learning_rate": 1.572228056128855e-05, + "loss": 1.2284, + "step": 10969 + }, + { + "epoch": 3.267372810365048, + "grad_norm": 0.32379814982414246, + "learning_rate": 1.5721489499061582e-05, + "loss": 1.2327, + "step": 10970 + }, + { + "epoch": 3.2676706565647162, + "grad_norm": 0.25725942850112915, + "learning_rate": 1.572069838360254e-05, + "loss": 1.2391, + "step": 10971 + }, + { + "epoch": 3.267968502764385, + "grad_norm": 0.25456273555755615, + "learning_rate": 1.5719907214918802e-05, + "loss": 1.2305, + "step": 10972 + }, + { + "epoch": 3.2682663489640538, + "grad_norm": 0.24083547294139862, + "learning_rate": 1.571911599301771e-05, + "loss": 1.2487, + "step": 10973 + }, + { + "epoch": 3.268564195163722, + "grad_norm": 0.244639053940773, + "learning_rate": 1.571832471790663e-05, + "loss": 1.2312, + "step": 10974 + }, + { + "epoch": 3.268862041363391, + "grad_norm": 0.23552767932415009, + "learning_rate": 1.571753338959294e-05, + "loss": 1.2385, + "step": 10975 + }, + { + "epoch": 3.2691598875630596, + "grad_norm": 0.23945331573486328, + "learning_rate": 1.571674200808398e-05, + "loss": 1.237, + "step": 10976 + }, + { + "epoch": 3.2694577337627284, + "grad_norm": 0.23800155520439148, + "learning_rate": 1.5715950573387126e-05, + "loss": 1.2353, + "step": 10977 + }, + { + "epoch": 3.2697555799623967, + "grad_norm": 0.2597010135650635, + "learning_rate": 1.5715159085509734e-05, + "loss": 1.2584, + "step": 10978 + }, + { + "epoch": 3.2700534261620655, + "grad_norm": 0.24727323651313782, + "learning_rate": 1.5714367544459178e-05, + "loss": 1.2429, + "step": 10979 + }, + { + "epoch": 3.2703512723617343, + "grad_norm": 0.2544100880622864, + "learning_rate": 1.5713575950242814e-05, + "loss": 1.2546, + "step": 10980 + }, + { + "epoch": 3.270649118561403, + "grad_norm": 0.2520626485347748, + "learning_rate": 1.571278430286801e-05, + "loss": 1.2395, + "step": 10981 + }, + { + "epoch": 3.2709469647610714, + "grad_norm": 0.3480736017227173, + "learning_rate": 1.571199260234213e-05, + "loss": 1.2472, + "step": 10982 + }, + { + "epoch": 3.27124481096074, + "grad_norm": 0.266797810792923, + "learning_rate": 1.571120084867254e-05, + "loss": 1.2414, + "step": 10983 + }, + { + "epoch": 3.271542657160409, + "grad_norm": 0.26604586839675903, + "learning_rate": 1.571040904186661e-05, + "loss": 1.2224, + "step": 10984 + }, + { + "epoch": 3.2718405033600773, + "grad_norm": 0.26607516407966614, + "learning_rate": 1.57096171819317e-05, + "loss": 1.238, + "step": 10985 + }, + { + "epoch": 3.272138349559746, + "grad_norm": 0.236188605427742, + "learning_rate": 1.5708825268875182e-05, + "loss": 1.2227, + "step": 10986 + }, + { + "epoch": 3.272436195759415, + "grad_norm": 0.2337832897901535, + "learning_rate": 1.5708033302704425e-05, + "loss": 1.2269, + "step": 10987 + }, + { + "epoch": 3.272734041959083, + "grad_norm": 0.23533278703689575, + "learning_rate": 1.5707241283426792e-05, + "loss": 1.2442, + "step": 10988 + }, + { + "epoch": 3.273031888158752, + "grad_norm": 0.23811085522174835, + "learning_rate": 1.570644921104966e-05, + "loss": 1.2373, + "step": 10989 + }, + { + "epoch": 3.2733297343584207, + "grad_norm": 0.2450307458639145, + "learning_rate": 1.570565708558039e-05, + "loss": 1.2455, + "step": 10990 + }, + { + "epoch": 3.2736275805580894, + "grad_norm": 0.26522406935691833, + "learning_rate": 1.5704864907026357e-05, + "loss": 1.2318, + "step": 10991 + }, + { + "epoch": 3.2739254267577578, + "grad_norm": 0.25072145462036133, + "learning_rate": 1.5704072675394932e-05, + "loss": 1.2399, + "step": 10992 + }, + { + "epoch": 3.2742232729574265, + "grad_norm": 0.25735607743263245, + "learning_rate": 1.570328039069348e-05, + "loss": 1.2441, + "step": 10993 + }, + { + "epoch": 3.2745211191570953, + "grad_norm": 0.30827564001083374, + "learning_rate": 1.5702488052929376e-05, + "loss": 1.2461, + "step": 10994 + }, + { + "epoch": 3.274818965356764, + "grad_norm": 0.2914036214351654, + "learning_rate": 1.5701695662109994e-05, + "loss": 1.2289, + "step": 10995 + }, + { + "epoch": 3.2751168115564324, + "grad_norm": 0.2513253092765808, + "learning_rate": 1.5700903218242703e-05, + "loss": 1.2206, + "step": 10996 + }, + { + "epoch": 3.275414657756101, + "grad_norm": 0.42634081840515137, + "learning_rate": 1.5700110721334877e-05, + "loss": 1.2496, + "step": 10997 + }, + { + "epoch": 3.27571250395577, + "grad_norm": 0.31619492173194885, + "learning_rate": 1.569931817139389e-05, + "loss": 1.2364, + "step": 10998 + }, + { + "epoch": 3.2760103501554383, + "grad_norm": 0.2968463897705078, + "learning_rate": 1.5698525568427118e-05, + "loss": 1.233, + "step": 10999 + }, + { + "epoch": 3.276308196355107, + "grad_norm": 0.24828587472438812, + "learning_rate": 1.569773291244193e-05, + "loss": 1.2265, + "step": 11000 + }, + { + "epoch": 3.276308196355107, + "eval_loss": 1.3360927104949951, + "eval_runtime": 20.0927, + "eval_samples_per_second": 86.3, + "eval_steps_per_second": 5.425, + "step": 11000 + }, + { + "epoch": 3.276606042554776, + "grad_norm": 0.3992263674736023, + "learning_rate": 1.5696940203445704e-05, + "loss": 1.2192, + "step": 11001 + }, + { + "epoch": 3.2769038887544446, + "grad_norm": 0.2677016258239746, + "learning_rate": 1.5696147441445812e-05, + "loss": 1.2429, + "step": 11002 + }, + { + "epoch": 3.277201734954113, + "grad_norm": 0.28629764914512634, + "learning_rate": 1.5695354626449633e-05, + "loss": 1.2413, + "step": 11003 + }, + { + "epoch": 3.2774995811537817, + "grad_norm": 0.25016868114471436, + "learning_rate": 1.569456175846454e-05, + "loss": 1.2173, + "step": 11004 + }, + { + "epoch": 3.2777974273534505, + "grad_norm": 0.3337778151035309, + "learning_rate": 1.569376883749792e-05, + "loss": 1.2294, + "step": 11005 + }, + { + "epoch": 3.2780952735531192, + "grad_norm": 0.24379444122314453, + "learning_rate": 1.5692975863557136e-05, + "loss": 1.2319, + "step": 11006 + }, + { + "epoch": 3.2783931197527876, + "grad_norm": 0.24937567114830017, + "learning_rate": 1.5692182836649573e-05, + "loss": 1.2347, + "step": 11007 + }, + { + "epoch": 3.2786909659524563, + "grad_norm": 0.2599264979362488, + "learning_rate": 1.5691389756782607e-05, + "loss": 1.2317, + "step": 11008 + }, + { + "epoch": 3.278988812152125, + "grad_norm": 0.24208100140094757, + "learning_rate": 1.569059662396362e-05, + "loss": 1.2197, + "step": 11009 + }, + { + "epoch": 3.2792866583517934, + "grad_norm": 0.3027574419975281, + "learning_rate": 1.568980343819999e-05, + "loss": 1.245, + "step": 11010 + }, + { + "epoch": 3.279584504551462, + "grad_norm": 0.24216900765895844, + "learning_rate": 1.5689010199499094e-05, + "loss": 1.2268, + "step": 11011 + }, + { + "epoch": 3.279882350751131, + "grad_norm": 0.27850988507270813, + "learning_rate": 1.5688216907868318e-05, + "loss": 1.2227, + "step": 11012 + }, + { + "epoch": 3.2801801969507993, + "grad_norm": 0.24350112676620483, + "learning_rate": 1.5687423563315034e-05, + "loss": 1.233, + "step": 11013 + }, + { + "epoch": 3.280478043150468, + "grad_norm": 0.2829402983188629, + "learning_rate": 1.568663016584663e-05, + "loss": 1.2286, + "step": 11014 + }, + { + "epoch": 3.280775889350137, + "grad_norm": 0.2563793957233429, + "learning_rate": 1.5685836715470485e-05, + "loss": 1.2421, + "step": 11015 + }, + { + "epoch": 3.2810737355498056, + "grad_norm": 0.24953509867191315, + "learning_rate": 1.5685043212193985e-05, + "loss": 1.2567, + "step": 11016 + }, + { + "epoch": 3.281371581749474, + "grad_norm": 0.2540299892425537, + "learning_rate": 1.568424965602451e-05, + "loss": 1.2401, + "step": 11017 + }, + { + "epoch": 3.2816694279491427, + "grad_norm": 0.24344053864479065, + "learning_rate": 1.568345604696944e-05, + "loss": 1.2303, + "step": 11018 + }, + { + "epoch": 3.2819672741488115, + "grad_norm": 0.35500892996788025, + "learning_rate": 1.568266238503616e-05, + "loss": 1.2503, + "step": 11019 + }, + { + "epoch": 3.2822651203484803, + "grad_norm": 0.23837926983833313, + "learning_rate": 1.568186867023206e-05, + "loss": 1.2318, + "step": 11020 + }, + { + "epoch": 3.2825629665481486, + "grad_norm": 0.25181010365486145, + "learning_rate": 1.5681074902564516e-05, + "loss": 1.2267, + "step": 11021 + }, + { + "epoch": 3.2828608127478174, + "grad_norm": 0.25234049558639526, + "learning_rate": 1.568028108204092e-05, + "loss": 1.2371, + "step": 11022 + }, + { + "epoch": 3.283158658947486, + "grad_norm": 0.26462599635124207, + "learning_rate": 1.5679487208668653e-05, + "loss": 1.2413, + "step": 11023 + }, + { + "epoch": 3.2834565051471545, + "grad_norm": 0.286300390958786, + "learning_rate": 1.5678693282455103e-05, + "loss": 1.2372, + "step": 11024 + }, + { + "epoch": 3.2837543513468233, + "grad_norm": 0.282059907913208, + "learning_rate": 1.5677899303407657e-05, + "loss": 1.241, + "step": 11025 + }, + { + "epoch": 3.284052197546492, + "grad_norm": 0.274752140045166, + "learning_rate": 1.5677105271533704e-05, + "loss": 1.2253, + "step": 11026 + }, + { + "epoch": 3.2843500437461604, + "grad_norm": 0.27365416288375854, + "learning_rate": 1.5676311186840626e-05, + "loss": 1.2475, + "step": 11027 + }, + { + "epoch": 3.284647889945829, + "grad_norm": 0.2936675548553467, + "learning_rate": 1.567551704933582e-05, + "loss": 1.2123, + "step": 11028 + }, + { + "epoch": 3.284945736145498, + "grad_norm": 0.28637778759002686, + "learning_rate": 1.5674722859026664e-05, + "loss": 1.2391, + "step": 11029 + }, + { + "epoch": 3.2852435823451667, + "grad_norm": 0.2901567816734314, + "learning_rate": 1.5673928615920552e-05, + "loss": 1.2343, + "step": 11030 + }, + { + "epoch": 3.285541428544835, + "grad_norm": 0.26433366537094116, + "learning_rate": 1.5673134320024874e-05, + "loss": 1.2494, + "step": 11031 + }, + { + "epoch": 3.2858392747445038, + "grad_norm": 0.2561134099960327, + "learning_rate": 1.567233997134702e-05, + "loss": 1.2368, + "step": 11032 + }, + { + "epoch": 3.2861371209441725, + "grad_norm": 0.25221318006515503, + "learning_rate": 1.5671545569894382e-05, + "loss": 1.2269, + "step": 11033 + }, + { + "epoch": 3.2864349671438413, + "grad_norm": 0.2477511763572693, + "learning_rate": 1.5670751115674345e-05, + "loss": 1.2328, + "step": 11034 + }, + { + "epoch": 3.2867328133435096, + "grad_norm": 0.2927154004573822, + "learning_rate": 1.566995660869431e-05, + "loss": 1.2292, + "step": 11035 + }, + { + "epoch": 3.2870306595431784, + "grad_norm": 0.24017727375030518, + "learning_rate": 1.5669162048961654e-05, + "loss": 1.2336, + "step": 11036 + }, + { + "epoch": 3.287328505742847, + "grad_norm": 0.24952299892902374, + "learning_rate": 1.566836743648379e-05, + "loss": 1.2286, + "step": 11037 + }, + { + "epoch": 3.2876263519425155, + "grad_norm": 0.24183036386966705, + "learning_rate": 1.5667572771268092e-05, + "loss": 1.2458, + "step": 11038 + }, + { + "epoch": 3.2879241981421843, + "grad_norm": 0.25494590401649475, + "learning_rate": 1.5666778053321964e-05, + "loss": 1.2482, + "step": 11039 + }, + { + "epoch": 3.288222044341853, + "grad_norm": 0.2713030278682709, + "learning_rate": 1.5665983282652803e-05, + "loss": 1.2168, + "step": 11040 + }, + { + "epoch": 3.2885198905415214, + "grad_norm": 0.24221836030483246, + "learning_rate": 1.5665188459267994e-05, + "loss": 1.2398, + "step": 11041 + }, + { + "epoch": 3.28881773674119, + "grad_norm": 0.27836811542510986, + "learning_rate": 1.5664393583174933e-05, + "loss": 1.2482, + "step": 11042 + }, + { + "epoch": 3.289115582940859, + "grad_norm": 0.2528473436832428, + "learning_rate": 1.5663598654381024e-05, + "loss": 1.2275, + "step": 11043 + }, + { + "epoch": 3.2894134291405277, + "grad_norm": 0.2529313564300537, + "learning_rate": 1.5662803672893653e-05, + "loss": 1.2234, + "step": 11044 + }, + { + "epoch": 3.289711275340196, + "grad_norm": 0.23660457134246826, + "learning_rate": 1.566200863872022e-05, + "loss": 1.2442, + "step": 11045 + }, + { + "epoch": 3.290009121539865, + "grad_norm": 0.3444722890853882, + "learning_rate": 1.5661213551868126e-05, + "loss": 1.2257, + "step": 11046 + }, + { + "epoch": 3.2903069677395336, + "grad_norm": 0.2877964675426483, + "learning_rate": 1.5660418412344762e-05, + "loss": 1.2298, + "step": 11047 + }, + { + "epoch": 3.2906048139392023, + "grad_norm": 0.3301161527633667, + "learning_rate": 1.565962322015753e-05, + "loss": 1.2434, + "step": 11048 + }, + { + "epoch": 3.2909026601388707, + "grad_norm": 0.2551535665988922, + "learning_rate": 1.5658827975313828e-05, + "loss": 1.2393, + "step": 11049 + }, + { + "epoch": 3.2912005063385394, + "grad_norm": 0.426665335893631, + "learning_rate": 1.5658032677821052e-05, + "loss": 1.2372, + "step": 11050 + }, + { + "epoch": 3.291498352538208, + "grad_norm": 0.28886961936950684, + "learning_rate": 1.5657237327686606e-05, + "loss": 1.233, + "step": 11051 + }, + { + "epoch": 3.2917961987378765, + "grad_norm": 0.2779581546783447, + "learning_rate": 1.5656441924917888e-05, + "loss": 1.2304, + "step": 11052 + }, + { + "epoch": 3.2920940449375453, + "grad_norm": 0.24516701698303223, + "learning_rate": 1.5655646469522294e-05, + "loss": 1.2297, + "step": 11053 + }, + { + "epoch": 3.292391891137214, + "grad_norm": 0.2536821663379669, + "learning_rate": 1.565485096150723e-05, + "loss": 1.2441, + "step": 11054 + }, + { + "epoch": 3.292689737336883, + "grad_norm": 0.2568303644657135, + "learning_rate": 1.5654055400880097e-05, + "loss": 1.227, + "step": 11055 + }, + { + "epoch": 3.292987583536551, + "grad_norm": 0.24783992767333984, + "learning_rate": 1.5653259787648293e-05, + "loss": 1.2229, + "step": 11056 + }, + { + "epoch": 3.29328542973622, + "grad_norm": 0.23471763730049133, + "learning_rate": 1.5652464121819226e-05, + "loss": 1.2214, + "step": 11057 + }, + { + "epoch": 3.2935832759358887, + "grad_norm": 0.24588479101657867, + "learning_rate": 1.5651668403400292e-05, + "loss": 1.2408, + "step": 11058 + }, + { + "epoch": 3.293881122135557, + "grad_norm": 0.2398025542497635, + "learning_rate": 1.56508726323989e-05, + "loss": 1.2487, + "step": 11059 + }, + { + "epoch": 3.294178968335226, + "grad_norm": 0.25474634766578674, + "learning_rate": 1.5650076808822453e-05, + "loss": 1.2292, + "step": 11060 + }, + { + "epoch": 3.2944768145348946, + "grad_norm": 0.2357475310564041, + "learning_rate": 1.5649280932678354e-05, + "loss": 1.242, + "step": 11061 + }, + { + "epoch": 3.2947746607345634, + "grad_norm": 0.2470824271440506, + "learning_rate": 1.5648485003974004e-05, + "loss": 1.2502, + "step": 11062 + }, + { + "epoch": 3.2950725069342317, + "grad_norm": 0.25632765889167786, + "learning_rate": 1.5647689022716813e-05, + "loss": 1.2338, + "step": 11063 + }, + { + "epoch": 3.2953703531339005, + "grad_norm": 0.24734733998775482, + "learning_rate": 1.5646892988914187e-05, + "loss": 1.2301, + "step": 11064 + }, + { + "epoch": 3.2956681993335692, + "grad_norm": 0.24102622270584106, + "learning_rate": 1.5646096902573533e-05, + "loss": 1.2313, + "step": 11065 + }, + { + "epoch": 3.2959660455332376, + "grad_norm": 0.23595453798770905, + "learning_rate": 1.5645300763702253e-05, + "loss": 1.246, + "step": 11066 + }, + { + "epoch": 3.2962638917329063, + "grad_norm": 0.24554964900016785, + "learning_rate": 1.564450457230776e-05, + "loss": 1.2552, + "step": 11067 + }, + { + "epoch": 3.296561737932575, + "grad_norm": 0.23637695610523224, + "learning_rate": 1.5643708328397455e-05, + "loss": 1.253, + "step": 11068 + }, + { + "epoch": 3.296859584132244, + "grad_norm": 0.2340025156736374, + "learning_rate": 1.5642912031978748e-05, + "loss": 1.228, + "step": 11069 + }, + { + "epoch": 3.297157430331912, + "grad_norm": 0.2444021850824356, + "learning_rate": 1.564211568305905e-05, + "loss": 1.2328, + "step": 11070 + }, + { + "epoch": 3.297455276531581, + "grad_norm": 0.24834030866622925, + "learning_rate": 1.564131928164577e-05, + "loss": 1.2504, + "step": 11071 + }, + { + "epoch": 3.2977531227312498, + "grad_norm": 0.2617681920528412, + "learning_rate": 1.564052282774632e-05, + "loss": 1.2386, + "step": 11072 + }, + { + "epoch": 3.2980509689309185, + "grad_norm": 0.24876558780670166, + "learning_rate": 1.56397263213681e-05, + "loss": 1.2429, + "step": 11073 + }, + { + "epoch": 3.298348815130587, + "grad_norm": 0.2376701533794403, + "learning_rate": 1.5638929762518537e-05, + "loss": 1.2425, + "step": 11074 + }, + { + "epoch": 3.2986466613302556, + "grad_norm": 0.2451256811618805, + "learning_rate": 1.5638133151205026e-05, + "loss": 1.2413, + "step": 11075 + }, + { + "epoch": 3.2989445075299244, + "grad_norm": 0.3038402497768402, + "learning_rate": 1.5637336487434988e-05, + "loss": 1.2606, + "step": 11076 + }, + { + "epoch": 3.2992423537295927, + "grad_norm": 0.3035128712654114, + "learning_rate": 1.563653977121583e-05, + "loss": 1.212, + "step": 11077 + }, + { + "epoch": 3.2995401999292615, + "grad_norm": 0.243771031498909, + "learning_rate": 1.5635743002554968e-05, + "loss": 1.2518, + "step": 11078 + }, + { + "epoch": 3.2998380461289303, + "grad_norm": 0.3867659568786621, + "learning_rate": 1.5634946181459815e-05, + "loss": 1.2285, + "step": 11079 + }, + { + "epoch": 3.3001358923285986, + "grad_norm": 0.3225020468235016, + "learning_rate": 1.5634149307937782e-05, + "loss": 1.24, + "step": 11080 + }, + { + "epoch": 3.3004337385282674, + "grad_norm": 0.253110408782959, + "learning_rate": 1.5633352381996284e-05, + "loss": 1.2529, + "step": 11081 + }, + { + "epoch": 3.300731584727936, + "grad_norm": 0.3336648941040039, + "learning_rate": 1.5632555403642736e-05, + "loss": 1.2554, + "step": 11082 + }, + { + "epoch": 3.301029430927605, + "grad_norm": 0.2612948715686798, + "learning_rate": 1.5631758372884558e-05, + "loss": 1.253, + "step": 11083 + }, + { + "epoch": 3.3013272771272733, + "grad_norm": 0.26156461238861084, + "learning_rate": 1.5630961289729158e-05, + "loss": 1.2434, + "step": 11084 + }, + { + "epoch": 3.301625123326942, + "grad_norm": 0.27161288261413574, + "learning_rate": 1.563016415418395e-05, + "loss": 1.2323, + "step": 11085 + }, + { + "epoch": 3.301922969526611, + "grad_norm": 0.2873471975326538, + "learning_rate": 1.5629366966256362e-05, + "loss": 1.2315, + "step": 11086 + }, + { + "epoch": 3.3022208157262796, + "grad_norm": 0.2846321165561676, + "learning_rate": 1.56285697259538e-05, + "loss": 1.224, + "step": 11087 + }, + { + "epoch": 3.302518661925948, + "grad_norm": 0.265316367149353, + "learning_rate": 1.5627772433283685e-05, + "loss": 1.2408, + "step": 11088 + }, + { + "epoch": 3.3028165081256167, + "grad_norm": 0.280942440032959, + "learning_rate": 1.5626975088253436e-05, + "loss": 1.2403, + "step": 11089 + }, + { + "epoch": 3.3031143543252854, + "grad_norm": 0.25459396839141846, + "learning_rate": 1.562617769087047e-05, + "loss": 1.2249, + "step": 11090 + }, + { + "epoch": 3.3034122005249538, + "grad_norm": 0.33199477195739746, + "learning_rate": 1.5625380241142206e-05, + "loss": 1.2198, + "step": 11091 + }, + { + "epoch": 3.3037100467246225, + "grad_norm": 0.2421032041311264, + "learning_rate": 1.5624582739076067e-05, + "loss": 1.2216, + "step": 11092 + }, + { + "epoch": 3.3040078929242913, + "grad_norm": 0.5085775852203369, + "learning_rate": 1.5623785184679468e-05, + "loss": 1.2399, + "step": 11093 + }, + { + "epoch": 3.3043057391239596, + "grad_norm": 0.32602718472480774, + "learning_rate": 1.5622987577959827e-05, + "loss": 1.2397, + "step": 11094 + }, + { + "epoch": 3.3046035853236284, + "grad_norm": 0.3089933395385742, + "learning_rate": 1.5622189918924575e-05, + "loss": 1.2216, + "step": 11095 + }, + { + "epoch": 3.304901431523297, + "grad_norm": 0.2592551112174988, + "learning_rate": 1.5621392207581125e-05, + "loss": 1.2513, + "step": 11096 + }, + { + "epoch": 3.305199277722966, + "grad_norm": 0.3399773836135864, + "learning_rate": 1.56205944439369e-05, + "loss": 1.2415, + "step": 11097 + }, + { + "epoch": 3.3054971239226343, + "grad_norm": 0.2785807251930237, + "learning_rate": 1.5619796627999326e-05, + "loss": 1.2077, + "step": 11098 + }, + { + "epoch": 3.305794970122303, + "grad_norm": 0.2429707944393158, + "learning_rate": 1.561899875977582e-05, + "loss": 1.2334, + "step": 11099 + }, + { + "epoch": 3.306092816321972, + "grad_norm": 0.25270265340805054, + "learning_rate": 1.5618200839273813e-05, + "loss": 1.2291, + "step": 11100 + }, + { + "epoch": 3.3063906625216406, + "grad_norm": 0.2738122045993805, + "learning_rate": 1.561740286650072e-05, + "loss": 1.2364, + "step": 11101 + }, + { + "epoch": 3.306688508721309, + "grad_norm": 0.23685142397880554, + "learning_rate": 1.5616604841463973e-05, + "loss": 1.2413, + "step": 11102 + }, + { + "epoch": 3.3069863549209777, + "grad_norm": 0.3383182883262634, + "learning_rate": 1.5615806764170987e-05, + "loss": 1.2378, + "step": 11103 + }, + { + "epoch": 3.3072842011206465, + "grad_norm": 0.26384952664375305, + "learning_rate": 1.5615008634629197e-05, + "loss": 1.2336, + "step": 11104 + }, + { + "epoch": 3.307582047320315, + "grad_norm": 0.2673135995864868, + "learning_rate": 1.5614210452846027e-05, + "loss": 1.2495, + "step": 11105 + }, + { + "epoch": 3.3078798935199836, + "grad_norm": 0.2639603614807129, + "learning_rate": 1.56134122188289e-05, + "loss": 1.2412, + "step": 11106 + }, + { + "epoch": 3.3081777397196523, + "grad_norm": 0.2544959485530853, + "learning_rate": 1.5612613932585243e-05, + "loss": 1.233, + "step": 11107 + }, + { + "epoch": 3.3084755859193207, + "grad_norm": 0.24230916798114777, + "learning_rate": 1.5611815594122485e-05, + "loss": 1.2295, + "step": 11108 + }, + { + "epoch": 3.3087734321189894, + "grad_norm": 0.24680808186531067, + "learning_rate": 1.5611017203448054e-05, + "loss": 1.2267, + "step": 11109 + }, + { + "epoch": 3.309071278318658, + "grad_norm": 0.2686762511730194, + "learning_rate": 1.5610218760569377e-05, + "loss": 1.2474, + "step": 11110 + }, + { + "epoch": 3.309369124518327, + "grad_norm": 0.25730791687965393, + "learning_rate": 1.560942026549388e-05, + "loss": 1.2473, + "step": 11111 + }, + { + "epoch": 3.3096669707179953, + "grad_norm": 0.2545897960662842, + "learning_rate": 1.5608621718228996e-05, + "loss": 1.2294, + "step": 11112 + }, + { + "epoch": 3.309964816917664, + "grad_norm": 0.24847790598869324, + "learning_rate": 1.560782311878215e-05, + "loss": 1.2383, + "step": 11113 + }, + { + "epoch": 3.310262663117333, + "grad_norm": 0.24648483097553253, + "learning_rate": 1.5607024467160782e-05, + "loss": 1.2471, + "step": 11114 + }, + { + "epoch": 3.3105605093170016, + "grad_norm": 0.25031670928001404, + "learning_rate": 1.560622576337231e-05, + "loss": 1.2207, + "step": 11115 + }, + { + "epoch": 3.31085835551667, + "grad_norm": 0.2567838430404663, + "learning_rate": 1.5605427007424175e-05, + "loss": 1.2409, + "step": 11116 + }, + { + "epoch": 3.3111562017163387, + "grad_norm": 0.24459747970104218, + "learning_rate": 1.5604628199323803e-05, + "loss": 1.2244, + "step": 11117 + }, + { + "epoch": 3.3114540479160075, + "grad_norm": 0.260470986366272, + "learning_rate": 1.5603829339078626e-05, + "loss": 1.2329, + "step": 11118 + }, + { + "epoch": 3.311751894115676, + "grad_norm": 0.24614861607551575, + "learning_rate": 1.5603030426696078e-05, + "loss": 1.2427, + "step": 11119 + }, + { + "epoch": 3.3120497403153446, + "grad_norm": 0.24856600165367126, + "learning_rate": 1.5602231462183595e-05, + "loss": 1.2446, + "step": 11120 + }, + { + "epoch": 3.3123475865150134, + "grad_norm": 0.2849280536174774, + "learning_rate": 1.5601432445548604e-05, + "loss": 1.2428, + "step": 11121 + }, + { + "epoch": 3.312645432714682, + "grad_norm": 0.25962498784065247, + "learning_rate": 1.5600633376798546e-05, + "loss": 1.2337, + "step": 11122 + }, + { + "epoch": 3.3129432789143505, + "grad_norm": 0.2747041583061218, + "learning_rate": 1.559983425594085e-05, + "loss": 1.2329, + "step": 11123 + }, + { + "epoch": 3.3132411251140192, + "grad_norm": 0.2400052845478058, + "learning_rate": 1.559903508298295e-05, + "loss": 1.2384, + "step": 11124 + }, + { + "epoch": 3.313538971313688, + "grad_norm": 0.2882014513015747, + "learning_rate": 1.5598235857932288e-05, + "loss": 1.2385, + "step": 11125 + }, + { + "epoch": 3.3138368175133563, + "grad_norm": 0.23737503588199615, + "learning_rate": 1.559743658079629e-05, + "loss": 1.2276, + "step": 11126 + }, + { + "epoch": 3.314134663713025, + "grad_norm": 0.281827837228775, + "learning_rate": 1.5596637251582406e-05, + "loss": 1.2281, + "step": 11127 + }, + { + "epoch": 3.314432509912694, + "grad_norm": 0.23551490902900696, + "learning_rate": 1.5595837870298064e-05, + "loss": 1.2313, + "step": 11128 + }, + { + "epoch": 3.3147303561123627, + "grad_norm": 0.2924799621105194, + "learning_rate": 1.5595038436950697e-05, + "loss": 1.2305, + "step": 11129 + }, + { + "epoch": 3.315028202312031, + "grad_norm": 0.24237075448036194, + "learning_rate": 1.5594238951547754e-05, + "loss": 1.224, + "step": 11130 + }, + { + "epoch": 3.3153260485116998, + "grad_norm": 0.3355448544025421, + "learning_rate": 1.5593439414096666e-05, + "loss": 1.2228, + "step": 11131 + }, + { + "epoch": 3.3156238947113685, + "grad_norm": 0.2854815125465393, + "learning_rate": 1.5592639824604874e-05, + "loss": 1.2352, + "step": 11132 + }, + { + "epoch": 3.315921740911037, + "grad_norm": 0.24154211580753326, + "learning_rate": 1.5591840183079817e-05, + "loss": 1.2457, + "step": 11133 + }, + { + "epoch": 3.3162195871107056, + "grad_norm": 0.24469321966171265, + "learning_rate": 1.5591040489528936e-05, + "loss": 1.2235, + "step": 11134 + }, + { + "epoch": 3.3165174333103744, + "grad_norm": 0.24999083578586578, + "learning_rate": 1.5590240743959667e-05, + "loss": 1.2596, + "step": 11135 + }, + { + "epoch": 3.316815279510043, + "grad_norm": 0.24260610342025757, + "learning_rate": 1.5589440946379456e-05, + "loss": 1.234, + "step": 11136 + }, + { + "epoch": 3.3171131257097115, + "grad_norm": 0.2558267116546631, + "learning_rate": 1.558864109679574e-05, + "loss": 1.241, + "step": 11137 + }, + { + "epoch": 3.3174109719093803, + "grad_norm": 0.27146750688552856, + "learning_rate": 1.5587841195215963e-05, + "loss": 1.2618, + "step": 11138 + }, + { + "epoch": 3.317708818109049, + "grad_norm": 0.2366420477628708, + "learning_rate": 1.558704124164757e-05, + "loss": 1.2395, + "step": 11139 + }, + { + "epoch": 3.318006664308718, + "grad_norm": 0.25245681405067444, + "learning_rate": 1.5586241236097995e-05, + "loss": 1.2289, + "step": 11140 + }, + { + "epoch": 3.318304510508386, + "grad_norm": 0.2500971555709839, + "learning_rate": 1.5585441178574688e-05, + "loss": 1.2464, + "step": 11141 + }, + { + "epoch": 3.318602356708055, + "grad_norm": 0.2692393958568573, + "learning_rate": 1.5584641069085097e-05, + "loss": 1.2469, + "step": 11142 + }, + { + "epoch": 3.3189002029077237, + "grad_norm": 0.41690102219581604, + "learning_rate": 1.5583840907636655e-05, + "loss": 1.2207, + "step": 11143 + }, + { + "epoch": 3.319198049107392, + "grad_norm": 0.2596820592880249, + "learning_rate": 1.5583040694236812e-05, + "loss": 1.2374, + "step": 11144 + }, + { + "epoch": 3.319495895307061, + "grad_norm": 0.2902927100658417, + "learning_rate": 1.5582240428893013e-05, + "loss": 1.2273, + "step": 11145 + }, + { + "epoch": 3.3197937415067296, + "grad_norm": 0.2572629153728485, + "learning_rate": 1.5581440111612707e-05, + "loss": 1.2404, + "step": 11146 + }, + { + "epoch": 3.320091587706398, + "grad_norm": 0.33107370138168335, + "learning_rate": 1.5580639742403332e-05, + "loss": 1.2392, + "step": 11147 + }, + { + "epoch": 3.3203894339060667, + "grad_norm": 0.27607280015945435, + "learning_rate": 1.5579839321272342e-05, + "loss": 1.2241, + "step": 11148 + }, + { + "epoch": 3.3206872801057354, + "grad_norm": 0.28077617287635803, + "learning_rate": 1.5579038848227184e-05, + "loss": 1.2272, + "step": 11149 + }, + { + "epoch": 3.320985126305404, + "grad_norm": 0.2819717824459076, + "learning_rate": 1.5578238323275298e-05, + "loss": 1.2274, + "step": 11150 + }, + { + "epoch": 3.3212829725050725, + "grad_norm": 0.24944089353084564, + "learning_rate": 1.5577437746424138e-05, + "loss": 1.2516, + "step": 11151 + }, + { + "epoch": 3.3215808187047413, + "grad_norm": 0.3945283591747284, + "learning_rate": 1.557663711768115e-05, + "loss": 1.2347, + "step": 11152 + }, + { + "epoch": 3.32187866490441, + "grad_norm": 0.36648494005203247, + "learning_rate": 1.5575836437053787e-05, + "loss": 1.2292, + "step": 11153 + }, + { + "epoch": 3.322176511104079, + "grad_norm": 0.30352574586868286, + "learning_rate": 1.5575035704549498e-05, + "loss": 1.2408, + "step": 11154 + }, + { + "epoch": 3.322474357303747, + "grad_norm": 0.30668723583221436, + "learning_rate": 1.5574234920175727e-05, + "loss": 1.2316, + "step": 11155 + }, + { + "epoch": 3.322772203503416, + "grad_norm": 0.33290109038352966, + "learning_rate": 1.5573434083939927e-05, + "loss": 1.2292, + "step": 11156 + }, + { + "epoch": 3.3230700497030847, + "grad_norm": 0.2717600166797638, + "learning_rate": 1.557263319584955e-05, + "loss": 1.2307, + "step": 11157 + }, + { + "epoch": 3.323367895902753, + "grad_norm": 0.2959630489349365, + "learning_rate": 1.5571832255912048e-05, + "loss": 1.2367, + "step": 11158 + }, + { + "epoch": 3.323665742102422, + "grad_norm": 0.25313127040863037, + "learning_rate": 1.5571031264134873e-05, + "loss": 1.2327, + "step": 11159 + }, + { + "epoch": 3.3239635883020906, + "grad_norm": 0.2804490029811859, + "learning_rate": 1.5570230220525476e-05, + "loss": 1.2655, + "step": 11160 + }, + { + "epoch": 3.324261434501759, + "grad_norm": 0.334187388420105, + "learning_rate": 1.556942912509131e-05, + "loss": 1.237, + "step": 11161 + }, + { + "epoch": 3.3245592807014277, + "grad_norm": 0.24870112538337708, + "learning_rate": 1.5568627977839828e-05, + "loss": 1.2391, + "step": 11162 + }, + { + "epoch": 3.3248571269010965, + "grad_norm": 0.27060747146606445, + "learning_rate": 1.5567826778778485e-05, + "loss": 1.2543, + "step": 11163 + }, + { + "epoch": 3.3251549731007652, + "grad_norm": 0.2730172872543335, + "learning_rate": 1.5567025527914735e-05, + "loss": 1.2662, + "step": 11164 + }, + { + "epoch": 3.3254528193004336, + "grad_norm": 0.2469976395368576, + "learning_rate": 1.556622422525603e-05, + "loss": 1.242, + "step": 11165 + }, + { + "epoch": 3.3257506655001023, + "grad_norm": 0.336348295211792, + "learning_rate": 1.556542287080983e-05, + "loss": 1.2396, + "step": 11166 + }, + { + "epoch": 3.326048511699771, + "grad_norm": 0.24766014516353607, + "learning_rate": 1.5564621464583586e-05, + "loss": 1.2395, + "step": 11167 + }, + { + "epoch": 3.32634635789944, + "grad_norm": 0.26880988478660583, + "learning_rate": 1.556382000658476e-05, + "loss": 1.2395, + "step": 11168 + }, + { + "epoch": 3.326644204099108, + "grad_norm": 0.2785128355026245, + "learning_rate": 1.5563018496820805e-05, + "loss": 1.2221, + "step": 11169 + }, + { + "epoch": 3.326942050298777, + "grad_norm": 0.2513678967952728, + "learning_rate": 1.5562216935299176e-05, + "loss": 1.2359, + "step": 11170 + }, + { + "epoch": 3.3272398964984458, + "grad_norm": 0.39331939816474915, + "learning_rate": 1.556141532202733e-05, + "loss": 1.2293, + "step": 11171 + }, + { + "epoch": 3.327537742698114, + "grad_norm": 0.2812885046005249, + "learning_rate": 1.556061365701273e-05, + "loss": 1.2153, + "step": 11172 + }, + { + "epoch": 3.327835588897783, + "grad_norm": 0.30336180329322815, + "learning_rate": 1.5559811940262838e-05, + "loss": 1.2361, + "step": 11173 + }, + { + "epoch": 3.3281334350974516, + "grad_norm": 0.23909083008766174, + "learning_rate": 1.5559010171785104e-05, + "loss": 1.2316, + "step": 11174 + }, + { + "epoch": 3.32843128129712, + "grad_norm": 0.28223928809165955, + "learning_rate": 1.5558208351586986e-05, + "loss": 1.2449, + "step": 11175 + }, + { + "epoch": 3.3287291274967887, + "grad_norm": 0.24867713451385498, + "learning_rate": 1.555740647967596e-05, + "loss": 1.238, + "step": 11176 + }, + { + "epoch": 3.3290269736964575, + "grad_norm": 0.2665850818157196, + "learning_rate": 1.5556604556059465e-05, + "loss": 1.2258, + "step": 11177 + }, + { + "epoch": 3.3293248198961263, + "grad_norm": 0.28365078568458557, + "learning_rate": 1.555580258074498e-05, + "loss": 1.2372, + "step": 11178 + }, + { + "epoch": 3.3296226660957946, + "grad_norm": 0.37532925605773926, + "learning_rate": 1.5555000553739955e-05, + "loss": 1.2222, + "step": 11179 + }, + { + "epoch": 3.3299205122954634, + "grad_norm": 0.22920817136764526, + "learning_rate": 1.5554198475051858e-05, + "loss": 1.2338, + "step": 11180 + }, + { + "epoch": 3.330218358495132, + "grad_norm": 0.28767210245132446, + "learning_rate": 1.555339634468815e-05, + "loss": 1.2309, + "step": 11181 + }, + { + "epoch": 3.330516204694801, + "grad_norm": 0.24251440167427063, + "learning_rate": 1.5552594162656294e-05, + "loss": 1.241, + "step": 11182 + }, + { + "epoch": 3.3308140508944692, + "grad_norm": 0.35734620690345764, + "learning_rate": 1.5551791928963752e-05, + "loss": 1.2458, + "step": 11183 + }, + { + "epoch": 3.331111897094138, + "grad_norm": 0.2381419837474823, + "learning_rate": 1.5550989643617992e-05, + "loss": 1.2279, + "step": 11184 + }, + { + "epoch": 3.331409743293807, + "grad_norm": 0.27754923701286316, + "learning_rate": 1.555018730662647e-05, + "loss": 1.2333, + "step": 11185 + }, + { + "epoch": 3.331707589493475, + "grad_norm": 0.24203741550445557, + "learning_rate": 1.554938491799666e-05, + "loss": 1.2378, + "step": 11186 + }, + { + "epoch": 3.332005435693144, + "grad_norm": 0.2660702168941498, + "learning_rate": 1.554858247773602e-05, + "loss": 1.2315, + "step": 11187 + }, + { + "epoch": 3.3323032818928127, + "grad_norm": 0.2345641702413559, + "learning_rate": 1.554777998585202e-05, + "loss": 1.2462, + "step": 11188 + }, + { + "epoch": 3.3326011280924814, + "grad_norm": 0.2346842736005783, + "learning_rate": 1.554697744235213e-05, + "loss": 1.2465, + "step": 11189 + }, + { + "epoch": 3.3328989742921498, + "grad_norm": 0.22912979125976562, + "learning_rate": 1.554617484724381e-05, + "loss": 1.2554, + "step": 11190 + }, + { + "epoch": 3.3331968204918185, + "grad_norm": 0.2334700971841812, + "learning_rate": 1.554537220053453e-05, + "loss": 1.2369, + "step": 11191 + }, + { + "epoch": 3.3334946666914873, + "grad_norm": 0.22997841238975525, + "learning_rate": 1.5544569502231756e-05, + "loss": 1.2362, + "step": 11192 + }, + { + "epoch": 3.333792512891156, + "grad_norm": 0.25977134704589844, + "learning_rate": 1.5543766752342957e-05, + "loss": 1.2335, + "step": 11193 + }, + { + "epoch": 3.3340903590908244, + "grad_norm": 0.2689334452152252, + "learning_rate": 1.5542963950875604e-05, + "loss": 1.2376, + "step": 11194 + }, + { + "epoch": 3.334388205290493, + "grad_norm": 0.2830061912536621, + "learning_rate": 1.5542161097837162e-05, + "loss": 1.2258, + "step": 11195 + }, + { + "epoch": 3.334686051490162, + "grad_norm": 0.2689288258552551, + "learning_rate": 1.5541358193235105e-05, + "loss": 1.2205, + "step": 11196 + }, + { + "epoch": 3.3349838976898303, + "grad_norm": 0.23667997121810913, + "learning_rate": 1.55405552370769e-05, + "loss": 1.2447, + "step": 11197 + }, + { + "epoch": 3.335281743889499, + "grad_norm": 0.2667357623577118, + "learning_rate": 1.5539752229370022e-05, + "loss": 1.2398, + "step": 11198 + }, + { + "epoch": 3.335579590089168, + "grad_norm": 0.2455081343650818, + "learning_rate": 1.5538949170121938e-05, + "loss": 1.2282, + "step": 11199 + }, + { + "epoch": 3.335877436288836, + "grad_norm": 0.29290878772735596, + "learning_rate": 1.553814605934012e-05, + "loss": 1.229, + "step": 11200 + }, + { + "epoch": 3.336175282488505, + "grad_norm": 0.2486369013786316, + "learning_rate": 1.5537342897032038e-05, + "loss": 1.2478, + "step": 11201 + }, + { + "epoch": 3.3364731286881737, + "grad_norm": 0.25021225214004517, + "learning_rate": 1.553653968320517e-05, + "loss": 1.2316, + "step": 11202 + }, + { + "epoch": 3.3367709748878425, + "grad_norm": 0.24372710287570953, + "learning_rate": 1.5535736417866984e-05, + "loss": 1.2361, + "step": 11203 + }, + { + "epoch": 3.337068821087511, + "grad_norm": 0.24520167708396912, + "learning_rate": 1.5534933101024955e-05, + "loss": 1.2297, + "step": 11204 + }, + { + "epoch": 3.3373666672871796, + "grad_norm": 0.25759097933769226, + "learning_rate": 1.553412973268656e-05, + "loss": 1.2516, + "step": 11205 + }, + { + "epoch": 3.3376645134868483, + "grad_norm": 0.2549898624420166, + "learning_rate": 1.5533326312859266e-05, + "loss": 1.244, + "step": 11206 + }, + { + "epoch": 3.337962359686517, + "grad_norm": 0.23916058242321014, + "learning_rate": 1.5532522841550558e-05, + "loss": 1.2476, + "step": 11207 + }, + { + "epoch": 3.3382602058861854, + "grad_norm": 0.23708297312259674, + "learning_rate": 1.5531719318767905e-05, + "loss": 1.2468, + "step": 11208 + }, + { + "epoch": 3.338558052085854, + "grad_norm": 0.26817384362220764, + "learning_rate": 1.5530915744518784e-05, + "loss": 1.2376, + "step": 11209 + }, + { + "epoch": 3.338855898285523, + "grad_norm": 0.2532375454902649, + "learning_rate": 1.553011211881067e-05, + "loss": 1.2511, + "step": 11210 + }, + { + "epoch": 3.3391537444851913, + "grad_norm": 0.2590024769306183, + "learning_rate": 1.5529308441651045e-05, + "loss": 1.2445, + "step": 11211 + }, + { + "epoch": 3.33945159068486, + "grad_norm": 0.3769739270210266, + "learning_rate": 1.5528504713047378e-05, + "loss": 1.2351, + "step": 11212 + }, + { + "epoch": 3.339749436884529, + "grad_norm": 0.3174033761024475, + "learning_rate": 1.5527700933007154e-05, + "loss": 1.221, + "step": 11213 + }, + { + "epoch": 3.340047283084197, + "grad_norm": 0.2384985238313675, + "learning_rate": 1.552689710153785e-05, + "loss": 1.2173, + "step": 11214 + }, + { + "epoch": 3.340345129283866, + "grad_norm": 0.29194796085357666, + "learning_rate": 1.552609321864694e-05, + "loss": 1.2261, + "step": 11215 + }, + { + "epoch": 3.3406429754835347, + "grad_norm": 0.2940313220024109, + "learning_rate": 1.552528928434191e-05, + "loss": 1.2108, + "step": 11216 + }, + { + "epoch": 3.3409408216832035, + "grad_norm": 0.30959200859069824, + "learning_rate": 1.5524485298630237e-05, + "loss": 1.2483, + "step": 11217 + }, + { + "epoch": 3.341238667882872, + "grad_norm": 0.28486740589141846, + "learning_rate": 1.5523681261519396e-05, + "loss": 1.2348, + "step": 11218 + }, + { + "epoch": 3.3415365140825406, + "grad_norm": 0.2476719468832016, + "learning_rate": 1.5522877173016878e-05, + "loss": 1.2333, + "step": 11219 + }, + { + "epoch": 3.3418343602822094, + "grad_norm": 0.2547983229160309, + "learning_rate": 1.5522073033130153e-05, + "loss": 1.2469, + "step": 11220 + }, + { + "epoch": 3.342132206481878, + "grad_norm": 0.2380474954843521, + "learning_rate": 1.552126884186671e-05, + "loss": 1.2356, + "step": 11221 + }, + { + "epoch": 3.3424300526815465, + "grad_norm": 0.2959021031856537, + "learning_rate": 1.552046459923403e-05, + "loss": 1.2423, + "step": 11222 + }, + { + "epoch": 3.3427278988812152, + "grad_norm": 0.32568472623825073, + "learning_rate": 1.5519660305239595e-05, + "loss": 1.2313, + "step": 11223 + }, + { + "epoch": 3.343025745080884, + "grad_norm": 0.2493116706609726, + "learning_rate": 1.5518855959890887e-05, + "loss": 1.2325, + "step": 11224 + }, + { + "epoch": 3.3433235912805523, + "grad_norm": 0.2777004837989807, + "learning_rate": 1.551805156319539e-05, + "loss": 1.24, + "step": 11225 + }, + { + "epoch": 3.343621437480221, + "grad_norm": 0.2908431589603424, + "learning_rate": 1.551724711516059e-05, + "loss": 1.2271, + "step": 11226 + }, + { + "epoch": 3.34391928367989, + "grad_norm": 0.23702867329120636, + "learning_rate": 1.5516442615793967e-05, + "loss": 1.2398, + "step": 11227 + }, + { + "epoch": 3.344217129879558, + "grad_norm": 0.2778693735599518, + "learning_rate": 1.551563806510301e-05, + "loss": 1.2381, + "step": 11228 + }, + { + "epoch": 3.344514976079227, + "grad_norm": 0.23637819290161133, + "learning_rate": 1.5514833463095206e-05, + "loss": 1.2358, + "step": 11229 + }, + { + "epoch": 3.3448128222788958, + "grad_norm": 0.2453201860189438, + "learning_rate": 1.5514028809778033e-05, + "loss": 1.2341, + "step": 11230 + }, + { + "epoch": 3.3451106684785645, + "grad_norm": 0.24085794389247894, + "learning_rate": 1.551322410515899e-05, + "loss": 1.2464, + "step": 11231 + }, + { + "epoch": 3.345408514678233, + "grad_norm": 0.2635227143764496, + "learning_rate": 1.5512419349245548e-05, + "loss": 1.2532, + "step": 11232 + }, + { + "epoch": 3.3457063608779016, + "grad_norm": 0.23879170417785645, + "learning_rate": 1.5511614542045206e-05, + "loss": 1.2441, + "step": 11233 + }, + { + "epoch": 3.3460042070775704, + "grad_norm": 0.2861533463001251, + "learning_rate": 1.551080968356545e-05, + "loss": 1.2367, + "step": 11234 + }, + { + "epoch": 3.346302053277239, + "grad_norm": 0.3119814395904541, + "learning_rate": 1.5510004773813766e-05, + "loss": 1.2341, + "step": 11235 + }, + { + "epoch": 3.3465998994769075, + "grad_norm": 0.2288614809513092, + "learning_rate": 1.5509199812797645e-05, + "loss": 1.2325, + "step": 11236 + }, + { + "epoch": 3.3468977456765763, + "grad_norm": 0.3121853470802307, + "learning_rate": 1.5508394800524573e-05, + "loss": 1.2375, + "step": 11237 + }, + { + "epoch": 3.347195591876245, + "grad_norm": 0.2740115821361542, + "learning_rate": 1.5507589737002043e-05, + "loss": 1.2252, + "step": 11238 + }, + { + "epoch": 3.3474934380759134, + "grad_norm": 0.2465236335992813, + "learning_rate": 1.5506784622237543e-05, + "loss": 1.2334, + "step": 11239 + }, + { + "epoch": 3.347791284275582, + "grad_norm": 0.27302634716033936, + "learning_rate": 1.5505979456238565e-05, + "loss": 1.2426, + "step": 11240 + }, + { + "epoch": 3.348089130475251, + "grad_norm": 0.23760247230529785, + "learning_rate": 1.55051742390126e-05, + "loss": 1.2418, + "step": 11241 + }, + { + "epoch": 3.3483869766749192, + "grad_norm": 0.24832670390605927, + "learning_rate": 1.5504368970567142e-05, + "loss": 1.2388, + "step": 11242 + }, + { + "epoch": 3.348684822874588, + "grad_norm": 0.2424217015504837, + "learning_rate": 1.5503563650909675e-05, + "loss": 1.2448, + "step": 11243 + }, + { + "epoch": 3.348982669074257, + "grad_norm": 0.24038386344909668, + "learning_rate": 1.5502758280047702e-05, + "loss": 1.218, + "step": 11244 + }, + { + "epoch": 3.3492805152739256, + "grad_norm": 0.23946547508239746, + "learning_rate": 1.550195285798871e-05, + "loss": 1.2427, + "step": 11245 + }, + { + "epoch": 3.349578361473594, + "grad_norm": 0.26257964968681335, + "learning_rate": 1.550114738474019e-05, + "loss": 1.2502, + "step": 11246 + }, + { + "epoch": 3.3498762076732627, + "grad_norm": 0.26204416155815125, + "learning_rate": 1.5500341860309643e-05, + "loss": 1.2304, + "step": 11247 + }, + { + "epoch": 3.3501740538729314, + "grad_norm": 0.2664135694503784, + "learning_rate": 1.5499536284704563e-05, + "loss": 1.2185, + "step": 11248 + }, + { + "epoch": 3.3504719000726, + "grad_norm": 0.2504350244998932, + "learning_rate": 1.5498730657932442e-05, + "loss": 1.2452, + "step": 11249 + }, + { + "epoch": 3.3507697462722685, + "grad_norm": 0.2880156934261322, + "learning_rate": 1.549792498000077e-05, + "loss": 1.2469, + "step": 11250 + }, + { + "epoch": 3.3510675924719373, + "grad_norm": 0.2322516292333603, + "learning_rate": 1.549711925091705e-05, + "loss": 1.2407, + "step": 11251 + }, + { + "epoch": 3.351365438671606, + "grad_norm": 0.3223528265953064, + "learning_rate": 1.5496313470688784e-05, + "loss": 1.2483, + "step": 11252 + }, + { + "epoch": 3.3516632848712744, + "grad_norm": 0.28237348794937134, + "learning_rate": 1.5495507639323453e-05, + "loss": 1.2348, + "step": 11253 + }, + { + "epoch": 3.351961131070943, + "grad_norm": 0.3122625946998596, + "learning_rate": 1.549470175682857e-05, + "loss": 1.2409, + "step": 11254 + }, + { + "epoch": 3.352258977270612, + "grad_norm": 0.49274566769599915, + "learning_rate": 1.5493895823211623e-05, + "loss": 1.2217, + "step": 11255 + }, + { + "epoch": 3.3525568234702807, + "grad_norm": 0.35787808895111084, + "learning_rate": 1.5493089838480116e-05, + "loss": 1.2284, + "step": 11256 + }, + { + "epoch": 3.352854669669949, + "grad_norm": 0.3012523949146271, + "learning_rate": 1.5492283802641544e-05, + "loss": 1.2401, + "step": 11257 + }, + { + "epoch": 3.353152515869618, + "grad_norm": 0.4172825813293457, + "learning_rate": 1.5491477715703405e-05, + "loss": 1.2134, + "step": 11258 + }, + { + "epoch": 3.3534503620692866, + "grad_norm": 0.2507089376449585, + "learning_rate": 1.5490671577673205e-05, + "loss": 1.2302, + "step": 11259 + }, + { + "epoch": 3.3537482082689554, + "grad_norm": 0.25309303402900696, + "learning_rate": 1.5489865388558438e-05, + "loss": 1.2361, + "step": 11260 + }, + { + "epoch": 3.3540460544686237, + "grad_norm": 0.23591768741607666, + "learning_rate": 1.5489059148366608e-05, + "loss": 1.2225, + "step": 11261 + }, + { + "epoch": 3.3543439006682925, + "grad_norm": 0.3089366555213928, + "learning_rate": 1.5488252857105217e-05, + "loss": 1.2405, + "step": 11262 + }, + { + "epoch": 3.3546417468679612, + "grad_norm": 0.258203387260437, + "learning_rate": 1.5487446514781762e-05, + "loss": 1.2332, + "step": 11263 + }, + { + "epoch": 3.3549395930676296, + "grad_norm": 0.28117161989212036, + "learning_rate": 1.548664012140375e-05, + "loss": 1.2348, + "step": 11264 + }, + { + "epoch": 3.3552374392672983, + "grad_norm": 0.30442148447036743, + "learning_rate": 1.5485833676978683e-05, + "loss": 1.2227, + "step": 11265 + }, + { + "epoch": 3.355535285466967, + "grad_norm": 0.2500152289867401, + "learning_rate": 1.548502718151406e-05, + "loss": 1.2292, + "step": 11266 + }, + { + "epoch": 3.3558331316666354, + "grad_norm": 0.26201489567756653, + "learning_rate": 1.548422063501739e-05, + "loss": 1.2374, + "step": 11267 + }, + { + "epoch": 3.356130977866304, + "grad_norm": 0.2322075515985489, + "learning_rate": 1.5483414037496173e-05, + "loss": 1.2387, + "step": 11268 + }, + { + "epoch": 3.356428824065973, + "grad_norm": 0.2669526934623718, + "learning_rate": 1.5482607388957915e-05, + "loss": 1.2364, + "step": 11269 + }, + { + "epoch": 3.3567266702656418, + "grad_norm": 0.32109588384628296, + "learning_rate": 1.548180068941012e-05, + "loss": 1.2123, + "step": 11270 + }, + { + "epoch": 3.35702451646531, + "grad_norm": 0.2350834608078003, + "learning_rate": 1.5480993938860294e-05, + "loss": 1.2312, + "step": 11271 + }, + { + "epoch": 3.357322362664979, + "grad_norm": 0.25055015087127686, + "learning_rate": 1.5480187137315942e-05, + "loss": 1.2467, + "step": 11272 + }, + { + "epoch": 3.3576202088646476, + "grad_norm": 0.23985609412193298, + "learning_rate": 1.5479380284784574e-05, + "loss": 1.2331, + "step": 11273 + }, + { + "epoch": 3.3579180550643164, + "grad_norm": 0.2938431203365326, + "learning_rate": 1.5478573381273694e-05, + "loss": 1.2447, + "step": 11274 + }, + { + "epoch": 3.3582159012639847, + "grad_norm": 0.30418601632118225, + "learning_rate": 1.547776642679081e-05, + "loss": 1.2456, + "step": 11275 + }, + { + "epoch": 3.3585137474636535, + "grad_norm": 0.2474595159292221, + "learning_rate": 1.5476959421343426e-05, + "loss": 1.2425, + "step": 11276 + }, + { + "epoch": 3.3588115936633223, + "grad_norm": 0.2823903560638428, + "learning_rate": 1.5476152364939058e-05, + "loss": 1.2244, + "step": 11277 + }, + { + "epoch": 3.3591094398629906, + "grad_norm": 0.30574876070022583, + "learning_rate": 1.547534525758521e-05, + "loss": 1.2133, + "step": 11278 + }, + { + "epoch": 3.3594072860626594, + "grad_norm": 0.24360063672065735, + "learning_rate": 1.547453809928939e-05, + "loss": 1.2337, + "step": 11279 + }, + { + "epoch": 3.359705132262328, + "grad_norm": 0.2866424322128296, + "learning_rate": 1.547373089005911e-05, + "loss": 1.2511, + "step": 11280 + }, + { + "epoch": 3.3600029784619965, + "grad_norm": 0.2539839744567871, + "learning_rate": 1.547292362990188e-05, + "loss": 1.239, + "step": 11281 + }, + { + "epoch": 3.3603008246616652, + "grad_norm": 0.2697642743587494, + "learning_rate": 1.547211631882521e-05, + "loss": 1.2164, + "step": 11282 + }, + { + "epoch": 3.360598670861334, + "grad_norm": 0.30866938829421997, + "learning_rate": 1.5471308956836614e-05, + "loss": 1.2295, + "step": 11283 + }, + { + "epoch": 3.360896517061003, + "grad_norm": 0.24037256836891174, + "learning_rate": 1.54705015439436e-05, + "loss": 1.234, + "step": 11284 + }, + { + "epoch": 3.361194363260671, + "grad_norm": 0.27809154987335205, + "learning_rate": 1.546969408015368e-05, + "loss": 1.2264, + "step": 11285 + }, + { + "epoch": 3.36149220946034, + "grad_norm": 0.2547796666622162, + "learning_rate": 1.546888656547437e-05, + "loss": 1.253, + "step": 11286 + }, + { + "epoch": 3.3617900556600087, + "grad_norm": 0.29331886768341064, + "learning_rate": 1.5468078999913177e-05, + "loss": 1.2242, + "step": 11287 + }, + { + "epoch": 3.3620879018596774, + "grad_norm": 0.308010995388031, + "learning_rate": 1.5467271383477617e-05, + "loss": 1.2261, + "step": 11288 + }, + { + "epoch": 3.3623857480593458, + "grad_norm": 0.24176310002803802, + "learning_rate": 1.546646371617521e-05, + "loss": 1.2268, + "step": 11289 + }, + { + "epoch": 3.3626835942590145, + "grad_norm": 0.30735498666763306, + "learning_rate": 1.5465655998013463e-05, + "loss": 1.247, + "step": 11290 + }, + { + "epoch": 3.3629814404586833, + "grad_norm": 0.2666889429092407, + "learning_rate": 1.5464848228999893e-05, + "loss": 1.2399, + "step": 11291 + }, + { + "epoch": 3.3632792866583516, + "grad_norm": 0.3406980037689209, + "learning_rate": 1.546404040914202e-05, + "loss": 1.2472, + "step": 11292 + }, + { + "epoch": 3.3635771328580204, + "grad_norm": 0.3658931255340576, + "learning_rate": 1.546323253844735e-05, + "loss": 1.2063, + "step": 11293 + }, + { + "epoch": 3.363874979057689, + "grad_norm": 0.28469905257225037, + "learning_rate": 1.5462424616923408e-05, + "loss": 1.2346, + "step": 11294 + }, + { + "epoch": 3.3641728252573575, + "grad_norm": 0.3957582712173462, + "learning_rate": 1.546161664457771e-05, + "loss": 1.2271, + "step": 11295 + }, + { + "epoch": 3.3644706714570263, + "grad_norm": 0.24683406949043274, + "learning_rate": 1.5460808621417768e-05, + "loss": 1.2269, + "step": 11296 + }, + { + "epoch": 3.364768517656695, + "grad_norm": 0.2529073655605316, + "learning_rate": 1.5460000547451103e-05, + "loss": 1.2387, + "step": 11297 + }, + { + "epoch": 3.365066363856364, + "grad_norm": 0.2449730634689331, + "learning_rate": 1.5459192422685234e-05, + "loss": 1.2303, + "step": 11298 + }, + { + "epoch": 3.365364210056032, + "grad_norm": 0.3057553768157959, + "learning_rate": 1.545838424712768e-05, + "loss": 1.2147, + "step": 11299 + }, + { + "epoch": 3.365662056255701, + "grad_norm": 0.2699509859085083, + "learning_rate": 1.545757602078596e-05, + "loss": 1.2329, + "step": 11300 + }, + { + "epoch": 3.3659599024553697, + "grad_norm": 0.22986018657684326, + "learning_rate": 1.545676774366759e-05, + "loss": 1.219, + "step": 11301 + }, + { + "epoch": 3.3662577486550385, + "grad_norm": 0.24623237550258636, + "learning_rate": 1.545595941578009e-05, + "loss": 1.2344, + "step": 11302 + }, + { + "epoch": 3.366555594854707, + "grad_norm": 0.2394254505634308, + "learning_rate": 1.545515103713099e-05, + "loss": 1.2522, + "step": 11303 + }, + { + "epoch": 3.3668534410543756, + "grad_norm": 0.24272018671035767, + "learning_rate": 1.54543426077278e-05, + "loss": 1.2167, + "step": 11304 + }, + { + "epoch": 3.3671512872540443, + "grad_norm": 0.23567035794258118, + "learning_rate": 1.545353412757805e-05, + "loss": 1.2314, + "step": 11305 + }, + { + "epoch": 3.3674491334537127, + "grad_norm": 0.26490622758865356, + "learning_rate": 1.5452725596689253e-05, + "loss": 1.2422, + "step": 11306 + }, + { + "epoch": 3.3677469796533814, + "grad_norm": 0.26032236218452454, + "learning_rate": 1.545191701506894e-05, + "loss": 1.2271, + "step": 11307 + }, + { + "epoch": 3.36804482585305, + "grad_norm": 0.2538129687309265, + "learning_rate": 1.5451108382724628e-05, + "loss": 1.2273, + "step": 11308 + }, + { + "epoch": 3.3683426720527185, + "grad_norm": 0.30988457798957825, + "learning_rate": 1.5450299699663842e-05, + "loss": 1.217, + "step": 11309 + }, + { + "epoch": 3.3686405182523873, + "grad_norm": 0.30218836665153503, + "learning_rate": 1.544949096589411e-05, + "loss": 1.2261, + "step": 11310 + }, + { + "epoch": 3.368938364452056, + "grad_norm": 0.2780192792415619, + "learning_rate": 1.5448682181422955e-05, + "loss": 1.223, + "step": 11311 + }, + { + "epoch": 3.369236210651725, + "grad_norm": 0.23283803462982178, + "learning_rate": 1.5447873346257895e-05, + "loss": 1.2239, + "step": 11312 + }, + { + "epoch": 3.369534056851393, + "grad_norm": 0.2457301914691925, + "learning_rate": 1.5447064460406464e-05, + "loss": 1.2263, + "step": 11313 + }, + { + "epoch": 3.369831903051062, + "grad_norm": 0.2556935250759125, + "learning_rate": 1.5446255523876178e-05, + "loss": 1.2382, + "step": 11314 + }, + { + "epoch": 3.3701297492507307, + "grad_norm": 0.24787387251853943, + "learning_rate": 1.5445446536674575e-05, + "loss": 1.2383, + "step": 11315 + }, + { + "epoch": 3.3704275954503995, + "grad_norm": 0.23617060482501984, + "learning_rate": 1.5444637498809177e-05, + "loss": 1.25, + "step": 11316 + }, + { + "epoch": 3.370725441650068, + "grad_norm": 0.32861047983169556, + "learning_rate": 1.5443828410287506e-05, + "loss": 1.2309, + "step": 11317 + }, + { + "epoch": 3.3710232878497366, + "grad_norm": 0.34847843647003174, + "learning_rate": 1.5443019271117096e-05, + "loss": 1.2357, + "step": 11318 + }, + { + "epoch": 3.3713211340494054, + "grad_norm": 0.25416529178619385, + "learning_rate": 1.544221008130547e-05, + "loss": 1.2434, + "step": 11319 + }, + { + "epoch": 3.3716189802490737, + "grad_norm": 0.8670878410339355, + "learning_rate": 1.5441400840860165e-05, + "loss": 1.249, + "step": 11320 + }, + { + "epoch": 3.3719168264487425, + "grad_norm": 0.26365143060684204, + "learning_rate": 1.54405915497887e-05, + "loss": 1.2437, + "step": 11321 + }, + { + "epoch": 3.3722146726484112, + "grad_norm": 0.24683503806591034, + "learning_rate": 1.543978220809861e-05, + "loss": 1.2421, + "step": 11322 + }, + { + "epoch": 3.37251251884808, + "grad_norm": 0.24612995982170105, + "learning_rate": 1.5438972815797427e-05, + "loss": 1.2395, + "step": 11323 + }, + { + "epoch": 3.3728103650477483, + "grad_norm": 0.24689802527427673, + "learning_rate": 1.5438163372892675e-05, + "loss": 1.2375, + "step": 11324 + }, + { + "epoch": 3.373108211247417, + "grad_norm": 0.22813276946544647, + "learning_rate": 1.5437353879391893e-05, + "loss": 1.2193, + "step": 11325 + }, + { + "epoch": 3.373406057447086, + "grad_norm": 0.23651349544525146, + "learning_rate": 1.5436544335302604e-05, + "loss": 1.2258, + "step": 11326 + }, + { + "epoch": 3.3737039036467547, + "grad_norm": 0.246408149600029, + "learning_rate": 1.5435734740632343e-05, + "loss": 1.2265, + "step": 11327 + }, + { + "epoch": 3.374001749846423, + "grad_norm": 0.24524538218975067, + "learning_rate": 1.5434925095388648e-05, + "loss": 1.2395, + "step": 11328 + }, + { + "epoch": 3.3742995960460918, + "grad_norm": 0.23395466804504395, + "learning_rate": 1.5434115399579048e-05, + "loss": 1.2354, + "step": 11329 + }, + { + "epoch": 3.3745974422457605, + "grad_norm": 0.22607681155204773, + "learning_rate": 1.543330565321107e-05, + "loss": 1.24, + "step": 11330 + }, + { + "epoch": 3.374895288445429, + "grad_norm": 0.23405821621418, + "learning_rate": 1.5432495856292255e-05, + "loss": 1.2485, + "step": 11331 + }, + { + "epoch": 3.3751931346450976, + "grad_norm": 0.2404523640871048, + "learning_rate": 1.5431686008830137e-05, + "loss": 1.2362, + "step": 11332 + }, + { + "epoch": 3.3754909808447664, + "grad_norm": 0.23444102704524994, + "learning_rate": 1.543087611083225e-05, + "loss": 1.243, + "step": 11333 + }, + { + "epoch": 3.3757888270444347, + "grad_norm": 0.23104414343833923, + "learning_rate": 1.543006616230613e-05, + "loss": 1.2444, + "step": 11334 + }, + { + "epoch": 3.3760866732441035, + "grad_norm": 0.23779796063899994, + "learning_rate": 1.5429256163259307e-05, + "loss": 1.2441, + "step": 11335 + }, + { + "epoch": 3.3763845194437723, + "grad_norm": 0.24722295999526978, + "learning_rate": 1.542844611369932e-05, + "loss": 1.2246, + "step": 11336 + }, + { + "epoch": 3.376682365643441, + "grad_norm": 0.24286231398582458, + "learning_rate": 1.5427636013633717e-05, + "loss": 1.222, + "step": 11337 + }, + { + "epoch": 3.3769802118431094, + "grad_norm": 0.24265117943286896, + "learning_rate": 1.5426825863070013e-05, + "loss": 1.2288, + "step": 11338 + }, + { + "epoch": 3.377278058042778, + "grad_norm": 0.2378610223531723, + "learning_rate": 1.5426015662015763e-05, + "loss": 1.2452, + "step": 11339 + }, + { + "epoch": 3.377575904242447, + "grad_norm": 0.2299157977104187, + "learning_rate": 1.5425205410478498e-05, + "loss": 1.2269, + "step": 11340 + }, + { + "epoch": 3.3778737504421157, + "grad_norm": 0.23765970766544342, + "learning_rate": 1.542439510846576e-05, + "loss": 1.2476, + "step": 11341 + }, + { + "epoch": 3.378171596641784, + "grad_norm": 0.24288013577461243, + "learning_rate": 1.5423584755985086e-05, + "loss": 1.2212, + "step": 11342 + }, + { + "epoch": 3.378469442841453, + "grad_norm": 0.23006445169448853, + "learning_rate": 1.5422774353044013e-05, + "loss": 1.2203, + "step": 11343 + }, + { + "epoch": 3.3787672890411216, + "grad_norm": 0.23436981439590454, + "learning_rate": 1.5421963899650086e-05, + "loss": 1.219, + "step": 11344 + }, + { + "epoch": 3.37906513524079, + "grad_norm": 0.2244638353586197, + "learning_rate": 1.542115339581084e-05, + "loss": 1.212, + "step": 11345 + }, + { + "epoch": 3.3793629814404587, + "grad_norm": 0.24347656965255737, + "learning_rate": 1.5420342841533823e-05, + "loss": 1.2491, + "step": 11346 + }, + { + "epoch": 3.3796608276401274, + "grad_norm": 0.23737263679504395, + "learning_rate": 1.5419532236826568e-05, + "loss": 1.2413, + "step": 11347 + }, + { + "epoch": 3.3799586738397958, + "grad_norm": 0.2330455631017685, + "learning_rate": 1.5418721581696623e-05, + "loss": 1.2216, + "step": 11348 + }, + { + "epoch": 3.3802565200394645, + "grad_norm": 0.24291853606700897, + "learning_rate": 1.5417910876151525e-05, + "loss": 1.2412, + "step": 11349 + }, + { + "epoch": 3.3805543662391333, + "grad_norm": 0.23848694562911987, + "learning_rate": 1.5417100120198822e-05, + "loss": 1.2299, + "step": 11350 + }, + { + "epoch": 3.380852212438802, + "grad_norm": 0.242288738489151, + "learning_rate": 1.5416289313846053e-05, + "loss": 1.2331, + "step": 11351 + }, + { + "epoch": 3.3811500586384704, + "grad_norm": 0.23414702713489532, + "learning_rate": 1.5415478457100766e-05, + "loss": 1.2484, + "step": 11352 + }, + { + "epoch": 3.381447904838139, + "grad_norm": 0.23552188277244568, + "learning_rate": 1.54146675499705e-05, + "loss": 1.2364, + "step": 11353 + }, + { + "epoch": 3.381745751037808, + "grad_norm": 0.2976562976837158, + "learning_rate": 1.5413856592462804e-05, + "loss": 1.2362, + "step": 11354 + }, + { + "epoch": 3.3820435972374767, + "grad_norm": 0.22300995886325836, + "learning_rate": 1.541304558458522e-05, + "loss": 1.2343, + "step": 11355 + }, + { + "epoch": 3.382341443437145, + "grad_norm": 0.2366066724061966, + "learning_rate": 1.5412234526345292e-05, + "loss": 1.248, + "step": 11356 + }, + { + "epoch": 3.382639289636814, + "grad_norm": 0.2326957881450653, + "learning_rate": 1.5411423417750574e-05, + "loss": 1.219, + "step": 11357 + }, + { + "epoch": 3.3829371358364826, + "grad_norm": 0.23947864770889282, + "learning_rate": 1.54106122588086e-05, + "loss": 1.2503, + "step": 11358 + }, + { + "epoch": 3.383234982036151, + "grad_norm": 0.24725420773029327, + "learning_rate": 1.540980104952693e-05, + "loss": 1.2324, + "step": 11359 + }, + { + "epoch": 3.3835328282358197, + "grad_norm": 0.24381734430789948, + "learning_rate": 1.5408989789913102e-05, + "loss": 1.2052, + "step": 11360 + }, + { + "epoch": 3.3838306744354885, + "grad_norm": 0.24043962359428406, + "learning_rate": 1.540817847997467e-05, + "loss": 1.2338, + "step": 11361 + }, + { + "epoch": 3.384128520635157, + "grad_norm": 0.23623792827129364, + "learning_rate": 1.540736711971918e-05, + "loss": 1.2289, + "step": 11362 + }, + { + "epoch": 3.3844263668348256, + "grad_norm": 0.23731105029582977, + "learning_rate": 1.5406555709154177e-05, + "loss": 1.2413, + "step": 11363 + }, + { + "epoch": 3.3847242130344943, + "grad_norm": 0.23703479766845703, + "learning_rate": 1.5405744248287215e-05, + "loss": 1.2469, + "step": 11364 + }, + { + "epoch": 3.385022059234163, + "grad_norm": 0.23037002980709076, + "learning_rate": 1.5404932737125845e-05, + "loss": 1.2363, + "step": 11365 + }, + { + "epoch": 3.3853199054338314, + "grad_norm": 0.2389373779296875, + "learning_rate": 1.5404121175677613e-05, + "loss": 1.2546, + "step": 11366 + }, + { + "epoch": 3.3856177516335, + "grad_norm": 0.23546260595321655, + "learning_rate": 1.5403309563950067e-05, + "loss": 1.2158, + "step": 11367 + }, + { + "epoch": 3.385915597833169, + "grad_norm": 0.23392999172210693, + "learning_rate": 1.5402497901950768e-05, + "loss": 1.2238, + "step": 11368 + }, + { + "epoch": 3.3862134440328377, + "grad_norm": 0.23209136724472046, + "learning_rate": 1.5401686189687262e-05, + "loss": 1.2254, + "step": 11369 + }, + { + "epoch": 3.386511290232506, + "grad_norm": 0.23876014351844788, + "learning_rate": 1.54008744271671e-05, + "loss": 1.2698, + "step": 11370 + }, + { + "epoch": 3.386809136432175, + "grad_norm": 0.2305627465248108, + "learning_rate": 1.5400062614397836e-05, + "loss": 1.2351, + "step": 11371 + }, + { + "epoch": 3.3871069826318436, + "grad_norm": 0.23279821872711182, + "learning_rate": 1.5399250751387024e-05, + "loss": 1.2307, + "step": 11372 + }, + { + "epoch": 3.387404828831512, + "grad_norm": 0.2321329116821289, + "learning_rate": 1.539843883814221e-05, + "loss": 1.2479, + "step": 11373 + }, + { + "epoch": 3.3877026750311807, + "grad_norm": 0.23694854974746704, + "learning_rate": 1.539762687467096e-05, + "loss": 1.2331, + "step": 11374 + }, + { + "epoch": 3.3880005212308495, + "grad_norm": 0.23521921038627625, + "learning_rate": 1.5396814860980818e-05, + "loss": 1.2391, + "step": 11375 + }, + { + "epoch": 3.388298367430518, + "grad_norm": 0.23892724514007568, + "learning_rate": 1.5396002797079347e-05, + "loss": 1.2316, + "step": 11376 + }, + { + "epoch": 3.3885962136301866, + "grad_norm": 0.22810769081115723, + "learning_rate": 1.53951906829741e-05, + "loss": 1.2344, + "step": 11377 + }, + { + "epoch": 3.3888940598298554, + "grad_norm": 0.22269442677497864, + "learning_rate": 1.539437851867263e-05, + "loss": 1.2383, + "step": 11378 + }, + { + "epoch": 3.389191906029524, + "grad_norm": 0.22351641952991486, + "learning_rate": 1.5393566304182496e-05, + "loss": 1.2262, + "step": 11379 + }, + { + "epoch": 3.3894897522291925, + "grad_norm": 0.2398180514574051, + "learning_rate": 1.5392754039511248e-05, + "loss": 1.2449, + "step": 11380 + }, + { + "epoch": 3.3897875984288612, + "grad_norm": 0.244129478931427, + "learning_rate": 1.5391941724666454e-05, + "loss": 1.2418, + "step": 11381 + }, + { + "epoch": 3.39008544462853, + "grad_norm": 0.2408350110054016, + "learning_rate": 1.5391129359655663e-05, + "loss": 1.2466, + "step": 11382 + }, + { + "epoch": 3.390383290828199, + "grad_norm": 0.24110910296440125, + "learning_rate": 1.539031694448644e-05, + "loss": 1.2369, + "step": 11383 + }, + { + "epoch": 3.390681137027867, + "grad_norm": 0.2331976592540741, + "learning_rate": 1.5389504479166338e-05, + "loss": 1.2155, + "step": 11384 + }, + { + "epoch": 3.390978983227536, + "grad_norm": 0.2415946125984192, + "learning_rate": 1.5388691963702922e-05, + "loss": 1.2357, + "step": 11385 + }, + { + "epoch": 3.3912768294272047, + "grad_norm": 0.22565360367298126, + "learning_rate": 1.5387879398103742e-05, + "loss": 1.2278, + "step": 11386 + }, + { + "epoch": 3.391574675626873, + "grad_norm": 0.23874175548553467, + "learning_rate": 1.538706678237637e-05, + "loss": 1.2379, + "step": 11387 + }, + { + "epoch": 3.3918725218265418, + "grad_norm": 0.22943367063999176, + "learning_rate": 1.5386254116528355e-05, + "loss": 1.2483, + "step": 11388 + }, + { + "epoch": 3.3921703680262105, + "grad_norm": 0.23683691024780273, + "learning_rate": 1.5385441400567267e-05, + "loss": 1.2391, + "step": 11389 + }, + { + "epoch": 3.3924682142258793, + "grad_norm": 0.23681248724460602, + "learning_rate": 1.5384628634500663e-05, + "loss": 1.2433, + "step": 11390 + }, + { + "epoch": 3.3927660604255476, + "grad_norm": 0.2290094941854477, + "learning_rate": 1.5383815818336106e-05, + "loss": 1.2506, + "step": 11391 + }, + { + "epoch": 3.3930639066252164, + "grad_norm": 0.2265474796295166, + "learning_rate": 1.5383002952081154e-05, + "loss": 1.2464, + "step": 11392 + }, + { + "epoch": 3.393361752824885, + "grad_norm": 0.22690005600452423, + "learning_rate": 1.5382190035743377e-05, + "loss": 1.2203, + "step": 11393 + }, + { + "epoch": 3.393659599024554, + "grad_norm": 0.23712964355945587, + "learning_rate": 1.5381377069330333e-05, + "loss": 1.2277, + "step": 11394 + }, + { + "epoch": 3.3939574452242223, + "grad_norm": 0.23558643460273743, + "learning_rate": 1.5380564052849592e-05, + "loss": 1.2146, + "step": 11395 + }, + { + "epoch": 3.394255291423891, + "grad_norm": 0.24280938506126404, + "learning_rate": 1.5379750986308716e-05, + "loss": 1.2386, + "step": 11396 + }, + { + "epoch": 3.39455313762356, + "grad_norm": 0.24029718339443207, + "learning_rate": 1.537893786971526e-05, + "loss": 1.2279, + "step": 11397 + }, + { + "epoch": 3.394850983823228, + "grad_norm": 0.24037472903728485, + "learning_rate": 1.53781247030768e-05, + "loss": 1.2399, + "step": 11398 + }, + { + "epoch": 3.395148830022897, + "grad_norm": 0.2389010637998581, + "learning_rate": 1.53773114864009e-05, + "loss": 1.2228, + "step": 11399 + }, + { + "epoch": 3.3954466762225657, + "grad_norm": 0.25520673394203186, + "learning_rate": 1.537649821969512e-05, + "loss": 1.2429, + "step": 11400 + }, + { + "epoch": 3.395744522422234, + "grad_norm": 0.23565858602523804, + "learning_rate": 1.5375684902967038e-05, + "loss": 1.2234, + "step": 11401 + }, + { + "epoch": 3.396042368621903, + "grad_norm": 0.24352875351905823, + "learning_rate": 1.537487153622421e-05, + "loss": 1.2457, + "step": 11402 + }, + { + "epoch": 3.3963402148215716, + "grad_norm": 0.23773491382598877, + "learning_rate": 1.5374058119474208e-05, + "loss": 1.2342, + "step": 11403 + }, + { + "epoch": 3.3966380610212403, + "grad_norm": 0.23633608222007751, + "learning_rate": 1.5373244652724596e-05, + "loss": 1.2565, + "step": 11404 + }, + { + "epoch": 3.3969359072209087, + "grad_norm": 0.23755548894405365, + "learning_rate": 1.537243113598295e-05, + "loss": 1.2298, + "step": 11405 + }, + { + "epoch": 3.3972337534205774, + "grad_norm": 0.23748286068439484, + "learning_rate": 1.5371617569256834e-05, + "loss": 1.2288, + "step": 11406 + }, + { + "epoch": 3.397531599620246, + "grad_norm": 0.23521433770656586, + "learning_rate": 1.537080395255382e-05, + "loss": 1.2534, + "step": 11407 + }, + { + "epoch": 3.397829445819915, + "grad_norm": 0.24321109056472778, + "learning_rate": 1.5369990285881473e-05, + "loss": 1.2291, + "step": 11408 + }, + { + "epoch": 3.3981272920195833, + "grad_norm": 0.23622368276119232, + "learning_rate": 1.5369176569247367e-05, + "loss": 1.2367, + "step": 11409 + }, + { + "epoch": 3.398425138219252, + "grad_norm": 0.2309383749961853, + "learning_rate": 1.536836280265907e-05, + "loss": 1.2377, + "step": 11410 + }, + { + "epoch": 3.398722984418921, + "grad_norm": 0.24700772762298584, + "learning_rate": 1.5367548986124156e-05, + "loss": 1.2167, + "step": 11411 + }, + { + "epoch": 3.399020830618589, + "grad_norm": 0.2414867877960205, + "learning_rate": 1.53667351196502e-05, + "loss": 1.2547, + "step": 11412 + }, + { + "epoch": 3.399318676818258, + "grad_norm": 0.23084893822669983, + "learning_rate": 1.5365921203244765e-05, + "loss": 1.2276, + "step": 11413 + }, + { + "epoch": 3.3996165230179267, + "grad_norm": 0.23509570956230164, + "learning_rate": 1.536510723691543e-05, + "loss": 1.2423, + "step": 11414 + }, + { + "epoch": 3.399914369217595, + "grad_norm": 0.24819272756576538, + "learning_rate": 1.5364293220669764e-05, + "loss": 1.2291, + "step": 11415 + }, + { + "epoch": 3.400212215417264, + "grad_norm": 0.23980839550495148, + "learning_rate": 1.5363479154515342e-05, + "loss": 1.2319, + "step": 11416 + }, + { + "epoch": 3.4005100616169326, + "grad_norm": 0.2304224967956543, + "learning_rate": 1.5362665038459743e-05, + "loss": 1.2384, + "step": 11417 + }, + { + "epoch": 3.4008079078166014, + "grad_norm": 0.23365645110607147, + "learning_rate": 1.536185087251054e-05, + "loss": 1.2207, + "step": 11418 + }, + { + "epoch": 3.4011057540162697, + "grad_norm": 0.24747350811958313, + "learning_rate": 1.53610366566753e-05, + "loss": 1.2359, + "step": 11419 + }, + { + "epoch": 3.4014036002159385, + "grad_norm": 0.23782464861869812, + "learning_rate": 1.5360222390961602e-05, + "loss": 1.2469, + "step": 11420 + }, + { + "epoch": 3.4017014464156072, + "grad_norm": 0.2449568659067154, + "learning_rate": 1.5359408075377028e-05, + "loss": 1.2256, + "step": 11421 + }, + { + "epoch": 3.401999292615276, + "grad_norm": 0.23566211760044098, + "learning_rate": 1.5358593709929148e-05, + "loss": 1.229, + "step": 11422 + }, + { + "epoch": 3.4022971388149443, + "grad_norm": 0.23816730082035065, + "learning_rate": 1.535777929462554e-05, + "loss": 1.2198, + "step": 11423 + }, + { + "epoch": 3.402594985014613, + "grad_norm": 0.23726990818977356, + "learning_rate": 1.5356964829473785e-05, + "loss": 1.2452, + "step": 11424 + }, + { + "epoch": 3.402892831214282, + "grad_norm": 0.24003499746322632, + "learning_rate": 1.535615031448145e-05, + "loss": 1.2421, + "step": 11425 + }, + { + "epoch": 3.40319067741395, + "grad_norm": 0.24958519637584686, + "learning_rate": 1.535533574965613e-05, + "loss": 1.235, + "step": 11426 + }, + { + "epoch": 3.403488523613619, + "grad_norm": 0.23630942404270172, + "learning_rate": 1.5354521135005387e-05, + "loss": 1.2159, + "step": 11427 + }, + { + "epoch": 3.4037863698132877, + "grad_norm": 0.23265543580055237, + "learning_rate": 1.535370647053681e-05, + "loss": 1.2638, + "step": 11428 + }, + { + "epoch": 3.404084216012956, + "grad_norm": 0.23566044867038727, + "learning_rate": 1.5352891756257977e-05, + "loss": 1.2366, + "step": 11429 + }, + { + "epoch": 3.404382062212625, + "grad_norm": 0.23651911318302155, + "learning_rate": 1.5352076992176464e-05, + "loss": 1.233, + "step": 11430 + }, + { + "epoch": 3.4046799084122936, + "grad_norm": 0.24107149243354797, + "learning_rate": 1.5351262178299855e-05, + "loss": 1.2396, + "step": 11431 + }, + { + "epoch": 3.4049777546119624, + "grad_norm": 0.237289160490036, + "learning_rate": 1.535044731463573e-05, + "loss": 1.2406, + "step": 11432 + }, + { + "epoch": 3.4052756008116307, + "grad_norm": 0.244338721036911, + "learning_rate": 1.534963240119167e-05, + "loss": 1.2387, + "step": 11433 + }, + { + "epoch": 3.4055734470112995, + "grad_norm": 0.2455170452594757, + "learning_rate": 1.534881743797526e-05, + "loss": 1.2468, + "step": 11434 + }, + { + "epoch": 3.4058712932109683, + "grad_norm": 0.23031951487064362, + "learning_rate": 1.534800242499408e-05, + "loss": 1.2308, + "step": 11435 + }, + { + "epoch": 3.406169139410637, + "grad_norm": 0.24803990125656128, + "learning_rate": 1.5347187362255712e-05, + "loss": 1.2142, + "step": 11436 + }, + { + "epoch": 3.4064669856103054, + "grad_norm": 0.24458499252796173, + "learning_rate": 1.534637224976774e-05, + "loss": 1.242, + "step": 11437 + }, + { + "epoch": 3.406764831809974, + "grad_norm": 0.24179518222808838, + "learning_rate": 1.5345557087537745e-05, + "loss": 1.2345, + "step": 11438 + }, + { + "epoch": 3.407062678009643, + "grad_norm": 0.24180927872657776, + "learning_rate": 1.5344741875573314e-05, + "loss": 1.2373, + "step": 11439 + }, + { + "epoch": 3.4073605242093112, + "grad_norm": 0.2570483088493347, + "learning_rate": 1.5343926613882035e-05, + "loss": 1.2379, + "step": 11440 + }, + { + "epoch": 3.40765837040898, + "grad_norm": 0.24013374745845795, + "learning_rate": 1.5343111302471487e-05, + "loss": 1.252, + "step": 11441 + }, + { + "epoch": 3.407956216608649, + "grad_norm": 0.25222188234329224, + "learning_rate": 1.5342295941349256e-05, + "loss": 1.2317, + "step": 11442 + }, + { + "epoch": 3.408254062808317, + "grad_norm": 0.2308976799249649, + "learning_rate": 1.5341480530522933e-05, + "loss": 1.2043, + "step": 11443 + }, + { + "epoch": 3.408551909007986, + "grad_norm": 0.23158913850784302, + "learning_rate": 1.5340665070000102e-05, + "loss": 1.2322, + "step": 11444 + }, + { + "epoch": 3.4088497552076547, + "grad_norm": 0.23615525662899017, + "learning_rate": 1.5339849559788346e-05, + "loss": 1.2337, + "step": 11445 + }, + { + "epoch": 3.4091476014073234, + "grad_norm": 0.23380263149738312, + "learning_rate": 1.5339033999895262e-05, + "loss": 1.2208, + "step": 11446 + }, + { + "epoch": 3.4094454476069918, + "grad_norm": 0.2484705001115799, + "learning_rate": 1.5338218390328426e-05, + "loss": 1.2289, + "step": 11447 + }, + { + "epoch": 3.4097432938066605, + "grad_norm": 0.2381022572517395, + "learning_rate": 1.5337402731095433e-05, + "loss": 1.2303, + "step": 11448 + }, + { + "epoch": 3.4100411400063293, + "grad_norm": 0.2324039340019226, + "learning_rate": 1.5336587022203874e-05, + "loss": 1.2167, + "step": 11449 + }, + { + "epoch": 3.410338986205998, + "grad_norm": 0.23961563408374786, + "learning_rate": 1.533577126366133e-05, + "loss": 1.2483, + "step": 11450 + }, + { + "epoch": 3.4106368324056664, + "grad_norm": 0.2381473183631897, + "learning_rate": 1.5334955455475398e-05, + "loss": 1.2287, + "step": 11451 + }, + { + "epoch": 3.410934678605335, + "grad_norm": 0.24186542630195618, + "learning_rate": 1.5334139597653667e-05, + "loss": 1.2295, + "step": 11452 + }, + { + "epoch": 3.411232524805004, + "grad_norm": 0.23231279850006104, + "learning_rate": 1.5333323690203727e-05, + "loss": 1.2287, + "step": 11453 + }, + { + "epoch": 3.4115303710046723, + "grad_norm": 0.23343810439109802, + "learning_rate": 1.5332507733133167e-05, + "loss": 1.2377, + "step": 11454 + }, + { + "epoch": 3.411828217204341, + "grad_norm": 0.24753369390964508, + "learning_rate": 1.5331691726449584e-05, + "loss": 1.2308, + "step": 11455 + }, + { + "epoch": 3.41212606340401, + "grad_norm": 0.24245429039001465, + "learning_rate": 1.5330875670160563e-05, + "loss": 1.2388, + "step": 11456 + }, + { + "epoch": 3.4124239096036786, + "grad_norm": 0.2326730191707611, + "learning_rate": 1.53300595642737e-05, + "loss": 1.2193, + "step": 11457 + }, + { + "epoch": 3.412721755803347, + "grad_norm": 0.23411613702774048, + "learning_rate": 1.532924340879659e-05, + "loss": 1.2267, + "step": 11458 + }, + { + "epoch": 3.4130196020030157, + "grad_norm": 0.2385409027338028, + "learning_rate": 1.5328427203736822e-05, + "loss": 1.24, + "step": 11459 + }, + { + "epoch": 3.4133174482026845, + "grad_norm": 0.24209468066692352, + "learning_rate": 1.5327610949101996e-05, + "loss": 1.2404, + "step": 11460 + }, + { + "epoch": 3.4136152944023532, + "grad_norm": 0.23912429809570312, + "learning_rate": 1.53267946448997e-05, + "loss": 1.2348, + "step": 11461 + }, + { + "epoch": 3.4139131406020216, + "grad_norm": 0.24100053310394287, + "learning_rate": 1.5325978291137528e-05, + "loss": 1.2275, + "step": 11462 + }, + { + "epoch": 3.4142109868016903, + "grad_norm": 0.2500961124897003, + "learning_rate": 1.5325161887823083e-05, + "loss": 1.2382, + "step": 11463 + }, + { + "epoch": 3.414508833001359, + "grad_norm": 0.2547939717769623, + "learning_rate": 1.5324345434963953e-05, + "loss": 1.2172, + "step": 11464 + }, + { + "epoch": 3.4148066792010274, + "grad_norm": 0.23142468929290771, + "learning_rate": 1.532352893256774e-05, + "loss": 1.2428, + "step": 11465 + }, + { + "epoch": 3.415104525400696, + "grad_norm": 0.234122633934021, + "learning_rate": 1.532271238064204e-05, + "loss": 1.2307, + "step": 11466 + }, + { + "epoch": 3.415402371600365, + "grad_norm": 0.22871531546115875, + "learning_rate": 1.5321895779194444e-05, + "loss": 1.2294, + "step": 11467 + }, + { + "epoch": 3.4157002178000333, + "grad_norm": 0.231284499168396, + "learning_rate": 1.5321079128232556e-05, + "loss": 1.2294, + "step": 11468 + }, + { + "epoch": 3.415998063999702, + "grad_norm": 0.2404448390007019, + "learning_rate": 1.5320262427763967e-05, + "loss": 1.2443, + "step": 11469 + }, + { + "epoch": 3.416295910199371, + "grad_norm": 0.22571872174739838, + "learning_rate": 1.5319445677796287e-05, + "loss": 1.23, + "step": 11470 + }, + { + "epoch": 3.4165937563990396, + "grad_norm": 0.23221392929553986, + "learning_rate": 1.5318628878337106e-05, + "loss": 1.2288, + "step": 11471 + }, + { + "epoch": 3.416891602598708, + "grad_norm": 0.24114064872264862, + "learning_rate": 1.5317812029394022e-05, + "loss": 1.2246, + "step": 11472 + }, + { + "epoch": 3.4171894487983767, + "grad_norm": 0.2522605359554291, + "learning_rate": 1.5316995130974647e-05, + "loss": 1.2435, + "step": 11473 + }, + { + "epoch": 3.4174872949980455, + "grad_norm": 0.23176567256450653, + "learning_rate": 1.5316178183086562e-05, + "loss": 1.2514, + "step": 11474 + }, + { + "epoch": 3.4177851411977143, + "grad_norm": 0.23099017143249512, + "learning_rate": 1.5315361185737384e-05, + "loss": 1.241, + "step": 11475 + }, + { + "epoch": 3.4180829873973826, + "grad_norm": 0.2418859750032425, + "learning_rate": 1.531454413893471e-05, + "loss": 1.2458, + "step": 11476 + }, + { + "epoch": 3.4183808335970514, + "grad_norm": 0.23647300899028778, + "learning_rate": 1.531372704268614e-05, + "loss": 1.247, + "step": 11477 + }, + { + "epoch": 3.41867867979672, + "grad_norm": 0.23838262259960175, + "learning_rate": 1.5312909896999277e-05, + "loss": 1.2409, + "step": 11478 + }, + { + "epoch": 3.4189765259963885, + "grad_norm": 0.23677779734134674, + "learning_rate": 1.531209270188172e-05, + "loss": 1.2419, + "step": 11479 + }, + { + "epoch": 3.4192743721960572, + "grad_norm": 0.23330330848693848, + "learning_rate": 1.531127545734108e-05, + "loss": 1.2541, + "step": 11480 + }, + { + "epoch": 3.419572218395726, + "grad_norm": 0.24141250550746918, + "learning_rate": 1.5310458163384955e-05, + "loss": 1.2394, + "step": 11481 + }, + { + "epoch": 3.4198700645953943, + "grad_norm": 0.23776668310165405, + "learning_rate": 1.5309640820020947e-05, + "loss": 1.2474, + "step": 11482 + }, + { + "epoch": 3.420167910795063, + "grad_norm": 0.2449045479297638, + "learning_rate": 1.5308823427256664e-05, + "loss": 1.2289, + "step": 11483 + }, + { + "epoch": 3.420465756994732, + "grad_norm": 0.24275699257850647, + "learning_rate": 1.530800598509971e-05, + "loss": 1.2282, + "step": 11484 + }, + { + "epoch": 3.4207636031944006, + "grad_norm": 0.24288229644298553, + "learning_rate": 1.5307188493557698e-05, + "loss": 1.2323, + "step": 11485 + }, + { + "epoch": 3.421061449394069, + "grad_norm": 0.23896589875221252, + "learning_rate": 1.5306370952638217e-05, + "loss": 1.2299, + "step": 11486 + }, + { + "epoch": 3.4213592955937377, + "grad_norm": 0.24213965237140656, + "learning_rate": 1.5305553362348887e-05, + "loss": 1.2364, + "step": 11487 + }, + { + "epoch": 3.4216571417934065, + "grad_norm": 0.2553258240222931, + "learning_rate": 1.530473572269731e-05, + "loss": 1.2463, + "step": 11488 + }, + { + "epoch": 3.4219549879930753, + "grad_norm": 0.2358098179101944, + "learning_rate": 1.5303918033691095e-05, + "loss": 1.2205, + "step": 11489 + }, + { + "epoch": 3.4222528341927436, + "grad_norm": 0.24891728162765503, + "learning_rate": 1.530310029533785e-05, + "loss": 1.2447, + "step": 11490 + }, + { + "epoch": 3.4225506803924124, + "grad_norm": 0.25674664974212646, + "learning_rate": 1.530228250764518e-05, + "loss": 1.2315, + "step": 11491 + }, + { + "epoch": 3.422848526592081, + "grad_norm": 0.2444644570350647, + "learning_rate": 1.5301464670620695e-05, + "loss": 1.2335, + "step": 11492 + }, + { + "epoch": 3.4231463727917495, + "grad_norm": 0.24277029931545258, + "learning_rate": 1.5300646784272003e-05, + "loss": 1.24, + "step": 11493 + }, + { + "epoch": 3.4234442189914183, + "grad_norm": 0.23423664271831512, + "learning_rate": 1.5299828848606716e-05, + "loss": 1.245, + "step": 11494 + }, + { + "epoch": 3.423742065191087, + "grad_norm": 0.2517566382884979, + "learning_rate": 1.5299010863632443e-05, + "loss": 1.2186, + "step": 11495 + }, + { + "epoch": 3.4240399113907554, + "grad_norm": 0.24025209248065948, + "learning_rate": 1.5298192829356796e-05, + "loss": 1.2335, + "step": 11496 + }, + { + "epoch": 3.424337757590424, + "grad_norm": 0.2340984046459198, + "learning_rate": 1.5297374745787383e-05, + "loss": 1.2333, + "step": 11497 + }, + { + "epoch": 3.424635603790093, + "grad_norm": 0.25047650933265686, + "learning_rate": 1.5296556612931816e-05, + "loss": 1.2257, + "step": 11498 + }, + { + "epoch": 3.4249334499897617, + "grad_norm": 0.23872359097003937, + "learning_rate": 1.5295738430797705e-05, + "loss": 1.23, + "step": 11499 + }, + { + "epoch": 3.42523129618943, + "grad_norm": 0.23748847842216492, + "learning_rate": 1.5294920199392667e-05, + "loss": 1.2255, + "step": 11500 + }, + { + "epoch": 3.42523129618943, + "eval_loss": 1.3356280326843262, + "eval_runtime": 20.2227, + "eval_samples_per_second": 85.745, + "eval_steps_per_second": 5.39, + "step": 11500 + }, + { + "epoch": 3.425529142389099, + "grad_norm": 0.2422969937324524, + "learning_rate": 1.5294101918724314e-05, + "loss": 1.2451, + "step": 11501 + }, + { + "epoch": 3.4258269885887676, + "grad_norm": 0.23358918726444244, + "learning_rate": 1.5293283588800257e-05, + "loss": 1.2222, + "step": 11502 + }, + { + "epoch": 3.4261248347884363, + "grad_norm": 0.22912141680717468, + "learning_rate": 1.529246520962811e-05, + "loss": 1.2167, + "step": 11503 + }, + { + "epoch": 3.4264226809881047, + "grad_norm": 0.22926951944828033, + "learning_rate": 1.5291646781215486e-05, + "loss": 1.239, + "step": 11504 + }, + { + "epoch": 3.4267205271877734, + "grad_norm": 0.23395386338233948, + "learning_rate": 1.529082830357e-05, + "loss": 1.2256, + "step": 11505 + }, + { + "epoch": 3.427018373387442, + "grad_norm": 0.22575154900550842, + "learning_rate": 1.529000977669927e-05, + "loss": 1.2189, + "step": 11506 + }, + { + "epoch": 3.4273162195871105, + "grad_norm": 0.24313803017139435, + "learning_rate": 1.5289191200610912e-05, + "loss": 1.2381, + "step": 11507 + }, + { + "epoch": 3.4276140657867793, + "grad_norm": 0.23815204203128815, + "learning_rate": 1.5288372575312534e-05, + "loss": 1.2288, + "step": 11508 + }, + { + "epoch": 3.427911911986448, + "grad_norm": 0.23456953465938568, + "learning_rate": 1.528755390081176e-05, + "loss": 1.2363, + "step": 11509 + }, + { + "epoch": 3.4282097581861164, + "grad_norm": 0.23735493421554565, + "learning_rate": 1.52867351771162e-05, + "loss": 1.2338, + "step": 11510 + }, + { + "epoch": 3.428507604385785, + "grad_norm": 0.24212393164634705, + "learning_rate": 1.5285916404233487e-05, + "loss": 1.2308, + "step": 11511 + }, + { + "epoch": 3.428805450585454, + "grad_norm": 0.25003373622894287, + "learning_rate": 1.528509758217122e-05, + "loss": 1.2551, + "step": 11512 + }, + { + "epoch": 3.4291032967851227, + "grad_norm": 0.24068664014339447, + "learning_rate": 1.5284278710937025e-05, + "loss": 1.225, + "step": 11513 + }, + { + "epoch": 3.429401142984791, + "grad_norm": 0.24042995274066925, + "learning_rate": 1.528345979053852e-05, + "loss": 1.2416, + "step": 11514 + }, + { + "epoch": 3.42969898918446, + "grad_norm": 0.2419779896736145, + "learning_rate": 1.5282640820983328e-05, + "loss": 1.2367, + "step": 11515 + }, + { + "epoch": 3.4299968353841286, + "grad_norm": 0.23886997997760773, + "learning_rate": 1.528182180227906e-05, + "loss": 1.2427, + "step": 11516 + }, + { + "epoch": 3.4302946815837974, + "grad_norm": 0.23846594989299774, + "learning_rate": 1.5281002734433344e-05, + "loss": 1.2256, + "step": 11517 + }, + { + "epoch": 3.4305925277834657, + "grad_norm": 0.23304830491542816, + "learning_rate": 1.5280183617453805e-05, + "loss": 1.238, + "step": 11518 + }, + { + "epoch": 3.4308903739831345, + "grad_norm": 0.24864642322063446, + "learning_rate": 1.5279364451348048e-05, + "loss": 1.2303, + "step": 11519 + }, + { + "epoch": 3.4311882201828032, + "grad_norm": 0.2348104864358902, + "learning_rate": 1.5278545236123705e-05, + "loss": 1.2352, + "step": 11520 + }, + { + "epoch": 3.4314860663824716, + "grad_norm": 0.24465292692184448, + "learning_rate": 1.5277725971788398e-05, + "loss": 1.2217, + "step": 11521 + }, + { + "epoch": 3.4317839125821403, + "grad_norm": 0.2340582311153412, + "learning_rate": 1.5276906658349747e-05, + "loss": 1.2267, + "step": 11522 + }, + { + "epoch": 3.432081758781809, + "grad_norm": 0.23585832118988037, + "learning_rate": 1.5276087295815373e-05, + "loss": 1.2317, + "step": 11523 + }, + { + "epoch": 3.432379604981478, + "grad_norm": 0.24549613893032074, + "learning_rate": 1.5275267884192905e-05, + "loss": 1.2412, + "step": 11524 + }, + { + "epoch": 3.432677451181146, + "grad_norm": 0.23276914656162262, + "learning_rate": 1.527444842348996e-05, + "loss": 1.2521, + "step": 11525 + }, + { + "epoch": 3.432975297380815, + "grad_norm": 0.2454558163881302, + "learning_rate": 1.5273628913714165e-05, + "loss": 1.2326, + "step": 11526 + }, + { + "epoch": 3.4332731435804837, + "grad_norm": 0.257112979888916, + "learning_rate": 1.5272809354873146e-05, + "loss": 1.2418, + "step": 11527 + }, + { + "epoch": 3.4335709897801525, + "grad_norm": 0.23777510225772858, + "learning_rate": 1.527198974697453e-05, + "loss": 1.2475, + "step": 11528 + }, + { + "epoch": 3.433868835979821, + "grad_norm": 0.23546954989433289, + "learning_rate": 1.5271170090025936e-05, + "loss": 1.2408, + "step": 11529 + }, + { + "epoch": 3.4341666821794896, + "grad_norm": 0.23517552018165588, + "learning_rate": 1.5270350384034993e-05, + "loss": 1.2193, + "step": 11530 + }, + { + "epoch": 3.4344645283791584, + "grad_norm": 0.23490594327449799, + "learning_rate": 1.5269530629009332e-05, + "loss": 1.2424, + "step": 11531 + }, + { + "epoch": 3.4347623745788267, + "grad_norm": 0.24757042527198792, + "learning_rate": 1.5268710824956574e-05, + "loss": 1.217, + "step": 11532 + }, + { + "epoch": 3.4350602207784955, + "grad_norm": 0.23664523661136627, + "learning_rate": 1.5267890971884346e-05, + "loss": 1.2264, + "step": 11533 + }, + { + "epoch": 3.4353580669781643, + "grad_norm": 0.24112482368946075, + "learning_rate": 1.526707106980028e-05, + "loss": 1.2415, + "step": 11534 + }, + { + "epoch": 3.4356559131778326, + "grad_norm": 0.251738578081131, + "learning_rate": 1.5266251118712005e-05, + "loss": 1.2343, + "step": 11535 + }, + { + "epoch": 3.4359537593775014, + "grad_norm": 0.2540231943130493, + "learning_rate": 1.5265431118627147e-05, + "loss": 1.2307, + "step": 11536 + }, + { + "epoch": 3.43625160557717, + "grad_norm": 0.24106860160827637, + "learning_rate": 1.526461106955333e-05, + "loss": 1.2267, + "step": 11537 + }, + { + "epoch": 3.436549451776839, + "grad_norm": 0.24626649916172028, + "learning_rate": 1.5263790971498195e-05, + "loss": 1.2342, + "step": 11538 + }, + { + "epoch": 3.4368472979765072, + "grad_norm": 0.24112309515476227, + "learning_rate": 1.5262970824469362e-05, + "loss": 1.2251, + "step": 11539 + }, + { + "epoch": 3.437145144176176, + "grad_norm": 0.23126092553138733, + "learning_rate": 1.5262150628474467e-05, + "loss": 1.2328, + "step": 11540 + }, + { + "epoch": 3.4374429903758448, + "grad_norm": 0.25916430354118347, + "learning_rate": 1.5261330383521138e-05, + "loss": 1.235, + "step": 11541 + }, + { + "epoch": 3.4377408365755135, + "grad_norm": 0.23983551561832428, + "learning_rate": 1.5260510089617012e-05, + "loss": 1.2208, + "step": 11542 + }, + { + "epoch": 3.438038682775182, + "grad_norm": 0.24710910022258759, + "learning_rate": 1.5259689746769714e-05, + "loss": 1.2284, + "step": 11543 + }, + { + "epoch": 3.4383365289748506, + "grad_norm": 0.2789791524410248, + "learning_rate": 1.525886935498688e-05, + "loss": 1.2285, + "step": 11544 + }, + { + "epoch": 3.4386343751745194, + "grad_norm": 0.25198033452033997, + "learning_rate": 1.525804891427614e-05, + "loss": 1.2438, + "step": 11545 + }, + { + "epoch": 3.4389322213741877, + "grad_norm": 0.24183997511863708, + "learning_rate": 1.5257228424645132e-05, + "loss": 1.2136, + "step": 11546 + }, + { + "epoch": 3.4392300675738565, + "grad_norm": 0.2604411542415619, + "learning_rate": 1.5256407886101486e-05, + "loss": 1.248, + "step": 11547 + }, + { + "epoch": 3.4395279137735253, + "grad_norm": 0.24355940520763397, + "learning_rate": 1.5255587298652841e-05, + "loss": 1.2382, + "step": 11548 + }, + { + "epoch": 3.4398257599731936, + "grad_norm": 0.23050110042095184, + "learning_rate": 1.5254766662306825e-05, + "loss": 1.2311, + "step": 11549 + }, + { + "epoch": 3.4401236061728624, + "grad_norm": 0.2449619323015213, + "learning_rate": 1.5253945977071076e-05, + "loss": 1.232, + "step": 11550 + }, + { + "epoch": 3.440421452372531, + "grad_norm": 0.2661532759666443, + "learning_rate": 1.5253125242953228e-05, + "loss": 1.2517, + "step": 11551 + }, + { + "epoch": 3.4407192985722, + "grad_norm": 0.24256959557533264, + "learning_rate": 1.5252304459960922e-05, + "loss": 1.2168, + "step": 11552 + }, + { + "epoch": 3.4410171447718683, + "grad_norm": 0.2672572433948517, + "learning_rate": 1.5251483628101791e-05, + "loss": 1.2303, + "step": 11553 + }, + { + "epoch": 3.441314990971537, + "grad_norm": 0.27686724066734314, + "learning_rate": 1.525066274738347e-05, + "loss": 1.2264, + "step": 11554 + }, + { + "epoch": 3.441612837171206, + "grad_norm": 0.2409094274044037, + "learning_rate": 1.5249841817813602e-05, + "loss": 1.228, + "step": 11555 + }, + { + "epoch": 3.4419106833708746, + "grad_norm": 0.2512090802192688, + "learning_rate": 1.5249020839399819e-05, + "loss": 1.2488, + "step": 11556 + }, + { + "epoch": 3.442208529570543, + "grad_norm": 0.23591990768909454, + "learning_rate": 1.524819981214976e-05, + "loss": 1.2239, + "step": 11557 + }, + { + "epoch": 3.4425063757702117, + "grad_norm": 0.26415589451789856, + "learning_rate": 1.5247378736071068e-05, + "loss": 1.2398, + "step": 11558 + }, + { + "epoch": 3.4428042219698805, + "grad_norm": 0.27485665678977966, + "learning_rate": 1.524655761117138e-05, + "loss": 1.2146, + "step": 11559 + }, + { + "epoch": 3.443102068169549, + "grad_norm": 0.2404964715242386, + "learning_rate": 1.5245736437458333e-05, + "loss": 1.2233, + "step": 11560 + }, + { + "epoch": 3.4433999143692176, + "grad_norm": 0.36868318915367126, + "learning_rate": 1.5244915214939574e-05, + "loss": 1.2349, + "step": 11561 + }, + { + "epoch": 3.4436977605688863, + "grad_norm": 0.371759831905365, + "learning_rate": 1.5244093943622733e-05, + "loss": 1.2466, + "step": 11562 + }, + { + "epoch": 3.4439956067685547, + "grad_norm": 0.23560090363025665, + "learning_rate": 1.5243272623515459e-05, + "loss": 1.2324, + "step": 11563 + }, + { + "epoch": 3.4442934529682234, + "grad_norm": 0.38808372616767883, + "learning_rate": 1.5242451254625393e-05, + "loss": 1.2547, + "step": 11564 + }, + { + "epoch": 3.444591299167892, + "grad_norm": 0.24690575897693634, + "learning_rate": 1.5241629836960173e-05, + "loss": 1.2413, + "step": 11565 + }, + { + "epoch": 3.444889145367561, + "grad_norm": 0.2622869908809662, + "learning_rate": 1.5240808370527444e-05, + "loss": 1.2251, + "step": 11566 + }, + { + "epoch": 3.4451869915672293, + "grad_norm": 0.2426728904247284, + "learning_rate": 1.5239986855334849e-05, + "loss": 1.2129, + "step": 11567 + }, + { + "epoch": 3.445484837766898, + "grad_norm": 0.23665250837802887, + "learning_rate": 1.5239165291390033e-05, + "loss": 1.2536, + "step": 11568 + }, + { + "epoch": 3.445782683966567, + "grad_norm": 0.2624794840812683, + "learning_rate": 1.5238343678700633e-05, + "loss": 1.2273, + "step": 11569 + }, + { + "epoch": 3.4460805301662356, + "grad_norm": 0.2386084645986557, + "learning_rate": 1.52375220172743e-05, + "loss": 1.2418, + "step": 11570 + }, + { + "epoch": 3.446378376365904, + "grad_norm": 0.23174983263015747, + "learning_rate": 1.5236700307118674e-05, + "loss": 1.241, + "step": 11571 + }, + { + "epoch": 3.4466762225655727, + "grad_norm": 0.2384009212255478, + "learning_rate": 1.523587854824141e-05, + "loss": 1.23, + "step": 11572 + }, + { + "epoch": 3.4469740687652415, + "grad_norm": 0.23641443252563477, + "learning_rate": 1.5235056740650138e-05, + "loss": 1.2256, + "step": 11573 + }, + { + "epoch": 3.44727191496491, + "grad_norm": 0.23755528032779694, + "learning_rate": 1.5234234884352513e-05, + "loss": 1.2521, + "step": 11574 + }, + { + "epoch": 3.4475697611645786, + "grad_norm": 0.23581965267658234, + "learning_rate": 1.5233412979356184e-05, + "loss": 1.2449, + "step": 11575 + }, + { + "epoch": 3.4478676073642474, + "grad_norm": 0.23634174466133118, + "learning_rate": 1.5232591025668792e-05, + "loss": 1.2538, + "step": 11576 + }, + { + "epoch": 3.4481654535639157, + "grad_norm": 0.2442152500152588, + "learning_rate": 1.5231769023297989e-05, + "loss": 1.2541, + "step": 11577 + }, + { + "epoch": 3.4484632997635845, + "grad_norm": 0.23592181503772736, + "learning_rate": 1.5230946972251416e-05, + "loss": 1.2374, + "step": 11578 + }, + { + "epoch": 3.4487611459632532, + "grad_norm": 0.23963795602321625, + "learning_rate": 1.523012487253673e-05, + "loss": 1.2251, + "step": 11579 + }, + { + "epoch": 3.449058992162922, + "grad_norm": 0.26141953468322754, + "learning_rate": 1.5229302724161574e-05, + "loss": 1.2459, + "step": 11580 + }, + { + "epoch": 3.4493568383625903, + "grad_norm": 0.2843151390552521, + "learning_rate": 1.5228480527133598e-05, + "loss": 1.2247, + "step": 11581 + }, + { + "epoch": 3.449654684562259, + "grad_norm": 0.25682297348976135, + "learning_rate": 1.5227658281460453e-05, + "loss": 1.2454, + "step": 11582 + }, + { + "epoch": 3.449952530761928, + "grad_norm": 0.4024375081062317, + "learning_rate": 1.5226835987149788e-05, + "loss": 1.2345, + "step": 11583 + }, + { + "epoch": 3.4502503769615966, + "grad_norm": 0.29868048429489136, + "learning_rate": 1.5226013644209255e-05, + "loss": 1.2346, + "step": 11584 + }, + { + "epoch": 3.450548223161265, + "grad_norm": 0.28260061144828796, + "learning_rate": 1.5225191252646503e-05, + "loss": 1.244, + "step": 11585 + }, + { + "epoch": 3.4508460693609337, + "grad_norm": 0.3465733230113983, + "learning_rate": 1.5224368812469185e-05, + "loss": 1.2389, + "step": 11586 + }, + { + "epoch": 3.4511439155606025, + "grad_norm": 0.2428768426179886, + "learning_rate": 1.522354632368495e-05, + "loss": 1.2565, + "step": 11587 + }, + { + "epoch": 3.451441761760271, + "grad_norm": 0.26560989022254944, + "learning_rate": 1.5222723786301454e-05, + "loss": 1.2241, + "step": 11588 + }, + { + "epoch": 3.4517396079599396, + "grad_norm": 0.2622128129005432, + "learning_rate": 1.5221901200326352e-05, + "loss": 1.2381, + "step": 11589 + }, + { + "epoch": 3.4520374541596084, + "grad_norm": 0.24717475473880768, + "learning_rate": 1.5221078565767289e-05, + "loss": 1.2509, + "step": 11590 + }, + { + "epoch": 3.452335300359277, + "grad_norm": 0.2863736152648926, + "learning_rate": 1.5220255882631922e-05, + "loss": 1.2513, + "step": 11591 + }, + { + "epoch": 3.4526331465589455, + "grad_norm": 0.23726223409175873, + "learning_rate": 1.5219433150927909e-05, + "loss": 1.2436, + "step": 11592 + }, + { + "epoch": 3.4529309927586143, + "grad_norm": 0.2420269399881363, + "learning_rate": 1.5218610370662903e-05, + "loss": 1.242, + "step": 11593 + }, + { + "epoch": 3.453228838958283, + "grad_norm": 0.242299884557724, + "learning_rate": 1.5217787541844557e-05, + "loss": 1.2291, + "step": 11594 + }, + { + "epoch": 3.453526685157952, + "grad_norm": 0.23246005177497864, + "learning_rate": 1.5216964664480527e-05, + "loss": 1.2306, + "step": 11595 + }, + { + "epoch": 3.45382453135762, + "grad_norm": 0.26450425386428833, + "learning_rate": 1.5216141738578471e-05, + "loss": 1.2305, + "step": 11596 + }, + { + "epoch": 3.454122377557289, + "grad_norm": 0.2860598564147949, + "learning_rate": 1.5215318764146046e-05, + "loss": 1.232, + "step": 11597 + }, + { + "epoch": 3.4544202237569577, + "grad_norm": 0.2404812127351761, + "learning_rate": 1.5214495741190898e-05, + "loss": 1.2405, + "step": 11598 + }, + { + "epoch": 3.454718069956626, + "grad_norm": 0.2578551769256592, + "learning_rate": 1.5213672669720701e-05, + "loss": 1.232, + "step": 11599 + }, + { + "epoch": 3.4550159161562948, + "grad_norm": 0.24088731408119202, + "learning_rate": 1.5212849549743106e-05, + "loss": 1.224, + "step": 11600 + }, + { + "epoch": 3.4553137623559635, + "grad_norm": 0.24067500233650208, + "learning_rate": 1.5212026381265763e-05, + "loss": 1.2374, + "step": 11601 + }, + { + "epoch": 3.455611608555632, + "grad_norm": 0.23492807149887085, + "learning_rate": 1.5211203164296342e-05, + "loss": 1.2335, + "step": 11602 + }, + { + "epoch": 3.4559094547553006, + "grad_norm": 0.22992326319217682, + "learning_rate": 1.5210379898842499e-05, + "loss": 1.2359, + "step": 11603 + }, + { + "epoch": 3.4562073009549694, + "grad_norm": 0.25816991925239563, + "learning_rate": 1.5209556584911889e-05, + "loss": 1.2474, + "step": 11604 + }, + { + "epoch": 3.456505147154638, + "grad_norm": 0.2460036426782608, + "learning_rate": 1.5208733222512178e-05, + "loss": 1.2224, + "step": 11605 + }, + { + "epoch": 3.4568029933543065, + "grad_norm": 0.23283728957176208, + "learning_rate": 1.5207909811651024e-05, + "loss": 1.238, + "step": 11606 + }, + { + "epoch": 3.4571008395539753, + "grad_norm": 0.23499785363674164, + "learning_rate": 1.5207086352336088e-05, + "loss": 1.223, + "step": 11607 + }, + { + "epoch": 3.457398685753644, + "grad_norm": 0.24944666028022766, + "learning_rate": 1.520626284457503e-05, + "loss": 1.2271, + "step": 11608 + }, + { + "epoch": 3.457696531953313, + "grad_norm": 0.25068676471710205, + "learning_rate": 1.5205439288375511e-05, + "loss": 1.2504, + "step": 11609 + }, + { + "epoch": 3.457994378152981, + "grad_norm": 0.2367619276046753, + "learning_rate": 1.5204615683745198e-05, + "loss": 1.2555, + "step": 11610 + }, + { + "epoch": 3.45829222435265, + "grad_norm": 0.2698817849159241, + "learning_rate": 1.520379203069175e-05, + "loss": 1.2342, + "step": 11611 + }, + { + "epoch": 3.4585900705523187, + "grad_norm": 0.2815074920654297, + "learning_rate": 1.5202968329222832e-05, + "loss": 1.2323, + "step": 11612 + }, + { + "epoch": 3.458887916751987, + "grad_norm": 0.24128301441669464, + "learning_rate": 1.5202144579346108e-05, + "loss": 1.261, + "step": 11613 + }, + { + "epoch": 3.459185762951656, + "grad_norm": 0.3304024338722229, + "learning_rate": 1.520132078106924e-05, + "loss": 1.2242, + "step": 11614 + }, + { + "epoch": 3.4594836091513246, + "grad_norm": 0.2661862075328827, + "learning_rate": 1.5200496934399893e-05, + "loss": 1.2535, + "step": 11615 + }, + { + "epoch": 3.459781455350993, + "grad_norm": 0.3135344684123993, + "learning_rate": 1.5199673039345728e-05, + "loss": 1.2326, + "step": 11616 + }, + { + "epoch": 3.4600793015506617, + "grad_norm": 0.30401116609573364, + "learning_rate": 1.5198849095914421e-05, + "loss": 1.2376, + "step": 11617 + }, + { + "epoch": 3.4603771477503305, + "grad_norm": 0.25784486532211304, + "learning_rate": 1.519802510411363e-05, + "loss": 1.2329, + "step": 11618 + }, + { + "epoch": 3.4606749939499992, + "grad_norm": 0.28522834181785583, + "learning_rate": 1.5197201063951022e-05, + "loss": 1.2152, + "step": 11619 + }, + { + "epoch": 3.4609728401496676, + "grad_norm": 0.24161286652088165, + "learning_rate": 1.5196376975434265e-05, + "loss": 1.2241, + "step": 11620 + }, + { + "epoch": 3.4612706863493363, + "grad_norm": 0.2395504266023636, + "learning_rate": 1.5195552838571026e-05, + "loss": 1.2233, + "step": 11621 + }, + { + "epoch": 3.461568532549005, + "grad_norm": 0.2447492629289627, + "learning_rate": 1.5194728653368973e-05, + "loss": 1.2439, + "step": 11622 + }, + { + "epoch": 3.461866378748674, + "grad_norm": 0.23938801884651184, + "learning_rate": 1.5193904419835775e-05, + "loss": 1.2441, + "step": 11623 + }, + { + "epoch": 3.462164224948342, + "grad_norm": 0.24809573590755463, + "learning_rate": 1.5193080137979098e-05, + "loss": 1.2217, + "step": 11624 + }, + { + "epoch": 3.462462071148011, + "grad_norm": 0.24161048233509064, + "learning_rate": 1.5192255807806611e-05, + "loss": 1.2405, + "step": 11625 + }, + { + "epoch": 3.4627599173476797, + "grad_norm": 0.23102059960365295, + "learning_rate": 1.5191431429325986e-05, + "loss": 1.2351, + "step": 11626 + }, + { + "epoch": 3.463057763547348, + "grad_norm": 0.2383132129907608, + "learning_rate": 1.5190607002544893e-05, + "loss": 1.2361, + "step": 11627 + }, + { + "epoch": 3.463355609747017, + "grad_norm": 0.23085835576057434, + "learning_rate": 1.5189782527471e-05, + "loss": 1.2264, + "step": 11628 + }, + { + "epoch": 3.4636534559466856, + "grad_norm": 0.23786355555057526, + "learning_rate": 1.5188958004111977e-05, + "loss": 1.2265, + "step": 11629 + }, + { + "epoch": 3.463951302146354, + "grad_norm": 0.24842850863933563, + "learning_rate": 1.5188133432475502e-05, + "loss": 1.2315, + "step": 11630 + }, + { + "epoch": 3.4642491483460227, + "grad_norm": 0.3102658987045288, + "learning_rate": 1.518730881256924e-05, + "loss": 1.2201, + "step": 11631 + }, + { + "epoch": 3.4645469945456915, + "grad_norm": 0.26763206720352173, + "learning_rate": 1.5186484144400861e-05, + "loss": 1.2232, + "step": 11632 + }, + { + "epoch": 3.4648448407453603, + "grad_norm": 0.2530723810195923, + "learning_rate": 1.5185659427978046e-05, + "loss": 1.2433, + "step": 11633 + }, + { + "epoch": 3.4651426869450286, + "grad_norm": 0.3082621097564697, + "learning_rate": 1.5184834663308464e-05, + "loss": 1.2397, + "step": 11634 + }, + { + "epoch": 3.4654405331446974, + "grad_norm": 0.23491385579109192, + "learning_rate": 1.5184009850399788e-05, + "loss": 1.2241, + "step": 11635 + }, + { + "epoch": 3.465738379344366, + "grad_norm": 0.2701856791973114, + "learning_rate": 1.518318498925969e-05, + "loss": 1.2237, + "step": 11636 + }, + { + "epoch": 3.466036225544035, + "grad_norm": 0.3279270529747009, + "learning_rate": 1.518236007989585e-05, + "loss": 1.2419, + "step": 11637 + }, + { + "epoch": 3.4663340717437032, + "grad_norm": 0.2563593089580536, + "learning_rate": 1.5181535122315936e-05, + "loss": 1.2361, + "step": 11638 + }, + { + "epoch": 3.466631917943372, + "grad_norm": 0.25081443786621094, + "learning_rate": 1.5180710116527628e-05, + "loss": 1.2381, + "step": 11639 + }, + { + "epoch": 3.4669297641430408, + "grad_norm": 0.2737971246242523, + "learning_rate": 1.51798850625386e-05, + "loss": 1.229, + "step": 11640 + }, + { + "epoch": 3.467227610342709, + "grad_norm": 0.255681574344635, + "learning_rate": 1.5179059960356531e-05, + "loss": 1.2343, + "step": 11641 + }, + { + "epoch": 3.467525456542378, + "grad_norm": 0.2361997365951538, + "learning_rate": 1.5178234809989094e-05, + "loss": 1.2458, + "step": 11642 + }, + { + "epoch": 3.4678233027420466, + "grad_norm": 0.2407403141260147, + "learning_rate": 1.5177409611443968e-05, + "loss": 1.2343, + "step": 11643 + }, + { + "epoch": 3.4681211489417154, + "grad_norm": 0.2502894997596741, + "learning_rate": 1.5176584364728829e-05, + "loss": 1.2303, + "step": 11644 + }, + { + "epoch": 3.4684189951413837, + "grad_norm": 0.3361647427082062, + "learning_rate": 1.5175759069851357e-05, + "loss": 1.2341, + "step": 11645 + }, + { + "epoch": 3.4687168413410525, + "grad_norm": 0.35799121856689453, + "learning_rate": 1.5174933726819229e-05, + "loss": 1.226, + "step": 11646 + }, + { + "epoch": 3.4690146875407213, + "grad_norm": 0.24811288714408875, + "learning_rate": 1.5174108335640127e-05, + "loss": 1.242, + "step": 11647 + }, + { + "epoch": 3.4693125337403896, + "grad_norm": 0.27803266048431396, + "learning_rate": 1.5173282896321727e-05, + "loss": 1.2269, + "step": 11648 + }, + { + "epoch": 3.4696103799400584, + "grad_norm": 0.29134753346443176, + "learning_rate": 1.5172457408871708e-05, + "loss": 1.2242, + "step": 11649 + }, + { + "epoch": 3.469908226139727, + "grad_norm": 0.24610275030136108, + "learning_rate": 1.517163187329775e-05, + "loss": 1.2519, + "step": 11650 + }, + { + "epoch": 3.470206072339396, + "grad_norm": 0.2515765130519867, + "learning_rate": 1.5170806289607538e-05, + "loss": 1.2518, + "step": 11651 + }, + { + "epoch": 3.4705039185390643, + "grad_norm": 0.2578960359096527, + "learning_rate": 1.5169980657808752e-05, + "loss": 1.2435, + "step": 11652 + }, + { + "epoch": 3.470801764738733, + "grad_norm": 0.2373909205198288, + "learning_rate": 1.5169154977909068e-05, + "loss": 1.215, + "step": 11653 + }, + { + "epoch": 3.471099610938402, + "grad_norm": 0.2524226903915405, + "learning_rate": 1.5168329249916176e-05, + "loss": 1.2442, + "step": 11654 + }, + { + "epoch": 3.47139745713807, + "grad_norm": 0.25795549154281616, + "learning_rate": 1.5167503473837754e-05, + "loss": 1.2246, + "step": 11655 + }, + { + "epoch": 3.471695303337739, + "grad_norm": 0.2714008390903473, + "learning_rate": 1.5166677649681484e-05, + "loss": 1.2288, + "step": 11656 + }, + { + "epoch": 3.4719931495374077, + "grad_norm": 0.3128831088542938, + "learning_rate": 1.5165851777455051e-05, + "loss": 1.2381, + "step": 11657 + }, + { + "epoch": 3.4722909957370764, + "grad_norm": 0.23760241270065308, + "learning_rate": 1.5165025857166143e-05, + "loss": 1.2522, + "step": 11658 + }, + { + "epoch": 3.4725888419367448, + "grad_norm": 0.3532728850841522, + "learning_rate": 1.5164199888822438e-05, + "loss": 1.2178, + "step": 11659 + }, + { + "epoch": 3.4728866881364135, + "grad_norm": 0.2692665457725525, + "learning_rate": 1.5163373872431622e-05, + "loss": 1.2357, + "step": 11660 + }, + { + "epoch": 3.4731845343360823, + "grad_norm": 0.2848125696182251, + "learning_rate": 1.516254780800138e-05, + "loss": 1.2348, + "step": 11661 + }, + { + "epoch": 3.473482380535751, + "grad_norm": 0.27238327264785767, + "learning_rate": 1.51617216955394e-05, + "loss": 1.2196, + "step": 11662 + }, + { + "epoch": 3.4737802267354194, + "grad_norm": 0.25026488304138184, + "learning_rate": 1.5160895535053364e-05, + "loss": 1.2267, + "step": 11663 + }, + { + "epoch": 3.474078072935088, + "grad_norm": 0.2846026122570038, + "learning_rate": 1.5160069326550965e-05, + "loss": 1.2197, + "step": 11664 + }, + { + "epoch": 3.474375919134757, + "grad_norm": 0.23811158537864685, + "learning_rate": 1.5159243070039887e-05, + "loss": 1.227, + "step": 11665 + }, + { + "epoch": 3.4746737653344253, + "grad_norm": 0.29034659266471863, + "learning_rate": 1.5158416765527811e-05, + "loss": 1.2392, + "step": 11666 + }, + { + "epoch": 3.474971611534094, + "grad_norm": 0.24458420276641846, + "learning_rate": 1.5157590413022433e-05, + "loss": 1.2241, + "step": 11667 + }, + { + "epoch": 3.475269457733763, + "grad_norm": 0.2439926713705063, + "learning_rate": 1.5156764012531438e-05, + "loss": 1.2295, + "step": 11668 + }, + { + "epoch": 3.475567303933431, + "grad_norm": 0.25224968791007996, + "learning_rate": 1.5155937564062517e-05, + "loss": 1.2599, + "step": 11669 + }, + { + "epoch": 3.4758651501331, + "grad_norm": 0.23347537219524384, + "learning_rate": 1.5155111067623357e-05, + "loss": 1.2452, + "step": 11670 + }, + { + "epoch": 3.4761629963327687, + "grad_norm": 0.2441583126783371, + "learning_rate": 1.5154284523221648e-05, + "loss": 1.2431, + "step": 11671 + }, + { + "epoch": 3.4764608425324375, + "grad_norm": 0.2398219257593155, + "learning_rate": 1.5153457930865081e-05, + "loss": 1.233, + "step": 11672 + }, + { + "epoch": 3.476758688732106, + "grad_norm": 0.3126453161239624, + "learning_rate": 1.5152631290561343e-05, + "loss": 1.2401, + "step": 11673 + }, + { + "epoch": 3.4770565349317746, + "grad_norm": 0.3580472469329834, + "learning_rate": 1.5151804602318133e-05, + "loss": 1.2424, + "step": 11674 + }, + { + "epoch": 3.4773543811314434, + "grad_norm": 0.23871783912181854, + "learning_rate": 1.515097786614313e-05, + "loss": 1.2557, + "step": 11675 + }, + { + "epoch": 3.477652227331112, + "grad_norm": 0.527107834815979, + "learning_rate": 1.5150151082044039e-05, + "loss": 1.2302, + "step": 11676 + }, + { + "epoch": 3.4779500735307805, + "grad_norm": 0.34208452701568604, + "learning_rate": 1.5149324250028544e-05, + "loss": 1.2393, + "step": 11677 + }, + { + "epoch": 3.4782479197304492, + "grad_norm": 0.311907559633255, + "learning_rate": 1.514849737010434e-05, + "loss": 1.2422, + "step": 11678 + }, + { + "epoch": 3.478545765930118, + "grad_norm": 0.23271065950393677, + "learning_rate": 1.5147670442279121e-05, + "loss": 1.2326, + "step": 11679 + }, + { + "epoch": 3.4788436121297863, + "grad_norm": 0.26043426990509033, + "learning_rate": 1.5146843466560577e-05, + "loss": 1.2335, + "step": 11680 + }, + { + "epoch": 3.479141458329455, + "grad_norm": 0.2789304554462433, + "learning_rate": 1.5146016442956409e-05, + "loss": 1.2601, + "step": 11681 + }, + { + "epoch": 3.479439304529124, + "grad_norm": 0.2576315999031067, + "learning_rate": 1.5145189371474302e-05, + "loss": 1.2326, + "step": 11682 + }, + { + "epoch": 3.479737150728792, + "grad_norm": 0.24909977614879608, + "learning_rate": 1.514436225212196e-05, + "loss": 1.2432, + "step": 11683 + }, + { + "epoch": 3.480034996928461, + "grad_norm": 0.25181618332862854, + "learning_rate": 1.5143535084907075e-05, + "loss": 1.2398, + "step": 11684 + }, + { + "epoch": 3.4803328431281297, + "grad_norm": 0.25857406854629517, + "learning_rate": 1.5142707869837342e-05, + "loss": 1.2421, + "step": 11685 + }, + { + "epoch": 3.4806306893277985, + "grad_norm": 0.2688182592391968, + "learning_rate": 1.5141880606920459e-05, + "loss": 1.2292, + "step": 11686 + }, + { + "epoch": 3.480928535527467, + "grad_norm": 0.23363617062568665, + "learning_rate": 1.514105329616412e-05, + "loss": 1.2202, + "step": 11687 + }, + { + "epoch": 3.4812263817271356, + "grad_norm": 0.40400955080986023, + "learning_rate": 1.5140225937576027e-05, + "loss": 1.2373, + "step": 11688 + }, + { + "epoch": 3.4815242279268044, + "grad_norm": 0.26861774921417236, + "learning_rate": 1.5139398531163872e-05, + "loss": 1.2375, + "step": 11689 + }, + { + "epoch": 3.481822074126473, + "grad_norm": 0.28978976607322693, + "learning_rate": 1.5138571076935357e-05, + "loss": 1.2426, + "step": 11690 + }, + { + "epoch": 3.4821199203261415, + "grad_norm": 0.2498270720243454, + "learning_rate": 1.5137743574898178e-05, + "loss": 1.2452, + "step": 11691 + }, + { + "epoch": 3.4824177665258103, + "grad_norm": 0.23749759793281555, + "learning_rate": 1.5136916025060035e-05, + "loss": 1.2412, + "step": 11692 + }, + { + "epoch": 3.482715612725479, + "grad_norm": 0.30182361602783203, + "learning_rate": 1.513608842742863e-05, + "loss": 1.2483, + "step": 11693 + }, + { + "epoch": 3.4830134589251474, + "grad_norm": 0.25271162390708923, + "learning_rate": 1.5135260782011659e-05, + "loss": 1.2293, + "step": 11694 + }, + { + "epoch": 3.483311305124816, + "grad_norm": 0.24954035878181458, + "learning_rate": 1.5134433088816826e-05, + "loss": 1.2422, + "step": 11695 + }, + { + "epoch": 3.483609151324485, + "grad_norm": 0.2597573697566986, + "learning_rate": 1.5133605347851827e-05, + "loss": 1.2339, + "step": 11696 + }, + { + "epoch": 3.4839069975241532, + "grad_norm": 0.2629152238368988, + "learning_rate": 1.5132777559124367e-05, + "loss": 1.2402, + "step": 11697 + }, + { + "epoch": 3.484204843723822, + "grad_norm": 0.2538357675075531, + "learning_rate": 1.5131949722642145e-05, + "loss": 1.2282, + "step": 11698 + }, + { + "epoch": 3.4845026899234908, + "grad_norm": 0.3156806230545044, + "learning_rate": 1.5131121838412868e-05, + "loss": 1.2467, + "step": 11699 + }, + { + "epoch": 3.4848005361231595, + "grad_norm": 0.26059478521347046, + "learning_rate": 1.5130293906444232e-05, + "loss": 1.2275, + "step": 11700 + }, + { + "epoch": 3.485098382322828, + "grad_norm": 0.26958325505256653, + "learning_rate": 1.5129465926743944e-05, + "loss": 1.2389, + "step": 11701 + }, + { + "epoch": 3.4853962285224966, + "grad_norm": 0.24079011380672455, + "learning_rate": 1.5128637899319709e-05, + "loss": 1.238, + "step": 11702 + }, + { + "epoch": 3.4856940747221654, + "grad_norm": 0.2870723009109497, + "learning_rate": 1.5127809824179223e-05, + "loss": 1.2369, + "step": 11703 + }, + { + "epoch": 3.485991920921834, + "grad_norm": 0.266473650932312, + "learning_rate": 1.5126981701330196e-05, + "loss": 1.2322, + "step": 11704 + }, + { + "epoch": 3.4862897671215025, + "grad_norm": 0.2880406677722931, + "learning_rate": 1.512615353078034e-05, + "loss": 1.2347, + "step": 11705 + }, + { + "epoch": 3.4865876133211713, + "grad_norm": 0.26380014419555664, + "learning_rate": 1.5125325312537347e-05, + "loss": 1.2561, + "step": 11706 + }, + { + "epoch": 3.48688545952084, + "grad_norm": 0.24268145859241486, + "learning_rate": 1.5124497046608927e-05, + "loss": 1.229, + "step": 11707 + }, + { + "epoch": 3.4871833057205084, + "grad_norm": 0.2717461585998535, + "learning_rate": 1.512366873300279e-05, + "loss": 1.2397, + "step": 11708 + }, + { + "epoch": 3.487481151920177, + "grad_norm": 0.2414597123861313, + "learning_rate": 1.512284037172664e-05, + "loss": 1.2317, + "step": 11709 + }, + { + "epoch": 3.487778998119846, + "grad_norm": 0.2472236007452011, + "learning_rate": 1.5122011962788184e-05, + "loss": 1.2339, + "step": 11710 + }, + { + "epoch": 3.4880768443195147, + "grad_norm": 0.24366678297519684, + "learning_rate": 1.5121183506195128e-05, + "loss": 1.2265, + "step": 11711 + }, + { + "epoch": 3.488374690519183, + "grad_norm": 0.24428726732730865, + "learning_rate": 1.5120355001955184e-05, + "loss": 1.2233, + "step": 11712 + }, + { + "epoch": 3.488672536718852, + "grad_norm": 0.24292029440402985, + "learning_rate": 1.5119526450076055e-05, + "loss": 1.244, + "step": 11713 + }, + { + "epoch": 3.4889703829185206, + "grad_norm": 0.23800034821033478, + "learning_rate": 1.511869785056545e-05, + "loss": 1.2374, + "step": 11714 + }, + { + "epoch": 3.489268229118189, + "grad_norm": 0.23138365149497986, + "learning_rate": 1.5117869203431083e-05, + "loss": 1.2376, + "step": 11715 + }, + { + "epoch": 3.4895660753178577, + "grad_norm": 0.25169605016708374, + "learning_rate": 1.5117040508680663e-05, + "loss": 1.2436, + "step": 11716 + }, + { + "epoch": 3.4898639215175264, + "grad_norm": 0.23849761486053467, + "learning_rate": 1.5116211766321896e-05, + "loss": 1.2522, + "step": 11717 + }, + { + "epoch": 3.490161767717195, + "grad_norm": 0.24596084654331207, + "learning_rate": 1.5115382976362493e-05, + "loss": 1.2316, + "step": 11718 + }, + { + "epoch": 3.4904596139168635, + "grad_norm": 0.23545923829078674, + "learning_rate": 1.5114554138810174e-05, + "loss": 1.2379, + "step": 11719 + }, + { + "epoch": 3.4907574601165323, + "grad_norm": 0.24625824391841888, + "learning_rate": 1.5113725253672635e-05, + "loss": 1.2308, + "step": 11720 + }, + { + "epoch": 3.491055306316201, + "grad_norm": 0.2673510015010834, + "learning_rate": 1.5112896320957597e-05, + "loss": 1.2244, + "step": 11721 + }, + { + "epoch": 3.4913531525158694, + "grad_norm": 0.2335682362318039, + "learning_rate": 1.5112067340672774e-05, + "loss": 1.2224, + "step": 11722 + }, + { + "epoch": 3.491650998715538, + "grad_norm": 0.285336434841156, + "learning_rate": 1.5111238312825875e-05, + "loss": 1.2361, + "step": 11723 + }, + { + "epoch": 3.491948844915207, + "grad_norm": 0.2684919834136963, + "learning_rate": 1.5110409237424612e-05, + "loss": 1.2173, + "step": 11724 + }, + { + "epoch": 3.4922466911148757, + "grad_norm": 0.23368461430072784, + "learning_rate": 1.5109580114476705e-05, + "loss": 1.2332, + "step": 11725 + }, + { + "epoch": 3.492544537314544, + "grad_norm": 0.2434931993484497, + "learning_rate": 1.5108750943989863e-05, + "loss": 1.227, + "step": 11726 + }, + { + "epoch": 3.492842383514213, + "grad_norm": 0.252808541059494, + "learning_rate": 1.5107921725971797e-05, + "loss": 1.218, + "step": 11727 + }, + { + "epoch": 3.4931402297138816, + "grad_norm": 0.2509145736694336, + "learning_rate": 1.5107092460430228e-05, + "loss": 1.2318, + "step": 11728 + }, + { + "epoch": 3.4934380759135504, + "grad_norm": 0.30690038204193115, + "learning_rate": 1.5106263147372872e-05, + "loss": 1.2296, + "step": 11729 + }, + { + "epoch": 3.4937359221132187, + "grad_norm": 0.2524191439151764, + "learning_rate": 1.5105433786807442e-05, + "loss": 1.2347, + "step": 11730 + }, + { + "epoch": 3.4940337683128875, + "grad_norm": 0.2589205503463745, + "learning_rate": 1.5104604378741653e-05, + "loss": 1.2335, + "step": 11731 + }, + { + "epoch": 3.4943316145125562, + "grad_norm": 0.2606979012489319, + "learning_rate": 1.5103774923183223e-05, + "loss": 1.2253, + "step": 11732 + }, + { + "epoch": 3.4946294607122246, + "grad_norm": 0.22792330384254456, + "learning_rate": 1.5102945420139872e-05, + "loss": 1.226, + "step": 11733 + }, + { + "epoch": 3.4949273069118933, + "grad_norm": 0.2698785364627838, + "learning_rate": 1.5102115869619315e-05, + "loss": 1.2297, + "step": 11734 + }, + { + "epoch": 3.495225153111562, + "grad_norm": 0.24608469009399414, + "learning_rate": 1.5101286271629269e-05, + "loss": 1.2346, + "step": 11735 + }, + { + "epoch": 3.4955229993112304, + "grad_norm": 0.32868117094039917, + "learning_rate": 1.5100456626177455e-05, + "loss": 1.2437, + "step": 11736 + }, + { + "epoch": 3.495820845510899, + "grad_norm": 0.28755104541778564, + "learning_rate": 1.5099626933271589e-05, + "loss": 1.2284, + "step": 11737 + }, + { + "epoch": 3.496118691710568, + "grad_norm": 0.26792627573013306, + "learning_rate": 1.5098797192919393e-05, + "loss": 1.2304, + "step": 11738 + }, + { + "epoch": 3.4964165379102368, + "grad_norm": 0.3042823374271393, + "learning_rate": 1.5097967405128584e-05, + "loss": 1.2291, + "step": 11739 + }, + { + "epoch": 3.496714384109905, + "grad_norm": 0.2533160448074341, + "learning_rate": 1.5097137569906885e-05, + "loss": 1.2305, + "step": 11740 + }, + { + "epoch": 3.497012230309574, + "grad_norm": 0.25064122676849365, + "learning_rate": 1.5096307687262019e-05, + "loss": 1.2438, + "step": 11741 + }, + { + "epoch": 3.4973100765092426, + "grad_norm": 0.2505095601081848, + "learning_rate": 1.50954777572017e-05, + "loss": 1.2337, + "step": 11742 + }, + { + "epoch": 3.4976079227089114, + "grad_norm": 0.23554089665412903, + "learning_rate": 1.5094647779733659e-05, + "loss": 1.236, + "step": 11743 + }, + { + "epoch": 3.4979057689085797, + "grad_norm": 0.2355387657880783, + "learning_rate": 1.5093817754865608e-05, + "loss": 1.2295, + "step": 11744 + }, + { + "epoch": 3.4982036151082485, + "grad_norm": 0.23906509578227997, + "learning_rate": 1.509298768260527e-05, + "loss": 1.2422, + "step": 11745 + }, + { + "epoch": 3.4985014613079173, + "grad_norm": 0.2412572056055069, + "learning_rate": 1.5092157562960377e-05, + "loss": 1.2483, + "step": 11746 + }, + { + "epoch": 3.4987993075075856, + "grad_norm": 0.2579009234905243, + "learning_rate": 1.5091327395938646e-05, + "loss": 1.2283, + "step": 11747 + }, + { + "epoch": 3.4990971537072544, + "grad_norm": 0.2576175928115845, + "learning_rate": 1.5090497181547803e-05, + "loss": 1.2408, + "step": 11748 + }, + { + "epoch": 3.499394999906923, + "grad_norm": 0.24004359543323517, + "learning_rate": 1.5089666919795569e-05, + "loss": 1.2273, + "step": 11749 + }, + { + "epoch": 3.4996928461065915, + "grad_norm": 0.2498658001422882, + "learning_rate": 1.5088836610689673e-05, + "loss": 1.2422, + "step": 11750 + }, + { + "epoch": 3.4999906923062603, + "grad_norm": 0.24315737187862396, + "learning_rate": 1.5088006254237837e-05, + "loss": 1.24, + "step": 11751 + }, + { + "epoch": 3.500288538505929, + "grad_norm": 0.24262316524982452, + "learning_rate": 1.5087175850447788e-05, + "loss": 1.2516, + "step": 11752 + }, + { + "epoch": 3.500586384705598, + "grad_norm": 0.2583785653114319, + "learning_rate": 1.5086345399327252e-05, + "loss": 1.2269, + "step": 11753 + }, + { + "epoch": 3.5008842309052666, + "grad_norm": 0.2627702057361603, + "learning_rate": 1.5085514900883953e-05, + "loss": 1.2366, + "step": 11754 + }, + { + "epoch": 3.501182077104935, + "grad_norm": 0.251803457736969, + "learning_rate": 1.508468435512562e-05, + "loss": 1.2325, + "step": 11755 + }, + { + "epoch": 3.5014799233046037, + "grad_norm": 0.36371490359306335, + "learning_rate": 1.508385376205998e-05, + "loss": 1.2297, + "step": 11756 + }, + { + "epoch": 3.5017777695042724, + "grad_norm": 0.3003302812576294, + "learning_rate": 1.5083023121694762e-05, + "loss": 1.2261, + "step": 11757 + }, + { + "epoch": 3.5020756157039408, + "grad_norm": 0.2752167880535126, + "learning_rate": 1.5082192434037693e-05, + "loss": 1.2243, + "step": 11758 + }, + { + "epoch": 3.5023734619036095, + "grad_norm": 0.3372878134250641, + "learning_rate": 1.50813616990965e-05, + "loss": 1.2072, + "step": 11759 + }, + { + "epoch": 3.5026713081032783, + "grad_norm": 0.2392565906047821, + "learning_rate": 1.5080530916878914e-05, + "loss": 1.2414, + "step": 11760 + }, + { + "epoch": 3.5029691543029466, + "grad_norm": 0.28664082288742065, + "learning_rate": 1.5079700087392666e-05, + "loss": 1.2423, + "step": 11761 + }, + { + "epoch": 3.5032670005026154, + "grad_norm": 0.2399989664554596, + "learning_rate": 1.5078869210645484e-05, + "loss": 1.243, + "step": 11762 + }, + { + "epoch": 3.503564846702284, + "grad_norm": 0.43447959423065186, + "learning_rate": 1.5078038286645097e-05, + "loss": 1.2458, + "step": 11763 + }, + { + "epoch": 3.5038626929019525, + "grad_norm": 0.35170891880989075, + "learning_rate": 1.5077207315399236e-05, + "loss": 1.2263, + "step": 11764 + }, + { + "epoch": 3.5041605391016213, + "grad_norm": 0.2994493544101715, + "learning_rate": 1.5076376296915634e-05, + "loss": 1.2373, + "step": 11765 + }, + { + "epoch": 3.50445838530129, + "grad_norm": 0.24334172904491425, + "learning_rate": 1.5075545231202022e-05, + "loss": 1.2362, + "step": 11766 + }, + { + "epoch": 3.504756231500959, + "grad_norm": 0.3043341040611267, + "learning_rate": 1.5074714118266137e-05, + "loss": 1.2485, + "step": 11767 + }, + { + "epoch": 3.5050540777006276, + "grad_norm": 0.2556568384170532, + "learning_rate": 1.5073882958115701e-05, + "loss": 1.2582, + "step": 11768 + }, + { + "epoch": 3.505351923900296, + "grad_norm": 0.25257766246795654, + "learning_rate": 1.5073051750758456e-05, + "loss": 1.2367, + "step": 11769 + }, + { + "epoch": 3.5056497700999647, + "grad_norm": 0.23973365128040314, + "learning_rate": 1.5072220496202132e-05, + "loss": 1.2611, + "step": 11770 + }, + { + "epoch": 3.5059476162996335, + "grad_norm": 0.2681906819343567, + "learning_rate": 1.5071389194454464e-05, + "loss": 1.2361, + "step": 11771 + }, + { + "epoch": 3.506245462499302, + "grad_norm": 0.24991819262504578, + "learning_rate": 1.5070557845523184e-05, + "loss": 1.229, + "step": 11772 + }, + { + "epoch": 3.5065433086989706, + "grad_norm": 0.25751763582229614, + "learning_rate": 1.5069726449416029e-05, + "loss": 1.2138, + "step": 11773 + }, + { + "epoch": 3.5068411548986393, + "grad_norm": 0.25750166177749634, + "learning_rate": 1.5068895006140733e-05, + "loss": 1.2452, + "step": 11774 + }, + { + "epoch": 3.5071390010983077, + "grad_norm": 0.24973426759243011, + "learning_rate": 1.5068063515705033e-05, + "loss": 1.2355, + "step": 11775 + }, + { + "epoch": 3.5074368472979764, + "grad_norm": 0.25527223944664, + "learning_rate": 1.5067231978116663e-05, + "loss": 1.223, + "step": 11776 + }, + { + "epoch": 3.507734693497645, + "grad_norm": 0.23963607847690582, + "learning_rate": 1.5066400393383367e-05, + "loss": 1.2361, + "step": 11777 + }, + { + "epoch": 3.5080325396973135, + "grad_norm": 0.2547277510166168, + "learning_rate": 1.506556876151287e-05, + "loss": 1.2407, + "step": 11778 + }, + { + "epoch": 3.5083303858969823, + "grad_norm": 0.266491174697876, + "learning_rate": 1.5064737082512916e-05, + "loss": 1.2274, + "step": 11779 + }, + { + "epoch": 3.508628232096651, + "grad_norm": 0.28425759077072144, + "learning_rate": 1.5063905356391242e-05, + "loss": 1.2477, + "step": 11780 + }, + { + "epoch": 3.50892607829632, + "grad_norm": 0.2597352862358093, + "learning_rate": 1.5063073583155587e-05, + "loss": 1.2271, + "step": 11781 + }, + { + "epoch": 3.5092239244959886, + "grad_norm": 0.38104113936424255, + "learning_rate": 1.5062241762813688e-05, + "loss": 1.2261, + "step": 11782 + }, + { + "epoch": 3.509521770695657, + "grad_norm": 0.3198084235191345, + "learning_rate": 1.5061409895373284e-05, + "loss": 1.2273, + "step": 11783 + }, + { + "epoch": 3.5098196168953257, + "grad_norm": 0.2601906359195709, + "learning_rate": 1.5060577980842121e-05, + "loss": 1.2441, + "step": 11784 + }, + { + "epoch": 3.5101174630949945, + "grad_norm": 0.504921555519104, + "learning_rate": 1.5059746019227932e-05, + "loss": 1.2329, + "step": 11785 + }, + { + "epoch": 3.510415309294663, + "grad_norm": 0.33166366815567017, + "learning_rate": 1.5058914010538454e-05, + "loss": 1.2349, + "step": 11786 + }, + { + "epoch": 3.5107131554943316, + "grad_norm": 0.28050699830055237, + "learning_rate": 1.505808195478144e-05, + "loss": 1.2389, + "step": 11787 + }, + { + "epoch": 3.5110110016940004, + "grad_norm": 0.2675134539604187, + "learning_rate": 1.5057249851964623e-05, + "loss": 1.2242, + "step": 11788 + }, + { + "epoch": 3.5113088478936687, + "grad_norm": 0.31375494599342346, + "learning_rate": 1.5056417702095747e-05, + "loss": 1.2162, + "step": 11789 + }, + { + "epoch": 3.5116066940933375, + "grad_norm": 0.25974899530410767, + "learning_rate": 1.5055585505182551e-05, + "loss": 1.2269, + "step": 11790 + }, + { + "epoch": 3.5119045402930062, + "grad_norm": 0.23940208554267883, + "learning_rate": 1.5054753261232782e-05, + "loss": 1.2511, + "step": 11791 + }, + { + "epoch": 3.5122023864926746, + "grad_norm": 0.26865193247795105, + "learning_rate": 1.505392097025418e-05, + "loss": 1.2364, + "step": 11792 + }, + { + "epoch": 3.5125002326923433, + "grad_norm": 0.2572293281555176, + "learning_rate": 1.5053088632254491e-05, + "loss": 1.2236, + "step": 11793 + }, + { + "epoch": 3.512798078892012, + "grad_norm": 0.28694552183151245, + "learning_rate": 1.505225624724146e-05, + "loss": 1.2473, + "step": 11794 + }, + { + "epoch": 3.513095925091681, + "grad_norm": 0.24849431216716766, + "learning_rate": 1.5051423815222826e-05, + "loss": 1.2368, + "step": 11795 + }, + { + "epoch": 3.5133937712913497, + "grad_norm": 0.24320310354232788, + "learning_rate": 1.505059133620634e-05, + "loss": 1.2246, + "step": 11796 + }, + { + "epoch": 3.513691617491018, + "grad_norm": 0.24401208758354187, + "learning_rate": 1.5049758810199741e-05, + "loss": 1.2251, + "step": 11797 + }, + { + "epoch": 3.5139894636906868, + "grad_norm": 0.24516740441322327, + "learning_rate": 1.504892623721078e-05, + "loss": 1.2434, + "step": 11798 + }, + { + "epoch": 3.5142873098903555, + "grad_norm": 0.23222769796848297, + "learning_rate": 1.5048093617247201e-05, + "loss": 1.2367, + "step": 11799 + }, + { + "epoch": 3.514585156090024, + "grad_norm": 0.23739634454250336, + "learning_rate": 1.5047260950316753e-05, + "loss": 1.2285, + "step": 11800 + }, + { + "epoch": 3.5148830022896926, + "grad_norm": 0.24186591804027557, + "learning_rate": 1.5046428236427179e-05, + "loss": 1.2326, + "step": 11801 + }, + { + "epoch": 3.5151808484893614, + "grad_norm": 0.23322808742523193, + "learning_rate": 1.504559547558623e-05, + "loss": 1.2325, + "step": 11802 + }, + { + "epoch": 3.5154786946890297, + "grad_norm": 0.2473326027393341, + "learning_rate": 1.5044762667801651e-05, + "loss": 1.2401, + "step": 11803 + }, + { + "epoch": 3.5157765408886985, + "grad_norm": 0.25037267804145813, + "learning_rate": 1.5043929813081191e-05, + "loss": 1.2384, + "step": 11804 + }, + { + "epoch": 3.5160743870883673, + "grad_norm": 0.24375185370445251, + "learning_rate": 1.5043096911432602e-05, + "loss": 1.2349, + "step": 11805 + }, + { + "epoch": 3.516372233288036, + "grad_norm": 0.24941161274909973, + "learning_rate": 1.5042263962863627e-05, + "loss": 1.2385, + "step": 11806 + }, + { + "epoch": 3.5166700794877044, + "grad_norm": 0.2311730533838272, + "learning_rate": 1.5041430967382021e-05, + "loss": 1.2287, + "step": 11807 + }, + { + "epoch": 3.516967925687373, + "grad_norm": 0.23631060123443604, + "learning_rate": 1.5040597924995535e-05, + "loss": 1.2294, + "step": 11808 + }, + { + "epoch": 3.517265771887042, + "grad_norm": 0.23170427978038788, + "learning_rate": 1.5039764835711914e-05, + "loss": 1.2253, + "step": 11809 + }, + { + "epoch": 3.5175636180867107, + "grad_norm": 0.26978152990341187, + "learning_rate": 1.5038931699538913e-05, + "loss": 1.2459, + "step": 11810 + }, + { + "epoch": 3.517861464286379, + "grad_norm": 0.23406825959682465, + "learning_rate": 1.5038098516484283e-05, + "loss": 1.234, + "step": 11811 + }, + { + "epoch": 3.518159310486048, + "grad_norm": 0.2873719334602356, + "learning_rate": 1.5037265286555776e-05, + "loss": 1.2322, + "step": 11812 + }, + { + "epoch": 3.5184571566857166, + "grad_norm": 0.25312504172325134, + "learning_rate": 1.503643200976114e-05, + "loss": 1.2379, + "step": 11813 + }, + { + "epoch": 3.518755002885385, + "grad_norm": 0.2473198026418686, + "learning_rate": 1.5035598686108132e-05, + "loss": 1.2384, + "step": 11814 + }, + { + "epoch": 3.5190528490850537, + "grad_norm": 0.27882787585258484, + "learning_rate": 1.5034765315604506e-05, + "loss": 1.2321, + "step": 11815 + }, + { + "epoch": 3.5193506952847224, + "grad_norm": 0.25915205478668213, + "learning_rate": 1.5033931898258013e-05, + "loss": 1.2215, + "step": 11816 + }, + { + "epoch": 3.5196485414843908, + "grad_norm": 0.2492315173149109, + "learning_rate": 1.503309843407641e-05, + "loss": 1.237, + "step": 11817 + }, + { + "epoch": 3.5199463876840595, + "grad_norm": 0.24386049807071686, + "learning_rate": 1.5032264923067448e-05, + "loss": 1.2538, + "step": 11818 + }, + { + "epoch": 3.5202442338837283, + "grad_norm": 0.2439318299293518, + "learning_rate": 1.5031431365238884e-05, + "loss": 1.2513, + "step": 11819 + }, + { + "epoch": 3.520542080083397, + "grad_norm": 0.3076123893260956, + "learning_rate": 1.5030597760598471e-05, + "loss": 1.2317, + "step": 11820 + }, + { + "epoch": 3.520839926283066, + "grad_norm": 0.275299996137619, + "learning_rate": 1.5029764109153966e-05, + "loss": 1.2382, + "step": 11821 + }, + { + "epoch": 3.521137772482734, + "grad_norm": 0.27564939856529236, + "learning_rate": 1.5028930410913125e-05, + "loss": 1.2636, + "step": 11822 + }, + { + "epoch": 3.521435618682403, + "grad_norm": 0.48066186904907227, + "learning_rate": 1.502809666588371e-05, + "loss": 1.2322, + "step": 11823 + }, + { + "epoch": 3.5217334648820717, + "grad_norm": 0.3236466646194458, + "learning_rate": 1.5027262874073466e-05, + "loss": 1.2269, + "step": 11824 + }, + { + "epoch": 3.52203131108174, + "grad_norm": 0.33280837535858154, + "learning_rate": 1.5026429035490164e-05, + "loss": 1.2329, + "step": 11825 + }, + { + "epoch": 3.522329157281409, + "grad_norm": 0.24413232505321503, + "learning_rate": 1.5025595150141554e-05, + "loss": 1.2316, + "step": 11826 + }, + { + "epoch": 3.5226270034810776, + "grad_norm": 0.5738617181777954, + "learning_rate": 1.5024761218035394e-05, + "loss": 1.2373, + "step": 11827 + }, + { + "epoch": 3.522924849680746, + "grad_norm": 0.2744631767272949, + "learning_rate": 1.5023927239179447e-05, + "loss": 1.2423, + "step": 11828 + }, + { + "epoch": 3.5232226958804147, + "grad_norm": 0.25785768032073975, + "learning_rate": 1.502309321358147e-05, + "loss": 1.2298, + "step": 11829 + }, + { + "epoch": 3.5235205420800835, + "grad_norm": 0.24500633776187897, + "learning_rate": 1.5022259141249222e-05, + "loss": 1.2337, + "step": 11830 + }, + { + "epoch": 3.523818388279752, + "grad_norm": 0.23993581533432007, + "learning_rate": 1.5021425022190464e-05, + "loss": 1.2275, + "step": 11831 + }, + { + "epoch": 3.5241162344794206, + "grad_norm": 0.24274177849292755, + "learning_rate": 1.5020590856412955e-05, + "loss": 1.2303, + "step": 11832 + }, + { + "epoch": 3.5244140806790893, + "grad_norm": 0.26262083649635315, + "learning_rate": 1.501975664392446e-05, + "loss": 1.2352, + "step": 11833 + }, + { + "epoch": 3.524711926878758, + "grad_norm": 0.24239331483840942, + "learning_rate": 1.5018922384732735e-05, + "loss": 1.2191, + "step": 11834 + }, + { + "epoch": 3.525009773078427, + "grad_norm": 0.23399677872657776, + "learning_rate": 1.501808807884555e-05, + "loss": 1.2316, + "step": 11835 + }, + { + "epoch": 3.525307619278095, + "grad_norm": 0.23567579686641693, + "learning_rate": 1.5017253726270658e-05, + "loss": 1.2249, + "step": 11836 + }, + { + "epoch": 3.525605465477764, + "grad_norm": 0.24142025411128998, + "learning_rate": 1.5016419327015825e-05, + "loss": 1.2236, + "step": 11837 + }, + { + "epoch": 3.5259033116774328, + "grad_norm": 0.24926820397377014, + "learning_rate": 1.5015584881088817e-05, + "loss": 1.2257, + "step": 11838 + }, + { + "epoch": 3.526201157877101, + "grad_norm": 0.23994338512420654, + "learning_rate": 1.5014750388497393e-05, + "loss": 1.2411, + "step": 11839 + }, + { + "epoch": 3.52649900407677, + "grad_norm": 0.23732413351535797, + "learning_rate": 1.501391584924932e-05, + "loss": 1.2199, + "step": 11840 + }, + { + "epoch": 3.5267968502764386, + "grad_norm": 0.23851914703845978, + "learning_rate": 1.5013081263352362e-05, + "loss": 1.2279, + "step": 11841 + }, + { + "epoch": 3.527094696476107, + "grad_norm": 0.2557980716228485, + "learning_rate": 1.5012246630814287e-05, + "loss": 1.2335, + "step": 11842 + }, + { + "epoch": 3.5273925426757757, + "grad_norm": 0.2455354630947113, + "learning_rate": 1.5011411951642853e-05, + "loss": 1.2399, + "step": 11843 + }, + { + "epoch": 3.5276903888754445, + "grad_norm": 0.23361371457576752, + "learning_rate": 1.5010577225845833e-05, + "loss": 1.2157, + "step": 11844 + }, + { + "epoch": 3.527988235075113, + "grad_norm": 0.23348510265350342, + "learning_rate": 1.5009742453430987e-05, + "loss": 1.206, + "step": 11845 + }, + { + "epoch": 3.5282860812747816, + "grad_norm": 0.24507753551006317, + "learning_rate": 1.5008907634406087e-05, + "loss": 1.2339, + "step": 11846 + }, + { + "epoch": 3.5285839274744504, + "grad_norm": 0.24981550872325897, + "learning_rate": 1.5008072768778897e-05, + "loss": 1.2443, + "step": 11847 + }, + { + "epoch": 3.528881773674119, + "grad_norm": 0.24339215457439423, + "learning_rate": 1.5007237856557184e-05, + "loss": 1.2148, + "step": 11848 + }, + { + "epoch": 3.529179619873788, + "grad_norm": 0.24039603769779205, + "learning_rate": 1.500640289774872e-05, + "loss": 1.2368, + "step": 11849 + }, + { + "epoch": 3.5294774660734562, + "grad_norm": 0.24635085463523865, + "learning_rate": 1.5005567892361269e-05, + "loss": 1.2332, + "step": 11850 + }, + { + "epoch": 3.529775312273125, + "grad_norm": 0.24283139407634735, + "learning_rate": 1.5004732840402598e-05, + "loss": 1.2405, + "step": 11851 + }, + { + "epoch": 3.530073158472794, + "grad_norm": 0.23929741978645325, + "learning_rate": 1.5003897741880484e-05, + "loss": 1.233, + "step": 11852 + }, + { + "epoch": 3.530371004672462, + "grad_norm": 0.23753710091114044, + "learning_rate": 1.500306259680269e-05, + "loss": 1.2248, + "step": 11853 + }, + { + "epoch": 3.530668850872131, + "grad_norm": 0.24215765297412872, + "learning_rate": 1.500222740517699e-05, + "loss": 1.2295, + "step": 11854 + }, + { + "epoch": 3.5309666970717997, + "grad_norm": 0.23550446331501007, + "learning_rate": 1.5001392167011153e-05, + "loss": 1.2213, + "step": 11855 + }, + { + "epoch": 3.531264543271468, + "grad_norm": 0.23802022635936737, + "learning_rate": 1.5000556882312948e-05, + "loss": 1.2452, + "step": 11856 + }, + { + "epoch": 3.5315623894711368, + "grad_norm": 0.2376105934381485, + "learning_rate": 1.4999721551090148e-05, + "loss": 1.2464, + "step": 11857 + }, + { + "epoch": 3.5318602356708055, + "grad_norm": 0.23863351345062256, + "learning_rate": 1.4998886173350527e-05, + "loss": 1.2199, + "step": 11858 + }, + { + "epoch": 3.532158081870474, + "grad_norm": 0.2474067658185959, + "learning_rate": 1.4998050749101855e-05, + "loss": 1.2257, + "step": 11859 + }, + { + "epoch": 3.5324559280701426, + "grad_norm": 0.2507241666316986, + "learning_rate": 1.4997215278351905e-05, + "loss": 1.2277, + "step": 11860 + }, + { + "epoch": 3.5327537742698114, + "grad_norm": 0.24643084406852722, + "learning_rate": 1.4996379761108446e-05, + "loss": 1.2503, + "step": 11861 + }, + { + "epoch": 3.53305162046948, + "grad_norm": 0.24573387205600739, + "learning_rate": 1.4995544197379259e-05, + "loss": 1.2302, + "step": 11862 + }, + { + "epoch": 3.533349466669149, + "grad_norm": 0.24578134715557098, + "learning_rate": 1.4994708587172115e-05, + "loss": 1.2589, + "step": 11863 + }, + { + "epoch": 3.5336473128688173, + "grad_norm": 0.2325790375471115, + "learning_rate": 1.4993872930494785e-05, + "loss": 1.2398, + "step": 11864 + }, + { + "epoch": 3.533945159068486, + "grad_norm": 0.2433117777109146, + "learning_rate": 1.4993037227355052e-05, + "loss": 1.2145, + "step": 11865 + }, + { + "epoch": 3.534243005268155, + "grad_norm": 0.23357227444648743, + "learning_rate": 1.4992201477760683e-05, + "loss": 1.2572, + "step": 11866 + }, + { + "epoch": 3.534540851467823, + "grad_norm": 0.23796075582504272, + "learning_rate": 1.499136568171946e-05, + "loss": 1.2398, + "step": 11867 + }, + { + "epoch": 3.534838697667492, + "grad_norm": 0.23160098493099213, + "learning_rate": 1.4990529839239149e-05, + "loss": 1.2191, + "step": 11868 + }, + { + "epoch": 3.5351365438671607, + "grad_norm": 0.23105956614017487, + "learning_rate": 1.498969395032754e-05, + "loss": 1.2316, + "step": 11869 + }, + { + "epoch": 3.535434390066829, + "grad_norm": 0.2299727499485016, + "learning_rate": 1.49888580149924e-05, + "loss": 1.2294, + "step": 11870 + }, + { + "epoch": 3.535732236266498, + "grad_norm": 0.24049155414104462, + "learning_rate": 1.4988022033241511e-05, + "loss": 1.2391, + "step": 11871 + }, + { + "epoch": 3.5360300824661666, + "grad_norm": 0.24359889328479767, + "learning_rate": 1.4987186005082653e-05, + "loss": 1.2339, + "step": 11872 + }, + { + "epoch": 3.5363279286658353, + "grad_norm": 0.24411973357200623, + "learning_rate": 1.4986349930523599e-05, + "loss": 1.2423, + "step": 11873 + }, + { + "epoch": 3.5366257748655037, + "grad_norm": 0.2380755990743637, + "learning_rate": 1.4985513809572127e-05, + "loss": 1.2271, + "step": 11874 + }, + { + "epoch": 3.5369236210651724, + "grad_norm": 0.24348989129066467, + "learning_rate": 1.4984677642236021e-05, + "loss": 1.2445, + "step": 11875 + }, + { + "epoch": 3.537221467264841, + "grad_norm": 0.23694832623004913, + "learning_rate": 1.4983841428523063e-05, + "loss": 1.2291, + "step": 11876 + }, + { + "epoch": 3.53751931346451, + "grad_norm": 0.25284677743911743, + "learning_rate": 1.4983005168441023e-05, + "loss": 1.2235, + "step": 11877 + }, + { + "epoch": 3.5378171596641783, + "grad_norm": 0.23373016715049744, + "learning_rate": 1.498216886199769e-05, + "loss": 1.2136, + "step": 11878 + }, + { + "epoch": 3.538115005863847, + "grad_norm": 0.2290879487991333, + "learning_rate": 1.4981332509200842e-05, + "loss": 1.225, + "step": 11879 + }, + { + "epoch": 3.538412852063516, + "grad_norm": 0.23518285155296326, + "learning_rate": 1.498049611005826e-05, + "loss": 1.2389, + "step": 11880 + }, + { + "epoch": 3.538710698263184, + "grad_norm": 0.24591416120529175, + "learning_rate": 1.4979659664577727e-05, + "loss": 1.2266, + "step": 11881 + }, + { + "epoch": 3.539008544462853, + "grad_norm": 0.24626922607421875, + "learning_rate": 1.497882317276702e-05, + "loss": 1.244, + "step": 11882 + }, + { + "epoch": 3.5393063906625217, + "grad_norm": 0.24175675213336945, + "learning_rate": 1.4977986634633935e-05, + "loss": 1.2439, + "step": 11883 + }, + { + "epoch": 3.53960423686219, + "grad_norm": 0.24137884378433228, + "learning_rate": 1.497715005018624e-05, + "loss": 1.2444, + "step": 11884 + }, + { + "epoch": 3.539902083061859, + "grad_norm": 0.2463780641555786, + "learning_rate": 1.4976313419431725e-05, + "loss": 1.2387, + "step": 11885 + }, + { + "epoch": 3.5401999292615276, + "grad_norm": 0.24791429936885834, + "learning_rate": 1.4975476742378173e-05, + "loss": 1.2475, + "step": 11886 + }, + { + "epoch": 3.5404977754611964, + "grad_norm": 0.24221408367156982, + "learning_rate": 1.497464001903337e-05, + "loss": 1.2574, + "step": 11887 + }, + { + "epoch": 3.540795621660865, + "grad_norm": 0.24171023070812225, + "learning_rate": 1.49738032494051e-05, + "loss": 1.2451, + "step": 11888 + }, + { + "epoch": 3.5410934678605335, + "grad_norm": 0.2393396496772766, + "learning_rate": 1.4972966433501146e-05, + "loss": 1.2319, + "step": 11889 + }, + { + "epoch": 3.5413913140602022, + "grad_norm": 0.23181261122226715, + "learning_rate": 1.49721295713293e-05, + "loss": 1.233, + "step": 11890 + }, + { + "epoch": 3.541689160259871, + "grad_norm": 0.23957259953022003, + "learning_rate": 1.4971292662897339e-05, + "loss": 1.2541, + "step": 11891 + }, + { + "epoch": 3.5419870064595393, + "grad_norm": 0.2328466922044754, + "learning_rate": 1.4970455708213055e-05, + "loss": 1.2273, + "step": 11892 + }, + { + "epoch": 3.542284852659208, + "grad_norm": 0.2393094003200531, + "learning_rate": 1.4969618707284235e-05, + "loss": 1.2389, + "step": 11893 + }, + { + "epoch": 3.542582698858877, + "grad_norm": 0.2378576546907425, + "learning_rate": 1.4968781660118662e-05, + "loss": 1.2226, + "step": 11894 + }, + { + "epoch": 3.542880545058545, + "grad_norm": 0.2518276870250702, + "learning_rate": 1.496794456672413e-05, + "loss": 1.2331, + "step": 11895 + }, + { + "epoch": 3.543178391258214, + "grad_norm": 0.2509482800960541, + "learning_rate": 1.4967107427108427e-05, + "loss": 1.2249, + "step": 11896 + }, + { + "epoch": 3.5434762374578828, + "grad_norm": 0.252467542886734, + "learning_rate": 1.4966270241279333e-05, + "loss": 1.2326, + "step": 11897 + }, + { + "epoch": 3.543774083657551, + "grad_norm": 0.25453031063079834, + "learning_rate": 1.4965433009244646e-05, + "loss": 1.2448, + "step": 11898 + }, + { + "epoch": 3.54407192985722, + "grad_norm": 0.23521898686885834, + "learning_rate": 1.4964595731012153e-05, + "loss": 1.2322, + "step": 11899 + }, + { + "epoch": 3.5443697760568886, + "grad_norm": 0.25464823842048645, + "learning_rate": 1.4963758406589644e-05, + "loss": 1.2248, + "step": 11900 + }, + { + "epoch": 3.5446676222565574, + "grad_norm": 0.23565281927585602, + "learning_rate": 1.4962921035984907e-05, + "loss": 1.2396, + "step": 11901 + }, + { + "epoch": 3.544965468456226, + "grad_norm": 0.26107919216156006, + "learning_rate": 1.4962083619205737e-05, + "loss": 1.2437, + "step": 11902 + }, + { + "epoch": 3.5452633146558945, + "grad_norm": 0.2406272143125534, + "learning_rate": 1.496124615625992e-05, + "loss": 1.2454, + "step": 11903 + }, + { + "epoch": 3.5455611608555633, + "grad_norm": 0.24669715762138367, + "learning_rate": 1.4960408647155254e-05, + "loss": 1.2166, + "step": 11904 + }, + { + "epoch": 3.545859007055232, + "grad_norm": 0.23928460478782654, + "learning_rate": 1.4959571091899525e-05, + "loss": 1.2254, + "step": 11905 + }, + { + "epoch": 3.5461568532549004, + "grad_norm": 0.2638623118400574, + "learning_rate": 1.4958733490500528e-05, + "loss": 1.2453, + "step": 11906 + }, + { + "epoch": 3.546454699454569, + "grad_norm": 0.2426176816225052, + "learning_rate": 1.495789584296606e-05, + "loss": 1.2443, + "step": 11907 + }, + { + "epoch": 3.546752545654238, + "grad_norm": 0.2569550573825836, + "learning_rate": 1.4957058149303905e-05, + "loss": 1.2404, + "step": 11908 + }, + { + "epoch": 3.5470503918539062, + "grad_norm": 0.2393750548362732, + "learning_rate": 1.4956220409521862e-05, + "loss": 1.2319, + "step": 11909 + }, + { + "epoch": 3.547348238053575, + "grad_norm": 0.3033221960067749, + "learning_rate": 1.4955382623627733e-05, + "loss": 1.2341, + "step": 11910 + }, + { + "epoch": 3.547646084253244, + "grad_norm": 0.26827922463417053, + "learning_rate": 1.4954544791629299e-05, + "loss": 1.2331, + "step": 11911 + }, + { + "epoch": 3.547943930452912, + "grad_norm": 0.3209385871887207, + "learning_rate": 1.4953706913534363e-05, + "loss": 1.2247, + "step": 11912 + }, + { + "epoch": 3.548241776652581, + "grad_norm": 0.32322075963020325, + "learning_rate": 1.495286898935072e-05, + "loss": 1.212, + "step": 11913 + }, + { + "epoch": 3.5485396228522497, + "grad_norm": 0.293900728225708, + "learning_rate": 1.4952031019086161e-05, + "loss": 1.2294, + "step": 11914 + }, + { + "epoch": 3.5488374690519184, + "grad_norm": 0.3369714319705963, + "learning_rate": 1.4951193002748487e-05, + "loss": 1.2514, + "step": 11915 + }, + { + "epoch": 3.549135315251587, + "grad_norm": 0.26007702946662903, + "learning_rate": 1.4950354940345494e-05, + "loss": 1.252, + "step": 11916 + }, + { + "epoch": 3.5494331614512555, + "grad_norm": 0.2527429461479187, + "learning_rate": 1.4949516831884983e-05, + "loss": 1.22, + "step": 11917 + }, + { + "epoch": 3.5497310076509243, + "grad_norm": 0.29641637206077576, + "learning_rate": 1.4948678677374746e-05, + "loss": 1.2363, + "step": 11918 + }, + { + "epoch": 3.550028853850593, + "grad_norm": 0.24928542971611023, + "learning_rate": 1.494784047682258e-05, + "loss": 1.2495, + "step": 11919 + }, + { + "epoch": 3.5503267000502614, + "grad_norm": 0.26066356897354126, + "learning_rate": 1.494700223023629e-05, + "loss": 1.2314, + "step": 11920 + }, + { + "epoch": 3.55062454624993, + "grad_norm": 0.22851091623306274, + "learning_rate": 1.4946163937623668e-05, + "loss": 1.2382, + "step": 11921 + }, + { + "epoch": 3.550922392449599, + "grad_norm": 0.2403339147567749, + "learning_rate": 1.4945325598992519e-05, + "loss": 1.2143, + "step": 11922 + }, + { + "epoch": 3.5512202386492673, + "grad_norm": 0.2572495937347412, + "learning_rate": 1.494448721435064e-05, + "loss": 1.2267, + "step": 11923 + }, + { + "epoch": 3.551518084848936, + "grad_norm": 0.2491593360900879, + "learning_rate": 1.4943648783705833e-05, + "loss": 1.2413, + "step": 11924 + }, + { + "epoch": 3.551815931048605, + "grad_norm": 0.27651259303092957, + "learning_rate": 1.4942810307065895e-05, + "loss": 1.2252, + "step": 11925 + }, + { + "epoch": 3.552113777248273, + "grad_norm": 0.23403340578079224, + "learning_rate": 1.4941971784438631e-05, + "loss": 1.2177, + "step": 11926 + }, + { + "epoch": 3.552411623447942, + "grad_norm": 0.25173115730285645, + "learning_rate": 1.4941133215831842e-05, + "loss": 1.2338, + "step": 11927 + }, + { + "epoch": 3.5527094696476107, + "grad_norm": 0.2646122872829437, + "learning_rate": 1.4940294601253327e-05, + "loss": 1.2233, + "step": 11928 + }, + { + "epoch": 3.5530073158472795, + "grad_norm": 0.23748041689395905, + "learning_rate": 1.493945594071089e-05, + "loss": 1.2511, + "step": 11929 + }, + { + "epoch": 3.5533051620469482, + "grad_norm": 0.2807159423828125, + "learning_rate": 1.4938617234212336e-05, + "loss": 1.2441, + "step": 11930 + }, + { + "epoch": 3.5536030082466166, + "grad_norm": 0.23702220618724823, + "learning_rate": 1.4937778481765465e-05, + "loss": 1.2379, + "step": 11931 + }, + { + "epoch": 3.5539008544462853, + "grad_norm": 0.27692580223083496, + "learning_rate": 1.4936939683378085e-05, + "loss": 1.2328, + "step": 11932 + }, + { + "epoch": 3.554198700645954, + "grad_norm": 0.24006317555904388, + "learning_rate": 1.4936100839057992e-05, + "loss": 1.2243, + "step": 11933 + }, + { + "epoch": 3.5544965468456224, + "grad_norm": 0.2403414100408554, + "learning_rate": 1.4935261948813e-05, + "loss": 1.2273, + "step": 11934 + }, + { + "epoch": 3.554794393045291, + "grad_norm": 0.2287728637456894, + "learning_rate": 1.4934423012650906e-05, + "loss": 1.2339, + "step": 11935 + }, + { + "epoch": 3.55509223924496, + "grad_norm": 0.24130764603614807, + "learning_rate": 1.4933584030579523e-05, + "loss": 1.2199, + "step": 11936 + }, + { + "epoch": 3.5553900854446283, + "grad_norm": 0.2393208146095276, + "learning_rate": 1.4932745002606652e-05, + "loss": 1.2486, + "step": 11937 + }, + { + "epoch": 3.555687931644297, + "grad_norm": 0.24160076677799225, + "learning_rate": 1.4931905928740097e-05, + "loss": 1.2314, + "step": 11938 + }, + { + "epoch": 3.555985777843966, + "grad_norm": 0.2669520378112793, + "learning_rate": 1.4931066808987668e-05, + "loss": 1.2264, + "step": 11939 + }, + { + "epoch": 3.5562836240436346, + "grad_norm": 0.24485118687152863, + "learning_rate": 1.4930227643357174e-05, + "loss": 1.254, + "step": 11940 + }, + { + "epoch": 3.556581470243303, + "grad_norm": 0.2909088134765625, + "learning_rate": 1.492938843185642e-05, + "loss": 1.237, + "step": 11941 + }, + { + "epoch": 3.5568793164429717, + "grad_norm": 0.2594226002693176, + "learning_rate": 1.4928549174493214e-05, + "loss": 1.2244, + "step": 11942 + }, + { + "epoch": 3.5571771626426405, + "grad_norm": 0.26096615195274353, + "learning_rate": 1.4927709871275361e-05, + "loss": 1.232, + "step": 11943 + }, + { + "epoch": 3.5574750088423093, + "grad_norm": 0.24414198100566864, + "learning_rate": 1.4926870522210676e-05, + "loss": 1.2366, + "step": 11944 + }, + { + "epoch": 3.5577728550419776, + "grad_norm": 0.30891233682632446, + "learning_rate": 1.4926031127306965e-05, + "loss": 1.2288, + "step": 11945 + }, + { + "epoch": 3.5580707012416464, + "grad_norm": 0.31807658076286316, + "learning_rate": 1.492519168657204e-05, + "loss": 1.2247, + "step": 11946 + }, + { + "epoch": 3.558368547441315, + "grad_norm": 0.24587668478488922, + "learning_rate": 1.4924352200013706e-05, + "loss": 1.2351, + "step": 11947 + }, + { + "epoch": 3.5586663936409835, + "grad_norm": 0.3405172526836395, + "learning_rate": 1.4923512667639778e-05, + "loss": 1.2371, + "step": 11948 + }, + { + "epoch": 3.5589642398406522, + "grad_norm": 0.280850350856781, + "learning_rate": 1.4922673089458066e-05, + "loss": 1.238, + "step": 11949 + }, + { + "epoch": 3.559262086040321, + "grad_norm": 0.2465774565935135, + "learning_rate": 1.492183346547638e-05, + "loss": 1.2234, + "step": 11950 + }, + { + "epoch": 3.5595599322399893, + "grad_norm": 0.23266535997390747, + "learning_rate": 1.4920993795702535e-05, + "loss": 1.2268, + "step": 11951 + }, + { + "epoch": 3.559857778439658, + "grad_norm": 0.27076929807662964, + "learning_rate": 1.4920154080144337e-05, + "loss": 1.2398, + "step": 11952 + }, + { + "epoch": 3.560155624639327, + "grad_norm": 0.25998640060424805, + "learning_rate": 1.4919314318809603e-05, + "loss": 1.2302, + "step": 11953 + }, + { + "epoch": 3.5604534708389957, + "grad_norm": 0.254976361989975, + "learning_rate": 1.491847451170615e-05, + "loss": 1.2367, + "step": 11954 + }, + { + "epoch": 3.5607513170386644, + "grad_norm": 0.28322747349739075, + "learning_rate": 1.4917634658841782e-05, + "loss": 1.236, + "step": 11955 + }, + { + "epoch": 3.5610491632383328, + "grad_norm": 0.24852986633777618, + "learning_rate": 1.4916794760224318e-05, + "loss": 1.2285, + "step": 11956 + }, + { + "epoch": 3.5613470094380015, + "grad_norm": 0.23009903728961945, + "learning_rate": 1.4915954815861572e-05, + "loss": 1.2479, + "step": 11957 + }, + { + "epoch": 3.5616448556376703, + "grad_norm": 0.2647802233695984, + "learning_rate": 1.491511482576136e-05, + "loss": 1.2466, + "step": 11958 + }, + { + "epoch": 3.5619427018373386, + "grad_norm": 0.23196657001972198, + "learning_rate": 1.4914274789931498e-05, + "loss": 1.2254, + "step": 11959 + }, + { + "epoch": 3.5622405480370074, + "grad_norm": 0.30095914006233215, + "learning_rate": 1.4913434708379796e-05, + "loss": 1.2454, + "step": 11960 + }, + { + "epoch": 3.562538394236676, + "grad_norm": 0.31365150213241577, + "learning_rate": 1.4912594581114077e-05, + "loss": 1.2291, + "step": 11961 + }, + { + "epoch": 3.5628362404363445, + "grad_norm": 0.24199581146240234, + "learning_rate": 1.4911754408142152e-05, + "loss": 1.2305, + "step": 11962 + }, + { + "epoch": 3.5631340866360133, + "grad_norm": 0.3605521023273468, + "learning_rate": 1.4910914189471841e-05, + "loss": 1.2339, + "step": 11963 + }, + { + "epoch": 3.563431932835682, + "grad_norm": 0.31536492705345154, + "learning_rate": 1.491007392511096e-05, + "loss": 1.2368, + "step": 11964 + }, + { + "epoch": 3.5637297790353504, + "grad_norm": 0.2886471450328827, + "learning_rate": 1.4909233615067326e-05, + "loss": 1.2269, + "step": 11965 + }, + { + "epoch": 3.564027625235019, + "grad_norm": 0.4995361268520355, + "learning_rate": 1.4908393259348761e-05, + "loss": 1.2336, + "step": 11966 + }, + { + "epoch": 3.564325471434688, + "grad_norm": 0.30417266488075256, + "learning_rate": 1.4907552857963077e-05, + "loss": 1.2468, + "step": 11967 + }, + { + "epoch": 3.5646233176343567, + "grad_norm": 0.3080073297023773, + "learning_rate": 1.49067124109181e-05, + "loss": 1.2303, + "step": 11968 + }, + { + "epoch": 3.5649211638340255, + "grad_norm": 0.22961090505123138, + "learning_rate": 1.4905871918221643e-05, + "loss": 1.2319, + "step": 11969 + }, + { + "epoch": 3.565219010033694, + "grad_norm": 0.3508792519569397, + "learning_rate": 1.490503137988153e-05, + "loss": 1.2466, + "step": 11970 + }, + { + "epoch": 3.5655168562333626, + "grad_norm": 0.24273687601089478, + "learning_rate": 1.4904190795905584e-05, + "loss": 1.2238, + "step": 11971 + }, + { + "epoch": 3.5658147024330313, + "grad_norm": 0.25304114818573, + "learning_rate": 1.4903350166301618e-05, + "loss": 1.217, + "step": 11972 + }, + { + "epoch": 3.5661125486326997, + "grad_norm": 0.24923963844776154, + "learning_rate": 1.4902509491077458e-05, + "loss": 1.2345, + "step": 11973 + }, + { + "epoch": 3.5664103948323684, + "grad_norm": 0.25437408685684204, + "learning_rate": 1.490166877024092e-05, + "loss": 1.2366, + "step": 11974 + }, + { + "epoch": 3.566708241032037, + "grad_norm": 0.2828586995601654, + "learning_rate": 1.4900828003799839e-05, + "loss": 1.2372, + "step": 11975 + }, + { + "epoch": 3.5670060872317055, + "grad_norm": 0.2456802874803543, + "learning_rate": 1.4899987191762022e-05, + "loss": 1.2166, + "step": 11976 + }, + { + "epoch": 3.5673039334313743, + "grad_norm": 0.27970948815345764, + "learning_rate": 1.48991463341353e-05, + "loss": 1.2434, + "step": 11977 + }, + { + "epoch": 3.567601779631043, + "grad_norm": 0.2370612472295761, + "learning_rate": 1.48983054309275e-05, + "loss": 1.2381, + "step": 11978 + }, + { + "epoch": 3.5678996258307114, + "grad_norm": 0.24851901829242706, + "learning_rate": 1.4897464482146435e-05, + "loss": 1.2598, + "step": 11979 + }, + { + "epoch": 3.56819747203038, + "grad_norm": 0.2595925033092499, + "learning_rate": 1.4896623487799937e-05, + "loss": 1.2323, + "step": 11980 + }, + { + "epoch": 3.568495318230049, + "grad_norm": 0.2781471610069275, + "learning_rate": 1.4895782447895828e-05, + "loss": 1.2503, + "step": 11981 + }, + { + "epoch": 3.5687931644297177, + "grad_norm": 0.2639451026916504, + "learning_rate": 1.4894941362441935e-05, + "loss": 1.243, + "step": 11982 + }, + { + "epoch": 3.5690910106293865, + "grad_norm": 0.2645019292831421, + "learning_rate": 1.489410023144608e-05, + "loss": 1.2386, + "step": 11983 + }, + { + "epoch": 3.569388856829055, + "grad_norm": 0.25453054904937744, + "learning_rate": 1.489325905491609e-05, + "loss": 1.2409, + "step": 11984 + }, + { + "epoch": 3.5696867030287236, + "grad_norm": 0.24270634353160858, + "learning_rate": 1.4892417832859792e-05, + "loss": 1.2437, + "step": 11985 + }, + { + "epoch": 3.5699845492283924, + "grad_norm": 0.26533761620521545, + "learning_rate": 1.489157656528501e-05, + "loss": 1.2184, + "step": 11986 + }, + { + "epoch": 3.5702823954280607, + "grad_norm": 0.254607230424881, + "learning_rate": 1.4890735252199578e-05, + "loss": 1.2329, + "step": 11987 + }, + { + "epoch": 3.5705802416277295, + "grad_norm": 0.27076947689056396, + "learning_rate": 1.4889893893611317e-05, + "loss": 1.2327, + "step": 11988 + }, + { + "epoch": 3.5708780878273982, + "grad_norm": 0.25077369809150696, + "learning_rate": 1.4889052489528055e-05, + "loss": 1.2387, + "step": 11989 + }, + { + "epoch": 3.5711759340270666, + "grad_norm": 0.2612989842891693, + "learning_rate": 1.4888211039957623e-05, + "loss": 1.2494, + "step": 11990 + }, + { + "epoch": 3.5714737802267353, + "grad_norm": 0.2401239573955536, + "learning_rate": 1.4887369544907848e-05, + "loss": 1.2238, + "step": 11991 + }, + { + "epoch": 3.571771626426404, + "grad_norm": 0.24555446207523346, + "learning_rate": 1.488652800438656e-05, + "loss": 1.2347, + "step": 11992 + }, + { + "epoch": 3.5720694726260724, + "grad_norm": 0.2627737522125244, + "learning_rate": 1.4885686418401588e-05, + "loss": 1.2241, + "step": 11993 + }, + { + "epoch": 3.572367318825741, + "grad_norm": 0.2304520308971405, + "learning_rate": 1.4884844786960763e-05, + "loss": 1.2111, + "step": 11994 + }, + { + "epoch": 3.57266516502541, + "grad_norm": 0.23122353851795197, + "learning_rate": 1.4884003110071918e-05, + "loss": 1.2239, + "step": 11995 + }, + { + "epoch": 3.5729630112250788, + "grad_norm": 0.23823879659175873, + "learning_rate": 1.4883161387742877e-05, + "loss": 1.2433, + "step": 11996 + }, + { + "epoch": 3.5732608574247475, + "grad_norm": 0.2689054012298584, + "learning_rate": 1.4882319619981473e-05, + "loss": 1.2238, + "step": 11997 + }, + { + "epoch": 3.573558703624416, + "grad_norm": 0.3108246326446533, + "learning_rate": 1.4881477806795543e-05, + "loss": 1.2272, + "step": 11998 + }, + { + "epoch": 3.5738565498240846, + "grad_norm": 0.2484017014503479, + "learning_rate": 1.4880635948192918e-05, + "loss": 1.1908, + "step": 11999 + }, + { + "epoch": 3.5741543960237534, + "grad_norm": 0.28578975796699524, + "learning_rate": 1.4879794044181426e-05, + "loss": 1.2269, + "step": 12000 + }, + { + "epoch": 3.5741543960237534, + "eval_loss": 1.331367015838623, + "eval_runtime": 20.7914, + "eval_samples_per_second": 83.4, + "eval_steps_per_second": 5.243, + "step": 12000 + }, + { + "epoch": 3.5744522422234217, + "grad_norm": 0.2474474161863327, + "learning_rate": 1.4878952094768904e-05, + "loss": 1.2152, + "step": 12001 + }, + { + "epoch": 3.5747500884230905, + "grad_norm": 0.24221831560134888, + "learning_rate": 1.4878110099963181e-05, + "loss": 1.223, + "step": 12002 + }, + { + "epoch": 3.5750479346227593, + "grad_norm": 0.24997448921203613, + "learning_rate": 1.4877268059772094e-05, + "loss": 1.2321, + "step": 12003 + }, + { + "epoch": 3.5753457808224276, + "grad_norm": 0.24379056692123413, + "learning_rate": 1.487642597420348e-05, + "loss": 1.2337, + "step": 12004 + }, + { + "epoch": 3.5756436270220964, + "grad_norm": 0.266418993473053, + "learning_rate": 1.4875583843265169e-05, + "loss": 1.2307, + "step": 12005 + }, + { + "epoch": 3.575941473221765, + "grad_norm": 0.304832398891449, + "learning_rate": 1.4874741666964999e-05, + "loss": 1.2186, + "step": 12006 + }, + { + "epoch": 3.576239319421434, + "grad_norm": 0.24056296050548553, + "learning_rate": 1.48738994453108e-05, + "loss": 1.2266, + "step": 12007 + }, + { + "epoch": 3.5765371656211022, + "grad_norm": 0.2905622720718384, + "learning_rate": 1.4873057178310414e-05, + "loss": 1.2372, + "step": 12008 + }, + { + "epoch": 3.576835011820771, + "grad_norm": 0.40158170461654663, + "learning_rate": 1.4872214865971677e-05, + "loss": 1.2298, + "step": 12009 + }, + { + "epoch": 3.57713285802044, + "grad_norm": 0.3111315071582794, + "learning_rate": 1.4871372508302422e-05, + "loss": 1.2395, + "step": 12010 + }, + { + "epoch": 3.5774307042201086, + "grad_norm": 0.26447662711143494, + "learning_rate": 1.487053010531049e-05, + "loss": 1.2399, + "step": 12011 + }, + { + "epoch": 3.577728550419777, + "grad_norm": 0.4011225700378418, + "learning_rate": 1.486968765700372e-05, + "loss": 1.2257, + "step": 12012 + }, + { + "epoch": 3.5780263966194457, + "grad_norm": 0.30219927430152893, + "learning_rate": 1.4868845163389942e-05, + "loss": 1.2365, + "step": 12013 + }, + { + "epoch": 3.5783242428191144, + "grad_norm": 0.2629011571407318, + "learning_rate": 1.4868002624477003e-05, + "loss": 1.2261, + "step": 12014 + }, + { + "epoch": 3.5786220890187828, + "grad_norm": 0.29616492986679077, + "learning_rate": 1.4867160040272732e-05, + "loss": 1.2259, + "step": 12015 + }, + { + "epoch": 3.5789199352184515, + "grad_norm": 0.2366276979446411, + "learning_rate": 1.4866317410784982e-05, + "loss": 1.2312, + "step": 12016 + }, + { + "epoch": 3.5792177814181203, + "grad_norm": 0.28599026799201965, + "learning_rate": 1.4865474736021584e-05, + "loss": 1.2182, + "step": 12017 + }, + { + "epoch": 3.5795156276177886, + "grad_norm": 0.26458537578582764, + "learning_rate": 1.4864632015990377e-05, + "loss": 1.2507, + "step": 12018 + }, + { + "epoch": 3.5798134738174574, + "grad_norm": 0.2551570534706116, + "learning_rate": 1.486378925069921e-05, + "loss": 1.2448, + "step": 12019 + }, + { + "epoch": 3.580111320017126, + "grad_norm": 0.2608723044395447, + "learning_rate": 1.486294644015591e-05, + "loss": 1.2223, + "step": 12020 + }, + { + "epoch": 3.580409166216795, + "grad_norm": 0.2454732060432434, + "learning_rate": 1.486210358436833e-05, + "loss": 1.2234, + "step": 12021 + }, + { + "epoch": 3.5807070124164637, + "grad_norm": 0.3151560425758362, + "learning_rate": 1.486126068334431e-05, + "loss": 1.2203, + "step": 12022 + }, + { + "epoch": 3.581004858616132, + "grad_norm": 0.25535303354263306, + "learning_rate": 1.486041773709169e-05, + "loss": 1.24, + "step": 12023 + }, + { + "epoch": 3.581302704815801, + "grad_norm": 0.2909095287322998, + "learning_rate": 1.4859574745618312e-05, + "loss": 1.2474, + "step": 12024 + }, + { + "epoch": 3.5816005510154696, + "grad_norm": 0.2343895584344864, + "learning_rate": 1.4858731708932022e-05, + "loss": 1.2393, + "step": 12025 + }, + { + "epoch": 3.581898397215138, + "grad_norm": 0.5344161987304688, + "learning_rate": 1.4857888627040662e-05, + "loss": 1.2285, + "step": 12026 + }, + { + "epoch": 3.5821962434148067, + "grad_norm": 0.31793347001075745, + "learning_rate": 1.4857045499952075e-05, + "loss": 1.2365, + "step": 12027 + }, + { + "epoch": 3.5824940896144755, + "grad_norm": 0.24410374462604523, + "learning_rate": 1.4856202327674107e-05, + "loss": 1.2378, + "step": 12028 + }, + { + "epoch": 3.582791935814144, + "grad_norm": 0.24520458281040192, + "learning_rate": 1.4855359110214602e-05, + "loss": 1.2227, + "step": 12029 + }, + { + "epoch": 3.5830897820138126, + "grad_norm": 0.24458833038806915, + "learning_rate": 1.4854515847581405e-05, + "loss": 1.245, + "step": 12030 + }, + { + "epoch": 3.5833876282134813, + "grad_norm": 0.24848031997680664, + "learning_rate": 1.485367253978236e-05, + "loss": 1.2436, + "step": 12031 + }, + { + "epoch": 3.5836854744131497, + "grad_norm": 0.24862825870513916, + "learning_rate": 1.4852829186825318e-05, + "loss": 1.2477, + "step": 12032 + }, + { + "epoch": 3.5839833206128184, + "grad_norm": 0.22774432599544525, + "learning_rate": 1.485198578871812e-05, + "loss": 1.227, + "step": 12033 + }, + { + "epoch": 3.584281166812487, + "grad_norm": 0.24232453107833862, + "learning_rate": 1.4851142345468614e-05, + "loss": 1.2433, + "step": 12034 + }, + { + "epoch": 3.584579013012156, + "grad_norm": 0.24190890789031982, + "learning_rate": 1.485029885708465e-05, + "loss": 1.246, + "step": 12035 + }, + { + "epoch": 3.5848768592118248, + "grad_norm": 0.2451910376548767, + "learning_rate": 1.4849455323574078e-05, + "loss": 1.2342, + "step": 12036 + }, + { + "epoch": 3.585174705411493, + "grad_norm": 0.243106871843338, + "learning_rate": 1.4848611744944739e-05, + "loss": 1.2261, + "step": 12037 + }, + { + "epoch": 3.585472551611162, + "grad_norm": 0.24494658410549164, + "learning_rate": 1.4847768121204482e-05, + "loss": 1.246, + "step": 12038 + }, + { + "epoch": 3.5857703978108306, + "grad_norm": 0.23604218661785126, + "learning_rate": 1.484692445236116e-05, + "loss": 1.2077, + "step": 12039 + }, + { + "epoch": 3.586068244010499, + "grad_norm": 0.23879358172416687, + "learning_rate": 1.4846080738422626e-05, + "loss": 1.2264, + "step": 12040 + }, + { + "epoch": 3.5863660902101677, + "grad_norm": 0.23756951093673706, + "learning_rate": 1.4845236979396722e-05, + "loss": 1.2054, + "step": 12041 + }, + { + "epoch": 3.5866639364098365, + "grad_norm": 0.25303810834884644, + "learning_rate": 1.48443931752913e-05, + "loss": 1.2401, + "step": 12042 + }, + { + "epoch": 3.586961782609505, + "grad_norm": 0.24577119946479797, + "learning_rate": 1.4843549326114217e-05, + "loss": 1.2492, + "step": 12043 + }, + { + "epoch": 3.5872596288091736, + "grad_norm": 0.24664612114429474, + "learning_rate": 1.4842705431873312e-05, + "loss": 1.2286, + "step": 12044 + }, + { + "epoch": 3.5875574750088424, + "grad_norm": 0.23486971855163574, + "learning_rate": 1.4841861492576448e-05, + "loss": 1.2246, + "step": 12045 + }, + { + "epoch": 3.5878553212085107, + "grad_norm": 0.24063166975975037, + "learning_rate": 1.484101750823147e-05, + "loss": 1.2258, + "step": 12046 + }, + { + "epoch": 3.5881531674081795, + "grad_norm": 0.25894486904144287, + "learning_rate": 1.4840173478846236e-05, + "loss": 1.2471, + "step": 12047 + }, + { + "epoch": 3.5884510136078482, + "grad_norm": 0.22805535793304443, + "learning_rate": 1.4839329404428593e-05, + "loss": 1.2395, + "step": 12048 + }, + { + "epoch": 3.588748859807517, + "grad_norm": 0.2343277782201767, + "learning_rate": 1.4838485284986398e-05, + "loss": 1.2377, + "step": 12049 + }, + { + "epoch": 3.589046706007186, + "grad_norm": 0.24033649265766144, + "learning_rate": 1.4837641120527502e-05, + "loss": 1.2191, + "step": 12050 + }, + { + "epoch": 3.589344552206854, + "grad_norm": 0.23253053426742554, + "learning_rate": 1.483679691105976e-05, + "loss": 1.2205, + "step": 12051 + }, + { + "epoch": 3.589642398406523, + "grad_norm": 0.23812629282474518, + "learning_rate": 1.4835952656591028e-05, + "loss": 1.2238, + "step": 12052 + }, + { + "epoch": 3.5899402446061917, + "grad_norm": 0.23149777948856354, + "learning_rate": 1.4835108357129162e-05, + "loss": 1.2257, + "step": 12053 + }, + { + "epoch": 3.59023809080586, + "grad_norm": 0.2525610327720642, + "learning_rate": 1.4834264012682012e-05, + "loss": 1.2357, + "step": 12054 + }, + { + "epoch": 3.5905359370055288, + "grad_norm": 0.24097470939159393, + "learning_rate": 1.4833419623257437e-05, + "loss": 1.2198, + "step": 12055 + }, + { + "epoch": 3.5908337832051975, + "grad_norm": 0.2267192006111145, + "learning_rate": 1.483257518886329e-05, + "loss": 1.2163, + "step": 12056 + }, + { + "epoch": 3.591131629404866, + "grad_norm": 0.24324989318847656, + "learning_rate": 1.4831730709507436e-05, + "loss": 1.2298, + "step": 12057 + }, + { + "epoch": 3.5914294756045346, + "grad_norm": 0.25552991032600403, + "learning_rate": 1.4830886185197719e-05, + "loss": 1.2338, + "step": 12058 + }, + { + "epoch": 3.5917273218042034, + "grad_norm": 0.24068230390548706, + "learning_rate": 1.4830041615942008e-05, + "loss": 1.2358, + "step": 12059 + }, + { + "epoch": 3.5920251680038717, + "grad_norm": 0.2373005896806717, + "learning_rate": 1.4829197001748156e-05, + "loss": 1.226, + "step": 12060 + }, + { + "epoch": 3.5923230142035405, + "grad_norm": 0.25316908955574036, + "learning_rate": 1.4828352342624021e-05, + "loss": 1.2327, + "step": 12061 + }, + { + "epoch": 3.5926208604032093, + "grad_norm": 0.2392289936542511, + "learning_rate": 1.4827507638577459e-05, + "loss": 1.2406, + "step": 12062 + }, + { + "epoch": 3.592918706602878, + "grad_norm": 0.23246882855892181, + "learning_rate": 1.4826662889616335e-05, + "loss": 1.2367, + "step": 12063 + }, + { + "epoch": 3.593216552802547, + "grad_norm": 0.2436981499195099, + "learning_rate": 1.4825818095748505e-05, + "loss": 1.2418, + "step": 12064 + }, + { + "epoch": 3.593514399002215, + "grad_norm": 0.23481963574886322, + "learning_rate": 1.4824973256981831e-05, + "loss": 1.2362, + "step": 12065 + }, + { + "epoch": 3.593812245201884, + "grad_norm": 0.23828721046447754, + "learning_rate": 1.4824128373324168e-05, + "loss": 1.2354, + "step": 12066 + }, + { + "epoch": 3.5941100914015527, + "grad_norm": 0.23842285573482513, + "learning_rate": 1.4823283444783383e-05, + "loss": 1.2503, + "step": 12067 + }, + { + "epoch": 3.594407937601221, + "grad_norm": 0.23747295141220093, + "learning_rate": 1.4822438471367333e-05, + "loss": 1.2382, + "step": 12068 + }, + { + "epoch": 3.59470578380089, + "grad_norm": 0.24500344693660736, + "learning_rate": 1.4821593453083883e-05, + "loss": 1.229, + "step": 12069 + }, + { + "epoch": 3.5950036300005586, + "grad_norm": 0.2548135817050934, + "learning_rate": 1.4820748389940894e-05, + "loss": 1.2261, + "step": 12070 + }, + { + "epoch": 3.595301476200227, + "grad_norm": 0.2509630024433136, + "learning_rate": 1.4819903281946224e-05, + "loss": 1.247, + "step": 12071 + }, + { + "epoch": 3.5955993223998957, + "grad_norm": 0.23769141733646393, + "learning_rate": 1.481905812910774e-05, + "loss": 1.2391, + "step": 12072 + }, + { + "epoch": 3.5958971685995644, + "grad_norm": 0.23118427395820618, + "learning_rate": 1.4818212931433305e-05, + "loss": 1.2214, + "step": 12073 + }, + { + "epoch": 3.596195014799233, + "grad_norm": 0.2310144454240799, + "learning_rate": 1.481736768893078e-05, + "loss": 1.2389, + "step": 12074 + }, + { + "epoch": 3.5964928609989015, + "grad_norm": 0.24245847761631012, + "learning_rate": 1.481652240160803e-05, + "loss": 1.2408, + "step": 12075 + }, + { + "epoch": 3.5967907071985703, + "grad_norm": 0.2524879574775696, + "learning_rate": 1.4815677069472924e-05, + "loss": 1.2236, + "step": 12076 + }, + { + "epoch": 3.597088553398239, + "grad_norm": 0.23940180242061615, + "learning_rate": 1.4814831692533322e-05, + "loss": 1.2249, + "step": 12077 + }, + { + "epoch": 3.597386399597908, + "grad_norm": 0.24941429495811462, + "learning_rate": 1.4813986270797091e-05, + "loss": 1.2277, + "step": 12078 + }, + { + "epoch": 3.597684245797576, + "grad_norm": 0.24452979862689972, + "learning_rate": 1.4813140804272097e-05, + "loss": 1.204, + "step": 12079 + }, + { + "epoch": 3.597982091997245, + "grad_norm": 0.2458629608154297, + "learning_rate": 1.4812295292966201e-05, + "loss": 1.2436, + "step": 12080 + }, + { + "epoch": 3.5982799381969137, + "grad_norm": 0.2509615421295166, + "learning_rate": 1.481144973688728e-05, + "loss": 1.2353, + "step": 12081 + }, + { + "epoch": 3.598577784396582, + "grad_norm": 0.26614630222320557, + "learning_rate": 1.481060413604319e-05, + "loss": 1.2352, + "step": 12082 + }, + { + "epoch": 3.598875630596251, + "grad_norm": 0.2650357782840729, + "learning_rate": 1.4809758490441806e-05, + "loss": 1.2298, + "step": 12083 + }, + { + "epoch": 3.5991734767959196, + "grad_norm": 0.2656654417514801, + "learning_rate": 1.4808912800090994e-05, + "loss": 1.2305, + "step": 12084 + }, + { + "epoch": 3.599471322995588, + "grad_norm": 0.2687116265296936, + "learning_rate": 1.4808067064998618e-05, + "loss": 1.2336, + "step": 12085 + }, + { + "epoch": 3.5997691691952567, + "grad_norm": 0.24685779213905334, + "learning_rate": 1.4807221285172548e-05, + "loss": 1.2444, + "step": 12086 + }, + { + "epoch": 3.6000670153949255, + "grad_norm": 0.2496700882911682, + "learning_rate": 1.4806375460620661e-05, + "loss": 1.2561, + "step": 12087 + }, + { + "epoch": 3.6003648615945942, + "grad_norm": 0.25268059968948364, + "learning_rate": 1.4805529591350816e-05, + "loss": 1.232, + "step": 12088 + }, + { + "epoch": 3.600662707794263, + "grad_norm": 0.23773519694805145, + "learning_rate": 1.480468367737089e-05, + "loss": 1.2568, + "step": 12089 + }, + { + "epoch": 3.6009605539939313, + "grad_norm": 0.2564716637134552, + "learning_rate": 1.4803837718688746e-05, + "loss": 1.2215, + "step": 12090 + }, + { + "epoch": 3.6012584001936, + "grad_norm": 0.2356421947479248, + "learning_rate": 1.4802991715312262e-05, + "loss": 1.2314, + "step": 12091 + }, + { + "epoch": 3.601556246393269, + "grad_norm": 0.2563536465167999, + "learning_rate": 1.4802145667249308e-05, + "loss": 1.2069, + "step": 12092 + }, + { + "epoch": 3.601854092592937, + "grad_norm": 0.25942927598953247, + "learning_rate": 1.4801299574507753e-05, + "loss": 1.207, + "step": 12093 + }, + { + "epoch": 3.602151938792606, + "grad_norm": 0.28995251655578613, + "learning_rate": 1.480045343709547e-05, + "loss": 1.2312, + "step": 12094 + }, + { + "epoch": 3.6024497849922748, + "grad_norm": 0.2602880299091339, + "learning_rate": 1.479960725502033e-05, + "loss": 1.2258, + "step": 12095 + }, + { + "epoch": 3.602747631191943, + "grad_norm": 0.23588363826274872, + "learning_rate": 1.4798761028290207e-05, + "loss": 1.2302, + "step": 12096 + }, + { + "epoch": 3.603045477391612, + "grad_norm": 0.270133376121521, + "learning_rate": 1.4797914756912974e-05, + "loss": 1.2342, + "step": 12097 + }, + { + "epoch": 3.6033433235912806, + "grad_norm": 0.25792115926742554, + "learning_rate": 1.4797068440896506e-05, + "loss": 1.2403, + "step": 12098 + }, + { + "epoch": 3.603641169790949, + "grad_norm": 0.24684999883174896, + "learning_rate": 1.4796222080248675e-05, + "loss": 1.2292, + "step": 12099 + }, + { + "epoch": 3.6039390159906177, + "grad_norm": 0.2691803276538849, + "learning_rate": 1.4795375674977354e-05, + "loss": 1.2436, + "step": 12100 + }, + { + "epoch": 3.6042368621902865, + "grad_norm": 0.2561197876930237, + "learning_rate": 1.4794529225090423e-05, + "loss": 1.228, + "step": 12101 + }, + { + "epoch": 3.6045347083899553, + "grad_norm": 0.2439146190881729, + "learning_rate": 1.4793682730595756e-05, + "loss": 1.243, + "step": 12102 + }, + { + "epoch": 3.604832554589624, + "grad_norm": 0.26815205812454224, + "learning_rate": 1.479283619150122e-05, + "loss": 1.2318, + "step": 12103 + }, + { + "epoch": 3.6051304007892924, + "grad_norm": 0.2403976321220398, + "learning_rate": 1.4791989607814703e-05, + "loss": 1.235, + "step": 12104 + }, + { + "epoch": 3.605428246988961, + "grad_norm": 0.26826152205467224, + "learning_rate": 1.4791142979544077e-05, + "loss": 1.2315, + "step": 12105 + }, + { + "epoch": 3.60572609318863, + "grad_norm": 0.3150516450405121, + "learning_rate": 1.4790296306697219e-05, + "loss": 1.2262, + "step": 12106 + }, + { + "epoch": 3.6060239393882982, + "grad_norm": 0.23319309949874878, + "learning_rate": 1.4789449589282004e-05, + "loss": 1.2212, + "step": 12107 + }, + { + "epoch": 3.606321785587967, + "grad_norm": 0.4288104176521301, + "learning_rate": 1.478860282730631e-05, + "loss": 1.2559, + "step": 12108 + }, + { + "epoch": 3.606619631787636, + "grad_norm": 0.34519460797309875, + "learning_rate": 1.4787756020778022e-05, + "loss": 1.2356, + "step": 12109 + }, + { + "epoch": 3.606917477987304, + "grad_norm": 0.3091725707054138, + "learning_rate": 1.4786909169705008e-05, + "loss": 1.221, + "step": 12110 + }, + { + "epoch": 3.607215324186973, + "grad_norm": 0.27726104855537415, + "learning_rate": 1.4786062274095159e-05, + "loss": 1.2286, + "step": 12111 + }, + { + "epoch": 3.6075131703866417, + "grad_norm": 0.33248814940452576, + "learning_rate": 1.4785215333956342e-05, + "loss": 1.2243, + "step": 12112 + }, + { + "epoch": 3.60781101658631, + "grad_norm": 0.2328971028327942, + "learning_rate": 1.4784368349296446e-05, + "loss": 1.2258, + "step": 12113 + }, + { + "epoch": 3.6081088627859788, + "grad_norm": 0.2641240358352661, + "learning_rate": 1.4783521320123347e-05, + "loss": 1.2343, + "step": 12114 + }, + { + "epoch": 3.6084067089856475, + "grad_norm": 0.29334887862205505, + "learning_rate": 1.4782674246444924e-05, + "loss": 1.2187, + "step": 12115 + }, + { + "epoch": 3.6087045551853163, + "grad_norm": 0.23718927800655365, + "learning_rate": 1.4781827128269062e-05, + "loss": 1.2179, + "step": 12116 + }, + { + "epoch": 3.609002401384985, + "grad_norm": 0.30184462666511536, + "learning_rate": 1.4780979965603642e-05, + "loss": 1.2395, + "step": 12117 + }, + { + "epoch": 3.6093002475846534, + "grad_norm": 0.25319600105285645, + "learning_rate": 1.4780132758456547e-05, + "loss": 1.2373, + "step": 12118 + }, + { + "epoch": 3.609598093784322, + "grad_norm": 0.25590604543685913, + "learning_rate": 1.4779285506835654e-05, + "loss": 1.2381, + "step": 12119 + }, + { + "epoch": 3.609895939983991, + "grad_norm": 0.27177754044532776, + "learning_rate": 1.4778438210748851e-05, + "loss": 1.2252, + "step": 12120 + }, + { + "epoch": 3.6101937861836593, + "grad_norm": 0.2445678561925888, + "learning_rate": 1.4777590870204015e-05, + "loss": 1.23, + "step": 12121 + }, + { + "epoch": 3.610491632383328, + "grad_norm": 0.3108128309249878, + "learning_rate": 1.4776743485209039e-05, + "loss": 1.2381, + "step": 12122 + }, + { + "epoch": 3.610789478582997, + "grad_norm": 0.24306008219718933, + "learning_rate": 1.4775896055771801e-05, + "loss": 1.248, + "step": 12123 + }, + { + "epoch": 3.611087324782665, + "grad_norm": 0.2657211124897003, + "learning_rate": 1.4775048581900184e-05, + "loss": 1.2384, + "step": 12124 + }, + { + "epoch": 3.611385170982334, + "grad_norm": 0.2527020275592804, + "learning_rate": 1.4774201063602079e-05, + "loss": 1.2506, + "step": 12125 + }, + { + "epoch": 3.6116830171820027, + "grad_norm": 0.2403356432914734, + "learning_rate": 1.477335350088536e-05, + "loss": 1.2186, + "step": 12126 + }, + { + "epoch": 3.611980863381671, + "grad_norm": 0.2830476760864258, + "learning_rate": 1.4772505893757924e-05, + "loss": 1.2354, + "step": 12127 + }, + { + "epoch": 3.61227870958134, + "grad_norm": 0.2860715091228485, + "learning_rate": 1.4771658242227655e-05, + "loss": 1.24, + "step": 12128 + }, + { + "epoch": 3.6125765557810086, + "grad_norm": 0.24847184121608734, + "learning_rate": 1.4770810546302436e-05, + "loss": 1.2118, + "step": 12129 + }, + { + "epoch": 3.6128744019806773, + "grad_norm": 0.2509472966194153, + "learning_rate": 1.4769962805990154e-05, + "loss": 1.2169, + "step": 12130 + }, + { + "epoch": 3.613172248180346, + "grad_norm": 0.27350685000419617, + "learning_rate": 1.4769115021298696e-05, + "loss": 1.2349, + "step": 12131 + }, + { + "epoch": 3.6134700943800144, + "grad_norm": 0.27454113960266113, + "learning_rate": 1.4768267192235954e-05, + "loss": 1.247, + "step": 12132 + }, + { + "epoch": 3.613767940579683, + "grad_norm": 0.2896553575992584, + "learning_rate": 1.4767419318809812e-05, + "loss": 1.2283, + "step": 12133 + }, + { + "epoch": 3.614065786779352, + "grad_norm": 0.2629541754722595, + "learning_rate": 1.476657140102816e-05, + "loss": 1.2161, + "step": 12134 + }, + { + "epoch": 3.6143636329790203, + "grad_norm": 0.2812981903553009, + "learning_rate": 1.4765723438898886e-05, + "loss": 1.2382, + "step": 12135 + }, + { + "epoch": 3.614661479178689, + "grad_norm": 0.25081151723861694, + "learning_rate": 1.476487543242988e-05, + "loss": 1.2276, + "step": 12136 + }, + { + "epoch": 3.614959325378358, + "grad_norm": 0.30545827746391296, + "learning_rate": 1.4764027381629034e-05, + "loss": 1.2286, + "step": 12137 + }, + { + "epoch": 3.615257171578026, + "grad_norm": 0.24036957323551178, + "learning_rate": 1.4763179286504234e-05, + "loss": 1.2148, + "step": 12138 + }, + { + "epoch": 3.615555017777695, + "grad_norm": 0.3674185276031494, + "learning_rate": 1.476233114706337e-05, + "loss": 1.2077, + "step": 12139 + }, + { + "epoch": 3.6158528639773637, + "grad_norm": 0.24986650049686432, + "learning_rate": 1.476148296331434e-05, + "loss": 1.2393, + "step": 12140 + }, + { + "epoch": 3.6161507101770325, + "grad_norm": 0.33543214201927185, + "learning_rate": 1.4760634735265029e-05, + "loss": 1.2292, + "step": 12141 + }, + { + "epoch": 3.616448556376701, + "grad_norm": 0.24175876379013062, + "learning_rate": 1.4759786462923332e-05, + "loss": 1.2195, + "step": 12142 + }, + { + "epoch": 3.6167464025763696, + "grad_norm": 0.34412795305252075, + "learning_rate": 1.4758938146297138e-05, + "loss": 1.2358, + "step": 12143 + }, + { + "epoch": 3.6170442487760384, + "grad_norm": 0.2774752676486969, + "learning_rate": 1.4758089785394338e-05, + "loss": 1.24, + "step": 12144 + }, + { + "epoch": 3.617342094975707, + "grad_norm": 0.2854416072368622, + "learning_rate": 1.4757241380222835e-05, + "loss": 1.2433, + "step": 12145 + }, + { + "epoch": 3.6176399411753755, + "grad_norm": 0.24738280475139618, + "learning_rate": 1.4756392930790516e-05, + "loss": 1.2334, + "step": 12146 + }, + { + "epoch": 3.6179377873750442, + "grad_norm": 0.4207460582256317, + "learning_rate": 1.4755544437105269e-05, + "loss": 1.2238, + "step": 12147 + }, + { + "epoch": 3.618235633574713, + "grad_norm": 0.32039952278137207, + "learning_rate": 1.4754695899174997e-05, + "loss": 1.2334, + "step": 12148 + }, + { + "epoch": 3.6185334797743813, + "grad_norm": 0.31696414947509766, + "learning_rate": 1.4753847317007594e-05, + "loss": 1.2517, + "step": 12149 + }, + { + "epoch": 3.61883132597405, + "grad_norm": 0.26194241642951965, + "learning_rate": 1.4752998690610951e-05, + "loss": 1.2318, + "step": 12150 + }, + { + "epoch": 3.619129172173719, + "grad_norm": 0.4184989035129547, + "learning_rate": 1.4752150019992968e-05, + "loss": 1.2323, + "step": 12151 + }, + { + "epoch": 3.619427018373387, + "grad_norm": 0.32291746139526367, + "learning_rate": 1.4751301305161536e-05, + "loss": 1.2329, + "step": 12152 + }, + { + "epoch": 3.619724864573056, + "grad_norm": 0.32019558548927307, + "learning_rate": 1.4750452546124556e-05, + "loss": 1.237, + "step": 12153 + }, + { + "epoch": 3.6200227107727247, + "grad_norm": 0.24402610957622528, + "learning_rate": 1.474960374288992e-05, + "loss": 1.2339, + "step": 12154 + }, + { + "epoch": 3.6203205569723935, + "grad_norm": 0.3688165247440338, + "learning_rate": 1.474875489546553e-05, + "loss": 1.2468, + "step": 12155 + }, + { + "epoch": 3.6206184031720623, + "grad_norm": 0.2501266896724701, + "learning_rate": 1.4747906003859278e-05, + "loss": 1.2277, + "step": 12156 + }, + { + "epoch": 3.6209162493717306, + "grad_norm": 0.25592708587646484, + "learning_rate": 1.4747057068079067e-05, + "loss": 1.2313, + "step": 12157 + }, + { + "epoch": 3.6212140955713994, + "grad_norm": 0.2680782377719879, + "learning_rate": 1.4746208088132794e-05, + "loss": 1.2284, + "step": 12158 + }, + { + "epoch": 3.621511941771068, + "grad_norm": 0.2572602927684784, + "learning_rate": 1.474535906402836e-05, + "loss": 1.2353, + "step": 12159 + }, + { + "epoch": 3.6218097879707365, + "grad_norm": 0.30644771456718445, + "learning_rate": 1.474450999577366e-05, + "loss": 1.2305, + "step": 12160 + }, + { + "epoch": 3.6221076341704053, + "grad_norm": 0.29997578263282776, + "learning_rate": 1.4743660883376593e-05, + "loss": 1.237, + "step": 12161 + }, + { + "epoch": 3.622405480370074, + "grad_norm": 0.25831934809684753, + "learning_rate": 1.4742811726845063e-05, + "loss": 1.2229, + "step": 12162 + }, + { + "epoch": 3.6227033265697424, + "grad_norm": 0.3088092803955078, + "learning_rate": 1.474196252618697e-05, + "loss": 1.2221, + "step": 12163 + }, + { + "epoch": 3.623001172769411, + "grad_norm": 0.2495061606168747, + "learning_rate": 1.4741113281410213e-05, + "loss": 1.23, + "step": 12164 + }, + { + "epoch": 3.62329901896908, + "grad_norm": 0.37403103709220886, + "learning_rate": 1.4740263992522695e-05, + "loss": 1.2415, + "step": 12165 + }, + { + "epoch": 3.6235968651687482, + "grad_norm": 0.24580949544906616, + "learning_rate": 1.4739414659532316e-05, + "loss": 1.2355, + "step": 12166 + }, + { + "epoch": 3.623894711368417, + "grad_norm": 0.32103511691093445, + "learning_rate": 1.4738565282446976e-05, + "loss": 1.2178, + "step": 12167 + }, + { + "epoch": 3.624192557568086, + "grad_norm": 0.23806986212730408, + "learning_rate": 1.4737715861274582e-05, + "loss": 1.2229, + "step": 12168 + }, + { + "epoch": 3.6244904037677546, + "grad_norm": 0.2906794250011444, + "learning_rate": 1.4736866396023038e-05, + "loss": 1.2429, + "step": 12169 + }, + { + "epoch": 3.6247882499674233, + "grad_norm": 0.26994964480400085, + "learning_rate": 1.4736016886700242e-05, + "loss": 1.2376, + "step": 12170 + }, + { + "epoch": 3.6250860961670917, + "grad_norm": 0.26804840564727783, + "learning_rate": 1.47351673333141e-05, + "loss": 1.2503, + "step": 12171 + }, + { + "epoch": 3.6253839423667604, + "grad_norm": 0.255969375371933, + "learning_rate": 1.4734317735872516e-05, + "loss": 1.209, + "step": 12172 + }, + { + "epoch": 3.625681788566429, + "grad_norm": 0.2911655008792877, + "learning_rate": 1.4733468094383397e-05, + "loss": 1.2396, + "step": 12173 + }, + { + "epoch": 3.6259796347660975, + "grad_norm": 0.26125261187553406, + "learning_rate": 1.4732618408854644e-05, + "loss": 1.2274, + "step": 12174 + }, + { + "epoch": 3.6262774809657663, + "grad_norm": 0.2519410252571106, + "learning_rate": 1.4731768679294165e-05, + "loss": 1.2434, + "step": 12175 + }, + { + "epoch": 3.626575327165435, + "grad_norm": 0.2514265179634094, + "learning_rate": 1.4730918905709868e-05, + "loss": 1.2303, + "step": 12176 + }, + { + "epoch": 3.6268731733651034, + "grad_norm": 0.2466183602809906, + "learning_rate": 1.4730069088109653e-05, + "loss": 1.2454, + "step": 12177 + }, + { + "epoch": 3.627171019564772, + "grad_norm": 0.2416197657585144, + "learning_rate": 1.472921922650143e-05, + "loss": 1.2298, + "step": 12178 + }, + { + "epoch": 3.627468865764441, + "grad_norm": 0.2630484104156494, + "learning_rate": 1.4728369320893106e-05, + "loss": 1.2205, + "step": 12179 + }, + { + "epoch": 3.6277667119641093, + "grad_norm": 0.24044835567474365, + "learning_rate": 1.4727519371292589e-05, + "loss": 1.2371, + "step": 12180 + }, + { + "epoch": 3.628064558163778, + "grad_norm": 0.2447517216205597, + "learning_rate": 1.4726669377707783e-05, + "loss": 1.2329, + "step": 12181 + }, + { + "epoch": 3.628362404363447, + "grad_norm": 0.24271756410598755, + "learning_rate": 1.4725819340146603e-05, + "loss": 1.2325, + "step": 12182 + }, + { + "epoch": 3.6286602505631156, + "grad_norm": 0.24534739553928375, + "learning_rate": 1.4724969258616954e-05, + "loss": 1.2358, + "step": 12183 + }, + { + "epoch": 3.6289580967627844, + "grad_norm": 0.24850770831108093, + "learning_rate": 1.4724119133126746e-05, + "loss": 1.2289, + "step": 12184 + }, + { + "epoch": 3.6292559429624527, + "grad_norm": 0.2543782591819763, + "learning_rate": 1.4723268963683883e-05, + "loss": 1.2249, + "step": 12185 + }, + { + "epoch": 3.6295537891621215, + "grad_norm": 0.25927695631980896, + "learning_rate": 1.4722418750296283e-05, + "loss": 1.232, + "step": 12186 + }, + { + "epoch": 3.6298516353617902, + "grad_norm": 0.266666054725647, + "learning_rate": 1.4721568492971851e-05, + "loss": 1.2118, + "step": 12187 + }, + { + "epoch": 3.6301494815614586, + "grad_norm": 0.23791494965553284, + "learning_rate": 1.4720718191718502e-05, + "loss": 1.2366, + "step": 12188 + }, + { + "epoch": 3.6304473277611273, + "grad_norm": 0.2627164125442505, + "learning_rate": 1.4719867846544142e-05, + "loss": 1.2314, + "step": 12189 + }, + { + "epoch": 3.630745173960796, + "grad_norm": 0.2487691044807434, + "learning_rate": 1.4719017457456688e-05, + "loss": 1.2216, + "step": 12190 + }, + { + "epoch": 3.6310430201604644, + "grad_norm": 0.25168392062187195, + "learning_rate": 1.4718167024464045e-05, + "loss": 1.2292, + "step": 12191 + }, + { + "epoch": 3.631340866360133, + "grad_norm": 0.2399231642484665, + "learning_rate": 1.471731654757413e-05, + "loss": 1.2234, + "step": 12192 + }, + { + "epoch": 3.631638712559802, + "grad_norm": 0.2516423463821411, + "learning_rate": 1.4716466026794858e-05, + "loss": 1.2278, + "step": 12193 + }, + { + "epoch": 3.6319365587594703, + "grad_norm": 0.2878287732601166, + "learning_rate": 1.4715615462134138e-05, + "loss": 1.2285, + "step": 12194 + }, + { + "epoch": 3.632234404959139, + "grad_norm": 0.26969072222709656, + "learning_rate": 1.4714764853599884e-05, + "loss": 1.2355, + "step": 12195 + }, + { + "epoch": 3.632532251158808, + "grad_norm": 0.2461068034172058, + "learning_rate": 1.471391420120001e-05, + "loss": 1.2396, + "step": 12196 + }, + { + "epoch": 3.6328300973584766, + "grad_norm": 0.2353159636259079, + "learning_rate": 1.4713063504942434e-05, + "loss": 1.2275, + "step": 12197 + }, + { + "epoch": 3.6331279435581454, + "grad_norm": 0.2578493356704712, + "learning_rate": 1.4712212764835063e-05, + "loss": 1.2178, + "step": 12198 + }, + { + "epoch": 3.6334257897578137, + "grad_norm": 0.29053571820259094, + "learning_rate": 1.4711361980885821e-05, + "loss": 1.229, + "step": 12199 + }, + { + "epoch": 3.6337236359574825, + "grad_norm": 0.24387332797050476, + "learning_rate": 1.4710511153102617e-05, + "loss": 1.2324, + "step": 12200 + }, + { + "epoch": 3.6340214821571513, + "grad_norm": 0.30657774209976196, + "learning_rate": 1.4709660281493371e-05, + "loss": 1.2402, + "step": 12201 + }, + { + "epoch": 3.6343193283568196, + "grad_norm": 0.32485294342041016, + "learning_rate": 1.4708809366066e-05, + "loss": 1.2217, + "step": 12202 + }, + { + "epoch": 3.6346171745564884, + "grad_norm": 0.2695823013782501, + "learning_rate": 1.4707958406828416e-05, + "loss": 1.2412, + "step": 12203 + }, + { + "epoch": 3.634915020756157, + "grad_norm": 0.25589483976364136, + "learning_rate": 1.4707107403788539e-05, + "loss": 1.2408, + "step": 12204 + }, + { + "epoch": 3.6352128669558255, + "grad_norm": 0.23745296895503998, + "learning_rate": 1.4706256356954287e-05, + "loss": 1.2102, + "step": 12205 + }, + { + "epoch": 3.6355107131554942, + "grad_norm": 0.3855360150337219, + "learning_rate": 1.470540526633358e-05, + "loss": 1.2274, + "step": 12206 + }, + { + "epoch": 3.635808559355163, + "grad_norm": 0.3667902946472168, + "learning_rate": 1.4704554131934332e-05, + "loss": 1.21, + "step": 12207 + }, + { + "epoch": 3.6361064055548318, + "grad_norm": 0.3271882236003876, + "learning_rate": 1.4703702953764465e-05, + "loss": 1.2383, + "step": 12208 + }, + { + "epoch": 3.6364042517545, + "grad_norm": 0.512535572052002, + "learning_rate": 1.4702851731831895e-05, + "loss": 1.2391, + "step": 12209 + }, + { + "epoch": 3.636702097954169, + "grad_norm": 0.2458118051290512, + "learning_rate": 1.4702000466144548e-05, + "loss": 1.2275, + "step": 12210 + }, + { + "epoch": 3.6369999441538376, + "grad_norm": 0.30778971314430237, + "learning_rate": 1.4701149156710338e-05, + "loss": 1.2326, + "step": 12211 + }, + { + "epoch": 3.6372977903535064, + "grad_norm": 0.2693280279636383, + "learning_rate": 1.4700297803537184e-05, + "loss": 1.2354, + "step": 12212 + }, + { + "epoch": 3.6375956365531747, + "grad_norm": 0.26226574182510376, + "learning_rate": 1.4699446406633016e-05, + "loss": 1.251, + "step": 12213 + }, + { + "epoch": 3.6378934827528435, + "grad_norm": 0.3327712416648865, + "learning_rate": 1.4698594966005745e-05, + "loss": 1.2227, + "step": 12214 + }, + { + "epoch": 3.6381913289525123, + "grad_norm": 0.285996675491333, + "learning_rate": 1.4697743481663303e-05, + "loss": 1.2366, + "step": 12215 + }, + { + "epoch": 3.6384891751521806, + "grad_norm": 0.26572054624557495, + "learning_rate": 1.46968919536136e-05, + "loss": 1.251, + "step": 12216 + }, + { + "epoch": 3.6387870213518494, + "grad_norm": 0.2764107584953308, + "learning_rate": 1.4696040381864571e-05, + "loss": 1.2466, + "step": 12217 + }, + { + "epoch": 3.639084867551518, + "grad_norm": 0.2600226104259491, + "learning_rate": 1.4695188766424132e-05, + "loss": 1.2549, + "step": 12218 + }, + { + "epoch": 3.6393827137511865, + "grad_norm": 0.2641436755657196, + "learning_rate": 1.4694337107300205e-05, + "loss": 1.2156, + "step": 12219 + }, + { + "epoch": 3.6396805599508553, + "grad_norm": 0.26132163405418396, + "learning_rate": 1.4693485404500716e-05, + "loss": 1.2334, + "step": 12220 + }, + { + "epoch": 3.639978406150524, + "grad_norm": 0.3215862512588501, + "learning_rate": 1.469263365803359e-05, + "loss": 1.2395, + "step": 12221 + }, + { + "epoch": 3.640276252350193, + "grad_norm": 0.26509419083595276, + "learning_rate": 1.469178186790675e-05, + "loss": 1.2168, + "step": 12222 + }, + { + "epoch": 3.6405740985498616, + "grad_norm": 0.2737056314945221, + "learning_rate": 1.4690930034128124e-05, + "loss": 1.2266, + "step": 12223 + }, + { + "epoch": 3.64087194474953, + "grad_norm": 0.24052157998085022, + "learning_rate": 1.4690078156705634e-05, + "loss": 1.2235, + "step": 12224 + }, + { + "epoch": 3.6411697909491987, + "grad_norm": 0.2925179898738861, + "learning_rate": 1.4689226235647205e-05, + "loss": 1.2265, + "step": 12225 + }, + { + "epoch": 3.6414676371488675, + "grad_norm": 0.2738029956817627, + "learning_rate": 1.4688374270960766e-05, + "loss": 1.2355, + "step": 12226 + }, + { + "epoch": 3.641765483348536, + "grad_norm": 0.30287572741508484, + "learning_rate": 1.4687522262654244e-05, + "loss": 1.2353, + "step": 12227 + }, + { + "epoch": 3.6420633295482046, + "grad_norm": 0.7332232594490051, + "learning_rate": 1.4686670210735562e-05, + "loss": 1.2331, + "step": 12228 + }, + { + "epoch": 3.6423611757478733, + "grad_norm": 0.3383345901966095, + "learning_rate": 1.468581811521265e-05, + "loss": 1.2258, + "step": 12229 + }, + { + "epoch": 3.6426590219475417, + "grad_norm": 0.2796458899974823, + "learning_rate": 1.468496597609344e-05, + "loss": 1.2537, + "step": 12230 + }, + { + "epoch": 3.6429568681472104, + "grad_norm": 0.24633510410785675, + "learning_rate": 1.4684113793385852e-05, + "loss": 1.2301, + "step": 12231 + }, + { + "epoch": 3.643254714346879, + "grad_norm": 0.24334703385829926, + "learning_rate": 1.468326156709782e-05, + "loss": 1.2377, + "step": 12232 + }, + { + "epoch": 3.6435525605465475, + "grad_norm": 0.2508890926837921, + "learning_rate": 1.468240929723727e-05, + "loss": 1.233, + "step": 12233 + }, + { + "epoch": 3.6438504067462163, + "grad_norm": 0.2544920742511749, + "learning_rate": 1.4681556983812136e-05, + "loss": 1.2488, + "step": 12234 + }, + { + "epoch": 3.644148252945885, + "grad_norm": 0.2417229562997818, + "learning_rate": 1.4680704626830342e-05, + "loss": 1.2223, + "step": 12235 + }, + { + "epoch": 3.644446099145554, + "grad_norm": 0.24827317893505096, + "learning_rate": 1.4679852226299823e-05, + "loss": 1.235, + "step": 12236 + }, + { + "epoch": 3.6447439453452226, + "grad_norm": 0.24662494659423828, + "learning_rate": 1.4678999782228505e-05, + "loss": 1.2244, + "step": 12237 + }, + { + "epoch": 3.645041791544891, + "grad_norm": 0.24618802964687347, + "learning_rate": 1.4678147294624323e-05, + "loss": 1.2447, + "step": 12238 + }, + { + "epoch": 3.6453396377445597, + "grad_norm": 0.2501998543739319, + "learning_rate": 1.4677294763495207e-05, + "loss": 1.2389, + "step": 12239 + }, + { + "epoch": 3.6456374839442285, + "grad_norm": 0.2450493425130844, + "learning_rate": 1.467644218884909e-05, + "loss": 1.2482, + "step": 12240 + }, + { + "epoch": 3.645935330143897, + "grad_norm": 0.23952670395374298, + "learning_rate": 1.4675589570693905e-05, + "loss": 1.2182, + "step": 12241 + }, + { + "epoch": 3.6462331763435656, + "grad_norm": 0.2516058087348938, + "learning_rate": 1.4674736909037579e-05, + "loss": 1.2384, + "step": 12242 + }, + { + "epoch": 3.6465310225432344, + "grad_norm": 0.25174760818481445, + "learning_rate": 1.4673884203888052e-05, + "loss": 1.2423, + "step": 12243 + }, + { + "epoch": 3.6468288687429027, + "grad_norm": 0.2346472293138504, + "learning_rate": 1.4673031455253253e-05, + "loss": 1.223, + "step": 12244 + }, + { + "epoch": 3.6471267149425715, + "grad_norm": 0.2519069015979767, + "learning_rate": 1.4672178663141117e-05, + "loss": 1.2423, + "step": 12245 + }, + { + "epoch": 3.6474245611422402, + "grad_norm": 0.23978659510612488, + "learning_rate": 1.4671325827559576e-05, + "loss": 1.2165, + "step": 12246 + }, + { + "epoch": 3.6477224073419086, + "grad_norm": 0.24777527153491974, + "learning_rate": 1.4670472948516572e-05, + "loss": 1.2421, + "step": 12247 + }, + { + "epoch": 3.6480202535415773, + "grad_norm": 0.23103217780590057, + "learning_rate": 1.4669620026020035e-05, + "loss": 1.2289, + "step": 12248 + }, + { + "epoch": 3.648318099741246, + "grad_norm": 0.24028585851192474, + "learning_rate": 1.46687670600779e-05, + "loss": 1.2273, + "step": 12249 + }, + { + "epoch": 3.648615945940915, + "grad_norm": 0.23895616829395294, + "learning_rate": 1.4667914050698102e-05, + "loss": 1.2342, + "step": 12250 + }, + { + "epoch": 3.6489137921405836, + "grad_norm": 0.23453964293003082, + "learning_rate": 1.466706099788858e-05, + "loss": 1.2413, + "step": 12251 + }, + { + "epoch": 3.649211638340252, + "grad_norm": 0.2327313870191574, + "learning_rate": 1.4666207901657273e-05, + "loss": 1.2379, + "step": 12252 + }, + { + "epoch": 3.6495094845399207, + "grad_norm": 0.23636144399642944, + "learning_rate": 1.466535476201211e-05, + "loss": 1.2336, + "step": 12253 + }, + { + "epoch": 3.6498073307395895, + "grad_norm": 0.23853585124015808, + "learning_rate": 1.4664501578961034e-05, + "loss": 1.2214, + "step": 12254 + }, + { + "epoch": 3.650105176939258, + "grad_norm": 0.24279792606830597, + "learning_rate": 1.4663648352511986e-05, + "loss": 1.2277, + "step": 12255 + }, + { + "epoch": 3.6504030231389266, + "grad_norm": 0.24204349517822266, + "learning_rate": 1.4662795082672898e-05, + "loss": 1.2235, + "step": 12256 + }, + { + "epoch": 3.6507008693385954, + "grad_norm": 0.2332702875137329, + "learning_rate": 1.466194176945171e-05, + "loss": 1.2192, + "step": 12257 + }, + { + "epoch": 3.6509987155382637, + "grad_norm": 0.24131035804748535, + "learning_rate": 1.4661088412856366e-05, + "loss": 1.2364, + "step": 12258 + }, + { + "epoch": 3.6512965617379325, + "grad_norm": 0.23367980122566223, + "learning_rate": 1.46602350128948e-05, + "loss": 1.2304, + "step": 12259 + }, + { + "epoch": 3.6515944079376013, + "grad_norm": 0.237742081284523, + "learning_rate": 1.4659381569574956e-05, + "loss": 1.2367, + "step": 12260 + }, + { + "epoch": 3.65189225413727, + "grad_norm": 0.2423309087753296, + "learning_rate": 1.4658528082904768e-05, + "loss": 1.2133, + "step": 12261 + }, + { + "epoch": 3.6521901003369384, + "grad_norm": 0.242317795753479, + "learning_rate": 1.4657674552892183e-05, + "loss": 1.215, + "step": 12262 + }, + { + "epoch": 3.652487946536607, + "grad_norm": 0.24148796498775482, + "learning_rate": 1.4656820979545141e-05, + "loss": 1.2201, + "step": 12263 + }, + { + "epoch": 3.652785792736276, + "grad_norm": 0.2296200841665268, + "learning_rate": 1.465596736287158e-05, + "loss": 1.2349, + "step": 12264 + }, + { + "epoch": 3.6530836389359447, + "grad_norm": 0.23521791398525238, + "learning_rate": 1.4655113702879451e-05, + "loss": 1.2336, + "step": 12265 + }, + { + "epoch": 3.653381485135613, + "grad_norm": 0.23943842947483063, + "learning_rate": 1.4654259999576685e-05, + "loss": 1.2423, + "step": 12266 + }, + { + "epoch": 3.6536793313352818, + "grad_norm": 0.232687309384346, + "learning_rate": 1.4653406252971229e-05, + "loss": 1.2401, + "step": 12267 + }, + { + "epoch": 3.6539771775349505, + "grad_norm": 0.23358887434005737, + "learning_rate": 1.4652552463071028e-05, + "loss": 1.2362, + "step": 12268 + }, + { + "epoch": 3.654275023734619, + "grad_norm": 0.25296834111213684, + "learning_rate": 1.4651698629884025e-05, + "loss": 1.2351, + "step": 12269 + }, + { + "epoch": 3.6545728699342876, + "grad_norm": 0.24187542498111725, + "learning_rate": 1.4650844753418164e-05, + "loss": 1.2437, + "step": 12270 + }, + { + "epoch": 3.6548707161339564, + "grad_norm": 0.2396983951330185, + "learning_rate": 1.4649990833681388e-05, + "loss": 1.2362, + "step": 12271 + }, + { + "epoch": 3.6551685623336247, + "grad_norm": 0.2348756194114685, + "learning_rate": 1.4649136870681643e-05, + "loss": 1.2469, + "step": 12272 + }, + { + "epoch": 3.6554664085332935, + "grad_norm": 0.2409428358078003, + "learning_rate": 1.4648282864426873e-05, + "loss": 1.2216, + "step": 12273 + }, + { + "epoch": 3.6557642547329623, + "grad_norm": 0.23733539879322052, + "learning_rate": 1.4647428814925025e-05, + "loss": 1.2382, + "step": 12274 + }, + { + "epoch": 3.656062100932631, + "grad_norm": 0.23807461559772491, + "learning_rate": 1.4646574722184043e-05, + "loss": 1.2114, + "step": 12275 + }, + { + "epoch": 3.6563599471322994, + "grad_norm": 0.24783183634281158, + "learning_rate": 1.4645720586211878e-05, + "loss": 1.2137, + "step": 12276 + }, + { + "epoch": 3.656657793331968, + "grad_norm": 0.25191429257392883, + "learning_rate": 1.464486640701647e-05, + "loss": 1.236, + "step": 12277 + }, + { + "epoch": 3.656955639531637, + "grad_norm": 0.23169903457164764, + "learning_rate": 1.4644012184605771e-05, + "loss": 1.2279, + "step": 12278 + }, + { + "epoch": 3.6572534857313057, + "grad_norm": 0.2517816424369812, + "learning_rate": 1.4643157918987727e-05, + "loss": 1.2198, + "step": 12279 + }, + { + "epoch": 3.657551331930974, + "grad_norm": 0.2378106415271759, + "learning_rate": 1.4642303610170286e-05, + "loss": 1.2342, + "step": 12280 + }, + { + "epoch": 3.657849178130643, + "grad_norm": 0.24237866699695587, + "learning_rate": 1.4641449258161396e-05, + "loss": 1.2263, + "step": 12281 + }, + { + "epoch": 3.6581470243303116, + "grad_norm": 0.24208959937095642, + "learning_rate": 1.4640594862969009e-05, + "loss": 1.2322, + "step": 12282 + }, + { + "epoch": 3.65844487052998, + "grad_norm": 0.24164338409900665, + "learning_rate": 1.463974042460107e-05, + "loss": 1.2359, + "step": 12283 + }, + { + "epoch": 3.6587427167296487, + "grad_norm": 0.23954281210899353, + "learning_rate": 1.4638885943065528e-05, + "loss": 1.2104, + "step": 12284 + }, + { + "epoch": 3.6590405629293175, + "grad_norm": 0.24737069010734558, + "learning_rate": 1.4638031418370338e-05, + "loss": 1.2475, + "step": 12285 + }, + { + "epoch": 3.659338409128986, + "grad_norm": 0.26112329959869385, + "learning_rate": 1.4637176850523447e-05, + "loss": 1.2436, + "step": 12286 + }, + { + "epoch": 3.6596362553286546, + "grad_norm": 0.236568421125412, + "learning_rate": 1.4636322239532806e-05, + "loss": 1.2364, + "step": 12287 + }, + { + "epoch": 3.6599341015283233, + "grad_norm": 0.2685500681400299, + "learning_rate": 1.4635467585406366e-05, + "loss": 1.2433, + "step": 12288 + }, + { + "epoch": 3.660231947727992, + "grad_norm": 0.2374984622001648, + "learning_rate": 1.4634612888152082e-05, + "loss": 1.2268, + "step": 12289 + }, + { + "epoch": 3.660529793927661, + "grad_norm": 0.2977931797504425, + "learning_rate": 1.4633758147777902e-05, + "loss": 1.2335, + "step": 12290 + }, + { + "epoch": 3.660827640127329, + "grad_norm": 0.2528064250946045, + "learning_rate": 1.4632903364291776e-05, + "loss": 1.2326, + "step": 12291 + }, + { + "epoch": 3.661125486326998, + "grad_norm": 0.24254608154296875, + "learning_rate": 1.4632048537701664e-05, + "loss": 1.2328, + "step": 12292 + }, + { + "epoch": 3.6614233325266667, + "grad_norm": 0.2421228289604187, + "learning_rate": 1.4631193668015513e-05, + "loss": 1.2288, + "step": 12293 + }, + { + "epoch": 3.661721178726335, + "grad_norm": 0.2589697241783142, + "learning_rate": 1.4630338755241283e-05, + "loss": 1.228, + "step": 12294 + }, + { + "epoch": 3.662019024926004, + "grad_norm": 0.2550235986709595, + "learning_rate": 1.4629483799386922e-05, + "loss": 1.2228, + "step": 12295 + }, + { + "epoch": 3.6623168711256726, + "grad_norm": 0.2578994035720825, + "learning_rate": 1.4628628800460385e-05, + "loss": 1.2402, + "step": 12296 + }, + { + "epoch": 3.662614717325341, + "grad_norm": 0.2588058412075043, + "learning_rate": 1.462777375846963e-05, + "loss": 1.2401, + "step": 12297 + }, + { + "epoch": 3.6629125635250097, + "grad_norm": 0.2608630657196045, + "learning_rate": 1.462691867342261e-05, + "loss": 1.2467, + "step": 12298 + }, + { + "epoch": 3.6632104097246785, + "grad_norm": 0.33025142550468445, + "learning_rate": 1.4626063545327283e-05, + "loss": 1.2279, + "step": 12299 + }, + { + "epoch": 3.663508255924347, + "grad_norm": 0.24234481155872345, + "learning_rate": 1.4625208374191603e-05, + "loss": 1.2388, + "step": 12300 + }, + { + "epoch": 3.6638061021240156, + "grad_norm": 0.3066018223762512, + "learning_rate": 1.4624353160023526e-05, + "loss": 1.2165, + "step": 12301 + }, + { + "epoch": 3.6641039483236844, + "grad_norm": 0.24784480035305023, + "learning_rate": 1.4623497902831007e-05, + "loss": 1.231, + "step": 12302 + }, + { + "epoch": 3.664401794523353, + "grad_norm": 0.28738903999328613, + "learning_rate": 1.4622642602622008e-05, + "loss": 1.2083, + "step": 12303 + }, + { + "epoch": 3.664699640723022, + "grad_norm": 0.2508058249950409, + "learning_rate": 1.4621787259404484e-05, + "loss": 1.2304, + "step": 12304 + }, + { + "epoch": 3.6649974869226902, + "grad_norm": 0.28743064403533936, + "learning_rate": 1.4620931873186393e-05, + "loss": 1.2446, + "step": 12305 + }, + { + "epoch": 3.665295333122359, + "grad_norm": 0.2851032316684723, + "learning_rate": 1.4620076443975697e-05, + "loss": 1.2289, + "step": 12306 + }, + { + "epoch": 3.6655931793220278, + "grad_norm": 0.3177313804626465, + "learning_rate": 1.4619220971780348e-05, + "loss": 1.24, + "step": 12307 + }, + { + "epoch": 3.665891025521696, + "grad_norm": 0.26557326316833496, + "learning_rate": 1.4618365456608309e-05, + "loss": 1.2241, + "step": 12308 + }, + { + "epoch": 3.666188871721365, + "grad_norm": 0.3448164463043213, + "learning_rate": 1.4617509898467539e-05, + "loss": 1.2331, + "step": 12309 + }, + { + "epoch": 3.6664867179210336, + "grad_norm": 0.2601182162761688, + "learning_rate": 1.4616654297366e-05, + "loss": 1.2301, + "step": 12310 + }, + { + "epoch": 3.666784564120702, + "grad_norm": 0.24896277487277985, + "learning_rate": 1.461579865331165e-05, + "loss": 1.2358, + "step": 12311 + }, + { + "epoch": 3.6670824103203707, + "grad_norm": 0.25055694580078125, + "learning_rate": 1.4614942966312452e-05, + "loss": 1.2367, + "step": 12312 + }, + { + "epoch": 3.6673802565200395, + "grad_norm": 0.24847956001758575, + "learning_rate": 1.4614087236376364e-05, + "loss": 1.2206, + "step": 12313 + }, + { + "epoch": 3.667678102719708, + "grad_norm": 0.2494431734085083, + "learning_rate": 1.4613231463511347e-05, + "loss": 1.232, + "step": 12314 + }, + { + "epoch": 3.6679759489193766, + "grad_norm": 0.2703046202659607, + "learning_rate": 1.461237564772537e-05, + "loss": 1.2367, + "step": 12315 + }, + { + "epoch": 3.6682737951190454, + "grad_norm": 0.2518532872200012, + "learning_rate": 1.4611519789026392e-05, + "loss": 1.2474, + "step": 12316 + }, + { + "epoch": 3.668571641318714, + "grad_norm": 0.2615206837654114, + "learning_rate": 1.461066388742237e-05, + "loss": 1.2279, + "step": 12317 + }, + { + "epoch": 3.668869487518383, + "grad_norm": 0.27220064401626587, + "learning_rate": 1.4609807942921274e-05, + "loss": 1.2241, + "step": 12318 + }, + { + "epoch": 3.6691673337180513, + "grad_norm": 0.27002599835395813, + "learning_rate": 1.4608951955531066e-05, + "loss": 1.2359, + "step": 12319 + }, + { + "epoch": 3.66946517991772, + "grad_norm": 0.38039320707321167, + "learning_rate": 1.4608095925259707e-05, + "loss": 1.2285, + "step": 12320 + }, + { + "epoch": 3.669763026117389, + "grad_norm": 0.24123649299144745, + "learning_rate": 1.4607239852115165e-05, + "loss": 1.2248, + "step": 12321 + }, + { + "epoch": 3.670060872317057, + "grad_norm": 0.2824949622154236, + "learning_rate": 1.4606383736105406e-05, + "loss": 1.2427, + "step": 12322 + }, + { + "epoch": 3.670358718516726, + "grad_norm": 0.2610337436199188, + "learning_rate": 1.4605527577238391e-05, + "loss": 1.2311, + "step": 12323 + }, + { + "epoch": 3.6706565647163947, + "grad_norm": 0.2990787923336029, + "learning_rate": 1.4604671375522088e-05, + "loss": 1.2419, + "step": 12324 + }, + { + "epoch": 3.670954410916063, + "grad_norm": 0.2511928975582123, + "learning_rate": 1.4603815130964463e-05, + "loss": 1.2374, + "step": 12325 + }, + { + "epoch": 3.6712522571157318, + "grad_norm": 0.2628847360610962, + "learning_rate": 1.4602958843573477e-05, + "loss": 1.2354, + "step": 12326 + }, + { + "epoch": 3.6715501033154005, + "grad_norm": 0.24535098671913147, + "learning_rate": 1.4602102513357109e-05, + "loss": 1.2267, + "step": 12327 + }, + { + "epoch": 3.6718479495150693, + "grad_norm": 0.3978574872016907, + "learning_rate": 1.4601246140323316e-05, + "loss": 1.2394, + "step": 12328 + }, + { + "epoch": 3.6721457957147376, + "grad_norm": 0.27503323554992676, + "learning_rate": 1.4600389724480067e-05, + "loss": 1.2449, + "step": 12329 + }, + { + "epoch": 3.6724436419144064, + "grad_norm": 0.2998397648334503, + "learning_rate": 1.4599533265835332e-05, + "loss": 1.2265, + "step": 12330 + }, + { + "epoch": 3.672741488114075, + "grad_norm": 0.2379540652036667, + "learning_rate": 1.459867676439708e-05, + "loss": 1.2417, + "step": 12331 + }, + { + "epoch": 3.673039334313744, + "grad_norm": 0.447098970413208, + "learning_rate": 1.4597820220173278e-05, + "loss": 1.2468, + "step": 12332 + }, + { + "epoch": 3.6733371805134123, + "grad_norm": 0.349273681640625, + "learning_rate": 1.4596963633171897e-05, + "loss": 1.2266, + "step": 12333 + }, + { + "epoch": 3.673635026713081, + "grad_norm": 0.3068247437477112, + "learning_rate": 1.4596107003400903e-05, + "loss": 1.2445, + "step": 12334 + }, + { + "epoch": 3.67393287291275, + "grad_norm": 0.3136936128139496, + "learning_rate": 1.459525033086827e-05, + "loss": 1.234, + "step": 12335 + }, + { + "epoch": 3.674230719112418, + "grad_norm": 0.28681203722953796, + "learning_rate": 1.4594393615581965e-05, + "loss": 1.2232, + "step": 12336 + }, + { + "epoch": 3.674528565312087, + "grad_norm": 0.2418070137500763, + "learning_rate": 1.4593536857549964e-05, + "loss": 1.2275, + "step": 12337 + }, + { + "epoch": 3.6748264115117557, + "grad_norm": 0.27810901403427124, + "learning_rate": 1.4592680056780232e-05, + "loss": 1.2253, + "step": 12338 + }, + { + "epoch": 3.675124257711424, + "grad_norm": 0.23245923221111298, + "learning_rate": 1.4591823213280742e-05, + "loss": 1.2371, + "step": 12339 + }, + { + "epoch": 3.675422103911093, + "grad_norm": 0.277251660823822, + "learning_rate": 1.4590966327059472e-05, + "loss": 1.2284, + "step": 12340 + }, + { + "epoch": 3.6757199501107616, + "grad_norm": 0.2440660148859024, + "learning_rate": 1.4590109398124388e-05, + "loss": 1.2366, + "step": 12341 + }, + { + "epoch": 3.6760177963104304, + "grad_norm": 0.2404196560382843, + "learning_rate": 1.4589252426483462e-05, + "loss": 1.2589, + "step": 12342 + }, + { + "epoch": 3.676315642510099, + "grad_norm": 0.2652944326400757, + "learning_rate": 1.4588395412144672e-05, + "loss": 1.2372, + "step": 12343 + }, + { + "epoch": 3.6766134887097675, + "grad_norm": 0.25153660774230957, + "learning_rate": 1.458753835511599e-05, + "loss": 1.2519, + "step": 12344 + }, + { + "epoch": 3.6769113349094362, + "grad_norm": 0.28372517228126526, + "learning_rate": 1.4586681255405384e-05, + "loss": 1.2395, + "step": 12345 + }, + { + "epoch": 3.677209181109105, + "grad_norm": 0.24544914066791534, + "learning_rate": 1.4585824113020839e-05, + "loss": 1.2404, + "step": 12346 + }, + { + "epoch": 3.6775070273087733, + "grad_norm": 0.2560138702392578, + "learning_rate": 1.4584966927970324e-05, + "loss": 1.2235, + "step": 12347 + }, + { + "epoch": 3.677804873508442, + "grad_norm": 0.28169959783554077, + "learning_rate": 1.458410970026181e-05, + "loss": 1.2284, + "step": 12348 + }, + { + "epoch": 3.678102719708111, + "grad_norm": 0.23807823657989502, + "learning_rate": 1.4583252429903278e-05, + "loss": 1.2555, + "step": 12349 + }, + { + "epoch": 3.678400565907779, + "grad_norm": 0.2647009789943695, + "learning_rate": 1.4582395116902706e-05, + "loss": 1.2236, + "step": 12350 + }, + { + "epoch": 3.678698412107448, + "grad_norm": 0.26797088980674744, + "learning_rate": 1.4581537761268065e-05, + "loss": 1.2495, + "step": 12351 + }, + { + "epoch": 3.6789962583071167, + "grad_norm": 0.27212825417518616, + "learning_rate": 1.4580680363007333e-05, + "loss": 1.2262, + "step": 12352 + }, + { + "epoch": 3.679294104506785, + "grad_norm": 0.24689751863479614, + "learning_rate": 1.457982292212849e-05, + "loss": 1.2144, + "step": 12353 + }, + { + "epoch": 3.679591950706454, + "grad_norm": 0.26500949263572693, + "learning_rate": 1.4578965438639511e-05, + "loss": 1.2518, + "step": 12354 + }, + { + "epoch": 3.6798897969061226, + "grad_norm": 0.2519420385360718, + "learning_rate": 1.4578107912548374e-05, + "loss": 1.2275, + "step": 12355 + }, + { + "epoch": 3.6801876431057914, + "grad_norm": 0.24441255629062653, + "learning_rate": 1.4577250343863059e-05, + "loss": 1.224, + "step": 12356 + }, + { + "epoch": 3.68048548930546, + "grad_norm": 0.2505742907524109, + "learning_rate": 1.4576392732591545e-05, + "loss": 1.2195, + "step": 12357 + }, + { + "epoch": 3.6807833355051285, + "grad_norm": 0.2475583851337433, + "learning_rate": 1.4575535078741804e-05, + "loss": 1.2189, + "step": 12358 + }, + { + "epoch": 3.6810811817047973, + "grad_norm": 0.2532002329826355, + "learning_rate": 1.4574677382321828e-05, + "loss": 1.2263, + "step": 12359 + }, + { + "epoch": 3.681379027904466, + "grad_norm": 0.2748001515865326, + "learning_rate": 1.4573819643339587e-05, + "loss": 1.2322, + "step": 12360 + }, + { + "epoch": 3.6816768741041344, + "grad_norm": 0.2338627725839615, + "learning_rate": 1.4572961861803061e-05, + "loss": 1.2068, + "step": 12361 + }, + { + "epoch": 3.681974720303803, + "grad_norm": 0.24474157392978668, + "learning_rate": 1.4572104037720239e-05, + "loss": 1.2205, + "step": 12362 + }, + { + "epoch": 3.682272566503472, + "grad_norm": 0.24145713448524475, + "learning_rate": 1.4571246171099095e-05, + "loss": 1.2293, + "step": 12363 + }, + { + "epoch": 3.6825704127031402, + "grad_norm": 0.25295016169548035, + "learning_rate": 1.4570388261947615e-05, + "loss": 1.2194, + "step": 12364 + }, + { + "epoch": 3.682868258902809, + "grad_norm": 0.24577906727790833, + "learning_rate": 1.4569530310273776e-05, + "loss": 1.2369, + "step": 12365 + }, + { + "epoch": 3.6831661051024778, + "grad_norm": 0.2546062767505646, + "learning_rate": 1.4568672316085564e-05, + "loss": 1.2501, + "step": 12366 + }, + { + "epoch": 3.683463951302146, + "grad_norm": 0.2619376480579376, + "learning_rate": 1.4567814279390958e-05, + "loss": 1.2387, + "step": 12367 + }, + { + "epoch": 3.683761797501815, + "grad_norm": 0.2567276358604431, + "learning_rate": 1.4566956200197946e-05, + "loss": 1.246, + "step": 12368 + }, + { + "epoch": 3.6840596437014836, + "grad_norm": 0.27727437019348145, + "learning_rate": 1.4566098078514508e-05, + "loss": 1.2566, + "step": 12369 + }, + { + "epoch": 3.6843574899011524, + "grad_norm": 0.24857331812381744, + "learning_rate": 1.456523991434863e-05, + "loss": 1.2351, + "step": 12370 + }, + { + "epoch": 3.684655336100821, + "grad_norm": 0.3601863980293274, + "learning_rate": 1.4564381707708295e-05, + "loss": 1.2246, + "step": 12371 + }, + { + "epoch": 3.6849531823004895, + "grad_norm": 0.28480979800224304, + "learning_rate": 1.456352345860149e-05, + "loss": 1.2165, + "step": 12372 + }, + { + "epoch": 3.6852510285001583, + "grad_norm": 0.29959580302238464, + "learning_rate": 1.4562665167036191e-05, + "loss": 1.238, + "step": 12373 + }, + { + "epoch": 3.685548874699827, + "grad_norm": 0.27144676446914673, + "learning_rate": 1.4561806833020398e-05, + "loss": 1.236, + "step": 12374 + }, + { + "epoch": 3.6858467208994954, + "grad_norm": 0.3229242265224457, + "learning_rate": 1.4560948456562085e-05, + "loss": 1.2331, + "step": 12375 + }, + { + "epoch": 3.686144567099164, + "grad_norm": 0.285470575094223, + "learning_rate": 1.4560090037669243e-05, + "loss": 1.2402, + "step": 12376 + }, + { + "epoch": 3.686442413298833, + "grad_norm": 0.28538915514945984, + "learning_rate": 1.455923157634986e-05, + "loss": 1.2298, + "step": 12377 + }, + { + "epoch": 3.6867402594985013, + "grad_norm": 0.24156533181667328, + "learning_rate": 1.455837307261192e-05, + "loss": 1.2424, + "step": 12378 + }, + { + "epoch": 3.68703810569817, + "grad_norm": 0.3212227523326874, + "learning_rate": 1.4557514526463407e-05, + "loss": 1.2251, + "step": 12379 + }, + { + "epoch": 3.687335951897839, + "grad_norm": 0.24562905728816986, + "learning_rate": 1.4556655937912318e-05, + "loss": 1.2378, + "step": 12380 + }, + { + "epoch": 3.687633798097507, + "grad_norm": 0.28291693329811096, + "learning_rate": 1.455579730696664e-05, + "loss": 1.2304, + "step": 12381 + }, + { + "epoch": 3.687931644297176, + "grad_norm": 0.2626647353172302, + "learning_rate": 1.4554938633634352e-05, + "loss": 1.2395, + "step": 12382 + }, + { + "epoch": 3.6882294904968447, + "grad_norm": 0.25677576661109924, + "learning_rate": 1.455407991792345e-05, + "loss": 1.2391, + "step": 12383 + }, + { + "epoch": 3.6885273366965134, + "grad_norm": 0.29853981733322144, + "learning_rate": 1.4553221159841925e-05, + "loss": 1.2333, + "step": 12384 + }, + { + "epoch": 3.688825182896182, + "grad_norm": 0.2698603570461273, + "learning_rate": 1.4552362359397761e-05, + "loss": 1.2282, + "step": 12385 + }, + { + "epoch": 3.6891230290958505, + "grad_norm": 0.3009848892688751, + "learning_rate": 1.4551503516598953e-05, + "loss": 1.2447, + "step": 12386 + }, + { + "epoch": 3.6894208752955193, + "grad_norm": 0.2589022219181061, + "learning_rate": 1.455064463145349e-05, + "loss": 1.2391, + "step": 12387 + }, + { + "epoch": 3.689718721495188, + "grad_norm": 0.3754878640174866, + "learning_rate": 1.4549785703969365e-05, + "loss": 1.2167, + "step": 12388 + }, + { + "epoch": 3.6900165676948564, + "grad_norm": 0.34311220049858093, + "learning_rate": 1.4548926734154565e-05, + "loss": 1.2172, + "step": 12389 + }, + { + "epoch": 3.690314413894525, + "grad_norm": 0.37802860140800476, + "learning_rate": 1.4548067722017085e-05, + "loss": 1.2382, + "step": 12390 + }, + { + "epoch": 3.690612260094194, + "grad_norm": 0.3811923861503601, + "learning_rate": 1.4547208667564916e-05, + "loss": 1.2268, + "step": 12391 + }, + { + "epoch": 3.6909101062938623, + "grad_norm": 0.3294679820537567, + "learning_rate": 1.4546349570806051e-05, + "loss": 1.2384, + "step": 12392 + }, + { + "epoch": 3.691207952493531, + "grad_norm": 0.3174379765987396, + "learning_rate": 1.4545490431748481e-05, + "loss": 1.2378, + "step": 12393 + }, + { + "epoch": 3.6915057986932, + "grad_norm": 0.31269073486328125, + "learning_rate": 1.4544631250400203e-05, + "loss": 1.2109, + "step": 12394 + }, + { + "epoch": 3.6918036448928686, + "grad_norm": 0.23892486095428467, + "learning_rate": 1.4543772026769209e-05, + "loss": 1.2331, + "step": 12395 + }, + { + "epoch": 3.692101491092537, + "grad_norm": 0.3780063986778259, + "learning_rate": 1.4542912760863493e-05, + "loss": 1.2436, + "step": 12396 + }, + { + "epoch": 3.6923993372922057, + "grad_norm": 0.25369495153427124, + "learning_rate": 1.4542053452691045e-05, + "loss": 1.2426, + "step": 12397 + }, + { + "epoch": 3.6926971834918745, + "grad_norm": 0.26020002365112305, + "learning_rate": 1.454119410225987e-05, + "loss": 1.225, + "step": 12398 + }, + { + "epoch": 3.6929950296915433, + "grad_norm": 0.2710517346858978, + "learning_rate": 1.4540334709577955e-05, + "loss": 1.226, + "step": 12399 + }, + { + "epoch": 3.6932928758912116, + "grad_norm": 0.2550024390220642, + "learning_rate": 1.4539475274653298e-05, + "loss": 1.2211, + "step": 12400 + }, + { + "epoch": 3.6935907220908804, + "grad_norm": 0.2555781900882721, + "learning_rate": 1.4538615797493898e-05, + "loss": 1.2244, + "step": 12401 + }, + { + "epoch": 3.693888568290549, + "grad_norm": 0.29365408420562744, + "learning_rate": 1.4537756278107743e-05, + "loss": 1.224, + "step": 12402 + }, + { + "epoch": 3.6941864144902175, + "grad_norm": 0.25374558568000793, + "learning_rate": 1.4536896716502841e-05, + "loss": 1.2223, + "step": 12403 + }, + { + "epoch": 3.6944842606898862, + "grad_norm": 0.3000558018684387, + "learning_rate": 1.4536037112687182e-05, + "loss": 1.2434, + "step": 12404 + }, + { + "epoch": 3.694782106889555, + "grad_norm": 0.2646952271461487, + "learning_rate": 1.4535177466668768e-05, + "loss": 1.2552, + "step": 12405 + }, + { + "epoch": 3.6950799530892233, + "grad_norm": 0.29860109090805054, + "learning_rate": 1.4534317778455593e-05, + "loss": 1.2244, + "step": 12406 + }, + { + "epoch": 3.695377799288892, + "grad_norm": 0.24162277579307556, + "learning_rate": 1.4533458048055658e-05, + "loss": 1.2348, + "step": 12407 + }, + { + "epoch": 3.695675645488561, + "grad_norm": 0.2900954782962799, + "learning_rate": 1.453259827547696e-05, + "loss": 1.215, + "step": 12408 + }, + { + "epoch": 3.6959734916882296, + "grad_norm": 0.24424606561660767, + "learning_rate": 1.4531738460727497e-05, + "loss": 1.2519, + "step": 12409 + }, + { + "epoch": 3.6962713378878984, + "grad_norm": 0.2941872179508209, + "learning_rate": 1.4530878603815273e-05, + "loss": 1.2308, + "step": 12410 + }, + { + "epoch": 3.6965691840875667, + "grad_norm": 0.2427120804786682, + "learning_rate": 1.4530018704748286e-05, + "loss": 1.2309, + "step": 12411 + }, + { + "epoch": 3.6968670302872355, + "grad_norm": 0.25930750370025635, + "learning_rate": 1.4529158763534536e-05, + "loss": 1.2231, + "step": 12412 + }, + { + "epoch": 3.6971648764869043, + "grad_norm": 0.24975840747356415, + "learning_rate": 1.4528298780182025e-05, + "loss": 1.2436, + "step": 12413 + }, + { + "epoch": 3.6974627226865726, + "grad_norm": 0.24957533180713654, + "learning_rate": 1.4527438754698751e-05, + "loss": 1.2301, + "step": 12414 + }, + { + "epoch": 3.6977605688862414, + "grad_norm": 0.2980298399925232, + "learning_rate": 1.4526578687092719e-05, + "loss": 1.2324, + "step": 12415 + }, + { + "epoch": 3.69805841508591, + "grad_norm": 0.26207733154296875, + "learning_rate": 1.4525718577371928e-05, + "loss": 1.2235, + "step": 12416 + }, + { + "epoch": 3.6983562612855785, + "grad_norm": 0.26013311743736267, + "learning_rate": 1.4524858425544383e-05, + "loss": 1.2224, + "step": 12417 + }, + { + "epoch": 3.6986541074852473, + "grad_norm": 0.2547445595264435, + "learning_rate": 1.4523998231618088e-05, + "loss": 1.2296, + "step": 12418 + }, + { + "epoch": 3.698951953684916, + "grad_norm": 0.3200279474258423, + "learning_rate": 1.4523137995601042e-05, + "loss": 1.2445, + "step": 12419 + }, + { + "epoch": 3.6992497998845844, + "grad_norm": 0.272141695022583, + "learning_rate": 1.4522277717501249e-05, + "loss": 1.2268, + "step": 12420 + }, + { + "epoch": 3.699547646084253, + "grad_norm": 0.31334221363067627, + "learning_rate": 1.4521417397326717e-05, + "loss": 1.2357, + "step": 12421 + }, + { + "epoch": 3.699845492283922, + "grad_norm": 0.29066202044487, + "learning_rate": 1.452055703508545e-05, + "loss": 1.2365, + "step": 12422 + }, + { + "epoch": 3.7001433384835907, + "grad_norm": 0.3803086578845978, + "learning_rate": 1.4519696630785448e-05, + "loss": 1.2171, + "step": 12423 + }, + { + "epoch": 3.7004411846832594, + "grad_norm": 0.3314111828804016, + "learning_rate": 1.4518836184434715e-05, + "loss": 1.2319, + "step": 12424 + }, + { + "epoch": 3.7007390308829278, + "grad_norm": 0.3040827810764313, + "learning_rate": 1.4517975696041263e-05, + "loss": 1.2384, + "step": 12425 + }, + { + "epoch": 3.7010368770825965, + "grad_norm": 0.268032044172287, + "learning_rate": 1.4517115165613094e-05, + "loss": 1.2363, + "step": 12426 + }, + { + "epoch": 3.7013347232822653, + "grad_norm": 0.3218487501144409, + "learning_rate": 1.4516254593158216e-05, + "loss": 1.2402, + "step": 12427 + }, + { + "epoch": 3.7016325694819336, + "grad_norm": 0.26561835408210754, + "learning_rate": 1.4515393978684635e-05, + "loss": 1.2289, + "step": 12428 + }, + { + "epoch": 3.7019304156816024, + "grad_norm": 0.30710723996162415, + "learning_rate": 1.4514533322200359e-05, + "loss": 1.2352, + "step": 12429 + }, + { + "epoch": 3.702228261881271, + "grad_norm": 0.2447279989719391, + "learning_rate": 1.4513672623713395e-05, + "loss": 1.2141, + "step": 12430 + }, + { + "epoch": 3.7025261080809395, + "grad_norm": 0.4446869194507599, + "learning_rate": 1.4512811883231748e-05, + "loss": 1.2366, + "step": 12431 + }, + { + "epoch": 3.7028239542806083, + "grad_norm": 0.3563823103904724, + "learning_rate": 1.4511951100763428e-05, + "loss": 1.2334, + "step": 12432 + }, + { + "epoch": 3.703121800480277, + "grad_norm": 0.3404117226600647, + "learning_rate": 1.4511090276316444e-05, + "loss": 1.2464, + "step": 12433 + }, + { + "epoch": 3.7034196466799454, + "grad_norm": 0.2750464379787445, + "learning_rate": 1.4510229409898806e-05, + "loss": 1.2399, + "step": 12434 + }, + { + "epoch": 3.703717492879614, + "grad_norm": 0.4659253656864166, + "learning_rate": 1.4509368501518526e-05, + "loss": 1.2372, + "step": 12435 + }, + { + "epoch": 3.704015339079283, + "grad_norm": 0.2898559868335724, + "learning_rate": 1.4508507551183606e-05, + "loss": 1.2187, + "step": 12436 + }, + { + "epoch": 3.7043131852789517, + "grad_norm": 0.3510069251060486, + "learning_rate": 1.4507646558902062e-05, + "loss": 1.2468, + "step": 12437 + }, + { + "epoch": 3.7046110314786205, + "grad_norm": 0.26242899894714355, + "learning_rate": 1.4506785524681898e-05, + "loss": 1.2136, + "step": 12438 + }, + { + "epoch": 3.704908877678289, + "grad_norm": 0.35440772771835327, + "learning_rate": 1.4505924448531137e-05, + "loss": 1.2418, + "step": 12439 + }, + { + "epoch": 3.7052067238779576, + "grad_norm": 0.25245949625968933, + "learning_rate": 1.450506333045778e-05, + "loss": 1.2328, + "step": 12440 + }, + { + "epoch": 3.7055045700776263, + "grad_norm": 0.27578699588775635, + "learning_rate": 1.4504202170469842e-05, + "loss": 1.2221, + "step": 12441 + }, + { + "epoch": 3.7058024162772947, + "grad_norm": 0.31687596440315247, + "learning_rate": 1.450334096857534e-05, + "loss": 1.2126, + "step": 12442 + }, + { + "epoch": 3.7061002624769634, + "grad_norm": 0.26394516229629517, + "learning_rate": 1.4502479724782272e-05, + "loss": 1.2167, + "step": 12443 + }, + { + "epoch": 3.706398108676632, + "grad_norm": 0.3684418201446533, + "learning_rate": 1.4501618439098666e-05, + "loss": 1.2383, + "step": 12444 + }, + { + "epoch": 3.7066959548763005, + "grad_norm": 0.2711924612522125, + "learning_rate": 1.450075711153253e-05, + "loss": 1.2403, + "step": 12445 + }, + { + "epoch": 3.7069938010759693, + "grad_norm": 0.3082464039325714, + "learning_rate": 1.449989574209188e-05, + "loss": 1.2235, + "step": 12446 + }, + { + "epoch": 3.707291647275638, + "grad_norm": 0.25182807445526123, + "learning_rate": 1.4499034330784725e-05, + "loss": 1.2322, + "step": 12447 + }, + { + "epoch": 3.7075894934753064, + "grad_norm": 0.24612084031105042, + "learning_rate": 1.4498172877619083e-05, + "loss": 1.2261, + "step": 12448 + }, + { + "epoch": 3.707887339674975, + "grad_norm": 0.44000059366226196, + "learning_rate": 1.4497311382602967e-05, + "loss": 1.25, + "step": 12449 + }, + { + "epoch": 3.708185185874644, + "grad_norm": 0.2966899275779724, + "learning_rate": 1.4496449845744393e-05, + "loss": 1.2317, + "step": 12450 + }, + { + "epoch": 3.7084830320743127, + "grad_norm": 0.3248160481452942, + "learning_rate": 1.449558826705138e-05, + "loss": 1.2416, + "step": 12451 + }, + { + "epoch": 3.7087808782739815, + "grad_norm": 0.2693547010421753, + "learning_rate": 1.4494726646531936e-05, + "loss": 1.2145, + "step": 12452 + }, + { + "epoch": 3.70907872447365, + "grad_norm": 0.6153912544250488, + "learning_rate": 1.4493864984194089e-05, + "loss": 1.2287, + "step": 12453 + }, + { + "epoch": 3.7093765706733186, + "grad_norm": 0.3109544813632965, + "learning_rate": 1.4493003280045843e-05, + "loss": 1.2349, + "step": 12454 + }, + { + "epoch": 3.7096744168729874, + "grad_norm": 0.26281970739364624, + "learning_rate": 1.4492141534095222e-05, + "loss": 1.2342, + "step": 12455 + }, + { + "epoch": 3.7099722630726557, + "grad_norm": 0.26653775572776794, + "learning_rate": 1.4491279746350246e-05, + "loss": 1.2257, + "step": 12456 + }, + { + "epoch": 3.7102701092723245, + "grad_norm": 0.23921839892864227, + "learning_rate": 1.4490417916818929e-05, + "loss": 1.2122, + "step": 12457 + }, + { + "epoch": 3.7105679554719933, + "grad_norm": 0.24738293886184692, + "learning_rate": 1.4489556045509287e-05, + "loss": 1.2429, + "step": 12458 + }, + { + "epoch": 3.7108658016716616, + "grad_norm": 0.24981005489826202, + "learning_rate": 1.4488694132429348e-05, + "loss": 1.2227, + "step": 12459 + }, + { + "epoch": 3.7111636478713304, + "grad_norm": 0.2284761369228363, + "learning_rate": 1.4487832177587121e-05, + "loss": 1.2197, + "step": 12460 + }, + { + "epoch": 3.711461494070999, + "grad_norm": 0.24517232179641724, + "learning_rate": 1.4486970180990629e-05, + "loss": 1.2362, + "step": 12461 + }, + { + "epoch": 3.711759340270668, + "grad_norm": 0.2507041096687317, + "learning_rate": 1.4486108142647894e-05, + "loss": 1.2249, + "step": 12462 + }, + { + "epoch": 3.7120571864703362, + "grad_norm": 0.2416991889476776, + "learning_rate": 1.4485246062566938e-05, + "loss": 1.231, + "step": 12463 + }, + { + "epoch": 3.712355032670005, + "grad_norm": 0.246464803814888, + "learning_rate": 1.4484383940755776e-05, + "loss": 1.2286, + "step": 12464 + }, + { + "epoch": 3.7126528788696738, + "grad_norm": 0.23346826434135437, + "learning_rate": 1.4483521777222428e-05, + "loss": 1.2312, + "step": 12465 + }, + { + "epoch": 3.7129507250693425, + "grad_norm": 0.23480015993118286, + "learning_rate": 1.4482659571974924e-05, + "loss": 1.2303, + "step": 12466 + }, + { + "epoch": 3.713248571269011, + "grad_norm": 0.23757559061050415, + "learning_rate": 1.4481797325021281e-05, + "loss": 1.219, + "step": 12467 + }, + { + "epoch": 3.7135464174686796, + "grad_norm": 0.25187939405441284, + "learning_rate": 1.4480935036369519e-05, + "loss": 1.2272, + "step": 12468 + }, + { + "epoch": 3.7138442636683484, + "grad_norm": 0.2528476417064667, + "learning_rate": 1.4480072706027661e-05, + "loss": 1.2181, + "step": 12469 + }, + { + "epoch": 3.7141421098680167, + "grad_norm": 0.25194936990737915, + "learning_rate": 1.4479210334003737e-05, + "loss": 1.2198, + "step": 12470 + }, + { + "epoch": 3.7144399560676855, + "grad_norm": 0.23710715770721436, + "learning_rate": 1.4478347920305761e-05, + "loss": 1.2315, + "step": 12471 + }, + { + "epoch": 3.7147378022673543, + "grad_norm": 0.2685161828994751, + "learning_rate": 1.4477485464941766e-05, + "loss": 1.2192, + "step": 12472 + }, + { + "epoch": 3.7150356484670226, + "grad_norm": 0.23555238544940948, + "learning_rate": 1.4476622967919766e-05, + "loss": 1.226, + "step": 12473 + }, + { + "epoch": 3.7153334946666914, + "grad_norm": 0.24198657274246216, + "learning_rate": 1.4475760429247793e-05, + "loss": 1.2354, + "step": 12474 + }, + { + "epoch": 3.71563134086636, + "grad_norm": 0.2961115539073944, + "learning_rate": 1.4474897848933872e-05, + "loss": 1.2279, + "step": 12475 + }, + { + "epoch": 3.715929187066029, + "grad_norm": 0.33702465891838074, + "learning_rate": 1.4474035226986025e-05, + "loss": 1.2342, + "step": 12476 + }, + { + "epoch": 3.7162270332656977, + "grad_norm": 0.23954392969608307, + "learning_rate": 1.4473172563412277e-05, + "loss": 1.2345, + "step": 12477 + }, + { + "epoch": 3.716524879465366, + "grad_norm": 0.28280097246170044, + "learning_rate": 1.4472309858220657e-05, + "loss": 1.2304, + "step": 12478 + }, + { + "epoch": 3.716822725665035, + "grad_norm": 0.24391360580921173, + "learning_rate": 1.4471447111419189e-05, + "loss": 1.2194, + "step": 12479 + }, + { + "epoch": 3.7171205718647036, + "grad_norm": 0.28991103172302246, + "learning_rate": 1.4470584323015904e-05, + "loss": 1.2343, + "step": 12480 + }, + { + "epoch": 3.717418418064372, + "grad_norm": 0.26414933800697327, + "learning_rate": 1.4469721493018827e-05, + "loss": 1.2309, + "step": 12481 + }, + { + "epoch": 3.7177162642640407, + "grad_norm": 0.33833932876586914, + "learning_rate": 1.4468858621435984e-05, + "loss": 1.2396, + "step": 12482 + }, + { + "epoch": 3.7180141104637094, + "grad_norm": 0.2547779381275177, + "learning_rate": 1.4467995708275404e-05, + "loss": 1.221, + "step": 12483 + }, + { + "epoch": 3.7183119566633778, + "grad_norm": 0.28853312134742737, + "learning_rate": 1.4467132753545114e-05, + "loss": 1.2413, + "step": 12484 + }, + { + "epoch": 3.7186098028630465, + "grad_norm": 0.2745019793510437, + "learning_rate": 1.4466269757253148e-05, + "loss": 1.2235, + "step": 12485 + }, + { + "epoch": 3.7189076490627153, + "grad_norm": 0.24493719637393951, + "learning_rate": 1.446540671940753e-05, + "loss": 1.2381, + "step": 12486 + }, + { + "epoch": 3.7192054952623836, + "grad_norm": 0.29772064089775085, + "learning_rate": 1.4464543640016295e-05, + "loss": 1.2106, + "step": 12487 + }, + { + "epoch": 3.7195033414620524, + "grad_norm": 0.2681344747543335, + "learning_rate": 1.4463680519087466e-05, + "loss": 1.2276, + "step": 12488 + }, + { + "epoch": 3.719801187661721, + "grad_norm": 0.3461523652076721, + "learning_rate": 1.446281735662908e-05, + "loss": 1.2286, + "step": 12489 + }, + { + "epoch": 3.72009903386139, + "grad_norm": 0.24688516557216644, + "learning_rate": 1.446195415264916e-05, + "loss": 1.2279, + "step": 12490 + }, + { + "epoch": 3.7203968800610587, + "grad_norm": 0.2979736626148224, + "learning_rate": 1.4461090907155746e-05, + "loss": 1.2172, + "step": 12491 + }, + { + "epoch": 3.720694726260727, + "grad_norm": 0.25735822319984436, + "learning_rate": 1.4460227620156864e-05, + "loss": 1.2184, + "step": 12492 + }, + { + "epoch": 3.720992572460396, + "grad_norm": 0.31744399666786194, + "learning_rate": 1.4459364291660548e-05, + "loss": 1.219, + "step": 12493 + }, + { + "epoch": 3.7212904186600646, + "grad_norm": 0.2509385943412781, + "learning_rate": 1.4458500921674828e-05, + "loss": 1.2401, + "step": 12494 + }, + { + "epoch": 3.721588264859733, + "grad_norm": 0.26265573501586914, + "learning_rate": 1.4457637510207738e-05, + "loss": 1.2403, + "step": 12495 + }, + { + "epoch": 3.7218861110594017, + "grad_norm": 0.2772224247455597, + "learning_rate": 1.445677405726731e-05, + "loss": 1.2316, + "step": 12496 + }, + { + "epoch": 3.7221839572590705, + "grad_norm": 0.26284000277519226, + "learning_rate": 1.4455910562861583e-05, + "loss": 1.2316, + "step": 12497 + }, + { + "epoch": 3.722481803458739, + "grad_norm": 0.25751689076423645, + "learning_rate": 1.4455047026998584e-05, + "loss": 1.2318, + "step": 12498 + }, + { + "epoch": 3.7227796496584076, + "grad_norm": 0.29437172412872314, + "learning_rate": 1.4454183449686349e-05, + "loss": 1.2412, + "step": 12499 + }, + { + "epoch": 3.7230774958580763, + "grad_norm": 0.2750532627105713, + "learning_rate": 1.4453319830932917e-05, + "loss": 1.2396, + "step": 12500 + }, + { + "epoch": 3.7230774958580763, + "eval_loss": 1.3298418521881104, + "eval_runtime": 23.7313, + "eval_samples_per_second": 73.068, + "eval_steps_per_second": 4.593, + "step": 12500 + }, + { + "epoch": 3.7233753420577447, + "grad_norm": 0.25021812319755554, + "learning_rate": 1.4452456170746318e-05, + "loss": 1.2211, + "step": 12501 + }, + { + "epoch": 3.7236731882574134, + "grad_norm": 0.25232037901878357, + "learning_rate": 1.4451592469134585e-05, + "loss": 1.2192, + "step": 12502 + }, + { + "epoch": 3.723971034457082, + "grad_norm": 0.25040093064308167, + "learning_rate": 1.4450728726105762e-05, + "loss": 1.2305, + "step": 12503 + }, + { + "epoch": 3.724268880656751, + "grad_norm": 0.2766180634498596, + "learning_rate": 1.4449864941667881e-05, + "loss": 1.2185, + "step": 12504 + }, + { + "epoch": 3.7245667268564198, + "grad_norm": 0.2391878068447113, + "learning_rate": 1.4449001115828978e-05, + "loss": 1.2395, + "step": 12505 + }, + { + "epoch": 3.724864573056088, + "grad_norm": 0.24679675698280334, + "learning_rate": 1.444813724859709e-05, + "loss": 1.2331, + "step": 12506 + }, + { + "epoch": 3.725162419255757, + "grad_norm": 0.2730211317539215, + "learning_rate": 1.4447273339980254e-05, + "loss": 1.2278, + "step": 12507 + }, + { + "epoch": 3.7254602654554256, + "grad_norm": 0.2508825957775116, + "learning_rate": 1.4446409389986506e-05, + "loss": 1.2191, + "step": 12508 + }, + { + "epoch": 3.725758111655094, + "grad_norm": 0.39348164200782776, + "learning_rate": 1.444554539862389e-05, + "loss": 1.2394, + "step": 12509 + }, + { + "epoch": 3.7260559578547627, + "grad_norm": 0.35554832220077515, + "learning_rate": 1.444468136590044e-05, + "loss": 1.2251, + "step": 12510 + }, + { + "epoch": 3.7263538040544315, + "grad_norm": 0.2719554305076599, + "learning_rate": 1.4443817291824198e-05, + "loss": 1.2342, + "step": 12511 + }, + { + "epoch": 3.7266516502541, + "grad_norm": 0.28637775778770447, + "learning_rate": 1.4442953176403199e-05, + "loss": 1.2306, + "step": 12512 + }, + { + "epoch": 3.7269494964537686, + "grad_norm": 0.2555995583534241, + "learning_rate": 1.4442089019645484e-05, + "loss": 1.2222, + "step": 12513 + }, + { + "epoch": 3.7272473426534374, + "grad_norm": 0.24985826015472412, + "learning_rate": 1.4441224821559093e-05, + "loss": 1.2251, + "step": 12514 + }, + { + "epoch": 3.7275451888531057, + "grad_norm": 0.26199600100517273, + "learning_rate": 1.4440360582152069e-05, + "loss": 1.223, + "step": 12515 + }, + { + "epoch": 3.7278430350527745, + "grad_norm": 0.24520473182201385, + "learning_rate": 1.443949630143245e-05, + "loss": 1.2128, + "step": 12516 + }, + { + "epoch": 3.7281408812524433, + "grad_norm": 0.28552722930908203, + "learning_rate": 1.443863197940828e-05, + "loss": 1.2402, + "step": 12517 + }, + { + "epoch": 3.728438727452112, + "grad_norm": 0.2536368668079376, + "learning_rate": 1.4437767616087595e-05, + "loss": 1.2406, + "step": 12518 + }, + { + "epoch": 3.728736573651781, + "grad_norm": 0.2644293010234833, + "learning_rate": 1.4436903211478442e-05, + "loss": 1.2252, + "step": 12519 + }, + { + "epoch": 3.729034419851449, + "grad_norm": 0.24547292292118073, + "learning_rate": 1.443603876558886e-05, + "loss": 1.2244, + "step": 12520 + }, + { + "epoch": 3.729332266051118, + "grad_norm": 0.27538245916366577, + "learning_rate": 1.4435174278426898e-05, + "loss": 1.2258, + "step": 12521 + }, + { + "epoch": 3.7296301122507867, + "grad_norm": 0.34969258308410645, + "learning_rate": 1.4434309750000593e-05, + "loss": 1.2248, + "step": 12522 + }, + { + "epoch": 3.729927958450455, + "grad_norm": 0.3113882839679718, + "learning_rate": 1.4433445180317986e-05, + "loss": 1.2179, + "step": 12523 + }, + { + "epoch": 3.7302258046501238, + "grad_norm": 0.26758047938346863, + "learning_rate": 1.4432580569387131e-05, + "loss": 1.2318, + "step": 12524 + }, + { + "epoch": 3.7305236508497925, + "grad_norm": 0.285544753074646, + "learning_rate": 1.4431715917216063e-05, + "loss": 1.2255, + "step": 12525 + }, + { + "epoch": 3.730821497049461, + "grad_norm": 0.32492971420288086, + "learning_rate": 1.4430851223812827e-05, + "loss": 1.2467, + "step": 12526 + }, + { + "epoch": 3.7311193432491296, + "grad_norm": 0.24724063277244568, + "learning_rate": 1.4429986489185475e-05, + "loss": 1.2159, + "step": 12527 + }, + { + "epoch": 3.7314171894487984, + "grad_norm": 0.2818030118942261, + "learning_rate": 1.4429121713342049e-05, + "loss": 1.2311, + "step": 12528 + }, + { + "epoch": 3.731715035648467, + "grad_norm": 0.24205012619495392, + "learning_rate": 1.4428256896290592e-05, + "loss": 1.21, + "step": 12529 + }, + { + "epoch": 3.7320128818481355, + "grad_norm": 0.3191501498222351, + "learning_rate": 1.4427392038039152e-05, + "loss": 1.2321, + "step": 12530 + }, + { + "epoch": 3.7323107280478043, + "grad_norm": 0.2558881342411041, + "learning_rate": 1.4426527138595773e-05, + "loss": 1.2289, + "step": 12531 + }, + { + "epoch": 3.732608574247473, + "grad_norm": 0.25629472732543945, + "learning_rate": 1.4425662197968509e-05, + "loss": 1.2254, + "step": 12532 + }, + { + "epoch": 3.732906420447142, + "grad_norm": 0.26400554180145264, + "learning_rate": 1.44247972161654e-05, + "loss": 1.2387, + "step": 12533 + }, + { + "epoch": 3.73320426664681, + "grad_norm": 0.2637544572353363, + "learning_rate": 1.44239321931945e-05, + "loss": 1.2402, + "step": 12534 + }, + { + "epoch": 3.733502112846479, + "grad_norm": 0.2510688602924347, + "learning_rate": 1.4423067129063852e-05, + "loss": 1.2342, + "step": 12535 + }, + { + "epoch": 3.7337999590461477, + "grad_norm": 0.2486250400543213, + "learning_rate": 1.4422202023781506e-05, + "loss": 1.2252, + "step": 12536 + }, + { + "epoch": 3.734097805245816, + "grad_norm": 0.2711244225502014, + "learning_rate": 1.442133687735551e-05, + "loss": 1.2361, + "step": 12537 + }, + { + "epoch": 3.734395651445485, + "grad_norm": 0.25015541911125183, + "learning_rate": 1.4420471689793914e-05, + "loss": 1.2135, + "step": 12538 + }, + { + "epoch": 3.7346934976451536, + "grad_norm": 0.2576945722103119, + "learning_rate": 1.4419606461104767e-05, + "loss": 1.2207, + "step": 12539 + }, + { + "epoch": 3.734991343844822, + "grad_norm": 0.27408546209335327, + "learning_rate": 1.4418741191296122e-05, + "loss": 1.2217, + "step": 12540 + }, + { + "epoch": 3.7352891900444907, + "grad_norm": 0.32182052731513977, + "learning_rate": 1.441787588037603e-05, + "loss": 1.2415, + "step": 12541 + }, + { + "epoch": 3.7355870362441594, + "grad_norm": 0.24851801991462708, + "learning_rate": 1.4417010528352534e-05, + "loss": 1.2456, + "step": 12542 + }, + { + "epoch": 3.735884882443828, + "grad_norm": 0.33332735300064087, + "learning_rate": 1.4416145135233688e-05, + "loss": 1.2236, + "step": 12543 + }, + { + "epoch": 3.736182728643497, + "grad_norm": 0.28403955698013306, + "learning_rate": 1.4415279701027549e-05, + "loss": 1.2269, + "step": 12544 + }, + { + "epoch": 3.7364805748431653, + "grad_norm": 0.2707551121711731, + "learning_rate": 1.4414414225742167e-05, + "loss": 1.2546, + "step": 12545 + }, + { + "epoch": 3.736778421042834, + "grad_norm": 0.29916614294052124, + "learning_rate": 1.441354870938559e-05, + "loss": 1.218, + "step": 12546 + }, + { + "epoch": 3.737076267242503, + "grad_norm": 0.24943292140960693, + "learning_rate": 1.4412683151965875e-05, + "loss": 1.2347, + "step": 12547 + }, + { + "epoch": 3.737374113442171, + "grad_norm": 0.35270532965660095, + "learning_rate": 1.4411817553491074e-05, + "loss": 1.2319, + "step": 12548 + }, + { + "epoch": 3.73767195964184, + "grad_norm": 0.26501792669296265, + "learning_rate": 1.4410951913969235e-05, + "loss": 1.2242, + "step": 12549 + }, + { + "epoch": 3.7379698058415087, + "grad_norm": 0.3242575526237488, + "learning_rate": 1.4410086233408422e-05, + "loss": 1.2172, + "step": 12550 + }, + { + "epoch": 3.738267652041177, + "grad_norm": 0.3066577911376953, + "learning_rate": 1.4409220511816684e-05, + "loss": 1.2276, + "step": 12551 + }, + { + "epoch": 3.738565498240846, + "grad_norm": 0.26140496134757996, + "learning_rate": 1.4408354749202072e-05, + "loss": 1.2383, + "step": 12552 + }, + { + "epoch": 3.7388633444405146, + "grad_norm": 0.23789289593696594, + "learning_rate": 1.4407488945572645e-05, + "loss": 1.2288, + "step": 12553 + }, + { + "epoch": 3.739161190640183, + "grad_norm": 0.2502937614917755, + "learning_rate": 1.4406623100936459e-05, + "loss": 1.2389, + "step": 12554 + }, + { + "epoch": 3.7394590368398517, + "grad_norm": 0.24868594110012054, + "learning_rate": 1.4405757215301568e-05, + "loss": 1.2307, + "step": 12555 + }, + { + "epoch": 3.7397568830395205, + "grad_norm": 0.2607230246067047, + "learning_rate": 1.4404891288676029e-05, + "loss": 1.222, + "step": 12556 + }, + { + "epoch": 3.7400547292391892, + "grad_norm": 0.30206650495529175, + "learning_rate": 1.4404025321067896e-05, + "loss": 1.2189, + "step": 12557 + }, + { + "epoch": 3.740352575438858, + "grad_norm": 0.2802787125110626, + "learning_rate": 1.440315931248523e-05, + "loss": 1.2539, + "step": 12558 + }, + { + "epoch": 3.7406504216385263, + "grad_norm": 0.26320210099220276, + "learning_rate": 1.4402293262936086e-05, + "loss": 1.2384, + "step": 12559 + }, + { + "epoch": 3.740948267838195, + "grad_norm": 0.24549047648906708, + "learning_rate": 1.4401427172428521e-05, + "loss": 1.225, + "step": 12560 + }, + { + "epoch": 3.741246114037864, + "grad_norm": 0.2524031698703766, + "learning_rate": 1.4400561040970591e-05, + "loss": 1.2515, + "step": 12561 + }, + { + "epoch": 3.741543960237532, + "grad_norm": 0.23625604808330536, + "learning_rate": 1.4399694868570361e-05, + "loss": 1.2227, + "step": 12562 + }, + { + "epoch": 3.741841806437201, + "grad_norm": 0.2702176570892334, + "learning_rate": 1.4398828655235886e-05, + "loss": 1.2344, + "step": 12563 + }, + { + "epoch": 3.7421396526368698, + "grad_norm": 0.27187982201576233, + "learning_rate": 1.4397962400975222e-05, + "loss": 1.237, + "step": 12564 + }, + { + "epoch": 3.742437498836538, + "grad_norm": 0.338758647441864, + "learning_rate": 1.4397096105796435e-05, + "loss": 1.2255, + "step": 12565 + }, + { + "epoch": 3.742735345036207, + "grad_norm": 0.29826152324676514, + "learning_rate": 1.4396229769707581e-05, + "loss": 1.227, + "step": 12566 + }, + { + "epoch": 3.7430331912358756, + "grad_norm": 0.29730749130249023, + "learning_rate": 1.4395363392716717e-05, + "loss": 1.222, + "step": 12567 + }, + { + "epoch": 3.743331037435544, + "grad_norm": 0.27890071272850037, + "learning_rate": 1.4394496974831911e-05, + "loss": 1.2301, + "step": 12568 + }, + { + "epoch": 3.7436288836352127, + "grad_norm": 0.2715962827205658, + "learning_rate": 1.439363051606122e-05, + "loss": 1.2369, + "step": 12569 + }, + { + "epoch": 3.7439267298348815, + "grad_norm": 0.3789243996143341, + "learning_rate": 1.4392764016412705e-05, + "loss": 1.2358, + "step": 12570 + }, + { + "epoch": 3.7442245760345503, + "grad_norm": 0.3870760500431061, + "learning_rate": 1.439189747589443e-05, + "loss": 1.2183, + "step": 12571 + }, + { + "epoch": 3.744522422234219, + "grad_norm": 0.31367406249046326, + "learning_rate": 1.4391030894514453e-05, + "loss": 1.2339, + "step": 12572 + }, + { + "epoch": 3.7448202684338874, + "grad_norm": 0.8676227927207947, + "learning_rate": 1.4390164272280841e-05, + "loss": 1.2247, + "step": 12573 + }, + { + "epoch": 3.745118114633556, + "grad_norm": 0.29327961802482605, + "learning_rate": 1.4389297609201655e-05, + "loss": 1.2265, + "step": 12574 + }, + { + "epoch": 3.745415960833225, + "grad_norm": 0.2783328890800476, + "learning_rate": 1.4388430905284963e-05, + "loss": 1.2395, + "step": 12575 + }, + { + "epoch": 3.7457138070328932, + "grad_norm": 0.2528507709503174, + "learning_rate": 1.4387564160538821e-05, + "loss": 1.2494, + "step": 12576 + }, + { + "epoch": 3.746011653232562, + "grad_norm": 0.24488899111747742, + "learning_rate": 1.4386697374971297e-05, + "loss": 1.2313, + "step": 12577 + }, + { + "epoch": 3.746309499432231, + "grad_norm": 0.24234485626220703, + "learning_rate": 1.4385830548590454e-05, + "loss": 1.2361, + "step": 12578 + }, + { + "epoch": 3.746607345631899, + "grad_norm": 0.24857281148433685, + "learning_rate": 1.4384963681404358e-05, + "loss": 1.2244, + "step": 12579 + }, + { + "epoch": 3.746905191831568, + "grad_norm": 0.2506444454193115, + "learning_rate": 1.4384096773421074e-05, + "loss": 1.222, + "step": 12580 + }, + { + "epoch": 3.7472030380312367, + "grad_norm": 0.24715283513069153, + "learning_rate": 1.4383229824648668e-05, + "loss": 1.2336, + "step": 12581 + }, + { + "epoch": 3.747500884230905, + "grad_norm": 0.24358782172203064, + "learning_rate": 1.4382362835095208e-05, + "loss": 1.215, + "step": 12582 + }, + { + "epoch": 3.7477987304305738, + "grad_norm": 0.24742436408996582, + "learning_rate": 1.4381495804768757e-05, + "loss": 1.2207, + "step": 12583 + }, + { + "epoch": 3.7480965766302425, + "grad_norm": 0.2504050135612488, + "learning_rate": 1.4380628733677382e-05, + "loss": 1.241, + "step": 12584 + }, + { + "epoch": 3.7483944228299113, + "grad_norm": 0.24546143412590027, + "learning_rate": 1.4379761621829147e-05, + "loss": 1.2237, + "step": 12585 + }, + { + "epoch": 3.74869226902958, + "grad_norm": 0.23761746287345886, + "learning_rate": 1.437889446923213e-05, + "loss": 1.2434, + "step": 12586 + }, + { + "epoch": 3.7489901152292484, + "grad_norm": 0.2457745224237442, + "learning_rate": 1.437802727589439e-05, + "loss": 1.2294, + "step": 12587 + }, + { + "epoch": 3.749287961428917, + "grad_norm": 0.23757010698318481, + "learning_rate": 1.4377160041823996e-05, + "loss": 1.2546, + "step": 12588 + }, + { + "epoch": 3.749585807628586, + "grad_norm": 0.2490760087966919, + "learning_rate": 1.4376292767029023e-05, + "loss": 1.2185, + "step": 12589 + }, + { + "epoch": 3.7498836538282543, + "grad_norm": 0.25601309537887573, + "learning_rate": 1.437542545151753e-05, + "loss": 1.2462, + "step": 12590 + }, + { + "epoch": 3.750181500027923, + "grad_norm": 0.23233304917812347, + "learning_rate": 1.4374558095297593e-05, + "loss": 1.2206, + "step": 12591 + }, + { + "epoch": 3.750479346227592, + "grad_norm": 0.24676714837551117, + "learning_rate": 1.4373690698377283e-05, + "loss": 1.2526, + "step": 12592 + }, + { + "epoch": 3.75077719242726, + "grad_norm": 0.2371000498533249, + "learning_rate": 1.4372823260764665e-05, + "loss": 1.2373, + "step": 12593 + }, + { + "epoch": 3.751075038626929, + "grad_norm": 0.24489223957061768, + "learning_rate": 1.4371955782467813e-05, + "loss": 1.2283, + "step": 12594 + }, + { + "epoch": 3.7513728848265977, + "grad_norm": 0.23973029851913452, + "learning_rate": 1.4371088263494797e-05, + "loss": 1.2308, + "step": 12595 + }, + { + "epoch": 3.7516707310262665, + "grad_norm": 0.24719129502773285, + "learning_rate": 1.4370220703853688e-05, + "loss": 1.2271, + "step": 12596 + }, + { + "epoch": 3.751968577225935, + "grad_norm": 0.23591454327106476, + "learning_rate": 1.436935310355256e-05, + "loss": 1.2325, + "step": 12597 + }, + { + "epoch": 3.7522664234256036, + "grad_norm": 0.23700574040412903, + "learning_rate": 1.4368485462599479e-05, + "loss": 1.2377, + "step": 12598 + }, + { + "epoch": 3.7525642696252723, + "grad_norm": 0.2341172993183136, + "learning_rate": 1.4367617781002525e-05, + "loss": 1.242, + "step": 12599 + }, + { + "epoch": 3.752862115824941, + "grad_norm": 0.24162684381008148, + "learning_rate": 1.4366750058769768e-05, + "loss": 1.2362, + "step": 12600 + }, + { + "epoch": 3.7531599620246094, + "grad_norm": 0.23593008518218994, + "learning_rate": 1.4365882295909278e-05, + "loss": 1.2418, + "step": 12601 + }, + { + "epoch": 3.753457808224278, + "grad_norm": 0.24879764020442963, + "learning_rate": 1.4365014492429132e-05, + "loss": 1.2363, + "step": 12602 + }, + { + "epoch": 3.753755654423947, + "grad_norm": 0.24039660394191742, + "learning_rate": 1.4364146648337403e-05, + "loss": 1.2264, + "step": 12603 + }, + { + "epoch": 3.7540535006236153, + "grad_norm": 0.24104157090187073, + "learning_rate": 1.4363278763642164e-05, + "loss": 1.2338, + "step": 12604 + }, + { + "epoch": 3.754351346823284, + "grad_norm": 0.251393586397171, + "learning_rate": 1.4362410838351492e-05, + "loss": 1.247, + "step": 12605 + }, + { + "epoch": 3.754649193022953, + "grad_norm": 0.23401515185832977, + "learning_rate": 1.4361542872473464e-05, + "loss": 1.2235, + "step": 12606 + }, + { + "epoch": 3.754947039222621, + "grad_norm": 0.23661254346370697, + "learning_rate": 1.4360674866016152e-05, + "loss": 1.236, + "step": 12607 + }, + { + "epoch": 3.75524488542229, + "grad_norm": 0.22960497438907623, + "learning_rate": 1.4359806818987628e-05, + "loss": 1.2149, + "step": 12608 + }, + { + "epoch": 3.7555427316219587, + "grad_norm": 0.23821663856506348, + "learning_rate": 1.4358938731395975e-05, + "loss": 1.2205, + "step": 12609 + }, + { + "epoch": 3.7558405778216275, + "grad_norm": 0.23379720747470856, + "learning_rate": 1.4358070603249267e-05, + "loss": 1.2346, + "step": 12610 + }, + { + "epoch": 3.7561384240212963, + "grad_norm": 0.23535868525505066, + "learning_rate": 1.4357202434555582e-05, + "loss": 1.22, + "step": 12611 + }, + { + "epoch": 3.7564362702209646, + "grad_norm": 0.2433944195508957, + "learning_rate": 1.4356334225322997e-05, + "loss": 1.2182, + "step": 12612 + }, + { + "epoch": 3.7567341164206334, + "grad_norm": 0.2404908686876297, + "learning_rate": 1.4355465975559586e-05, + "loss": 1.2302, + "step": 12613 + }, + { + "epoch": 3.757031962620302, + "grad_norm": 0.24126465618610382, + "learning_rate": 1.4354597685273432e-05, + "loss": 1.2345, + "step": 12614 + }, + { + "epoch": 3.7573298088199705, + "grad_norm": 0.23302330076694489, + "learning_rate": 1.4353729354472611e-05, + "loss": 1.2318, + "step": 12615 + }, + { + "epoch": 3.7576276550196392, + "grad_norm": 0.24231773614883423, + "learning_rate": 1.435286098316521e-05, + "loss": 1.2303, + "step": 12616 + }, + { + "epoch": 3.757925501219308, + "grad_norm": 0.2433927059173584, + "learning_rate": 1.4351992571359292e-05, + "loss": 1.2218, + "step": 12617 + }, + { + "epoch": 3.7582233474189763, + "grad_norm": 0.24965760111808777, + "learning_rate": 1.4351124119062949e-05, + "loss": 1.223, + "step": 12618 + }, + { + "epoch": 3.758521193618645, + "grad_norm": 0.23461966216564178, + "learning_rate": 1.4350255626284254e-05, + "loss": 1.2291, + "step": 12619 + }, + { + "epoch": 3.758819039818314, + "grad_norm": 0.23923666775226593, + "learning_rate": 1.4349387093031295e-05, + "loss": 1.2447, + "step": 12620 + }, + { + "epoch": 3.759116886017982, + "grad_norm": 0.24339085817337036, + "learning_rate": 1.4348518519312147e-05, + "loss": 1.2252, + "step": 12621 + }, + { + "epoch": 3.759414732217651, + "grad_norm": 0.2397315949201584, + "learning_rate": 1.4347649905134893e-05, + "loss": 1.2248, + "step": 12622 + }, + { + "epoch": 3.7597125784173198, + "grad_norm": 0.23497259616851807, + "learning_rate": 1.4346781250507615e-05, + "loss": 1.242, + "step": 12623 + }, + { + "epoch": 3.7600104246169885, + "grad_norm": 0.24975162744522095, + "learning_rate": 1.4345912555438393e-05, + "loss": 1.2323, + "step": 12624 + }, + { + "epoch": 3.7603082708166573, + "grad_norm": 0.23598866164684296, + "learning_rate": 1.434504381993531e-05, + "loss": 1.2245, + "step": 12625 + }, + { + "epoch": 3.7606061170163256, + "grad_norm": 0.23897448182106018, + "learning_rate": 1.4344175044006445e-05, + "loss": 1.2249, + "step": 12626 + }, + { + "epoch": 3.7609039632159944, + "grad_norm": 0.23851969838142395, + "learning_rate": 1.434330622765989e-05, + "loss": 1.2364, + "step": 12627 + }, + { + "epoch": 3.761201809415663, + "grad_norm": 0.23931705951690674, + "learning_rate": 1.4342437370903723e-05, + "loss": 1.2207, + "step": 12628 + }, + { + "epoch": 3.7614996556153315, + "grad_norm": 0.24956797063350677, + "learning_rate": 1.4341568473746026e-05, + "loss": 1.2324, + "step": 12629 + }, + { + "epoch": 3.7617975018150003, + "grad_norm": 0.2446548491716385, + "learning_rate": 1.4340699536194887e-05, + "loss": 1.2255, + "step": 12630 + }, + { + "epoch": 3.762095348014669, + "grad_norm": 0.23512233793735504, + "learning_rate": 1.4339830558258385e-05, + "loss": 1.2323, + "step": 12631 + }, + { + "epoch": 3.7623931942143374, + "grad_norm": 0.2424010932445526, + "learning_rate": 1.4338961539944611e-05, + "loss": 1.2293, + "step": 12632 + }, + { + "epoch": 3.762691040414006, + "grad_norm": 0.2611819803714752, + "learning_rate": 1.4338092481261648e-05, + "loss": 1.2458, + "step": 12633 + }, + { + "epoch": 3.762988886613675, + "grad_norm": 0.23570644855499268, + "learning_rate": 1.4337223382217582e-05, + "loss": 1.2046, + "step": 12634 + }, + { + "epoch": 3.7632867328133432, + "grad_norm": 0.24657092988491058, + "learning_rate": 1.4336354242820497e-05, + "loss": 1.2216, + "step": 12635 + }, + { + "epoch": 3.763584579013012, + "grad_norm": 0.2414436936378479, + "learning_rate": 1.4335485063078478e-05, + "loss": 1.2171, + "step": 12636 + }, + { + "epoch": 3.763882425212681, + "grad_norm": 0.26084816455841064, + "learning_rate": 1.4334615842999618e-05, + "loss": 1.2376, + "step": 12637 + }, + { + "epoch": 3.7641802714123496, + "grad_norm": 0.25345945358276367, + "learning_rate": 1.4333746582592e-05, + "loss": 1.2395, + "step": 12638 + }, + { + "epoch": 3.7644781176120183, + "grad_norm": 0.24163250625133514, + "learning_rate": 1.4332877281863708e-05, + "loss": 1.2292, + "step": 12639 + }, + { + "epoch": 3.7647759638116867, + "grad_norm": 0.24772830307483673, + "learning_rate": 1.433200794082284e-05, + "loss": 1.22, + "step": 12640 + }, + { + "epoch": 3.7650738100113554, + "grad_norm": 0.24569171667099, + "learning_rate": 1.4331138559477476e-05, + "loss": 1.2263, + "step": 12641 + }, + { + "epoch": 3.765371656211024, + "grad_norm": 0.25932878255844116, + "learning_rate": 1.4330269137835706e-05, + "loss": 1.2397, + "step": 12642 + }, + { + "epoch": 3.7656695024106925, + "grad_norm": 0.26829370856285095, + "learning_rate": 1.4329399675905618e-05, + "loss": 1.22, + "step": 12643 + }, + { + "epoch": 3.7659673486103613, + "grad_norm": 0.23489348590373993, + "learning_rate": 1.4328530173695306e-05, + "loss": 1.2226, + "step": 12644 + }, + { + "epoch": 3.76626519481003, + "grad_norm": 0.24736730754375458, + "learning_rate": 1.4327660631212856e-05, + "loss": 1.2418, + "step": 12645 + }, + { + "epoch": 3.7665630410096984, + "grad_norm": 0.23377391695976257, + "learning_rate": 1.4326791048466358e-05, + "loss": 1.2251, + "step": 12646 + }, + { + "epoch": 3.766860887209367, + "grad_norm": 0.2361942082643509, + "learning_rate": 1.4325921425463904e-05, + "loss": 1.221, + "step": 12647 + }, + { + "epoch": 3.767158733409036, + "grad_norm": 0.24728500843048096, + "learning_rate": 1.4325051762213586e-05, + "loss": 1.2502, + "step": 12648 + }, + { + "epoch": 3.7674565796087043, + "grad_norm": 0.25355592370033264, + "learning_rate": 1.432418205872349e-05, + "loss": 1.2395, + "step": 12649 + }, + { + "epoch": 3.767754425808373, + "grad_norm": 0.2558923363685608, + "learning_rate": 1.4323312315001714e-05, + "loss": 1.2358, + "step": 12650 + }, + { + "epoch": 3.768052272008042, + "grad_norm": 0.3136868178844452, + "learning_rate": 1.4322442531056346e-05, + "loss": 1.2226, + "step": 12651 + }, + { + "epoch": 3.7683501182077106, + "grad_norm": 0.26222771406173706, + "learning_rate": 1.4321572706895482e-05, + "loss": 1.2319, + "step": 12652 + }, + { + "epoch": 3.7686479644073794, + "grad_norm": 0.2818720042705536, + "learning_rate": 1.432070284252721e-05, + "loss": 1.2479, + "step": 12653 + }, + { + "epoch": 3.7689458106070477, + "grad_norm": 0.27556848526000977, + "learning_rate": 1.4319832937959626e-05, + "loss": 1.2295, + "step": 12654 + }, + { + "epoch": 3.7692436568067165, + "grad_norm": 0.26749783754348755, + "learning_rate": 1.4318962993200821e-05, + "loss": 1.2296, + "step": 12655 + }, + { + "epoch": 3.7695415030063852, + "grad_norm": 0.2664186656475067, + "learning_rate": 1.4318093008258892e-05, + "loss": 1.2293, + "step": 12656 + }, + { + "epoch": 3.7698393492060536, + "grad_norm": 0.2746717035770416, + "learning_rate": 1.4317222983141934e-05, + "loss": 1.2295, + "step": 12657 + }, + { + "epoch": 3.7701371954057223, + "grad_norm": 0.2837357521057129, + "learning_rate": 1.4316352917858038e-05, + "loss": 1.2262, + "step": 12658 + }, + { + "epoch": 3.770435041605391, + "grad_norm": 0.27733665704727173, + "learning_rate": 1.43154828124153e-05, + "loss": 1.2141, + "step": 12659 + }, + { + "epoch": 3.7707328878050594, + "grad_norm": 0.2695692777633667, + "learning_rate": 1.4314612666821817e-05, + "loss": 1.2299, + "step": 12660 + }, + { + "epoch": 3.771030734004728, + "grad_norm": 0.2996913492679596, + "learning_rate": 1.4313742481085684e-05, + "loss": 1.2332, + "step": 12661 + }, + { + "epoch": 3.771328580204397, + "grad_norm": 0.31992167234420776, + "learning_rate": 1.4312872255214996e-05, + "loss": 1.2174, + "step": 12662 + }, + { + "epoch": 3.7716264264040658, + "grad_norm": 0.2649223208427429, + "learning_rate": 1.431200198921785e-05, + "loss": 1.2314, + "step": 12663 + }, + { + "epoch": 3.771924272603734, + "grad_norm": 0.3016444146633148, + "learning_rate": 1.4311131683102347e-05, + "loss": 1.2264, + "step": 12664 + }, + { + "epoch": 3.772222118803403, + "grad_norm": 0.3113537132740021, + "learning_rate": 1.4310261336876576e-05, + "loss": 1.2258, + "step": 12665 + }, + { + "epoch": 3.7725199650030716, + "grad_norm": 0.3229823708534241, + "learning_rate": 1.4309390950548639e-05, + "loss": 1.2267, + "step": 12666 + }, + { + "epoch": 3.7728178112027404, + "grad_norm": 0.2772013247013092, + "learning_rate": 1.4308520524126634e-05, + "loss": 1.2367, + "step": 12667 + }, + { + "epoch": 3.7731156574024087, + "grad_norm": 0.2575990855693817, + "learning_rate": 1.430765005761866e-05, + "loss": 1.2111, + "step": 12668 + }, + { + "epoch": 3.7734135036020775, + "grad_norm": 0.2997469902038574, + "learning_rate": 1.4306779551032817e-05, + "loss": 1.2358, + "step": 12669 + }, + { + "epoch": 3.7737113498017463, + "grad_norm": 0.28576475381851196, + "learning_rate": 1.4305909004377198e-05, + "loss": 1.2455, + "step": 12670 + }, + { + "epoch": 3.7740091960014146, + "grad_norm": 0.2684483528137207, + "learning_rate": 1.4305038417659912e-05, + "loss": 1.2355, + "step": 12671 + }, + { + "epoch": 3.7743070422010834, + "grad_norm": 0.23956318199634552, + "learning_rate": 1.4304167790889048e-05, + "loss": 1.2363, + "step": 12672 + }, + { + "epoch": 3.774604888400752, + "grad_norm": 0.2986161708831787, + "learning_rate": 1.4303297124072713e-05, + "loss": 1.227, + "step": 12673 + }, + { + "epoch": 3.7749027346004205, + "grad_norm": 0.2462140917778015, + "learning_rate": 1.4302426417219009e-05, + "loss": 1.2394, + "step": 12674 + }, + { + "epoch": 3.7752005808000892, + "grad_norm": 0.2997017204761505, + "learning_rate": 1.430155567033603e-05, + "loss": 1.2514, + "step": 12675 + }, + { + "epoch": 3.775498426999758, + "grad_norm": 0.251280277967453, + "learning_rate": 1.4300684883431886e-05, + "loss": 1.2448, + "step": 12676 + }, + { + "epoch": 3.775796273199427, + "grad_norm": 0.27205878496170044, + "learning_rate": 1.4299814056514672e-05, + "loss": 1.2221, + "step": 12677 + }, + { + "epoch": 3.7760941193990956, + "grad_norm": 0.29926252365112305, + "learning_rate": 1.4298943189592491e-05, + "loss": 1.2315, + "step": 12678 + }, + { + "epoch": 3.776391965598764, + "grad_norm": 0.2541559636592865, + "learning_rate": 1.4298072282673448e-05, + "loss": 1.2283, + "step": 12679 + }, + { + "epoch": 3.7766898117984327, + "grad_norm": 0.28829267621040344, + "learning_rate": 1.4297201335765643e-05, + "loss": 1.2127, + "step": 12680 + }, + { + "epoch": 3.7769876579981014, + "grad_norm": 0.2551625967025757, + "learning_rate": 1.4296330348877183e-05, + "loss": 1.233, + "step": 12681 + }, + { + "epoch": 3.7772855041977698, + "grad_norm": 0.2720828652381897, + "learning_rate": 1.429545932201617e-05, + "loss": 1.2149, + "step": 12682 + }, + { + "epoch": 3.7775833503974385, + "grad_norm": 0.30832982063293457, + "learning_rate": 1.4294588255190704e-05, + "loss": 1.2323, + "step": 12683 + }, + { + "epoch": 3.7778811965971073, + "grad_norm": 0.26237520575523376, + "learning_rate": 1.4293717148408895e-05, + "loss": 1.2282, + "step": 12684 + }, + { + "epoch": 3.7781790427967756, + "grad_norm": 0.30137020349502563, + "learning_rate": 1.4292846001678842e-05, + "loss": 1.251, + "step": 12685 + }, + { + "epoch": 3.7784768889964444, + "grad_norm": 0.2443857342004776, + "learning_rate": 1.4291974815008656e-05, + "loss": 1.2236, + "step": 12686 + }, + { + "epoch": 3.778774735196113, + "grad_norm": 0.278854638338089, + "learning_rate": 1.4291103588406439e-05, + "loss": 1.2435, + "step": 12687 + }, + { + "epoch": 3.7790725813957815, + "grad_norm": 0.27049946784973145, + "learning_rate": 1.4290232321880297e-05, + "loss": 1.2415, + "step": 12688 + }, + { + "epoch": 3.7793704275954503, + "grad_norm": 0.28129956126213074, + "learning_rate": 1.4289361015438337e-05, + "loss": 1.235, + "step": 12689 + }, + { + "epoch": 3.779668273795119, + "grad_norm": 0.30038005113601685, + "learning_rate": 1.4288489669088663e-05, + "loss": 1.2378, + "step": 12690 + }, + { + "epoch": 3.779966119994788, + "grad_norm": 0.2376733124256134, + "learning_rate": 1.4287618282839388e-05, + "loss": 1.232, + "step": 12691 + }, + { + "epoch": 3.7802639661944566, + "grad_norm": 0.31067323684692383, + "learning_rate": 1.4286746856698614e-05, + "loss": 1.2307, + "step": 12692 + }, + { + "epoch": 3.780561812394125, + "grad_norm": 0.24436017870903015, + "learning_rate": 1.4285875390674448e-05, + "loss": 1.2438, + "step": 12693 + }, + { + "epoch": 3.7808596585937937, + "grad_norm": 0.2674926519393921, + "learning_rate": 1.4285003884775002e-05, + "loss": 1.2166, + "step": 12694 + }, + { + "epoch": 3.7811575047934625, + "grad_norm": 0.25261390209198, + "learning_rate": 1.4284132339008383e-05, + "loss": 1.2287, + "step": 12695 + }, + { + "epoch": 3.781455350993131, + "grad_norm": 0.2543044686317444, + "learning_rate": 1.4283260753382696e-05, + "loss": 1.2535, + "step": 12696 + }, + { + "epoch": 3.7817531971927996, + "grad_norm": 0.2796723246574402, + "learning_rate": 1.4282389127906053e-05, + "loss": 1.2226, + "step": 12697 + }, + { + "epoch": 3.7820510433924683, + "grad_norm": 0.2589174807071686, + "learning_rate": 1.4281517462586567e-05, + "loss": 1.2269, + "step": 12698 + }, + { + "epoch": 3.7823488895921367, + "grad_norm": 0.2836647033691406, + "learning_rate": 1.4280645757432343e-05, + "loss": 1.2506, + "step": 12699 + }, + { + "epoch": 3.7826467357918054, + "grad_norm": 0.26077181100845337, + "learning_rate": 1.4279774012451493e-05, + "loss": 1.2353, + "step": 12700 + }, + { + "epoch": 3.782944581991474, + "grad_norm": 0.2713318467140198, + "learning_rate": 1.4278902227652128e-05, + "loss": 1.2497, + "step": 12701 + }, + { + "epoch": 3.7832424281911425, + "grad_norm": 0.24418582022190094, + "learning_rate": 1.4278030403042357e-05, + "loss": 1.2176, + "step": 12702 + }, + { + "epoch": 3.7835402743908113, + "grad_norm": 0.28171828389167786, + "learning_rate": 1.4277158538630294e-05, + "loss": 1.2286, + "step": 12703 + }, + { + "epoch": 3.78383812059048, + "grad_norm": 0.26205557584762573, + "learning_rate": 1.4276286634424048e-05, + "loss": 1.2259, + "step": 12704 + }, + { + "epoch": 3.784135966790149, + "grad_norm": 0.26702257990837097, + "learning_rate": 1.4275414690431735e-05, + "loss": 1.2135, + "step": 12705 + }, + { + "epoch": 3.7844338129898176, + "grad_norm": 0.2754550874233246, + "learning_rate": 1.4274542706661465e-05, + "loss": 1.2265, + "step": 12706 + }, + { + "epoch": 3.784731659189486, + "grad_norm": 0.29837870597839355, + "learning_rate": 1.427367068312135e-05, + "loss": 1.2318, + "step": 12707 + }, + { + "epoch": 3.7850295053891547, + "grad_norm": 0.26493605971336365, + "learning_rate": 1.4272798619819503e-05, + "loss": 1.2214, + "step": 12708 + }, + { + "epoch": 3.7853273515888235, + "grad_norm": 0.3191721439361572, + "learning_rate": 1.4271926516764038e-05, + "loss": 1.2354, + "step": 12709 + }, + { + "epoch": 3.785625197788492, + "grad_norm": 0.2518579363822937, + "learning_rate": 1.4271054373963073e-05, + "loss": 1.2078, + "step": 12710 + }, + { + "epoch": 3.7859230439881606, + "grad_norm": 0.3005197048187256, + "learning_rate": 1.4270182191424718e-05, + "loss": 1.2429, + "step": 12711 + }, + { + "epoch": 3.7862208901878294, + "grad_norm": 0.25633421540260315, + "learning_rate": 1.4269309969157087e-05, + "loss": 1.2191, + "step": 12712 + }, + { + "epoch": 3.7865187363874977, + "grad_norm": 0.3384515941143036, + "learning_rate": 1.4268437707168299e-05, + "loss": 1.2168, + "step": 12713 + }, + { + "epoch": 3.7868165825871665, + "grad_norm": 0.2397773265838623, + "learning_rate": 1.4267565405466464e-05, + "loss": 1.2267, + "step": 12714 + }, + { + "epoch": 3.7871144287868352, + "grad_norm": 0.30711597204208374, + "learning_rate": 1.4266693064059705e-05, + "loss": 1.2221, + "step": 12715 + }, + { + "epoch": 3.7874122749865036, + "grad_norm": 0.25729888677597046, + "learning_rate": 1.4265820682956131e-05, + "loss": 1.2428, + "step": 12716 + }, + { + "epoch": 3.7877101211861723, + "grad_norm": 0.24500930309295654, + "learning_rate": 1.426494826216386e-05, + "loss": 1.2185, + "step": 12717 + }, + { + "epoch": 3.788007967385841, + "grad_norm": 0.279811829328537, + "learning_rate": 1.4264075801691014e-05, + "loss": 1.2491, + "step": 12718 + }, + { + "epoch": 3.78830581358551, + "grad_norm": 0.2677135467529297, + "learning_rate": 1.4263203301545705e-05, + "loss": 1.237, + "step": 12719 + }, + { + "epoch": 3.7886036597851787, + "grad_norm": 0.29217711091041565, + "learning_rate": 1.4262330761736051e-05, + "loss": 1.2313, + "step": 12720 + }, + { + "epoch": 3.788901505984847, + "grad_norm": 0.26018255949020386, + "learning_rate": 1.4261458182270173e-05, + "loss": 1.2316, + "step": 12721 + }, + { + "epoch": 3.7891993521845158, + "grad_norm": 0.349682480096817, + "learning_rate": 1.426058556315619e-05, + "loss": 1.2238, + "step": 12722 + }, + { + "epoch": 3.7894971983841845, + "grad_norm": 0.24746723473072052, + "learning_rate": 1.4259712904402219e-05, + "loss": 1.2287, + "step": 12723 + }, + { + "epoch": 3.789795044583853, + "grad_norm": 0.313815176486969, + "learning_rate": 1.4258840206016376e-05, + "loss": 1.2183, + "step": 12724 + }, + { + "epoch": 3.7900928907835216, + "grad_norm": 0.24808746576309204, + "learning_rate": 1.4257967468006782e-05, + "loss": 1.2433, + "step": 12725 + }, + { + "epoch": 3.7903907369831904, + "grad_norm": 0.2969103753566742, + "learning_rate": 1.4257094690381559e-05, + "loss": 1.2373, + "step": 12726 + }, + { + "epoch": 3.7906885831828587, + "grad_norm": 0.26070764660835266, + "learning_rate": 1.4256221873148826e-05, + "loss": 1.2149, + "step": 12727 + }, + { + "epoch": 3.7909864293825275, + "grad_norm": 0.3144305944442749, + "learning_rate": 1.4255349016316707e-05, + "loss": 1.2255, + "step": 12728 + }, + { + "epoch": 3.7912842755821963, + "grad_norm": 0.2931560277938843, + "learning_rate": 1.4254476119893317e-05, + "loss": 1.2357, + "step": 12729 + }, + { + "epoch": 3.791582121781865, + "grad_norm": 0.3370493948459625, + "learning_rate": 1.4253603183886779e-05, + "loss": 1.2322, + "step": 12730 + }, + { + "epoch": 3.7918799679815334, + "grad_norm": 0.23886103928089142, + "learning_rate": 1.425273020830522e-05, + "loss": 1.2387, + "step": 12731 + }, + { + "epoch": 3.792177814181202, + "grad_norm": 0.25827065110206604, + "learning_rate": 1.4251857193156754e-05, + "loss": 1.2214, + "step": 12732 + }, + { + "epoch": 3.792475660380871, + "grad_norm": 0.24766288697719574, + "learning_rate": 1.4250984138449506e-05, + "loss": 1.2268, + "step": 12733 + }, + { + "epoch": 3.7927735065805397, + "grad_norm": 0.23920349776744843, + "learning_rate": 1.4250111044191604e-05, + "loss": 1.2229, + "step": 12734 + }, + { + "epoch": 3.793071352780208, + "grad_norm": 0.26184290647506714, + "learning_rate": 1.4249237910391164e-05, + "loss": 1.2299, + "step": 12735 + }, + { + "epoch": 3.793369198979877, + "grad_norm": 0.24239376187324524, + "learning_rate": 1.4248364737056318e-05, + "loss": 1.224, + "step": 12736 + }, + { + "epoch": 3.7936670451795456, + "grad_norm": 0.24595192074775696, + "learning_rate": 1.4247491524195176e-05, + "loss": 1.246, + "step": 12737 + }, + { + "epoch": 3.793964891379214, + "grad_norm": 0.2419978678226471, + "learning_rate": 1.4246618271815877e-05, + "loss": 1.223, + "step": 12738 + }, + { + "epoch": 3.7942627375788827, + "grad_norm": 0.2508220970630646, + "learning_rate": 1.4245744979926538e-05, + "loss": 1.25, + "step": 12739 + }, + { + "epoch": 3.7945605837785514, + "grad_norm": 0.24817495048046112, + "learning_rate": 1.4244871648535288e-05, + "loss": 1.2404, + "step": 12740 + }, + { + "epoch": 3.7948584299782198, + "grad_norm": 0.2514076828956604, + "learning_rate": 1.4243998277650248e-05, + "loss": 1.2483, + "step": 12741 + }, + { + "epoch": 3.7951562761778885, + "grad_norm": 0.2461961805820465, + "learning_rate": 1.4243124867279545e-05, + "loss": 1.2093, + "step": 12742 + }, + { + "epoch": 3.7954541223775573, + "grad_norm": 0.2558443248271942, + "learning_rate": 1.4242251417431306e-05, + "loss": 1.2331, + "step": 12743 + }, + { + "epoch": 3.795751968577226, + "grad_norm": 0.28598901629447937, + "learning_rate": 1.4241377928113658e-05, + "loss": 1.2371, + "step": 12744 + }, + { + "epoch": 3.796049814776895, + "grad_norm": 0.2537623941898346, + "learning_rate": 1.4240504399334728e-05, + "loss": 1.2321, + "step": 12745 + }, + { + "epoch": 3.796347660976563, + "grad_norm": 0.2908998131752014, + "learning_rate": 1.4239630831102641e-05, + "loss": 1.2273, + "step": 12746 + }, + { + "epoch": 3.796645507176232, + "grad_norm": 0.3052311837673187, + "learning_rate": 1.4238757223425528e-05, + "loss": 1.2196, + "step": 12747 + }, + { + "epoch": 3.7969433533759007, + "grad_norm": 0.3534977436065674, + "learning_rate": 1.423788357631151e-05, + "loss": 1.2208, + "step": 12748 + }, + { + "epoch": 3.797241199575569, + "grad_norm": 0.27182501554489136, + "learning_rate": 1.4237009889768725e-05, + "loss": 1.2239, + "step": 12749 + }, + { + "epoch": 3.797539045775238, + "grad_norm": 0.33824798464775085, + "learning_rate": 1.4236136163805294e-05, + "loss": 1.2348, + "step": 12750 + }, + { + "epoch": 3.7978368919749066, + "grad_norm": 0.3123975694179535, + "learning_rate": 1.423526239842935e-05, + "loss": 1.2235, + "step": 12751 + }, + { + "epoch": 3.798134738174575, + "grad_norm": 0.2852601408958435, + "learning_rate": 1.423438859364902e-05, + "loss": 1.2294, + "step": 12752 + }, + { + "epoch": 3.7984325843742437, + "grad_norm": 0.33295968174934387, + "learning_rate": 1.423351474947244e-05, + "loss": 1.2377, + "step": 12753 + }, + { + "epoch": 3.7987304305739125, + "grad_norm": 0.2448887676000595, + "learning_rate": 1.4232640865907729e-05, + "loss": 1.2299, + "step": 12754 + }, + { + "epoch": 3.799028276773581, + "grad_norm": 0.299250990152359, + "learning_rate": 1.4231766942963025e-05, + "loss": 1.2408, + "step": 12755 + }, + { + "epoch": 3.7993261229732496, + "grad_norm": 0.25415459275245667, + "learning_rate": 1.423089298064646e-05, + "loss": 1.2254, + "step": 12756 + }, + { + "epoch": 3.7996239691729183, + "grad_norm": 0.2801014482975006, + "learning_rate": 1.4230018978966162e-05, + "loss": 1.2234, + "step": 12757 + }, + { + "epoch": 3.799921815372587, + "grad_norm": 0.2553093135356903, + "learning_rate": 1.4229144937930263e-05, + "loss": 1.2236, + "step": 12758 + }, + { + "epoch": 3.800219661572256, + "grad_norm": 0.2585800290107727, + "learning_rate": 1.4228270857546895e-05, + "loss": 1.2099, + "step": 12759 + }, + { + "epoch": 3.800517507771924, + "grad_norm": 0.29664096236228943, + "learning_rate": 1.4227396737824193e-05, + "loss": 1.2389, + "step": 12760 + }, + { + "epoch": 3.800815353971593, + "grad_norm": 0.2602430284023285, + "learning_rate": 1.4226522578770285e-05, + "loss": 1.2383, + "step": 12761 + }, + { + "epoch": 3.8011132001712618, + "grad_norm": 0.27732184529304504, + "learning_rate": 1.4225648380393306e-05, + "loss": 1.2235, + "step": 12762 + }, + { + "epoch": 3.80141104637093, + "grad_norm": 0.2764049470424652, + "learning_rate": 1.4224774142701394e-05, + "loss": 1.2356, + "step": 12763 + }, + { + "epoch": 3.801708892570599, + "grad_norm": 0.24653466045856476, + "learning_rate": 1.4223899865702677e-05, + "loss": 1.2279, + "step": 12764 + }, + { + "epoch": 3.8020067387702676, + "grad_norm": 0.25066083669662476, + "learning_rate": 1.422302554940529e-05, + "loss": 1.2325, + "step": 12765 + }, + { + "epoch": 3.802304584969936, + "grad_norm": 0.3771968483924866, + "learning_rate": 1.4222151193817368e-05, + "loss": 1.2464, + "step": 12766 + }, + { + "epoch": 3.8026024311696047, + "grad_norm": 0.3234902620315552, + "learning_rate": 1.4221276798947048e-05, + "loss": 1.2205, + "step": 12767 + }, + { + "epoch": 3.8029002773692735, + "grad_norm": 0.33073753118515015, + "learning_rate": 1.4220402364802461e-05, + "loss": 1.2168, + "step": 12768 + }, + { + "epoch": 3.803198123568942, + "grad_norm": 0.4305053651332855, + "learning_rate": 1.4219527891391746e-05, + "loss": 1.2184, + "step": 12769 + }, + { + "epoch": 3.8034959697686106, + "grad_norm": 0.25483497977256775, + "learning_rate": 1.4218653378723042e-05, + "loss": 1.2169, + "step": 12770 + }, + { + "epoch": 3.8037938159682794, + "grad_norm": 0.32836589217185974, + "learning_rate": 1.421777882680448e-05, + "loss": 1.2201, + "step": 12771 + }, + { + "epoch": 3.804091662167948, + "grad_norm": 0.2492336630821228, + "learning_rate": 1.4216904235644195e-05, + "loss": 1.2349, + "step": 12772 + }, + { + "epoch": 3.804389508367617, + "grad_norm": 0.3971332609653473, + "learning_rate": 1.421602960525033e-05, + "loss": 1.2135, + "step": 12773 + }, + { + "epoch": 3.8046873545672852, + "grad_norm": 0.32474571466445923, + "learning_rate": 1.4215154935631021e-05, + "loss": 1.2447, + "step": 12774 + }, + { + "epoch": 3.804985200766954, + "grad_norm": 0.310362309217453, + "learning_rate": 1.4214280226794404e-05, + "loss": 1.2294, + "step": 12775 + }, + { + "epoch": 3.805283046966623, + "grad_norm": 0.26335400342941284, + "learning_rate": 1.4213405478748616e-05, + "loss": 1.2481, + "step": 12776 + }, + { + "epoch": 3.805580893166291, + "grad_norm": 0.33689048886299133, + "learning_rate": 1.42125306915018e-05, + "loss": 1.2269, + "step": 12777 + }, + { + "epoch": 3.80587873936596, + "grad_norm": 0.2633482813835144, + "learning_rate": 1.4211655865062091e-05, + "loss": 1.2301, + "step": 12778 + }, + { + "epoch": 3.8061765855656287, + "grad_norm": 0.3629227876663208, + "learning_rate": 1.421078099943763e-05, + "loss": 1.2232, + "step": 12779 + }, + { + "epoch": 3.806474431765297, + "grad_norm": 0.24028271436691284, + "learning_rate": 1.420990609463656e-05, + "loss": 1.2363, + "step": 12780 + }, + { + "epoch": 3.8067722779649658, + "grad_norm": 0.4227234423160553, + "learning_rate": 1.4209031150667013e-05, + "loss": 1.2418, + "step": 12781 + }, + { + "epoch": 3.8070701241646345, + "grad_norm": 0.2809666097164154, + "learning_rate": 1.4208156167537132e-05, + "loss": 1.2268, + "step": 12782 + }, + { + "epoch": 3.807367970364303, + "grad_norm": 0.3538129925727844, + "learning_rate": 1.4207281145255063e-05, + "loss": 1.2254, + "step": 12783 + }, + { + "epoch": 3.8076658165639716, + "grad_norm": 0.24617306888103485, + "learning_rate": 1.4206406083828943e-05, + "loss": 1.2284, + "step": 12784 + }, + { + "epoch": 3.8079636627636404, + "grad_norm": 0.4199250638484955, + "learning_rate": 1.4205530983266913e-05, + "loss": 1.2411, + "step": 12785 + }, + { + "epoch": 3.808261508963309, + "grad_norm": 0.24131809175014496, + "learning_rate": 1.4204655843577117e-05, + "loss": 1.2143, + "step": 12786 + }, + { + "epoch": 3.808559355162978, + "grad_norm": 0.30456212162971497, + "learning_rate": 1.42037806647677e-05, + "loss": 1.2339, + "step": 12787 + }, + { + "epoch": 3.8088572013626463, + "grad_norm": 0.25974634289741516, + "learning_rate": 1.4202905446846793e-05, + "loss": 1.2316, + "step": 12788 + }, + { + "epoch": 3.809155047562315, + "grad_norm": 0.318865567445755, + "learning_rate": 1.4202030189822551e-05, + "loss": 1.2177, + "step": 12789 + }, + { + "epoch": 3.809452893761984, + "grad_norm": 0.2840079963207245, + "learning_rate": 1.420115489370311e-05, + "loss": 1.2415, + "step": 12790 + }, + { + "epoch": 3.809750739961652, + "grad_norm": 0.26356443762779236, + "learning_rate": 1.420027955849662e-05, + "loss": 1.2234, + "step": 12791 + }, + { + "epoch": 3.810048586161321, + "grad_norm": 0.27328574657440186, + "learning_rate": 1.419940418421122e-05, + "loss": 1.2234, + "step": 12792 + }, + { + "epoch": 3.8103464323609897, + "grad_norm": 0.25236785411834717, + "learning_rate": 1.4198528770855056e-05, + "loss": 1.2198, + "step": 12793 + }, + { + "epoch": 3.810644278560658, + "grad_norm": 0.2905680537223816, + "learning_rate": 1.4197653318436273e-05, + "loss": 1.2455, + "step": 12794 + }, + { + "epoch": 3.810942124760327, + "grad_norm": 0.25465255975723267, + "learning_rate": 1.4196777826963018e-05, + "loss": 1.2265, + "step": 12795 + }, + { + "epoch": 3.8112399709599956, + "grad_norm": 0.27620112895965576, + "learning_rate": 1.4195902296443427e-05, + "loss": 1.2478, + "step": 12796 + }, + { + "epoch": 3.8115378171596643, + "grad_norm": 0.2501170337200165, + "learning_rate": 1.419502672688566e-05, + "loss": 1.21, + "step": 12797 + }, + { + "epoch": 3.8118356633593327, + "grad_norm": 0.24600648880004883, + "learning_rate": 1.4194151118297854e-05, + "loss": 1.2288, + "step": 12798 + }, + { + "epoch": 3.8121335095590014, + "grad_norm": 0.28037095069885254, + "learning_rate": 1.4193275470688157e-05, + "loss": 1.2301, + "step": 12799 + }, + { + "epoch": 3.81243135575867, + "grad_norm": 0.2571197748184204, + "learning_rate": 1.4192399784064718e-05, + "loss": 1.2219, + "step": 12800 + }, + { + "epoch": 3.812729201958339, + "grad_norm": 0.29590463638305664, + "learning_rate": 1.4191524058435682e-05, + "loss": 1.2236, + "step": 12801 + }, + { + "epoch": 3.8130270481580073, + "grad_norm": 0.2541712522506714, + "learning_rate": 1.4190648293809198e-05, + "loss": 1.2225, + "step": 12802 + }, + { + "epoch": 3.813324894357676, + "grad_norm": 0.30674511194229126, + "learning_rate": 1.4189772490193414e-05, + "loss": 1.2229, + "step": 12803 + }, + { + "epoch": 3.813622740557345, + "grad_norm": 0.24881549179553986, + "learning_rate": 1.418889664759648e-05, + "loss": 1.2191, + "step": 12804 + }, + { + "epoch": 3.813920586757013, + "grad_norm": 0.2782246768474579, + "learning_rate": 1.418802076602654e-05, + "loss": 1.224, + "step": 12805 + }, + { + "epoch": 3.814218432956682, + "grad_norm": 0.2547534704208374, + "learning_rate": 1.4187144845491748e-05, + "loss": 1.215, + "step": 12806 + }, + { + "epoch": 3.8145162791563507, + "grad_norm": 0.2677958309650421, + "learning_rate": 1.418626888600025e-05, + "loss": 1.22, + "step": 12807 + }, + { + "epoch": 3.814814125356019, + "grad_norm": 0.25236955285072327, + "learning_rate": 1.4185392887560197e-05, + "loss": 1.2383, + "step": 12808 + }, + { + "epoch": 3.815111971555688, + "grad_norm": 0.24296407401561737, + "learning_rate": 1.4184516850179739e-05, + "loss": 1.2217, + "step": 12809 + }, + { + "epoch": 3.8154098177553566, + "grad_norm": 0.27396103739738464, + "learning_rate": 1.4183640773867026e-05, + "loss": 1.2237, + "step": 12810 + }, + { + "epoch": 3.8157076639550254, + "grad_norm": 0.2571660876274109, + "learning_rate": 1.4182764658630215e-05, + "loss": 1.2088, + "step": 12811 + }, + { + "epoch": 3.816005510154694, + "grad_norm": 0.26393911242485046, + "learning_rate": 1.4181888504477447e-05, + "loss": 1.2457, + "step": 12812 + }, + { + "epoch": 3.8163033563543625, + "grad_norm": 0.2524780035018921, + "learning_rate": 1.4181012311416882e-05, + "loss": 1.2366, + "step": 12813 + }, + { + "epoch": 3.8166012025540312, + "grad_norm": 0.3149135708808899, + "learning_rate": 1.4180136079456666e-05, + "loss": 1.2309, + "step": 12814 + }, + { + "epoch": 3.8168990487537, + "grad_norm": 0.26127392053604126, + "learning_rate": 1.4179259808604954e-05, + "loss": 1.2289, + "step": 12815 + }, + { + "epoch": 3.8171968949533683, + "grad_norm": 0.29379215836524963, + "learning_rate": 1.4178383498869902e-05, + "loss": 1.2355, + "step": 12816 + }, + { + "epoch": 3.817494741153037, + "grad_norm": 0.3243806064128876, + "learning_rate": 1.4177507150259656e-05, + "loss": 1.2401, + "step": 12817 + }, + { + "epoch": 3.817792587352706, + "grad_norm": 0.26709645986557007, + "learning_rate": 1.4176630762782378e-05, + "loss": 1.2387, + "step": 12818 + }, + { + "epoch": 3.818090433552374, + "grad_norm": 0.274420827627182, + "learning_rate": 1.417575433644621e-05, + "loss": 1.235, + "step": 12819 + }, + { + "epoch": 3.818388279752043, + "grad_norm": 0.3666256368160248, + "learning_rate": 1.4174877871259319e-05, + "loss": 1.2343, + "step": 12820 + }, + { + "epoch": 3.8186861259517118, + "grad_norm": 0.3476208448410034, + "learning_rate": 1.4174001367229853e-05, + "loss": 1.2284, + "step": 12821 + }, + { + "epoch": 3.81898397215138, + "grad_norm": 0.28282949328422546, + "learning_rate": 1.4173124824365968e-05, + "loss": 1.2367, + "step": 12822 + }, + { + "epoch": 3.819281818351049, + "grad_norm": 0.26556313037872314, + "learning_rate": 1.417224824267582e-05, + "loss": 1.2416, + "step": 12823 + }, + { + "epoch": 3.8195796645507176, + "grad_norm": 0.3180636167526245, + "learning_rate": 1.4171371622167562e-05, + "loss": 1.2339, + "step": 12824 + }, + { + "epoch": 3.8198775107503864, + "grad_norm": 0.24711021780967712, + "learning_rate": 1.4170494962849349e-05, + "loss": 1.2254, + "step": 12825 + }, + { + "epoch": 3.820175356950055, + "grad_norm": 0.2862757444381714, + "learning_rate": 1.4169618264729343e-05, + "loss": 1.2259, + "step": 12826 + }, + { + "epoch": 3.8204732031497235, + "grad_norm": 0.2634437382221222, + "learning_rate": 1.4168741527815697e-05, + "loss": 1.2319, + "step": 12827 + }, + { + "epoch": 3.8207710493493923, + "grad_norm": 0.31689709424972534, + "learning_rate": 1.416786475211657e-05, + "loss": 1.2281, + "step": 12828 + }, + { + "epoch": 3.821068895549061, + "grad_norm": 0.2715291380882263, + "learning_rate": 1.4166987937640117e-05, + "loss": 1.2323, + "step": 12829 + }, + { + "epoch": 3.8213667417487294, + "grad_norm": 0.3118676245212555, + "learning_rate": 1.4166111084394495e-05, + "loss": 1.2391, + "step": 12830 + }, + { + "epoch": 3.821664587948398, + "grad_norm": 0.2390519380569458, + "learning_rate": 1.4165234192387866e-05, + "loss": 1.206, + "step": 12831 + }, + { + "epoch": 3.821962434148067, + "grad_norm": 0.3161649703979492, + "learning_rate": 1.4164357261628388e-05, + "loss": 1.2205, + "step": 12832 + }, + { + "epoch": 3.8222602803477352, + "grad_norm": 0.23863878846168518, + "learning_rate": 1.4163480292124214e-05, + "loss": 1.2189, + "step": 12833 + }, + { + "epoch": 3.822558126547404, + "grad_norm": 0.2643231451511383, + "learning_rate": 1.4162603283883511e-05, + "loss": 1.2417, + "step": 12834 + }, + { + "epoch": 3.822855972747073, + "grad_norm": 0.23881596326828003, + "learning_rate": 1.4161726236914434e-05, + "loss": 1.2371, + "step": 12835 + }, + { + "epoch": 3.823153818946741, + "grad_norm": 0.28563517332077026, + "learning_rate": 1.4160849151225147e-05, + "loss": 1.2442, + "step": 12836 + }, + { + "epoch": 3.82345166514641, + "grad_norm": 0.24477483332157135, + "learning_rate": 1.4159972026823802e-05, + "loss": 1.2296, + "step": 12837 + }, + { + "epoch": 3.8237495113460787, + "grad_norm": 0.25879228115081787, + "learning_rate": 1.415909486371857e-05, + "loss": 1.2247, + "step": 12838 + }, + { + "epoch": 3.8240473575457474, + "grad_norm": 0.26841700077056885, + "learning_rate": 1.4158217661917604e-05, + "loss": 1.2219, + "step": 12839 + }, + { + "epoch": 3.824345203745416, + "grad_norm": 0.29669052362442017, + "learning_rate": 1.4157340421429071e-05, + "loss": 1.2453, + "step": 12840 + }, + { + "epoch": 3.8246430499450845, + "grad_norm": 0.24732880294322968, + "learning_rate": 1.415646314226113e-05, + "loss": 1.2406, + "step": 12841 + }, + { + "epoch": 3.8249408961447533, + "grad_norm": 0.2771657109260559, + "learning_rate": 1.4155585824421941e-05, + "loss": 1.2316, + "step": 12842 + }, + { + "epoch": 3.825238742344422, + "grad_norm": 0.2514537572860718, + "learning_rate": 1.4154708467919669e-05, + "loss": 1.233, + "step": 12843 + }, + { + "epoch": 3.8255365885440904, + "grad_norm": 0.2608799338340759, + "learning_rate": 1.4153831072762476e-05, + "loss": 1.2366, + "step": 12844 + }, + { + "epoch": 3.825834434743759, + "grad_norm": 0.23932795226573944, + "learning_rate": 1.4152953638958531e-05, + "loss": 1.2475, + "step": 12845 + }, + { + "epoch": 3.826132280943428, + "grad_norm": 0.24809974431991577, + "learning_rate": 1.4152076166515987e-05, + "loss": 1.2333, + "step": 12846 + }, + { + "epoch": 3.8264301271430963, + "grad_norm": 0.24367476999759674, + "learning_rate": 1.4151198655443014e-05, + "loss": 1.2292, + "step": 12847 + }, + { + "epoch": 3.826727973342765, + "grad_norm": 0.2372283935546875, + "learning_rate": 1.4150321105747777e-05, + "loss": 1.2335, + "step": 12848 + }, + { + "epoch": 3.827025819542434, + "grad_norm": 0.2502457797527313, + "learning_rate": 1.414944351743844e-05, + "loss": 1.2319, + "step": 12849 + }, + { + "epoch": 3.8273236657421026, + "grad_norm": 0.25848546624183655, + "learning_rate": 1.4148565890523162e-05, + "loss": 1.224, + "step": 12850 + }, + { + "epoch": 3.827621511941771, + "grad_norm": 0.24064145982265472, + "learning_rate": 1.4147688225010119e-05, + "loss": 1.2133, + "step": 12851 + }, + { + "epoch": 3.8279193581414397, + "grad_norm": 0.26705843210220337, + "learning_rate": 1.414681052090747e-05, + "loss": 1.2402, + "step": 12852 + }, + { + "epoch": 3.8282172043411085, + "grad_norm": 0.24247168004512787, + "learning_rate": 1.4145932778223381e-05, + "loss": 1.2302, + "step": 12853 + }, + { + "epoch": 3.8285150505407772, + "grad_norm": 0.26049691438674927, + "learning_rate": 1.4145054996966021e-05, + "loss": 1.2326, + "step": 12854 + }, + { + "epoch": 3.8288128967404456, + "grad_norm": 0.30729058384895325, + "learning_rate": 1.4144177177143554e-05, + "loss": 1.2334, + "step": 12855 + }, + { + "epoch": 3.8291107429401143, + "grad_norm": 0.2650429904460907, + "learning_rate": 1.4143299318764149e-05, + "loss": 1.2382, + "step": 12856 + }, + { + "epoch": 3.829408589139783, + "grad_norm": 0.2934802770614624, + "learning_rate": 1.4142421421835972e-05, + "loss": 1.2304, + "step": 12857 + }, + { + "epoch": 3.8297064353394514, + "grad_norm": 0.253642737865448, + "learning_rate": 1.4141543486367193e-05, + "loss": 1.2304, + "step": 12858 + }, + { + "epoch": 3.83000428153912, + "grad_norm": 0.3072591722011566, + "learning_rate": 1.414066551236598e-05, + "loss": 1.237, + "step": 12859 + }, + { + "epoch": 3.830302127738789, + "grad_norm": 0.3484978675842285, + "learning_rate": 1.41397874998405e-05, + "loss": 1.2316, + "step": 12860 + }, + { + "epoch": 3.8305999739384573, + "grad_norm": 0.3204314112663269, + "learning_rate": 1.413890944879892e-05, + "loss": 1.2345, + "step": 12861 + }, + { + "epoch": 3.830897820138126, + "grad_norm": 0.3025658130645752, + "learning_rate": 1.4138031359249416e-05, + "loss": 1.219, + "step": 12862 + }, + { + "epoch": 3.831195666337795, + "grad_norm": 0.45898616313934326, + "learning_rate": 1.413715323120015e-05, + "loss": 1.2281, + "step": 12863 + }, + { + "epoch": 3.8314935125374636, + "grad_norm": 0.2997315526008606, + "learning_rate": 1.4136275064659297e-05, + "loss": 1.2262, + "step": 12864 + }, + { + "epoch": 3.831791358737132, + "grad_norm": 0.2999708354473114, + "learning_rate": 1.4135396859635028e-05, + "loss": 1.2194, + "step": 12865 + }, + { + "epoch": 3.8320892049368007, + "grad_norm": 0.30110031366348267, + "learning_rate": 1.4134518616135507e-05, + "loss": 1.2303, + "step": 12866 + }, + { + "epoch": 3.8323870511364695, + "grad_norm": 0.27862995862960815, + "learning_rate": 1.4133640334168912e-05, + "loss": 1.2235, + "step": 12867 + }, + { + "epoch": 3.8326848973361383, + "grad_norm": 0.3227183222770691, + "learning_rate": 1.413276201374341e-05, + "loss": 1.2088, + "step": 12868 + }, + { + "epoch": 3.8329827435358066, + "grad_norm": 0.27088144421577454, + "learning_rate": 1.413188365486718e-05, + "loss": 1.2288, + "step": 12869 + }, + { + "epoch": 3.8332805897354754, + "grad_norm": 0.30756035447120667, + "learning_rate": 1.4131005257548387e-05, + "loss": 1.2166, + "step": 12870 + }, + { + "epoch": 3.833578435935144, + "grad_norm": 0.26481255888938904, + "learning_rate": 1.4130126821795202e-05, + "loss": 1.2493, + "step": 12871 + }, + { + "epoch": 3.8338762821348125, + "grad_norm": 0.32878783345222473, + "learning_rate": 1.4129248347615804e-05, + "loss": 1.2195, + "step": 12872 + }, + { + "epoch": 3.8341741283344812, + "grad_norm": 0.2628609538078308, + "learning_rate": 1.4128369835018365e-05, + "loss": 1.2375, + "step": 12873 + }, + { + "epoch": 3.83447197453415, + "grad_norm": 0.27789103984832764, + "learning_rate": 1.4127491284011054e-05, + "loss": 1.2126, + "step": 12874 + }, + { + "epoch": 3.8347698207338183, + "grad_norm": 0.26154825091362, + "learning_rate": 1.4126612694602049e-05, + "loss": 1.2341, + "step": 12875 + }, + { + "epoch": 3.835067666933487, + "grad_norm": 0.2882075905799866, + "learning_rate": 1.4125734066799526e-05, + "loss": 1.2393, + "step": 12876 + }, + { + "epoch": 3.835365513133156, + "grad_norm": 0.2781543731689453, + "learning_rate": 1.4124855400611655e-05, + "loss": 1.216, + "step": 12877 + }, + { + "epoch": 3.8356633593328247, + "grad_norm": 0.28504717350006104, + "learning_rate": 1.412397669604661e-05, + "loss": 1.2342, + "step": 12878 + }, + { + "epoch": 3.8359612055324934, + "grad_norm": 0.32166755199432373, + "learning_rate": 1.4123097953112574e-05, + "loss": 1.2228, + "step": 12879 + }, + { + "epoch": 3.8362590517321618, + "grad_norm": 0.27417391538619995, + "learning_rate": 1.4122219171817717e-05, + "loss": 1.2111, + "step": 12880 + }, + { + "epoch": 3.8365568979318305, + "grad_norm": 0.48461154103279114, + "learning_rate": 1.4121340352170217e-05, + "loss": 1.2248, + "step": 12881 + }, + { + "epoch": 3.8368547441314993, + "grad_norm": 0.3839823603630066, + "learning_rate": 1.412046149417825e-05, + "loss": 1.2247, + "step": 12882 + }, + { + "epoch": 3.8371525903311676, + "grad_norm": 0.29947030544281006, + "learning_rate": 1.4119582597849993e-05, + "loss": 1.2255, + "step": 12883 + }, + { + "epoch": 3.8374504365308364, + "grad_norm": 0.35313907265663147, + "learning_rate": 1.4118703663193615e-05, + "loss": 1.2265, + "step": 12884 + }, + { + "epoch": 3.837748282730505, + "grad_norm": 0.27523180842399597, + "learning_rate": 1.4117824690217307e-05, + "loss": 1.2319, + "step": 12885 + }, + { + "epoch": 3.8380461289301735, + "grad_norm": 0.31127822399139404, + "learning_rate": 1.4116945678929244e-05, + "loss": 1.2195, + "step": 12886 + }, + { + "epoch": 3.8383439751298423, + "grad_norm": 0.2907017767429352, + "learning_rate": 1.4116066629337596e-05, + "loss": 1.2471, + "step": 12887 + }, + { + "epoch": 3.838641821329511, + "grad_norm": 0.2989345192909241, + "learning_rate": 1.411518754145055e-05, + "loss": 1.2197, + "step": 12888 + }, + { + "epoch": 3.8389396675291794, + "grad_norm": 0.2512325942516327, + "learning_rate": 1.411430841527628e-05, + "loss": 1.2167, + "step": 12889 + }, + { + "epoch": 3.839237513728848, + "grad_norm": 0.30058085918426514, + "learning_rate": 1.4113429250822965e-05, + "loss": 1.2366, + "step": 12890 + }, + { + "epoch": 3.839535359928517, + "grad_norm": 0.2330739051103592, + "learning_rate": 1.411255004809879e-05, + "loss": 1.2098, + "step": 12891 + }, + { + "epoch": 3.8398332061281857, + "grad_norm": 0.2903881072998047, + "learning_rate": 1.411167080711193e-05, + "loss": 1.2274, + "step": 12892 + }, + { + "epoch": 3.8401310523278545, + "grad_norm": 0.246946781873703, + "learning_rate": 1.411079152787057e-05, + "loss": 1.2264, + "step": 12893 + }, + { + "epoch": 3.840428898527523, + "grad_norm": 0.2761788070201874, + "learning_rate": 1.4109912210382884e-05, + "loss": 1.2443, + "step": 12894 + }, + { + "epoch": 3.8407267447271916, + "grad_norm": 0.256935179233551, + "learning_rate": 1.410903285465706e-05, + "loss": 1.2195, + "step": 12895 + }, + { + "epoch": 3.8410245909268603, + "grad_norm": 0.2901683449745178, + "learning_rate": 1.4108153460701272e-05, + "loss": 1.2362, + "step": 12896 + }, + { + "epoch": 3.8413224371265287, + "grad_norm": 0.287685751914978, + "learning_rate": 1.4107274028523708e-05, + "loss": 1.2208, + "step": 12897 + }, + { + "epoch": 3.8416202833261974, + "grad_norm": 0.26763716340065, + "learning_rate": 1.4106394558132548e-05, + "loss": 1.2564, + "step": 12898 + }, + { + "epoch": 3.841918129525866, + "grad_norm": 0.2797291874885559, + "learning_rate": 1.4105515049535974e-05, + "loss": 1.2247, + "step": 12899 + }, + { + "epoch": 3.8422159757255345, + "grad_norm": 0.25272098183631897, + "learning_rate": 1.4104635502742172e-05, + "loss": 1.2265, + "step": 12900 + }, + { + "epoch": 3.8425138219252033, + "grad_norm": 0.2657968997955322, + "learning_rate": 1.4103755917759321e-05, + "loss": 1.2335, + "step": 12901 + }, + { + "epoch": 3.842811668124872, + "grad_norm": 0.3597632944583893, + "learning_rate": 1.4102876294595602e-05, + "loss": 1.2321, + "step": 12902 + }, + { + "epoch": 3.8431095143245404, + "grad_norm": 0.45975279808044434, + "learning_rate": 1.410199663325921e-05, + "loss": 1.2072, + "step": 12903 + }, + { + "epoch": 3.843407360524209, + "grad_norm": 0.26953503489494324, + "learning_rate": 1.4101116933758318e-05, + "loss": 1.239, + "step": 12904 + }, + { + "epoch": 3.843705206723878, + "grad_norm": 0.44539040327072144, + "learning_rate": 1.4100237196101116e-05, + "loss": 1.2255, + "step": 12905 + }, + { + "epoch": 3.8440030529235467, + "grad_norm": 0.4395925998687744, + "learning_rate": 1.4099357420295792e-05, + "loss": 1.2385, + "step": 12906 + }, + { + "epoch": 3.8443008991232155, + "grad_norm": 0.32635074853897095, + "learning_rate": 1.409847760635052e-05, + "loss": 1.2247, + "step": 12907 + }, + { + "epoch": 3.844598745322884, + "grad_norm": 1.0792405605316162, + "learning_rate": 1.4097597754273497e-05, + "loss": 1.2356, + "step": 12908 + }, + { + "epoch": 3.8448965915225526, + "grad_norm": 0.3065727651119232, + "learning_rate": 1.4096717864072904e-05, + "loss": 1.2288, + "step": 12909 + }, + { + "epoch": 3.8451944377222214, + "grad_norm": 0.26505017280578613, + "learning_rate": 1.409583793575693e-05, + "loss": 1.2308, + "step": 12910 + }, + { + "epoch": 3.8454922839218897, + "grad_norm": 0.2600894272327423, + "learning_rate": 1.409495796933376e-05, + "loss": 1.2391, + "step": 12911 + }, + { + "epoch": 3.8457901301215585, + "grad_norm": 0.2440338134765625, + "learning_rate": 1.4094077964811579e-05, + "loss": 1.2293, + "step": 12912 + }, + { + "epoch": 3.8460879763212272, + "grad_norm": 0.23576608300209045, + "learning_rate": 1.4093197922198577e-05, + "loss": 1.2277, + "step": 12913 + }, + { + "epoch": 3.8463858225208956, + "grad_norm": 0.24084974825382233, + "learning_rate": 1.4092317841502942e-05, + "loss": 1.2146, + "step": 12914 + }, + { + "epoch": 3.8466836687205643, + "grad_norm": 0.25695639848709106, + "learning_rate": 1.4091437722732863e-05, + "loss": 1.2404, + "step": 12915 + }, + { + "epoch": 3.846981514920233, + "grad_norm": 0.25280672311782837, + "learning_rate": 1.4090557565896526e-05, + "loss": 1.2367, + "step": 12916 + }, + { + "epoch": 3.847279361119902, + "grad_norm": 0.23778603971004486, + "learning_rate": 1.4089677371002124e-05, + "loss": 1.226, + "step": 12917 + }, + { + "epoch": 3.84757720731957, + "grad_norm": 0.2415882796049118, + "learning_rate": 1.4088797138057839e-05, + "loss": 1.2302, + "step": 12918 + }, + { + "epoch": 3.847875053519239, + "grad_norm": 0.24139143526554108, + "learning_rate": 1.4087916867071866e-05, + "loss": 1.2223, + "step": 12919 + }, + { + "epoch": 3.8481728997189077, + "grad_norm": 0.2485382854938507, + "learning_rate": 1.4087036558052396e-05, + "loss": 1.2245, + "step": 12920 + }, + { + "epoch": 3.8484707459185765, + "grad_norm": 0.25132110714912415, + "learning_rate": 1.4086156211007615e-05, + "loss": 1.2221, + "step": 12921 + }, + { + "epoch": 3.848768592118245, + "grad_norm": 0.24649682641029358, + "learning_rate": 1.4085275825945717e-05, + "loss": 1.2268, + "step": 12922 + }, + { + "epoch": 3.8490664383179136, + "grad_norm": 0.24931305646896362, + "learning_rate": 1.4084395402874894e-05, + "loss": 1.2265, + "step": 12923 + }, + { + "epoch": 3.8493642845175824, + "grad_norm": 0.25036677718162537, + "learning_rate": 1.4083514941803332e-05, + "loss": 1.2321, + "step": 12924 + }, + { + "epoch": 3.8496621307172507, + "grad_norm": 0.2317926436662674, + "learning_rate": 1.4082634442739223e-05, + "loss": 1.2157, + "step": 12925 + }, + { + "epoch": 3.8499599769169195, + "grad_norm": 0.24006804823875427, + "learning_rate": 1.4081753905690764e-05, + "loss": 1.2377, + "step": 12926 + }, + { + "epoch": 3.8502578231165883, + "grad_norm": 0.242327019572258, + "learning_rate": 1.4080873330666149e-05, + "loss": 1.2353, + "step": 12927 + }, + { + "epoch": 3.8505556693162566, + "grad_norm": 0.2397931069135666, + "learning_rate": 1.4079992717673563e-05, + "loss": 1.2315, + "step": 12928 + }, + { + "epoch": 3.8508535155159254, + "grad_norm": 0.24000605940818787, + "learning_rate": 1.4079112066721205e-05, + "loss": 1.2461, + "step": 12929 + }, + { + "epoch": 3.851151361715594, + "grad_norm": 0.2388916164636612, + "learning_rate": 1.4078231377817266e-05, + "loss": 1.2346, + "step": 12930 + }, + { + "epoch": 3.851449207915263, + "grad_norm": 0.22943438589572906, + "learning_rate": 1.4077350650969941e-05, + "loss": 1.2248, + "step": 12931 + }, + { + "epoch": 3.8517470541149317, + "grad_norm": 0.24493323266506195, + "learning_rate": 1.4076469886187423e-05, + "loss": 1.2306, + "step": 12932 + }, + { + "epoch": 3.8520449003146, + "grad_norm": 0.24133175611495972, + "learning_rate": 1.4075589083477907e-05, + "loss": 1.2031, + "step": 12933 + }, + { + "epoch": 3.852342746514269, + "grad_norm": 0.24110007286071777, + "learning_rate": 1.407470824284959e-05, + "loss": 1.2223, + "step": 12934 + }, + { + "epoch": 3.8526405927139376, + "grad_norm": 0.22775600850582123, + "learning_rate": 1.4073827364310666e-05, + "loss": 1.2221, + "step": 12935 + }, + { + "epoch": 3.852938438913606, + "grad_norm": 0.23863746225833893, + "learning_rate": 1.4072946447869326e-05, + "loss": 1.2306, + "step": 12936 + }, + { + "epoch": 3.8532362851132747, + "grad_norm": 0.2448456883430481, + "learning_rate": 1.4072065493533773e-05, + "loss": 1.2125, + "step": 12937 + }, + { + "epoch": 3.8535341313129434, + "grad_norm": 0.23308877646923065, + "learning_rate": 1.4071184501312199e-05, + "loss": 1.2172, + "step": 12938 + }, + { + "epoch": 3.8538319775126118, + "grad_norm": 0.23921458423137665, + "learning_rate": 1.40703034712128e-05, + "loss": 1.232, + "step": 12939 + }, + { + "epoch": 3.8541298237122805, + "grad_norm": 0.2406570017337799, + "learning_rate": 1.406942240324378e-05, + "loss": 1.2184, + "step": 12940 + }, + { + "epoch": 3.8544276699119493, + "grad_norm": 0.24633492529392242, + "learning_rate": 1.4068541297413327e-05, + "loss": 1.2451, + "step": 12941 + }, + { + "epoch": 3.8547255161116176, + "grad_norm": 0.24022212624549866, + "learning_rate": 1.4067660153729646e-05, + "loss": 1.2349, + "step": 12942 + }, + { + "epoch": 3.8550233623112864, + "grad_norm": 0.24970600008964539, + "learning_rate": 1.4066778972200926e-05, + "loss": 1.2585, + "step": 12943 + }, + { + "epoch": 3.855321208510955, + "grad_norm": 0.23601877689361572, + "learning_rate": 1.4065897752835378e-05, + "loss": 1.24, + "step": 12944 + }, + { + "epoch": 3.855619054710624, + "grad_norm": 0.23589526116847992, + "learning_rate": 1.406501649564119e-05, + "loss": 1.2319, + "step": 12945 + }, + { + "epoch": 3.8559169009102927, + "grad_norm": 0.24604082107543945, + "learning_rate": 1.4064135200626566e-05, + "loss": 1.2163, + "step": 12946 + }, + { + "epoch": 3.856214747109961, + "grad_norm": 0.24139080941677094, + "learning_rate": 1.4063253867799706e-05, + "loss": 1.225, + "step": 12947 + }, + { + "epoch": 3.85651259330963, + "grad_norm": 0.23471637070178986, + "learning_rate": 1.4062372497168805e-05, + "loss": 1.2327, + "step": 12948 + }, + { + "epoch": 3.8568104395092986, + "grad_norm": 0.23509523272514343, + "learning_rate": 1.406149108874207e-05, + "loss": 1.2252, + "step": 12949 + }, + { + "epoch": 3.857108285708967, + "grad_norm": 0.2365317940711975, + "learning_rate": 1.4060609642527696e-05, + "loss": 1.2307, + "step": 12950 + }, + { + "epoch": 3.8574061319086357, + "grad_norm": 0.2595860958099365, + "learning_rate": 1.405972815853389e-05, + "loss": 1.239, + "step": 12951 + }, + { + "epoch": 3.8577039781083045, + "grad_norm": 0.24657686054706573, + "learning_rate": 1.4058846636768845e-05, + "loss": 1.2149, + "step": 12952 + }, + { + "epoch": 3.858001824307973, + "grad_norm": 0.2447076141834259, + "learning_rate": 1.405796507724077e-05, + "loss": 1.2235, + "step": 12953 + }, + { + "epoch": 3.8582996705076416, + "grad_norm": 0.24253875017166138, + "learning_rate": 1.4057083479957863e-05, + "loss": 1.223, + "step": 12954 + }, + { + "epoch": 3.8585975167073103, + "grad_norm": 0.23274843394756317, + "learning_rate": 1.4056201844928325e-05, + "loss": 1.2222, + "step": 12955 + }, + { + "epoch": 3.8588953629069787, + "grad_norm": 0.23749485611915588, + "learning_rate": 1.4055320172160363e-05, + "loss": 1.2392, + "step": 12956 + }, + { + "epoch": 3.8591932091066474, + "grad_norm": 0.23494069278240204, + "learning_rate": 1.4054438461662177e-05, + "loss": 1.2264, + "step": 12957 + }, + { + "epoch": 3.859491055306316, + "grad_norm": 0.24517206847667694, + "learning_rate": 1.4053556713441972e-05, + "loss": 1.2362, + "step": 12958 + }, + { + "epoch": 3.859788901505985, + "grad_norm": 0.23095130920410156, + "learning_rate": 1.4052674927507947e-05, + "loss": 1.2294, + "step": 12959 + }, + { + "epoch": 3.8600867477056537, + "grad_norm": 0.23468315601348877, + "learning_rate": 1.4051793103868314e-05, + "loss": 1.2399, + "step": 12960 + }, + { + "epoch": 3.860384593905322, + "grad_norm": 0.2437102496623993, + "learning_rate": 1.405091124253127e-05, + "loss": 1.2366, + "step": 12961 + }, + { + "epoch": 3.860682440104991, + "grad_norm": 0.23301923274993896, + "learning_rate": 1.4050029343505025e-05, + "loss": 1.2254, + "step": 12962 + }, + { + "epoch": 3.8609802863046596, + "grad_norm": 0.2372865527868271, + "learning_rate": 1.404914740679778e-05, + "loss": 1.2249, + "step": 12963 + }, + { + "epoch": 3.861278132504328, + "grad_norm": 0.22793932259082794, + "learning_rate": 1.4048265432417745e-05, + "loss": 1.2417, + "step": 12964 + }, + { + "epoch": 3.8615759787039967, + "grad_norm": 0.23911087214946747, + "learning_rate": 1.4047383420373121e-05, + "loss": 1.2382, + "step": 12965 + }, + { + "epoch": 3.8618738249036655, + "grad_norm": 0.2349393218755722, + "learning_rate": 1.4046501370672114e-05, + "loss": 1.2265, + "step": 12966 + }, + { + "epoch": 3.862171671103334, + "grad_norm": 0.2519095540046692, + "learning_rate": 1.4045619283322936e-05, + "loss": 1.238, + "step": 12967 + }, + { + "epoch": 3.8624695173030026, + "grad_norm": 0.24281921982765198, + "learning_rate": 1.4044737158333793e-05, + "loss": 1.2064, + "step": 12968 + }, + { + "epoch": 3.8627673635026714, + "grad_norm": 0.24120020866394043, + "learning_rate": 1.4043854995712886e-05, + "loss": 1.2156, + "step": 12969 + }, + { + "epoch": 3.8630652097023397, + "grad_norm": 0.23576250672340393, + "learning_rate": 1.4042972795468428e-05, + "loss": 1.2127, + "step": 12970 + }, + { + "epoch": 3.8633630559020085, + "grad_norm": 0.24687890708446503, + "learning_rate": 1.4042090557608623e-05, + "loss": 1.2336, + "step": 12971 + }, + { + "epoch": 3.8636609021016772, + "grad_norm": 0.2488422989845276, + "learning_rate": 1.4041208282141683e-05, + "loss": 1.2291, + "step": 12972 + }, + { + "epoch": 3.863958748301346, + "grad_norm": 0.24812117218971252, + "learning_rate": 1.4040325969075814e-05, + "loss": 1.2138, + "step": 12973 + }, + { + "epoch": 3.8642565945010148, + "grad_norm": 0.3074995279312134, + "learning_rate": 1.4039443618419228e-05, + "loss": 1.2457, + "step": 12974 + }, + { + "epoch": 3.864554440700683, + "grad_norm": 0.2558381259441376, + "learning_rate": 1.4038561230180132e-05, + "loss": 1.2468, + "step": 12975 + }, + { + "epoch": 3.864852286900352, + "grad_norm": 0.26688438653945923, + "learning_rate": 1.4037678804366735e-05, + "loss": 1.2287, + "step": 12976 + }, + { + "epoch": 3.8651501331000206, + "grad_norm": 0.24483263492584229, + "learning_rate": 1.4036796340987248e-05, + "loss": 1.2433, + "step": 12977 + }, + { + "epoch": 3.865447979299689, + "grad_norm": 0.2557902932167053, + "learning_rate": 1.4035913840049882e-05, + "loss": 1.2298, + "step": 12978 + }, + { + "epoch": 3.8657458254993577, + "grad_norm": 0.22995924949645996, + "learning_rate": 1.4035031301562845e-05, + "loss": 1.2227, + "step": 12979 + }, + { + "epoch": 3.8660436716990265, + "grad_norm": 0.25961512327194214, + "learning_rate": 1.4034148725534351e-05, + "loss": 1.2325, + "step": 12980 + }, + { + "epoch": 3.866341517898695, + "grad_norm": 0.24411143362522125, + "learning_rate": 1.4033266111972613e-05, + "loss": 1.2315, + "step": 12981 + }, + { + "epoch": 3.8666393640983636, + "grad_norm": 0.2525519132614136, + "learning_rate": 1.4032383460885837e-05, + "loss": 1.2345, + "step": 12982 + }, + { + "epoch": 3.8669372102980324, + "grad_norm": 0.24101418256759644, + "learning_rate": 1.4031500772282238e-05, + "loss": 1.2236, + "step": 12983 + }, + { + "epoch": 3.867235056497701, + "grad_norm": 0.26651257276535034, + "learning_rate": 1.4030618046170026e-05, + "loss": 1.2273, + "step": 12984 + }, + { + "epoch": 3.8675329026973695, + "grad_norm": 0.2586762011051178, + "learning_rate": 1.4029735282557423e-05, + "loss": 1.2247, + "step": 12985 + }, + { + "epoch": 3.8678307488970383, + "grad_norm": 0.27988675236701965, + "learning_rate": 1.402885248145263e-05, + "loss": 1.2244, + "step": 12986 + }, + { + "epoch": 3.868128595096707, + "grad_norm": 0.2713130712509155, + "learning_rate": 1.4027969642863869e-05, + "loss": 1.2288, + "step": 12987 + }, + { + "epoch": 3.868426441296376, + "grad_norm": 0.3202548623085022, + "learning_rate": 1.4027086766799352e-05, + "loss": 1.2312, + "step": 12988 + }, + { + "epoch": 3.868724287496044, + "grad_norm": 0.2576482892036438, + "learning_rate": 1.4026203853267285e-05, + "loss": 1.2434, + "step": 12989 + }, + { + "epoch": 3.869022133695713, + "grad_norm": 0.287153035402298, + "learning_rate": 1.4025320902275895e-05, + "loss": 1.2261, + "step": 12990 + }, + { + "epoch": 3.8693199798953817, + "grad_norm": 0.299532413482666, + "learning_rate": 1.4024437913833391e-05, + "loss": 1.2214, + "step": 12991 + }, + { + "epoch": 3.86961782609505, + "grad_norm": 0.26357489824295044, + "learning_rate": 1.402355488794799e-05, + "loss": 1.229, + "step": 12992 + }, + { + "epoch": 3.869915672294719, + "grad_norm": 0.2971954047679901, + "learning_rate": 1.4022671824627902e-05, + "loss": 1.2239, + "step": 12993 + }, + { + "epoch": 3.8702135184943876, + "grad_norm": 0.2457713931798935, + "learning_rate": 1.402178872388135e-05, + "loss": 1.2296, + "step": 12994 + }, + { + "epoch": 3.870511364694056, + "grad_norm": 0.26199328899383545, + "learning_rate": 1.4020905585716546e-05, + "loss": 1.2322, + "step": 12995 + }, + { + "epoch": 3.8708092108937247, + "grad_norm": 0.24583016335964203, + "learning_rate": 1.402002241014171e-05, + "loss": 1.2309, + "step": 12996 + }, + { + "epoch": 3.8711070570933934, + "grad_norm": 0.25787654519081116, + "learning_rate": 1.4019139197165054e-05, + "loss": 1.2374, + "step": 12997 + }, + { + "epoch": 3.871404903293062, + "grad_norm": 0.26302674412727356, + "learning_rate": 1.4018255946794802e-05, + "loss": 1.2407, + "step": 12998 + }, + { + "epoch": 3.871702749492731, + "grad_norm": 0.2593098282814026, + "learning_rate": 1.4017372659039166e-05, + "loss": 1.2435, + "step": 12999 + }, + { + "epoch": 3.8720005956923993, + "grad_norm": 0.2577357888221741, + "learning_rate": 1.4016489333906365e-05, + "loss": 1.2303, + "step": 13000 + }, + { + "epoch": 3.8720005956923993, + "eval_loss": 1.3259975910186768, + "eval_runtime": 22.2707, + "eval_samples_per_second": 77.86, + "eval_steps_per_second": 4.894, + "step": 13000 + }, + { + "epoch": 3.872298441892068, + "grad_norm": 0.25295189023017883, + "learning_rate": 1.4015605971404618e-05, + "loss": 1.231, + "step": 13001 + }, + { + "epoch": 3.872596288091737, + "grad_norm": 0.2653793692588806, + "learning_rate": 1.4014722571542146e-05, + "loss": 1.2351, + "step": 13002 + }, + { + "epoch": 3.872894134291405, + "grad_norm": 0.2781165838241577, + "learning_rate": 1.4013839134327163e-05, + "loss": 1.2451, + "step": 13003 + }, + { + "epoch": 3.873191980491074, + "grad_norm": 0.24404959380626678, + "learning_rate": 1.4012955659767893e-05, + "loss": 1.2195, + "step": 13004 + }, + { + "epoch": 3.8734898266907427, + "grad_norm": 0.2575218081474304, + "learning_rate": 1.4012072147872555e-05, + "loss": 1.2222, + "step": 13005 + }, + { + "epoch": 3.873787672890411, + "grad_norm": 0.2504289746284485, + "learning_rate": 1.4011188598649368e-05, + "loss": 1.2299, + "step": 13006 + }, + { + "epoch": 3.87408551909008, + "grad_norm": 0.23770669102668762, + "learning_rate": 1.401030501210655e-05, + "loss": 1.2146, + "step": 13007 + }, + { + "epoch": 3.8743833652897486, + "grad_norm": 0.2796214818954468, + "learning_rate": 1.4009421388252326e-05, + "loss": 1.2219, + "step": 13008 + }, + { + "epoch": 3.874681211489417, + "grad_norm": 0.2567894756793976, + "learning_rate": 1.4008537727094918e-05, + "loss": 1.2271, + "step": 13009 + }, + { + "epoch": 3.8749790576890857, + "grad_norm": 0.29812344908714294, + "learning_rate": 1.4007654028642543e-05, + "loss": 1.2158, + "step": 13010 + }, + { + "epoch": 3.8752769038887545, + "grad_norm": 0.2508329451084137, + "learning_rate": 1.4006770292903425e-05, + "loss": 1.2244, + "step": 13011 + }, + { + "epoch": 3.8755747500884232, + "grad_norm": 0.32301008701324463, + "learning_rate": 1.4005886519885788e-05, + "loss": 1.2287, + "step": 13012 + }, + { + "epoch": 3.875872596288092, + "grad_norm": 0.25789791345596313, + "learning_rate": 1.4005002709597848e-05, + "loss": 1.2348, + "step": 13013 + }, + { + "epoch": 3.8761704424877603, + "grad_norm": 0.3174765706062317, + "learning_rate": 1.4004118862047836e-05, + "loss": 1.2159, + "step": 13014 + }, + { + "epoch": 3.876468288687429, + "grad_norm": 0.24706552922725677, + "learning_rate": 1.400323497724397e-05, + "loss": 1.2357, + "step": 13015 + }, + { + "epoch": 3.876766134887098, + "grad_norm": 0.2730309069156647, + "learning_rate": 1.4002351055194477e-05, + "loss": 1.2325, + "step": 13016 + }, + { + "epoch": 3.877063981086766, + "grad_norm": 0.24229326844215393, + "learning_rate": 1.4001467095907577e-05, + "loss": 1.2452, + "step": 13017 + }, + { + "epoch": 3.877361827286435, + "grad_norm": 0.24087953567504883, + "learning_rate": 1.4000583099391498e-05, + "loss": 1.2372, + "step": 13018 + }, + { + "epoch": 3.8776596734861037, + "grad_norm": 0.2412424385547638, + "learning_rate": 1.3999699065654462e-05, + "loss": 1.2286, + "step": 13019 + }, + { + "epoch": 3.877957519685772, + "grad_norm": 0.23994415998458862, + "learning_rate": 1.3998814994704696e-05, + "loss": 1.2291, + "step": 13020 + }, + { + "epoch": 3.878255365885441, + "grad_norm": 0.2591041922569275, + "learning_rate": 1.3997930886550421e-05, + "loss": 1.2072, + "step": 13021 + }, + { + "epoch": 3.8785532120851096, + "grad_norm": 0.2399747222661972, + "learning_rate": 1.3997046741199871e-05, + "loss": 1.2435, + "step": 13022 + }, + { + "epoch": 3.878851058284778, + "grad_norm": 0.23505285382270813, + "learning_rate": 1.3996162558661264e-05, + "loss": 1.2195, + "step": 13023 + }, + { + "epoch": 3.8791489044844467, + "grad_norm": 0.2627851963043213, + "learning_rate": 1.399527833894283e-05, + "loss": 1.2335, + "step": 13024 + }, + { + "epoch": 3.8794467506841155, + "grad_norm": 0.28813469409942627, + "learning_rate": 1.3994394082052791e-05, + "loss": 1.239, + "step": 13025 + }, + { + "epoch": 3.8797445968837843, + "grad_norm": 0.24302732944488525, + "learning_rate": 1.3993509787999383e-05, + "loss": 1.2214, + "step": 13026 + }, + { + "epoch": 3.880042443083453, + "grad_norm": 0.2919987440109253, + "learning_rate": 1.3992625456790827e-05, + "loss": 1.2259, + "step": 13027 + }, + { + "epoch": 3.8803402892831214, + "grad_norm": 0.25420111417770386, + "learning_rate": 1.3991741088435351e-05, + "loss": 1.2392, + "step": 13028 + }, + { + "epoch": 3.88063813548279, + "grad_norm": 0.31524187326431274, + "learning_rate": 1.3990856682941184e-05, + "loss": 1.2354, + "step": 13029 + }, + { + "epoch": 3.880935981682459, + "grad_norm": 0.33104392886161804, + "learning_rate": 1.3989972240316556e-05, + "loss": 1.2406, + "step": 13030 + }, + { + "epoch": 3.8812338278821272, + "grad_norm": 0.24215851724147797, + "learning_rate": 1.398908776056969e-05, + "loss": 1.2299, + "step": 13031 + }, + { + "epoch": 3.881531674081796, + "grad_norm": 0.32851579785346985, + "learning_rate": 1.3988203243708822e-05, + "loss": 1.2293, + "step": 13032 + }, + { + "epoch": 3.8818295202814648, + "grad_norm": 0.27357521653175354, + "learning_rate": 1.398731868974218e-05, + "loss": 1.218, + "step": 13033 + }, + { + "epoch": 3.882127366481133, + "grad_norm": 0.2925247251987457, + "learning_rate": 1.3986434098677992e-05, + "loss": 1.223, + "step": 13034 + }, + { + "epoch": 3.882425212680802, + "grad_norm": 0.31523144245147705, + "learning_rate": 1.3985549470524487e-05, + "loss": 1.2375, + "step": 13035 + }, + { + "epoch": 3.8827230588804706, + "grad_norm": 0.25969579815864563, + "learning_rate": 1.3984664805289898e-05, + "loss": 1.2343, + "step": 13036 + }, + { + "epoch": 3.883020905080139, + "grad_norm": 0.31095975637435913, + "learning_rate": 1.3983780102982454e-05, + "loss": 1.2279, + "step": 13037 + }, + { + "epoch": 3.8833187512798077, + "grad_norm": 0.28102242946624756, + "learning_rate": 1.398289536361039e-05, + "loss": 1.2278, + "step": 13038 + }, + { + "epoch": 3.8836165974794765, + "grad_norm": 0.2894362807273865, + "learning_rate": 1.3982010587181933e-05, + "loss": 1.2371, + "step": 13039 + }, + { + "epoch": 3.8839144436791453, + "grad_norm": 0.2896195948123932, + "learning_rate": 1.3981125773705316e-05, + "loss": 1.2161, + "step": 13040 + }, + { + "epoch": 3.884212289878814, + "grad_norm": 0.2916172444820404, + "learning_rate": 1.398024092318877e-05, + "loss": 1.2349, + "step": 13041 + }, + { + "epoch": 3.8845101360784824, + "grad_norm": 0.2779673635959625, + "learning_rate": 1.3979356035640532e-05, + "loss": 1.211, + "step": 13042 + }, + { + "epoch": 3.884807982278151, + "grad_norm": 0.29934269189834595, + "learning_rate": 1.3978471111068829e-05, + "loss": 1.2235, + "step": 13043 + }, + { + "epoch": 3.88510582847782, + "grad_norm": 0.26979440450668335, + "learning_rate": 1.3977586149481901e-05, + "loss": 1.2242, + "step": 13044 + }, + { + "epoch": 3.8854036746774883, + "grad_norm": 0.30681008100509644, + "learning_rate": 1.3976701150887975e-05, + "loss": 1.2336, + "step": 13045 + }, + { + "epoch": 3.885701520877157, + "grad_norm": 0.24776335060596466, + "learning_rate": 1.397581611529529e-05, + "loss": 1.2464, + "step": 13046 + }, + { + "epoch": 3.885999367076826, + "grad_norm": 0.2861195206642151, + "learning_rate": 1.3974931042712079e-05, + "loss": 1.2284, + "step": 13047 + }, + { + "epoch": 3.886297213276494, + "grad_norm": 0.28940054774284363, + "learning_rate": 1.3974045933146574e-05, + "loss": 1.2224, + "step": 13048 + }, + { + "epoch": 3.886595059476163, + "grad_norm": 0.2539926767349243, + "learning_rate": 1.3973160786607007e-05, + "loss": 1.2316, + "step": 13049 + }, + { + "epoch": 3.8868929056758317, + "grad_norm": 0.2622884213924408, + "learning_rate": 1.3972275603101624e-05, + "loss": 1.2289, + "step": 13050 + }, + { + "epoch": 3.8871907518755004, + "grad_norm": 0.2613528370857239, + "learning_rate": 1.3971390382638654e-05, + "loss": 1.2188, + "step": 13051 + }, + { + "epoch": 3.8874885980751688, + "grad_norm": 0.2652732729911804, + "learning_rate": 1.3970505125226332e-05, + "loss": 1.2392, + "step": 13052 + }, + { + "epoch": 3.8877864442748375, + "grad_norm": 0.26048338413238525, + "learning_rate": 1.3969619830872898e-05, + "loss": 1.2253, + "step": 13053 + }, + { + "epoch": 3.8880842904745063, + "grad_norm": 0.25450706481933594, + "learning_rate": 1.3968734499586582e-05, + "loss": 1.2321, + "step": 13054 + }, + { + "epoch": 3.888382136674175, + "grad_norm": 0.26873451471328735, + "learning_rate": 1.3967849131375631e-05, + "loss": 1.2354, + "step": 13055 + }, + { + "epoch": 3.8886799828738434, + "grad_norm": 0.24118025600910187, + "learning_rate": 1.3966963726248276e-05, + "loss": 1.2447, + "step": 13056 + }, + { + "epoch": 3.888977829073512, + "grad_norm": 0.32833707332611084, + "learning_rate": 1.3966078284212754e-05, + "loss": 1.2256, + "step": 13057 + }, + { + "epoch": 3.889275675273181, + "grad_norm": 0.242631733417511, + "learning_rate": 1.3965192805277306e-05, + "loss": 1.2471, + "step": 13058 + }, + { + "epoch": 3.8895735214728493, + "grad_norm": 0.2761871814727783, + "learning_rate": 1.3964307289450166e-05, + "loss": 1.2259, + "step": 13059 + }, + { + "epoch": 3.889871367672518, + "grad_norm": 0.250262975692749, + "learning_rate": 1.3963421736739578e-05, + "loss": 1.2243, + "step": 13060 + }, + { + "epoch": 3.890169213872187, + "grad_norm": 0.28379562497138977, + "learning_rate": 1.396253614715378e-05, + "loss": 1.2481, + "step": 13061 + }, + { + "epoch": 3.890467060071855, + "grad_norm": 0.23868994414806366, + "learning_rate": 1.396165052070101e-05, + "loss": 1.2245, + "step": 13062 + }, + { + "epoch": 3.890764906271524, + "grad_norm": 0.3253426253795624, + "learning_rate": 1.3960764857389509e-05, + "loss": 1.2375, + "step": 13063 + }, + { + "epoch": 3.8910627524711927, + "grad_norm": 0.2643393278121948, + "learning_rate": 1.3959879157227514e-05, + "loss": 1.2149, + "step": 13064 + }, + { + "epoch": 3.8913605986708615, + "grad_norm": 0.34511491656303406, + "learning_rate": 1.3958993420223268e-05, + "loss": 1.2409, + "step": 13065 + }, + { + "epoch": 3.8916584448705303, + "grad_norm": 0.25883370637893677, + "learning_rate": 1.3958107646385011e-05, + "loss": 1.2357, + "step": 13066 + }, + { + "epoch": 3.8919562910701986, + "grad_norm": 0.351248174905777, + "learning_rate": 1.3957221835720986e-05, + "loss": 1.2441, + "step": 13067 + }, + { + "epoch": 3.8922541372698674, + "grad_norm": 0.25470685958862305, + "learning_rate": 1.3956335988239431e-05, + "loss": 1.2149, + "step": 13068 + }, + { + "epoch": 3.892551983469536, + "grad_norm": 0.43083131313323975, + "learning_rate": 1.3955450103948592e-05, + "loss": 1.2292, + "step": 13069 + }, + { + "epoch": 3.8928498296692045, + "grad_norm": 0.31342896819114685, + "learning_rate": 1.3954564182856709e-05, + "loss": 1.2309, + "step": 13070 + }, + { + "epoch": 3.8931476758688732, + "grad_norm": 0.29841113090515137, + "learning_rate": 1.3953678224972025e-05, + "loss": 1.2335, + "step": 13071 + }, + { + "epoch": 3.893445522068542, + "grad_norm": 0.24150791764259338, + "learning_rate": 1.395279223030278e-05, + "loss": 1.2348, + "step": 13072 + }, + { + "epoch": 3.8937433682682103, + "grad_norm": 0.3425508439540863, + "learning_rate": 1.3951906198857221e-05, + "loss": 1.2453, + "step": 13073 + }, + { + "epoch": 3.894041214467879, + "grad_norm": 0.24360494315624237, + "learning_rate": 1.395102013064359e-05, + "loss": 1.2332, + "step": 13074 + }, + { + "epoch": 3.894339060667548, + "grad_norm": 0.27031195163726807, + "learning_rate": 1.3950134025670132e-05, + "loss": 1.219, + "step": 13075 + }, + { + "epoch": 3.894636906867216, + "grad_norm": 0.2692050635814667, + "learning_rate": 1.394924788394509e-05, + "loss": 1.2112, + "step": 13076 + }, + { + "epoch": 3.894934753066885, + "grad_norm": 0.2483391910791397, + "learning_rate": 1.394836170547671e-05, + "loss": 1.2255, + "step": 13077 + }, + { + "epoch": 3.8952325992665537, + "grad_norm": 0.256445050239563, + "learning_rate": 1.3947475490273232e-05, + "loss": 1.2399, + "step": 13078 + }, + { + "epoch": 3.8955304454662225, + "grad_norm": 0.2907178997993469, + "learning_rate": 1.3946589238342907e-05, + "loss": 1.2274, + "step": 13079 + }, + { + "epoch": 3.8958282916658913, + "grad_norm": 0.29144421219825745, + "learning_rate": 1.394570294969398e-05, + "loss": 1.2228, + "step": 13080 + }, + { + "epoch": 3.8961261378655596, + "grad_norm": 0.2566499710083008, + "learning_rate": 1.3944816624334693e-05, + "loss": 1.2242, + "step": 13081 + }, + { + "epoch": 3.8964239840652284, + "grad_norm": 0.252852201461792, + "learning_rate": 1.3943930262273297e-05, + "loss": 1.2238, + "step": 13082 + }, + { + "epoch": 3.896721830264897, + "grad_norm": 0.28006088733673096, + "learning_rate": 1.3943043863518036e-05, + "loss": 1.2393, + "step": 13083 + }, + { + "epoch": 3.8970196764645655, + "grad_norm": 0.2647339701652527, + "learning_rate": 1.3942157428077154e-05, + "loss": 1.2377, + "step": 13084 + }, + { + "epoch": 3.8973175226642343, + "grad_norm": 0.2689646780490875, + "learning_rate": 1.3941270955958905e-05, + "loss": 1.2193, + "step": 13085 + }, + { + "epoch": 3.897615368863903, + "grad_norm": 0.2609025835990906, + "learning_rate": 1.3940384447171531e-05, + "loss": 1.2239, + "step": 13086 + }, + { + "epoch": 3.8979132150635714, + "grad_norm": 0.2561178505420685, + "learning_rate": 1.3939497901723287e-05, + "loss": 1.2291, + "step": 13087 + }, + { + "epoch": 3.89821106126324, + "grad_norm": 0.2868281602859497, + "learning_rate": 1.393861131962241e-05, + "loss": 1.2249, + "step": 13088 + }, + { + "epoch": 3.898508907462909, + "grad_norm": 0.2537521421909332, + "learning_rate": 1.3937724700877157e-05, + "loss": 1.2163, + "step": 13089 + }, + { + "epoch": 3.8988067536625772, + "grad_norm": 0.3083294928073883, + "learning_rate": 1.3936838045495775e-05, + "loss": 1.2244, + "step": 13090 + }, + { + "epoch": 3.899104599862246, + "grad_norm": 0.26004940271377563, + "learning_rate": 1.3935951353486516e-05, + "loss": 1.2341, + "step": 13091 + }, + { + "epoch": 3.8994024460619148, + "grad_norm": 0.28579092025756836, + "learning_rate": 1.3935064624857626e-05, + "loss": 1.2276, + "step": 13092 + }, + { + "epoch": 3.8997002922615835, + "grad_norm": 0.28353142738342285, + "learning_rate": 1.3934177859617356e-05, + "loss": 1.2194, + "step": 13093 + }, + { + "epoch": 3.8999981384612523, + "grad_norm": 0.3470597565174103, + "learning_rate": 1.3933291057773959e-05, + "loss": 1.2221, + "step": 13094 + }, + { + "epoch": 3.9002959846609206, + "grad_norm": 0.27343669533729553, + "learning_rate": 1.3932404219335678e-05, + "loss": 1.2356, + "step": 13095 + }, + { + "epoch": 3.9005938308605894, + "grad_norm": 0.31600189208984375, + "learning_rate": 1.3931517344310772e-05, + "loss": 1.2453, + "step": 13096 + }, + { + "epoch": 3.900891677060258, + "grad_norm": 0.24470661580562592, + "learning_rate": 1.393063043270749e-05, + "loss": 1.2442, + "step": 13097 + }, + { + "epoch": 3.9011895232599265, + "grad_norm": 0.380693644285202, + "learning_rate": 1.3929743484534083e-05, + "loss": 1.2367, + "step": 13098 + }, + { + "epoch": 3.9014873694595953, + "grad_norm": 0.27079856395721436, + "learning_rate": 1.3928856499798803e-05, + "loss": 1.2318, + "step": 13099 + }, + { + "epoch": 3.901785215659264, + "grad_norm": 0.31574124097824097, + "learning_rate": 1.3927969478509904e-05, + "loss": 1.2325, + "step": 13100 + }, + { + "epoch": 3.9020830618589324, + "grad_norm": 0.28129833936691284, + "learning_rate": 1.3927082420675637e-05, + "loss": 1.2394, + "step": 13101 + }, + { + "epoch": 3.902380908058601, + "grad_norm": 0.2591829299926758, + "learning_rate": 1.3926195326304254e-05, + "loss": 1.229, + "step": 13102 + }, + { + "epoch": 3.90267875425827, + "grad_norm": 0.4169105291366577, + "learning_rate": 1.3925308195404013e-05, + "loss": 1.2144, + "step": 13103 + }, + { + "epoch": 3.9029766004579383, + "grad_norm": 0.32777151465415955, + "learning_rate": 1.3924421027983166e-05, + "loss": 1.2326, + "step": 13104 + }, + { + "epoch": 3.903274446657607, + "grad_norm": 0.3248552680015564, + "learning_rate": 1.3923533824049962e-05, + "loss": 1.2232, + "step": 13105 + }, + { + "epoch": 3.903572292857276, + "grad_norm": 0.371523916721344, + "learning_rate": 1.392264658361266e-05, + "loss": 1.2324, + "step": 13106 + }, + { + "epoch": 3.9038701390569446, + "grad_norm": 0.29270094633102417, + "learning_rate": 1.3921759306679516e-05, + "loss": 1.2272, + "step": 13107 + }, + { + "epoch": 3.9041679852566133, + "grad_norm": 0.31862056255340576, + "learning_rate": 1.3920871993258782e-05, + "loss": 1.2088, + "step": 13108 + }, + { + "epoch": 3.9044658314562817, + "grad_norm": 0.24687409400939941, + "learning_rate": 1.3919984643358715e-05, + "loss": 1.2067, + "step": 13109 + }, + { + "epoch": 3.9047636776559504, + "grad_norm": 0.2599547803401947, + "learning_rate": 1.391909725698757e-05, + "loss": 1.2238, + "step": 13110 + }, + { + "epoch": 3.905061523855619, + "grad_norm": 0.26793843507766724, + "learning_rate": 1.3918209834153606e-05, + "loss": 1.2377, + "step": 13111 + }, + { + "epoch": 3.9053593700552875, + "grad_norm": 0.23883236944675446, + "learning_rate": 1.3917322374865076e-05, + "loss": 1.2203, + "step": 13112 + }, + { + "epoch": 3.9056572162549563, + "grad_norm": 0.23982957005500793, + "learning_rate": 1.3916434879130233e-05, + "loss": 1.2224, + "step": 13113 + }, + { + "epoch": 3.905955062454625, + "grad_norm": 0.2505717873573303, + "learning_rate": 1.3915547346957348e-05, + "loss": 1.2332, + "step": 13114 + }, + { + "epoch": 3.9062529086542934, + "grad_norm": 0.2481008768081665, + "learning_rate": 1.3914659778354664e-05, + "loss": 1.226, + "step": 13115 + }, + { + "epoch": 3.906550754853962, + "grad_norm": 0.23589645326137543, + "learning_rate": 1.3913772173330445e-05, + "loss": 1.2554, + "step": 13116 + }, + { + "epoch": 3.906848601053631, + "grad_norm": 0.2520972192287445, + "learning_rate": 1.391288453189295e-05, + "loss": 1.2257, + "step": 13117 + }, + { + "epoch": 3.9071464472532997, + "grad_norm": 0.24034655094146729, + "learning_rate": 1.3911996854050436e-05, + "loss": 1.2379, + "step": 13118 + }, + { + "epoch": 3.907444293452968, + "grad_norm": 0.26680758595466614, + "learning_rate": 1.3911109139811161e-05, + "loss": 1.2263, + "step": 13119 + }, + { + "epoch": 3.907742139652637, + "grad_norm": 0.2632834315299988, + "learning_rate": 1.3910221389183384e-05, + "loss": 1.2191, + "step": 13120 + }, + { + "epoch": 3.9080399858523056, + "grad_norm": 0.26333409547805786, + "learning_rate": 1.390933360217537e-05, + "loss": 1.2396, + "step": 13121 + }, + { + "epoch": 3.9083378320519744, + "grad_norm": 0.24903489649295807, + "learning_rate": 1.3908445778795372e-05, + "loss": 1.2389, + "step": 13122 + }, + { + "epoch": 3.9086356782516427, + "grad_norm": 0.3399023115634918, + "learning_rate": 1.3907557919051654e-05, + "loss": 1.2339, + "step": 13123 + }, + { + "epoch": 3.9089335244513115, + "grad_norm": 0.37201112508773804, + "learning_rate": 1.3906670022952473e-05, + "loss": 1.2178, + "step": 13124 + }, + { + "epoch": 3.9092313706509803, + "grad_norm": 0.2407924383878708, + "learning_rate": 1.3905782090506095e-05, + "loss": 1.2169, + "step": 13125 + }, + { + "epoch": 3.9095292168506486, + "grad_norm": 0.37440383434295654, + "learning_rate": 1.3904894121720776e-05, + "loss": 1.2331, + "step": 13126 + }, + { + "epoch": 3.9098270630503174, + "grad_norm": 0.27032122015953064, + "learning_rate": 1.390400611660478e-05, + "loss": 1.2259, + "step": 13127 + }, + { + "epoch": 3.910124909249986, + "grad_norm": 0.3098152279853821, + "learning_rate": 1.3903118075166371e-05, + "loss": 1.2328, + "step": 13128 + }, + { + "epoch": 3.9104227554496545, + "grad_norm": 0.24604958295822144, + "learning_rate": 1.390222999741381e-05, + "loss": 1.2314, + "step": 13129 + }, + { + "epoch": 3.9107206016493232, + "grad_norm": 0.2745722830295563, + "learning_rate": 1.3901341883355356e-05, + "loss": 1.2192, + "step": 13130 + }, + { + "epoch": 3.911018447848992, + "grad_norm": 0.300998717546463, + "learning_rate": 1.3900453732999273e-05, + "loss": 1.2137, + "step": 13131 + }, + { + "epoch": 3.9113162940486608, + "grad_norm": 0.4329034686088562, + "learning_rate": 1.3899565546353828e-05, + "loss": 1.2376, + "step": 13132 + }, + { + "epoch": 3.9116141402483295, + "grad_norm": 0.29281479120254517, + "learning_rate": 1.3898677323427283e-05, + "loss": 1.2332, + "step": 13133 + }, + { + "epoch": 3.911911986447998, + "grad_norm": 0.27857616543769836, + "learning_rate": 1.38977890642279e-05, + "loss": 1.2366, + "step": 13134 + }, + { + "epoch": 3.9122098326476666, + "grad_norm": 0.3135628402233124, + "learning_rate": 1.3896900768763948e-05, + "loss": 1.2315, + "step": 13135 + }, + { + "epoch": 3.9125076788473354, + "grad_norm": 0.2370682656764984, + "learning_rate": 1.3896012437043685e-05, + "loss": 1.234, + "step": 13136 + }, + { + "epoch": 3.9128055250470037, + "grad_norm": 0.29846489429473877, + "learning_rate": 1.3895124069075378e-05, + "loss": 1.2391, + "step": 13137 + }, + { + "epoch": 3.9131033712466725, + "grad_norm": 0.2559567987918854, + "learning_rate": 1.3894235664867298e-05, + "loss": 1.2308, + "step": 13138 + }, + { + "epoch": 3.9134012174463413, + "grad_norm": 0.27251166105270386, + "learning_rate": 1.3893347224427703e-05, + "loss": 1.2277, + "step": 13139 + }, + { + "epoch": 3.9136990636460096, + "grad_norm": 0.24662406742572784, + "learning_rate": 1.3892458747764863e-05, + "loss": 1.2231, + "step": 13140 + }, + { + "epoch": 3.9139969098456784, + "grad_norm": 0.257978230714798, + "learning_rate": 1.3891570234887041e-05, + "loss": 1.222, + "step": 13141 + }, + { + "epoch": 3.914294756045347, + "grad_norm": 0.242611363530159, + "learning_rate": 1.3890681685802508e-05, + "loss": 1.2354, + "step": 13142 + }, + { + "epoch": 3.9145926022450155, + "grad_norm": 0.274141401052475, + "learning_rate": 1.3889793100519529e-05, + "loss": 1.2174, + "step": 13143 + }, + { + "epoch": 3.9148904484446843, + "grad_norm": 0.27475303411483765, + "learning_rate": 1.388890447904637e-05, + "loss": 1.2136, + "step": 13144 + }, + { + "epoch": 3.915188294644353, + "grad_norm": 0.2634553611278534, + "learning_rate": 1.38880158213913e-05, + "loss": 1.2218, + "step": 13145 + }, + { + "epoch": 3.915486140844022, + "grad_norm": 0.36685436964035034, + "learning_rate": 1.388712712756259e-05, + "loss": 1.2461, + "step": 13146 + }, + { + "epoch": 3.9157839870436906, + "grad_norm": 0.3333728611469269, + "learning_rate": 1.3886238397568498e-05, + "loss": 1.2325, + "step": 13147 + }, + { + "epoch": 3.916081833243359, + "grad_norm": 0.33060261607170105, + "learning_rate": 1.3885349631417307e-05, + "loss": 1.2172, + "step": 13148 + }, + { + "epoch": 3.9163796794430277, + "grad_norm": 0.2988325357437134, + "learning_rate": 1.3884460829117275e-05, + "loss": 1.2336, + "step": 13149 + }, + { + "epoch": 3.9166775256426964, + "grad_norm": 0.3582686185836792, + "learning_rate": 1.3883571990676675e-05, + "loss": 1.2411, + "step": 13150 + }, + { + "epoch": 3.9169753718423648, + "grad_norm": 0.298128604888916, + "learning_rate": 1.3882683116103777e-05, + "loss": 1.2524, + "step": 13151 + }, + { + "epoch": 3.9172732180420335, + "grad_norm": 0.3053963780403137, + "learning_rate": 1.3881794205406852e-05, + "loss": 1.2129, + "step": 13152 + }, + { + "epoch": 3.9175710642417023, + "grad_norm": 0.28138604760169983, + "learning_rate": 1.3880905258594166e-05, + "loss": 1.2205, + "step": 13153 + }, + { + "epoch": 3.9178689104413706, + "grad_norm": 0.33124756813049316, + "learning_rate": 1.388001627567399e-05, + "loss": 1.2279, + "step": 13154 + }, + { + "epoch": 3.9181667566410394, + "grad_norm": 0.3027435839176178, + "learning_rate": 1.3879127256654606e-05, + "loss": 1.2409, + "step": 13155 + }, + { + "epoch": 3.918464602840708, + "grad_norm": 0.27342790365219116, + "learning_rate": 1.3878238201544272e-05, + "loss": 1.2489, + "step": 13156 + }, + { + "epoch": 3.9187624490403765, + "grad_norm": 0.35694262385368347, + "learning_rate": 1.3877349110351263e-05, + "loss": 1.2568, + "step": 13157 + }, + { + "epoch": 3.9190602952400453, + "grad_norm": 0.28090721368789673, + "learning_rate": 1.3876459983083857e-05, + "loss": 1.2317, + "step": 13158 + }, + { + "epoch": 3.919358141439714, + "grad_norm": 0.3718603849411011, + "learning_rate": 1.3875570819750319e-05, + "loss": 1.2171, + "step": 13159 + }, + { + "epoch": 3.919655987639383, + "grad_norm": 0.33166900277137756, + "learning_rate": 1.3874681620358924e-05, + "loss": 1.2233, + "step": 13160 + }, + { + "epoch": 3.9199538338390516, + "grad_norm": 0.26730549335479736, + "learning_rate": 1.3873792384917945e-05, + "loss": 1.2252, + "step": 13161 + }, + { + "epoch": 3.92025168003872, + "grad_norm": 0.2688398063182831, + "learning_rate": 1.387290311343566e-05, + "loss": 1.2129, + "step": 13162 + }, + { + "epoch": 3.9205495262383887, + "grad_norm": 0.26750171184539795, + "learning_rate": 1.3872013805920337e-05, + "loss": 1.2331, + "step": 13163 + }, + { + "epoch": 3.9208473724380575, + "grad_norm": 0.32285237312316895, + "learning_rate": 1.387112446238025e-05, + "loss": 1.232, + "step": 13164 + }, + { + "epoch": 3.921145218637726, + "grad_norm": 0.2518673241138458, + "learning_rate": 1.3870235082823675e-05, + "loss": 1.2255, + "step": 13165 + }, + { + "epoch": 3.9214430648373946, + "grad_norm": 0.30436524748802185, + "learning_rate": 1.3869345667258887e-05, + "loss": 1.2371, + "step": 13166 + }, + { + "epoch": 3.9217409110370633, + "grad_norm": 0.2510192096233368, + "learning_rate": 1.386845621569416e-05, + "loss": 1.2338, + "step": 13167 + }, + { + "epoch": 3.9220387572367317, + "grad_norm": 0.3274816572666168, + "learning_rate": 1.3867566728137771e-05, + "loss": 1.2375, + "step": 13168 + }, + { + "epoch": 3.9223366034364004, + "grad_norm": 0.2782514989376068, + "learning_rate": 1.3866677204597997e-05, + "loss": 1.2405, + "step": 13169 + }, + { + "epoch": 3.922634449636069, + "grad_norm": 0.2858296036720276, + "learning_rate": 1.3865787645083111e-05, + "loss": 1.2219, + "step": 13170 + }, + { + "epoch": 3.9229322958357375, + "grad_norm": 0.25521355867385864, + "learning_rate": 1.3864898049601387e-05, + "loss": 1.2246, + "step": 13171 + }, + { + "epoch": 3.9232301420354063, + "grad_norm": 0.26153141260147095, + "learning_rate": 1.3864008418161106e-05, + "loss": 1.2382, + "step": 13172 + }, + { + "epoch": 3.923527988235075, + "grad_norm": 0.26130759716033936, + "learning_rate": 1.3863118750770543e-05, + "loss": 1.2202, + "step": 13173 + }, + { + "epoch": 3.923825834434744, + "grad_norm": 0.2539772093296051, + "learning_rate": 1.3862229047437979e-05, + "loss": 1.2287, + "step": 13174 + }, + { + "epoch": 3.9241236806344126, + "grad_norm": 0.27328330278396606, + "learning_rate": 1.3861339308171686e-05, + "loss": 1.2357, + "step": 13175 + }, + { + "epoch": 3.924421526834081, + "grad_norm": 0.2556736469268799, + "learning_rate": 1.3860449532979947e-05, + "loss": 1.2191, + "step": 13176 + }, + { + "epoch": 3.9247193730337497, + "grad_norm": 0.27171239256858826, + "learning_rate": 1.3859559721871037e-05, + "loss": 1.2467, + "step": 13177 + }, + { + "epoch": 3.9250172192334185, + "grad_norm": 0.24272269010543823, + "learning_rate": 1.3858669874853235e-05, + "loss": 1.2175, + "step": 13178 + }, + { + "epoch": 3.925315065433087, + "grad_norm": 0.26099082827568054, + "learning_rate": 1.3857779991934823e-05, + "loss": 1.2362, + "step": 13179 + }, + { + "epoch": 3.9256129116327556, + "grad_norm": 0.25095367431640625, + "learning_rate": 1.3856890073124077e-05, + "loss": 1.2364, + "step": 13180 + }, + { + "epoch": 3.9259107578324244, + "grad_norm": 0.29382559657096863, + "learning_rate": 1.3856000118429278e-05, + "loss": 1.2343, + "step": 13181 + }, + { + "epoch": 3.9262086040320927, + "grad_norm": 0.3270135819911957, + "learning_rate": 1.3855110127858706e-05, + "loss": 1.2265, + "step": 13182 + }, + { + "epoch": 3.9265064502317615, + "grad_norm": 0.2749212086200714, + "learning_rate": 1.3854220101420644e-05, + "loss": 1.2376, + "step": 13183 + }, + { + "epoch": 3.9268042964314303, + "grad_norm": 0.3886204957962036, + "learning_rate": 1.3853330039123365e-05, + "loss": 1.2241, + "step": 13184 + }, + { + "epoch": 3.927102142631099, + "grad_norm": 0.2546490430831909, + "learning_rate": 1.385243994097516e-05, + "loss": 1.227, + "step": 13185 + }, + { + "epoch": 3.9273999888307674, + "grad_norm": 0.4590868651866913, + "learning_rate": 1.3851549806984306e-05, + "loss": 1.2189, + "step": 13186 + }, + { + "epoch": 3.927697835030436, + "grad_norm": 0.3852839469909668, + "learning_rate": 1.3850659637159081e-05, + "loss": 1.2176, + "step": 13187 + }, + { + "epoch": 3.927995681230105, + "grad_norm": 0.4147535562515259, + "learning_rate": 1.3849769431507771e-05, + "loss": 1.2238, + "step": 13188 + }, + { + "epoch": 3.9282935274297737, + "grad_norm": 0.40568336844444275, + "learning_rate": 1.3848879190038658e-05, + "loss": 1.2503, + "step": 13189 + }, + { + "epoch": 3.928591373629442, + "grad_norm": 0.34892594814300537, + "learning_rate": 1.3847988912760025e-05, + "loss": 1.2395, + "step": 13190 + }, + { + "epoch": 3.9288892198291108, + "grad_norm": 0.31116774678230286, + "learning_rate": 1.3847098599680153e-05, + "loss": 1.2224, + "step": 13191 + }, + { + "epoch": 3.9291870660287795, + "grad_norm": 0.360795795917511, + "learning_rate": 1.3846208250807326e-05, + "loss": 1.2294, + "step": 13192 + }, + { + "epoch": 3.929484912228448, + "grad_norm": 0.2690966725349426, + "learning_rate": 1.3845317866149833e-05, + "loss": 1.2195, + "step": 13193 + }, + { + "epoch": 3.9297827584281166, + "grad_norm": 0.4607526957988739, + "learning_rate": 1.3844427445715952e-05, + "loss": 1.2321, + "step": 13194 + }, + { + "epoch": 3.9300806046277854, + "grad_norm": 0.2560616135597229, + "learning_rate": 1.3843536989513964e-05, + "loss": 1.2309, + "step": 13195 + }, + { + "epoch": 3.9303784508274537, + "grad_norm": 0.29613804817199707, + "learning_rate": 1.3842646497552164e-05, + "loss": 1.2468, + "step": 13196 + }, + { + "epoch": 3.9306762970271225, + "grad_norm": 0.28417542576789856, + "learning_rate": 1.3841755969838829e-05, + "loss": 1.2209, + "step": 13197 + }, + { + "epoch": 3.9309741432267913, + "grad_norm": 0.34782496094703674, + "learning_rate": 1.3840865406382246e-05, + "loss": 1.2255, + "step": 13198 + }, + { + "epoch": 3.93127198942646, + "grad_norm": 0.2893727123737335, + "learning_rate": 1.3839974807190703e-05, + "loss": 1.2479, + "step": 13199 + }, + { + "epoch": 3.931569835626129, + "grad_norm": 0.2869235873222351, + "learning_rate": 1.3839084172272487e-05, + "loss": 1.2209, + "step": 13200 + }, + { + "epoch": 3.931867681825797, + "grad_norm": 0.25480109453201294, + "learning_rate": 1.3838193501635878e-05, + "loss": 1.2301, + "step": 13201 + }, + { + "epoch": 3.932165528025466, + "grad_norm": 0.2833738625049591, + "learning_rate": 1.3837302795289169e-05, + "loss": 1.226, + "step": 13202 + }, + { + "epoch": 3.9324633742251347, + "grad_norm": 0.3162032961845398, + "learning_rate": 1.3836412053240644e-05, + "loss": 1.2398, + "step": 13203 + }, + { + "epoch": 3.932761220424803, + "grad_norm": 0.3235669434070587, + "learning_rate": 1.3835521275498593e-05, + "loss": 1.2317, + "step": 13204 + }, + { + "epoch": 3.933059066624472, + "grad_norm": 0.24424074590206146, + "learning_rate": 1.3834630462071301e-05, + "loss": 1.23, + "step": 13205 + }, + { + "epoch": 3.9333569128241406, + "grad_norm": 0.47108832001686096, + "learning_rate": 1.3833739612967054e-05, + "loss": 1.2338, + "step": 13206 + }, + { + "epoch": 3.933654759023809, + "grad_norm": 0.29592519998550415, + "learning_rate": 1.3832848728194146e-05, + "loss": 1.231, + "step": 13207 + }, + { + "epoch": 3.9339526052234777, + "grad_norm": 0.3220840394496918, + "learning_rate": 1.3831957807760862e-05, + "loss": 1.2255, + "step": 13208 + }, + { + "epoch": 3.9342504514231464, + "grad_norm": 0.26471439003944397, + "learning_rate": 1.3831066851675492e-05, + "loss": 1.2397, + "step": 13209 + }, + { + "epoch": 3.9345482976228148, + "grad_norm": 0.47105714678764343, + "learning_rate": 1.3830175859946327e-05, + "loss": 1.2334, + "step": 13210 + }, + { + "epoch": 3.9348461438224835, + "grad_norm": 0.23992817103862762, + "learning_rate": 1.3829284832581653e-05, + "loss": 1.2232, + "step": 13211 + }, + { + "epoch": 3.9351439900221523, + "grad_norm": 0.32074904441833496, + "learning_rate": 1.3828393769589764e-05, + "loss": 1.2217, + "step": 13212 + }, + { + "epoch": 3.935441836221821, + "grad_norm": 0.2720768451690674, + "learning_rate": 1.3827502670978945e-05, + "loss": 1.236, + "step": 13213 + }, + { + "epoch": 3.93573968242149, + "grad_norm": 0.31859278678894043, + "learning_rate": 1.3826611536757493e-05, + "loss": 1.2269, + "step": 13214 + }, + { + "epoch": 3.936037528621158, + "grad_norm": 0.2529468238353729, + "learning_rate": 1.3825720366933695e-05, + "loss": 1.2435, + "step": 13215 + }, + { + "epoch": 3.936335374820827, + "grad_norm": 0.28112754225730896, + "learning_rate": 1.382482916151584e-05, + "loss": 1.2279, + "step": 13216 + }, + { + "epoch": 3.9366332210204957, + "grad_norm": 0.28580331802368164, + "learning_rate": 1.382393792051223e-05, + "loss": 1.2195, + "step": 13217 + }, + { + "epoch": 3.936931067220164, + "grad_norm": 0.23452556133270264, + "learning_rate": 1.3823046643931144e-05, + "loss": 1.2286, + "step": 13218 + }, + { + "epoch": 3.937228913419833, + "grad_norm": 0.270723432302475, + "learning_rate": 1.3822155331780882e-05, + "loss": 1.2187, + "step": 13219 + }, + { + "epoch": 3.9375267596195016, + "grad_norm": 0.25173231959342957, + "learning_rate": 1.3821263984069737e-05, + "loss": 1.2416, + "step": 13220 + }, + { + "epoch": 3.93782460581917, + "grad_norm": 0.2495669424533844, + "learning_rate": 1.3820372600805998e-05, + "loss": 1.2041, + "step": 13221 + }, + { + "epoch": 3.9381224520188387, + "grad_norm": 0.2559252083301544, + "learning_rate": 1.381948118199796e-05, + "loss": 1.227, + "step": 13222 + }, + { + "epoch": 3.9384202982185075, + "grad_norm": 0.29392632842063904, + "learning_rate": 1.3818589727653918e-05, + "loss": 1.243, + "step": 13223 + }, + { + "epoch": 3.938718144418176, + "grad_norm": 0.2594864070415497, + "learning_rate": 1.3817698237782165e-05, + "loss": 1.225, + "step": 13224 + }, + { + "epoch": 3.9390159906178446, + "grad_norm": 0.25113070011138916, + "learning_rate": 1.3816806712390993e-05, + "loss": 1.2268, + "step": 13225 + }, + { + "epoch": 3.9393138368175133, + "grad_norm": 0.26626530289649963, + "learning_rate": 1.3815915151488702e-05, + "loss": 1.2248, + "step": 13226 + }, + { + "epoch": 3.939611683017182, + "grad_norm": 0.24988804757595062, + "learning_rate": 1.3815023555083584e-05, + "loss": 1.2278, + "step": 13227 + }, + { + "epoch": 3.939909529216851, + "grad_norm": 0.25075680017471313, + "learning_rate": 1.3814131923183934e-05, + "loss": 1.2287, + "step": 13228 + }, + { + "epoch": 3.940207375416519, + "grad_norm": 0.2675560712814331, + "learning_rate": 1.3813240255798046e-05, + "loss": 1.2194, + "step": 13229 + }, + { + "epoch": 3.940505221616188, + "grad_norm": 0.2381962090730667, + "learning_rate": 1.3812348552934219e-05, + "loss": 1.2323, + "step": 13230 + }, + { + "epoch": 3.9408030678158568, + "grad_norm": 0.24896834790706635, + "learning_rate": 1.3811456814600748e-05, + "loss": 1.2319, + "step": 13231 + }, + { + "epoch": 3.941100914015525, + "grad_norm": 0.25786885619163513, + "learning_rate": 1.3810565040805928e-05, + "loss": 1.227, + "step": 13232 + }, + { + "epoch": 3.941398760215194, + "grad_norm": 0.2563535273075104, + "learning_rate": 1.3809673231558058e-05, + "loss": 1.2305, + "step": 13233 + }, + { + "epoch": 3.9416966064148626, + "grad_norm": 0.2621895968914032, + "learning_rate": 1.380878138686544e-05, + "loss": 1.238, + "step": 13234 + }, + { + "epoch": 3.941994452614531, + "grad_norm": 0.30734968185424805, + "learning_rate": 1.3807889506736363e-05, + "loss": 1.2295, + "step": 13235 + }, + { + "epoch": 3.9422922988141997, + "grad_norm": 0.24150706827640533, + "learning_rate": 1.3806997591179128e-05, + "loss": 1.2265, + "step": 13236 + }, + { + "epoch": 3.9425901450138685, + "grad_norm": 0.35120949149131775, + "learning_rate": 1.3806105640202035e-05, + "loss": 1.2247, + "step": 13237 + }, + { + "epoch": 3.942887991213537, + "grad_norm": 0.315340131521225, + "learning_rate": 1.3805213653813378e-05, + "loss": 1.2356, + "step": 13238 + }, + { + "epoch": 3.9431858374132056, + "grad_norm": 0.28169921040534973, + "learning_rate": 1.3804321632021462e-05, + "loss": 1.2115, + "step": 13239 + }, + { + "epoch": 3.9434836836128744, + "grad_norm": 0.4050716161727905, + "learning_rate": 1.3803429574834583e-05, + "loss": 1.2306, + "step": 13240 + }, + { + "epoch": 3.943781529812543, + "grad_norm": 0.2576994001865387, + "learning_rate": 1.3802537482261044e-05, + "loss": 1.2318, + "step": 13241 + }, + { + "epoch": 3.944079376012212, + "grad_norm": 0.4222757816314697, + "learning_rate": 1.3801645354309138e-05, + "loss": 1.2308, + "step": 13242 + }, + { + "epoch": 3.9443772222118803, + "grad_norm": 0.4433141052722931, + "learning_rate": 1.3800753190987173e-05, + "loss": 1.2196, + "step": 13243 + }, + { + "epoch": 3.944675068411549, + "grad_norm": 0.259381502866745, + "learning_rate": 1.3799860992303447e-05, + "loss": 1.2299, + "step": 13244 + }, + { + "epoch": 3.944972914611218, + "grad_norm": 0.4008481204509735, + "learning_rate": 1.3798968758266256e-05, + "loss": 1.2271, + "step": 13245 + }, + { + "epoch": 3.945270760810886, + "grad_norm": 0.25758200883865356, + "learning_rate": 1.3798076488883907e-05, + "loss": 1.2356, + "step": 13246 + }, + { + "epoch": 3.945568607010555, + "grad_norm": 0.3300653100013733, + "learning_rate": 1.3797184184164699e-05, + "loss": 1.236, + "step": 13247 + }, + { + "epoch": 3.9458664532102237, + "grad_norm": 0.2681613564491272, + "learning_rate": 1.3796291844116935e-05, + "loss": 1.2247, + "step": 13248 + }, + { + "epoch": 3.946164299409892, + "grad_norm": 0.32309770584106445, + "learning_rate": 1.3795399468748916e-05, + "loss": 1.2465, + "step": 13249 + }, + { + "epoch": 3.9464621456095608, + "grad_norm": 0.3046136200428009, + "learning_rate": 1.3794507058068947e-05, + "loss": 1.225, + "step": 13250 + }, + { + "epoch": 3.9467599918092295, + "grad_norm": 0.3005814254283905, + "learning_rate": 1.3793614612085331e-05, + "loss": 1.2279, + "step": 13251 + }, + { + "epoch": 3.9470578380088983, + "grad_norm": 0.3176072835922241, + "learning_rate": 1.3792722130806366e-05, + "loss": 1.2109, + "step": 13252 + }, + { + "epoch": 3.9473556842085666, + "grad_norm": 0.30952659249305725, + "learning_rate": 1.379182961424036e-05, + "loss": 1.2341, + "step": 13253 + }, + { + "epoch": 3.9476535304082354, + "grad_norm": 0.34675028920173645, + "learning_rate": 1.3790937062395615e-05, + "loss": 1.2284, + "step": 13254 + }, + { + "epoch": 3.947951376607904, + "grad_norm": 0.29791009426116943, + "learning_rate": 1.3790044475280438e-05, + "loss": 1.2338, + "step": 13255 + }, + { + "epoch": 3.948249222807573, + "grad_norm": 0.37231138348579407, + "learning_rate": 1.378915185290313e-05, + "loss": 1.2312, + "step": 13256 + }, + { + "epoch": 3.9485470690072413, + "grad_norm": 0.28895992040634155, + "learning_rate": 1.3788259195271998e-05, + "loss": 1.2508, + "step": 13257 + }, + { + "epoch": 3.94884491520691, + "grad_norm": 0.2745595872402191, + "learning_rate": 1.378736650239535e-05, + "loss": 1.2536, + "step": 13258 + }, + { + "epoch": 3.949142761406579, + "grad_norm": 0.24945619702339172, + "learning_rate": 1.3786473774281484e-05, + "loss": 1.2237, + "step": 13259 + }, + { + "epoch": 3.949440607606247, + "grad_norm": 0.34441810846328735, + "learning_rate": 1.3785581010938709e-05, + "loss": 1.2449, + "step": 13260 + }, + { + "epoch": 3.949738453805916, + "grad_norm": 0.28372707962989807, + "learning_rate": 1.3784688212375337e-05, + "loss": 1.2267, + "step": 13261 + }, + { + "epoch": 3.9500363000055847, + "grad_norm": 0.2903372049331665, + "learning_rate": 1.3783795378599666e-05, + "loss": 1.253, + "step": 13262 + }, + { + "epoch": 3.950334146205253, + "grad_norm": 0.24565307796001434, + "learning_rate": 1.3782902509620009e-05, + "loss": 1.2213, + "step": 13263 + }, + { + "epoch": 3.950631992404922, + "grad_norm": 0.3678787052631378, + "learning_rate": 1.3782009605444669e-05, + "loss": 1.2338, + "step": 13264 + }, + { + "epoch": 3.9509298386045906, + "grad_norm": 0.24600066244602203, + "learning_rate": 1.3781116666081956e-05, + "loss": 1.2318, + "step": 13265 + }, + { + "epoch": 3.9512276848042593, + "grad_norm": 0.2908230721950531, + "learning_rate": 1.3780223691540174e-05, + "loss": 1.2265, + "step": 13266 + }, + { + "epoch": 3.951525531003928, + "grad_norm": 0.2514212429523468, + "learning_rate": 1.3779330681827637e-05, + "loss": 1.2078, + "step": 13267 + }, + { + "epoch": 3.9518233772035964, + "grad_norm": 0.25792211294174194, + "learning_rate": 1.3778437636952654e-05, + "loss": 1.2411, + "step": 13268 + }, + { + "epoch": 3.952121223403265, + "grad_norm": 0.284408837556839, + "learning_rate": 1.3777544556923523e-05, + "loss": 1.2243, + "step": 13269 + }, + { + "epoch": 3.952419069602934, + "grad_norm": 0.35277897119522095, + "learning_rate": 1.3776651441748565e-05, + "loss": 1.2177, + "step": 13270 + }, + { + "epoch": 3.9527169158026023, + "grad_norm": 0.32351380586624146, + "learning_rate": 1.3775758291436083e-05, + "loss": 1.2358, + "step": 13271 + }, + { + "epoch": 3.953014762002271, + "grad_norm": 0.3098355233669281, + "learning_rate": 1.3774865105994388e-05, + "loss": 1.2275, + "step": 13272 + }, + { + "epoch": 3.95331260820194, + "grad_norm": 0.28767481446266174, + "learning_rate": 1.3773971885431791e-05, + "loss": 1.2379, + "step": 13273 + }, + { + "epoch": 3.953610454401608, + "grad_norm": 0.3367336690425873, + "learning_rate": 1.3773078629756604e-05, + "loss": 1.2158, + "step": 13274 + }, + { + "epoch": 3.953908300601277, + "grad_norm": 0.32177621126174927, + "learning_rate": 1.3772185338977138e-05, + "loss": 1.2046, + "step": 13275 + }, + { + "epoch": 3.9542061468009457, + "grad_norm": 0.3031023442745209, + "learning_rate": 1.3771292013101698e-05, + "loss": 1.2189, + "step": 13276 + }, + { + "epoch": 3.954503993000614, + "grad_norm": 0.2863255739212036, + "learning_rate": 1.3770398652138598e-05, + "loss": 1.219, + "step": 13277 + }, + { + "epoch": 3.954801839200283, + "grad_norm": 0.2768959701061249, + "learning_rate": 1.3769505256096154e-05, + "loss": 1.2212, + "step": 13278 + }, + { + "epoch": 3.9550996853999516, + "grad_norm": 0.25252997875213623, + "learning_rate": 1.3768611824982675e-05, + "loss": 1.2127, + "step": 13279 + }, + { + "epoch": 3.9553975315996204, + "grad_norm": 0.3021676540374756, + "learning_rate": 1.3767718358806473e-05, + "loss": 1.2216, + "step": 13280 + }, + { + "epoch": 3.955695377799289, + "grad_norm": 0.2823593020439148, + "learning_rate": 1.376682485757586e-05, + "loss": 1.2118, + "step": 13281 + }, + { + "epoch": 3.9559932239989575, + "grad_norm": 0.2506415545940399, + "learning_rate": 1.3765931321299153e-05, + "loss": 1.2306, + "step": 13282 + }, + { + "epoch": 3.9562910701986262, + "grad_norm": 0.2492976188659668, + "learning_rate": 1.376503774998466e-05, + "loss": 1.2232, + "step": 13283 + }, + { + "epoch": 3.956588916398295, + "grad_norm": 0.24417439103126526, + "learning_rate": 1.3764144143640699e-05, + "loss": 1.2353, + "step": 13284 + }, + { + "epoch": 3.9568867625979633, + "grad_norm": 0.2732827067375183, + "learning_rate": 1.3763250502275583e-05, + "loss": 1.2343, + "step": 13285 + }, + { + "epoch": 3.957184608797632, + "grad_norm": 0.2529347836971283, + "learning_rate": 1.3762356825897622e-05, + "loss": 1.2083, + "step": 13286 + }, + { + "epoch": 3.957482454997301, + "grad_norm": 0.2552180290222168, + "learning_rate": 1.3761463114515137e-05, + "loss": 1.2236, + "step": 13287 + }, + { + "epoch": 3.957780301196969, + "grad_norm": 0.3384588956832886, + "learning_rate": 1.3760569368136443e-05, + "loss": 1.2244, + "step": 13288 + }, + { + "epoch": 3.958078147396638, + "grad_norm": 0.28780779242515564, + "learning_rate": 1.3759675586769844e-05, + "loss": 1.2139, + "step": 13289 + }, + { + "epoch": 3.9583759935963068, + "grad_norm": 0.5732924342155457, + "learning_rate": 1.375878177042367e-05, + "loss": 1.2133, + "step": 13290 + }, + { + "epoch": 3.958673839795975, + "grad_norm": 0.4588649570941925, + "learning_rate": 1.3757887919106232e-05, + "loss": 1.2275, + "step": 13291 + }, + { + "epoch": 3.958971685995644, + "grad_norm": 0.34352895617485046, + "learning_rate": 1.3756994032825846e-05, + "loss": 1.2292, + "step": 13292 + }, + { + "epoch": 3.9592695321953126, + "grad_norm": 0.29472851753234863, + "learning_rate": 1.3756100111590826e-05, + "loss": 1.2355, + "step": 13293 + }, + { + "epoch": 3.9595673783949814, + "grad_norm": 0.6480907201766968, + "learning_rate": 1.375520615540949e-05, + "loss": 1.2303, + "step": 13294 + }, + { + "epoch": 3.95986522459465, + "grad_norm": 0.33491411805152893, + "learning_rate": 1.3754312164290157e-05, + "loss": 1.2287, + "step": 13295 + }, + { + "epoch": 3.9601630707943185, + "grad_norm": 0.3167470395565033, + "learning_rate": 1.3753418138241146e-05, + "loss": 1.2268, + "step": 13296 + }, + { + "epoch": 3.9604609169939873, + "grad_norm": 0.28013840317726135, + "learning_rate": 1.375252407727077e-05, + "loss": 1.2193, + "step": 13297 + }, + { + "epoch": 3.960758763193656, + "grad_norm": 0.27401334047317505, + "learning_rate": 1.3751629981387352e-05, + "loss": 1.2147, + "step": 13298 + }, + { + "epoch": 3.9610566093933244, + "grad_norm": 0.26001015305519104, + "learning_rate": 1.3750735850599211e-05, + "loss": 1.2221, + "step": 13299 + }, + { + "epoch": 3.961354455592993, + "grad_norm": 0.3323241174221039, + "learning_rate": 1.3749841684914658e-05, + "loss": 1.2213, + "step": 13300 + }, + { + "epoch": 3.961652301792662, + "grad_norm": 0.25518015027046204, + "learning_rate": 1.374894748434202e-05, + "loss": 1.2397, + "step": 13301 + }, + { + "epoch": 3.9619501479923303, + "grad_norm": 0.2652130424976349, + "learning_rate": 1.3748053248889615e-05, + "loss": 1.2185, + "step": 13302 + }, + { + "epoch": 3.962247994191999, + "grad_norm": 0.2644560635089874, + "learning_rate": 1.374715897856576e-05, + "loss": 1.2348, + "step": 13303 + }, + { + "epoch": 3.962545840391668, + "grad_norm": 0.24957582354545593, + "learning_rate": 1.374626467337878e-05, + "loss": 1.2228, + "step": 13304 + }, + { + "epoch": 3.962843686591336, + "grad_norm": 0.25798946619033813, + "learning_rate": 1.374537033333699e-05, + "loss": 1.2235, + "step": 13305 + }, + { + "epoch": 3.963141532791005, + "grad_norm": 0.24982014298439026, + "learning_rate": 1.3744475958448715e-05, + "loss": 1.2253, + "step": 13306 + }, + { + "epoch": 3.9634393789906737, + "grad_norm": 0.2523110806941986, + "learning_rate": 1.3743581548722276e-05, + "loss": 1.2176, + "step": 13307 + }, + { + "epoch": 3.9637372251903424, + "grad_norm": 0.25925248861312866, + "learning_rate": 1.374268710416599e-05, + "loss": 1.2309, + "step": 13308 + }, + { + "epoch": 3.964035071390011, + "grad_norm": 0.25103744864463806, + "learning_rate": 1.3741792624788186e-05, + "loss": 1.2435, + "step": 13309 + }, + { + "epoch": 3.9643329175896795, + "grad_norm": 0.25214672088623047, + "learning_rate": 1.374089811059718e-05, + "loss": 1.2048, + "step": 13310 + }, + { + "epoch": 3.9646307637893483, + "grad_norm": 0.2590175271034241, + "learning_rate": 1.3740003561601298e-05, + "loss": 1.2388, + "step": 13311 + }, + { + "epoch": 3.964928609989017, + "grad_norm": 0.25053536891937256, + "learning_rate": 1.3739108977808862e-05, + "loss": 1.2362, + "step": 13312 + }, + { + "epoch": 3.9652264561886854, + "grad_norm": 0.26403069496154785, + "learning_rate": 1.3738214359228192e-05, + "loss": 1.241, + "step": 13313 + }, + { + "epoch": 3.965524302388354, + "grad_norm": 0.2543472349643707, + "learning_rate": 1.3737319705867615e-05, + "loss": 1.2288, + "step": 13314 + }, + { + "epoch": 3.965822148588023, + "grad_norm": 0.2844364047050476, + "learning_rate": 1.3736425017735453e-05, + "loss": 1.2332, + "step": 13315 + }, + { + "epoch": 3.9661199947876913, + "grad_norm": 0.292111873626709, + "learning_rate": 1.3735530294840034e-05, + "loss": 1.2458, + "step": 13316 + }, + { + "epoch": 3.96641784098736, + "grad_norm": 0.26022544503211975, + "learning_rate": 1.3734635537189675e-05, + "loss": 1.2287, + "step": 13317 + }, + { + "epoch": 3.966715687187029, + "grad_norm": 0.30310890078544617, + "learning_rate": 1.3733740744792708e-05, + "loss": 1.2266, + "step": 13318 + }, + { + "epoch": 3.9670135333866976, + "grad_norm": 0.26686811447143555, + "learning_rate": 1.3732845917657453e-05, + "loss": 1.2174, + "step": 13319 + }, + { + "epoch": 3.967311379586366, + "grad_norm": 0.2606055736541748, + "learning_rate": 1.3731951055792237e-05, + "loss": 1.221, + "step": 13320 + }, + { + "epoch": 3.9676092257860347, + "grad_norm": 0.2605314254760742, + "learning_rate": 1.3731056159205386e-05, + "loss": 1.2218, + "step": 13321 + }, + { + "epoch": 3.9679070719857035, + "grad_norm": 0.2701477110385895, + "learning_rate": 1.3730161227905227e-05, + "loss": 1.2232, + "step": 13322 + }, + { + "epoch": 3.9682049181853722, + "grad_norm": 0.2692610025405884, + "learning_rate": 1.3729266261900086e-05, + "loss": 1.2375, + "step": 13323 + }, + { + "epoch": 3.9685027643850406, + "grad_norm": 0.3172907829284668, + "learning_rate": 1.3728371261198287e-05, + "loss": 1.2231, + "step": 13324 + }, + { + "epoch": 3.9688006105847093, + "grad_norm": 0.2733261287212372, + "learning_rate": 1.372747622580816e-05, + "loss": 1.2124, + "step": 13325 + }, + { + "epoch": 3.969098456784378, + "grad_norm": 0.31666746735572815, + "learning_rate": 1.3726581155738035e-05, + "loss": 1.2273, + "step": 13326 + }, + { + "epoch": 3.9693963029840464, + "grad_norm": 0.30270305275917053, + "learning_rate": 1.3725686050996232e-05, + "loss": 1.2254, + "step": 13327 + }, + { + "epoch": 3.969694149183715, + "grad_norm": 0.2606399655342102, + "learning_rate": 1.3724790911591082e-05, + "loss": 1.2336, + "step": 13328 + }, + { + "epoch": 3.969991995383384, + "grad_norm": 0.35708358883857727, + "learning_rate": 1.3723895737530919e-05, + "loss": 1.2307, + "step": 13329 + }, + { + "epoch": 3.9702898415830523, + "grad_norm": 0.2547812759876251, + "learning_rate": 1.372300052882406e-05, + "loss": 1.2354, + "step": 13330 + }, + { + "epoch": 3.970587687782721, + "grad_norm": 0.2860596179962158, + "learning_rate": 1.3722105285478844e-05, + "loss": 1.2268, + "step": 13331 + }, + { + "epoch": 3.97088553398239, + "grad_norm": 0.260704904794693, + "learning_rate": 1.3721210007503596e-05, + "loss": 1.2144, + "step": 13332 + }, + { + "epoch": 3.9711833801820586, + "grad_norm": 0.2592894732952118, + "learning_rate": 1.372031469490665e-05, + "loss": 1.2393, + "step": 13333 + }, + { + "epoch": 3.9714812263817274, + "grad_norm": 0.3154374957084656, + "learning_rate": 1.371941934769633e-05, + "loss": 1.2124, + "step": 13334 + }, + { + "epoch": 3.9717790725813957, + "grad_norm": 0.27829208970069885, + "learning_rate": 1.3718523965880967e-05, + "loss": 1.2154, + "step": 13335 + }, + { + "epoch": 3.9720769187810645, + "grad_norm": 0.3362678289413452, + "learning_rate": 1.3717628549468893e-05, + "loss": 1.2308, + "step": 13336 + }, + { + "epoch": 3.9723747649807333, + "grad_norm": 0.25007420778274536, + "learning_rate": 1.3716733098468441e-05, + "loss": 1.2377, + "step": 13337 + }, + { + "epoch": 3.9726726111804016, + "grad_norm": 0.27721577882766724, + "learning_rate": 1.371583761288794e-05, + "loss": 1.2244, + "step": 13338 + }, + { + "epoch": 3.9729704573800704, + "grad_norm": 0.3188413381576538, + "learning_rate": 1.371494209273572e-05, + "loss": 1.2397, + "step": 13339 + }, + { + "epoch": 3.973268303579739, + "grad_norm": 0.2549150884151459, + "learning_rate": 1.3714046538020116e-05, + "loss": 1.2191, + "step": 13340 + }, + { + "epoch": 3.9735661497794075, + "grad_norm": 0.3306371569633484, + "learning_rate": 1.3713150948749459e-05, + "loss": 1.2211, + "step": 13341 + }, + { + "epoch": 3.9738639959790762, + "grad_norm": 0.2630874812602997, + "learning_rate": 1.3712255324932079e-05, + "loss": 1.2226, + "step": 13342 + }, + { + "epoch": 3.974161842178745, + "grad_norm": 0.2721898853778839, + "learning_rate": 1.3711359666576312e-05, + "loss": 1.2326, + "step": 13343 + }, + { + "epoch": 3.9744596883784133, + "grad_norm": 0.43239787220954895, + "learning_rate": 1.371046397369049e-05, + "loss": 1.2438, + "step": 13344 + }, + { + "epoch": 3.974757534578082, + "grad_norm": 0.2552736699581146, + "learning_rate": 1.3709568246282945e-05, + "loss": 1.2214, + "step": 13345 + }, + { + "epoch": 3.975055380777751, + "grad_norm": 0.34517979621887207, + "learning_rate": 1.3708672484362013e-05, + "loss": 1.2266, + "step": 13346 + }, + { + "epoch": 3.9753532269774197, + "grad_norm": 0.27155017852783203, + "learning_rate": 1.3707776687936028e-05, + "loss": 1.2294, + "step": 13347 + }, + { + "epoch": 3.9756510731770884, + "grad_norm": 0.271278977394104, + "learning_rate": 1.370688085701332e-05, + "loss": 1.2185, + "step": 13348 + }, + { + "epoch": 3.9759489193767568, + "grad_norm": 0.38098785281181335, + "learning_rate": 1.3705984991602229e-05, + "loss": 1.2276, + "step": 13349 + }, + { + "epoch": 3.9762467655764255, + "grad_norm": 0.30669933557510376, + "learning_rate": 1.370508909171109e-05, + "loss": 1.2249, + "step": 13350 + }, + { + "epoch": 3.9765446117760943, + "grad_norm": 0.3008941113948822, + "learning_rate": 1.3704193157348236e-05, + "loss": 1.2295, + "step": 13351 + }, + { + "epoch": 3.9768424579757626, + "grad_norm": 0.3231416344642639, + "learning_rate": 1.3703297188522002e-05, + "loss": 1.2201, + "step": 13352 + }, + { + "epoch": 3.9771403041754314, + "grad_norm": 0.3122359812259674, + "learning_rate": 1.3702401185240726e-05, + "loss": 1.2365, + "step": 13353 + }, + { + "epoch": 3.9774381503751, + "grad_norm": 0.28316250443458557, + "learning_rate": 1.3701505147512744e-05, + "loss": 1.2172, + "step": 13354 + }, + { + "epoch": 3.9777359965747685, + "grad_norm": 0.31386739015579224, + "learning_rate": 1.3700609075346391e-05, + "loss": 1.2197, + "step": 13355 + }, + { + "epoch": 3.9780338427744373, + "grad_norm": 0.2508314549922943, + "learning_rate": 1.3699712968750004e-05, + "loss": 1.2242, + "step": 13356 + }, + { + "epoch": 3.978331688974106, + "grad_norm": 0.40898066759109497, + "learning_rate": 1.3698816827731926e-05, + "loss": 1.2277, + "step": 13357 + }, + { + "epoch": 3.9786295351737744, + "grad_norm": 0.3118723928928375, + "learning_rate": 1.3697920652300487e-05, + "loss": 1.2267, + "step": 13358 + }, + { + "epoch": 3.978927381373443, + "grad_norm": 0.36608466506004333, + "learning_rate": 1.3697024442464027e-05, + "loss": 1.2314, + "step": 13359 + }, + { + "epoch": 3.979225227573112, + "grad_norm": 0.24628345668315887, + "learning_rate": 1.3696128198230888e-05, + "loss": 1.2358, + "step": 13360 + }, + { + "epoch": 3.9795230737727807, + "grad_norm": 0.5022084712982178, + "learning_rate": 1.3695231919609402e-05, + "loss": 1.2333, + "step": 13361 + }, + { + "epoch": 3.9798209199724495, + "grad_norm": 0.30987975001335144, + "learning_rate": 1.3694335606607913e-05, + "loss": 1.2212, + "step": 13362 + }, + { + "epoch": 3.980118766172118, + "grad_norm": 0.37688446044921875, + "learning_rate": 1.369343925923476e-05, + "loss": 1.2416, + "step": 13363 + }, + { + "epoch": 3.9804166123717866, + "grad_norm": 0.2744654417037964, + "learning_rate": 1.3692542877498281e-05, + "loss": 1.2261, + "step": 13364 + }, + { + "epoch": 3.9807144585714553, + "grad_norm": 0.5666722655296326, + "learning_rate": 1.3691646461406816e-05, + "loss": 1.2173, + "step": 13365 + }, + { + "epoch": 3.9810123047711237, + "grad_norm": 0.30332040786743164, + "learning_rate": 1.3690750010968703e-05, + "loss": 1.221, + "step": 13366 + }, + { + "epoch": 3.9813101509707924, + "grad_norm": 0.35051602125167847, + "learning_rate": 1.3689853526192287e-05, + "loss": 1.2212, + "step": 13367 + }, + { + "epoch": 3.981607997170461, + "grad_norm": 0.2537488341331482, + "learning_rate": 1.3688957007085906e-05, + "loss": 1.2155, + "step": 13368 + }, + { + "epoch": 3.9819058433701295, + "grad_norm": 0.308860182762146, + "learning_rate": 1.3688060453657901e-05, + "loss": 1.2237, + "step": 13369 + }, + { + "epoch": 3.9822036895697983, + "grad_norm": 0.32965436577796936, + "learning_rate": 1.3687163865916616e-05, + "loss": 1.2222, + "step": 13370 + }, + { + "epoch": 3.982501535769467, + "grad_norm": 0.252133846282959, + "learning_rate": 1.3686267243870385e-05, + "loss": 1.2289, + "step": 13371 + }, + { + "epoch": 3.9827993819691354, + "grad_norm": 0.33121633529663086, + "learning_rate": 1.3685370587527562e-05, + "loss": 1.2278, + "step": 13372 + }, + { + "epoch": 3.983097228168804, + "grad_norm": 0.25625285506248474, + "learning_rate": 1.368447389689648e-05, + "loss": 1.2375, + "step": 13373 + }, + { + "epoch": 3.983395074368473, + "grad_norm": 0.3637160658836365, + "learning_rate": 1.3683577171985487e-05, + "loss": 1.2124, + "step": 13374 + }, + { + "epoch": 3.9836929205681417, + "grad_norm": 0.28041210770606995, + "learning_rate": 1.3682680412802924e-05, + "loss": 1.2287, + "step": 13375 + }, + { + "epoch": 3.9839907667678105, + "grad_norm": 0.25311151146888733, + "learning_rate": 1.3681783619357134e-05, + "loss": 1.2147, + "step": 13376 + }, + { + "epoch": 3.984288612967479, + "grad_norm": 0.2525341510772705, + "learning_rate": 1.368088679165646e-05, + "loss": 1.2312, + "step": 13377 + }, + { + "epoch": 3.9845864591671476, + "grad_norm": 0.24177545309066772, + "learning_rate": 1.3679989929709247e-05, + "loss": 1.2156, + "step": 13378 + }, + { + "epoch": 3.9848843053668164, + "grad_norm": 0.2735190689563751, + "learning_rate": 1.3679093033523838e-05, + "loss": 1.2163, + "step": 13379 + }, + { + "epoch": 3.9851821515664847, + "grad_norm": 0.2353980392217636, + "learning_rate": 1.367819610310858e-05, + "loss": 1.2245, + "step": 13380 + }, + { + "epoch": 3.9854799977661535, + "grad_norm": 0.27483224868774414, + "learning_rate": 1.367729913847182e-05, + "loss": 1.2108, + "step": 13381 + }, + { + "epoch": 3.9857778439658222, + "grad_norm": 0.2524372935295105, + "learning_rate": 1.3676402139621896e-05, + "loss": 1.2127, + "step": 13382 + }, + { + "epoch": 3.9860756901654906, + "grad_norm": 0.2566528916358948, + "learning_rate": 1.367550510656716e-05, + "loss": 1.225, + "step": 13383 + }, + { + "epoch": 3.9863735363651593, + "grad_norm": 0.23759350180625916, + "learning_rate": 1.3674608039315954e-05, + "loss": 1.2175, + "step": 13384 + }, + { + "epoch": 3.986671382564828, + "grad_norm": 0.26291152834892273, + "learning_rate": 1.3673710937876626e-05, + "loss": 1.2137, + "step": 13385 + }, + { + "epoch": 3.986969228764497, + "grad_norm": 0.2782786786556244, + "learning_rate": 1.3672813802257521e-05, + "loss": 1.2349, + "step": 13386 + }, + { + "epoch": 3.987267074964165, + "grad_norm": 0.27542656660079956, + "learning_rate": 1.367191663246699e-05, + "loss": 1.2218, + "step": 13387 + }, + { + "epoch": 3.987564921163834, + "grad_norm": 0.26309019327163696, + "learning_rate": 1.3671019428513374e-05, + "loss": 1.2179, + "step": 13388 + }, + { + "epoch": 3.9878627673635028, + "grad_norm": 0.2406158298254013, + "learning_rate": 1.3670122190405026e-05, + "loss": 1.2357, + "step": 13389 + }, + { + "epoch": 3.9881606135631715, + "grad_norm": 0.26158133149147034, + "learning_rate": 1.366922491815029e-05, + "loss": 1.2268, + "step": 13390 + }, + { + "epoch": 3.98845845976284, + "grad_norm": 0.25136712193489075, + "learning_rate": 1.3668327611757516e-05, + "loss": 1.2362, + "step": 13391 + }, + { + "epoch": 3.9887563059625086, + "grad_norm": 0.3350224196910858, + "learning_rate": 1.3667430271235053e-05, + "loss": 1.2249, + "step": 13392 + }, + { + "epoch": 3.9890541521621774, + "grad_norm": 0.25642406940460205, + "learning_rate": 1.3666532896591249e-05, + "loss": 1.229, + "step": 13393 + }, + { + "epoch": 3.9893519983618457, + "grad_norm": 0.27974164485931396, + "learning_rate": 1.3665635487834453e-05, + "loss": 1.2324, + "step": 13394 + }, + { + "epoch": 3.9896498445615145, + "grad_norm": 0.3308431804180145, + "learning_rate": 1.3664738044973011e-05, + "loss": 1.2481, + "step": 13395 + }, + { + "epoch": 3.9899476907611833, + "grad_norm": 0.3376076817512512, + "learning_rate": 1.366384056801528e-05, + "loss": 1.2224, + "step": 13396 + }, + { + "epoch": 3.9902455369608516, + "grad_norm": 0.2756621837615967, + "learning_rate": 1.3662943056969605e-05, + "loss": 1.2265, + "step": 13397 + }, + { + "epoch": 3.9905433831605204, + "grad_norm": 0.250488817691803, + "learning_rate": 1.3662045511844338e-05, + "loss": 1.2201, + "step": 13398 + }, + { + "epoch": 3.990841229360189, + "grad_norm": 0.3583226799964905, + "learning_rate": 1.3661147932647826e-05, + "loss": 1.2415, + "step": 13399 + }, + { + "epoch": 3.991139075559858, + "grad_norm": 0.2988361120223999, + "learning_rate": 1.3660250319388427e-05, + "loss": 1.2195, + "step": 13400 + }, + { + "epoch": 3.9914369217595267, + "grad_norm": 0.2791658043861389, + "learning_rate": 1.3659352672074484e-05, + "loss": 1.23, + "step": 13401 + }, + { + "epoch": 3.991734767959195, + "grad_norm": 0.2548013925552368, + "learning_rate": 1.3658454990714356e-05, + "loss": 1.215, + "step": 13402 + }, + { + "epoch": 3.992032614158864, + "grad_norm": 0.25278958678245544, + "learning_rate": 1.365755727531639e-05, + "loss": 1.231, + "step": 13403 + }, + { + "epoch": 3.9923304603585326, + "grad_norm": 0.27782562375068665, + "learning_rate": 1.3656659525888942e-05, + "loss": 1.2275, + "step": 13404 + }, + { + "epoch": 3.992628306558201, + "grad_norm": 0.26534757018089294, + "learning_rate": 1.365576174244036e-05, + "loss": 1.2259, + "step": 13405 + }, + { + "epoch": 3.9929261527578697, + "grad_norm": 0.3131057322025299, + "learning_rate": 1.3654863924979002e-05, + "loss": 1.2205, + "step": 13406 + }, + { + "epoch": 3.9932239989575384, + "grad_norm": 0.2801111936569214, + "learning_rate": 1.3653966073513214e-05, + "loss": 1.238, + "step": 13407 + }, + { + "epoch": 3.9935218451572068, + "grad_norm": 0.26753294467926025, + "learning_rate": 1.3653068188051359e-05, + "loss": 1.235, + "step": 13408 + }, + { + "epoch": 3.9938196913568755, + "grad_norm": 0.2762524485588074, + "learning_rate": 1.3652170268601785e-05, + "loss": 1.2376, + "step": 13409 + }, + { + "epoch": 3.9941175375565443, + "grad_norm": 0.3896852731704712, + "learning_rate": 1.3651272315172846e-05, + "loss": 1.2169, + "step": 13410 + }, + { + "epoch": 3.9944153837562126, + "grad_norm": 0.36620697379112244, + "learning_rate": 1.36503743277729e-05, + "loss": 1.2215, + "step": 13411 + }, + { + "epoch": 3.9947132299558814, + "grad_norm": 0.35110583901405334, + "learning_rate": 1.3649476306410296e-05, + "loss": 1.2262, + "step": 13412 + }, + { + "epoch": 3.99501107615555, + "grad_norm": 0.46130087971687317, + "learning_rate": 1.3648578251093395e-05, + "loss": 1.222, + "step": 13413 + }, + { + "epoch": 3.995308922355219, + "grad_norm": 0.26714006066322327, + "learning_rate": 1.3647680161830546e-05, + "loss": 1.2269, + "step": 13414 + }, + { + "epoch": 3.9956067685548877, + "grad_norm": 0.346652626991272, + "learning_rate": 1.3646782038630114e-05, + "loss": 1.2301, + "step": 13415 + }, + { + "epoch": 3.995904614754556, + "grad_norm": 0.3244880735874176, + "learning_rate": 1.3645883881500445e-05, + "loss": 1.2105, + "step": 13416 + }, + { + "epoch": 3.996202460954225, + "grad_norm": 0.2955103814601898, + "learning_rate": 1.3644985690449901e-05, + "loss": 1.2225, + "step": 13417 + }, + { + "epoch": 3.9965003071538936, + "grad_norm": 0.3054068088531494, + "learning_rate": 1.3644087465486839e-05, + "loss": 1.2387, + "step": 13418 + }, + { + "epoch": 3.996798153353562, + "grad_norm": 0.29540181159973145, + "learning_rate": 1.3643189206619613e-05, + "loss": 1.2321, + "step": 13419 + }, + { + "epoch": 3.9970959995532307, + "grad_norm": 0.27125608921051025, + "learning_rate": 1.3642290913856582e-05, + "loss": 1.2144, + "step": 13420 + }, + { + "epoch": 3.9973938457528995, + "grad_norm": 0.25907444953918457, + "learning_rate": 1.3641392587206103e-05, + "loss": 1.2368, + "step": 13421 + }, + { + "epoch": 3.997691691952568, + "grad_norm": 0.266828328371048, + "learning_rate": 1.364049422667654e-05, + "loss": 1.2354, + "step": 13422 + }, + { + "epoch": 3.9979895381522366, + "grad_norm": 0.24839074909687042, + "learning_rate": 1.363959583227624e-05, + "loss": 1.2263, + "step": 13423 + }, + { + "epoch": 3.9982873843519053, + "grad_norm": 0.25622907280921936, + "learning_rate": 1.3638697404013566e-05, + "loss": 1.2206, + "step": 13424 + }, + { + "epoch": 3.9985852305515737, + "grad_norm": 0.27672067284584045, + "learning_rate": 1.363779894189688e-05, + "loss": 1.2309, + "step": 13425 + }, + { + "epoch": 3.9988830767512424, + "grad_norm": 0.29166603088378906, + "learning_rate": 1.363690044593454e-05, + "loss": 1.2258, + "step": 13426 + }, + { + "epoch": 3.999180922950911, + "grad_norm": 0.2589173913002014, + "learning_rate": 1.3636001916134904e-05, + "loss": 1.2542, + "step": 13427 + }, + { + "epoch": 3.99947876915058, + "grad_norm": 0.26252371072769165, + "learning_rate": 1.3635103352506333e-05, + "loss": 1.2101, + "step": 13428 + }, + { + "epoch": 3.9997766153502488, + "grad_norm": 0.2667730152606964, + "learning_rate": 1.3634204755057187e-05, + "loss": 1.2485, + "step": 13429 + }, + { + "epoch": 4.000074461549917, + "grad_norm": 0.24984438717365265, + "learning_rate": 1.3633306123795824e-05, + "loss": 1.2281, + "step": 13430 + }, + { + "epoch": 4.000372307749585, + "grad_norm": 0.2710225284099579, + "learning_rate": 1.3632407458730607e-05, + "loss": 1.2259, + "step": 13431 + }, + { + "epoch": 4.000670153949255, + "grad_norm": 0.24073953926563263, + "learning_rate": 1.3631508759869903e-05, + "loss": 1.2012, + "step": 13432 + }, + { + "epoch": 4.000968000148923, + "grad_norm": 0.2353154420852661, + "learning_rate": 1.3630610027222062e-05, + "loss": 1.241, + "step": 13433 + }, + { + "epoch": 4.001265846348592, + "grad_norm": 0.23412029445171356, + "learning_rate": 1.362971126079545e-05, + "loss": 1.2255, + "step": 13434 + }, + { + "epoch": 4.0015636925482605, + "grad_norm": 0.2648339867591858, + "learning_rate": 1.3628812460598431e-05, + "loss": 1.2337, + "step": 13435 + }, + { + "epoch": 4.001861538747929, + "grad_norm": 0.2483452707529068, + "learning_rate": 1.3627913626639368e-05, + "loss": 1.2289, + "step": 13436 + }, + { + "epoch": 4.002159384947598, + "grad_norm": 0.27604132890701294, + "learning_rate": 1.3627014758926622e-05, + "loss": 1.2204, + "step": 13437 + }, + { + "epoch": 4.002457231147266, + "grad_norm": 0.23962776362895966, + "learning_rate": 1.3626115857468553e-05, + "loss": 1.2272, + "step": 13438 + }, + { + "epoch": 4.002755077346935, + "grad_norm": 0.3925142288208008, + "learning_rate": 1.3625216922273532e-05, + "loss": 1.2256, + "step": 13439 + }, + { + "epoch": 4.003052923546604, + "grad_norm": 0.47054609656333923, + "learning_rate": 1.3624317953349916e-05, + "loss": 1.2337, + "step": 13440 + }, + { + "epoch": 4.003350769746272, + "grad_norm": 0.2619442045688629, + "learning_rate": 1.3623418950706068e-05, + "loss": 1.2356, + "step": 13441 + }, + { + "epoch": 4.003648615945941, + "grad_norm": 0.2708479166030884, + "learning_rate": 1.3622519914350358e-05, + "loss": 1.2059, + "step": 13442 + }, + { + "epoch": 4.00394646214561, + "grad_norm": 0.2544673979282379, + "learning_rate": 1.3621620844291147e-05, + "loss": 1.2081, + "step": 13443 + }, + { + "epoch": 4.004244308345278, + "grad_norm": 0.2590193748474121, + "learning_rate": 1.36207217405368e-05, + "loss": 1.2295, + "step": 13444 + }, + { + "epoch": 4.004542154544946, + "grad_norm": 0.24963033199310303, + "learning_rate": 1.3619822603095685e-05, + "loss": 1.2054, + "step": 13445 + }, + { + "epoch": 4.004840000744616, + "grad_norm": 0.2631826102733612, + "learning_rate": 1.3618923431976163e-05, + "loss": 1.2303, + "step": 13446 + }, + { + "epoch": 4.005137846944284, + "grad_norm": 0.31257209181785583, + "learning_rate": 1.3618024227186601e-05, + "loss": 1.2159, + "step": 13447 + }, + { + "epoch": 4.005435693143953, + "grad_norm": 0.27918773889541626, + "learning_rate": 1.3617124988735362e-05, + "loss": 1.2262, + "step": 13448 + }, + { + "epoch": 4.0057335393436215, + "grad_norm": 0.2980075180530548, + "learning_rate": 1.3616225716630824e-05, + "loss": 1.2444, + "step": 13449 + }, + { + "epoch": 4.00603138554329, + "grad_norm": 0.4892701208591461, + "learning_rate": 1.3615326410881342e-05, + "loss": 1.2293, + "step": 13450 + }, + { + "epoch": 4.006329231742959, + "grad_norm": 0.5278096795082092, + "learning_rate": 1.3614427071495287e-05, + "loss": 1.2284, + "step": 13451 + }, + { + "epoch": 4.006627077942627, + "grad_norm": 0.3616490662097931, + "learning_rate": 1.3613527698481029e-05, + "loss": 1.2318, + "step": 13452 + }, + { + "epoch": 4.006924924142296, + "grad_norm": 0.33913710713386536, + "learning_rate": 1.3612628291846927e-05, + "loss": 1.2125, + "step": 13453 + }, + { + "epoch": 4.007222770341965, + "grad_norm": 0.5846142768859863, + "learning_rate": 1.361172885160136e-05, + "loss": 1.217, + "step": 13454 + }, + { + "epoch": 4.007520616541633, + "grad_norm": 0.39744895696640015, + "learning_rate": 1.361082937775269e-05, + "loss": 1.2231, + "step": 13455 + }, + { + "epoch": 4.007818462741302, + "grad_norm": 0.38165080547332764, + "learning_rate": 1.3609929870309288e-05, + "loss": 1.2344, + "step": 13456 + }, + { + "epoch": 4.008116308940971, + "grad_norm": 0.28365325927734375, + "learning_rate": 1.3609030329279522e-05, + "loss": 1.2136, + "step": 13457 + }, + { + "epoch": 4.008414155140639, + "grad_norm": 0.5792515873908997, + "learning_rate": 1.360813075467176e-05, + "loss": 1.2273, + "step": 13458 + }, + { + "epoch": 4.0087120013403075, + "grad_norm": 0.3754171133041382, + "learning_rate": 1.360723114649437e-05, + "loss": 1.2409, + "step": 13459 + }, + { + "epoch": 4.009009847539977, + "grad_norm": 0.36889129877090454, + "learning_rate": 1.3606331504755727e-05, + "loss": 1.2193, + "step": 13460 + }, + { + "epoch": 4.009307693739645, + "grad_norm": 0.33019784092903137, + "learning_rate": 1.36054318294642e-05, + "loss": 1.2184, + "step": 13461 + }, + { + "epoch": 4.009605539939314, + "grad_norm": 0.3890586495399475, + "learning_rate": 1.3604532120628158e-05, + "loss": 1.2195, + "step": 13462 + }, + { + "epoch": 4.009903386138983, + "grad_norm": 0.4135071337223053, + "learning_rate": 1.3603632378255972e-05, + "loss": 1.2122, + "step": 13463 + }, + { + "epoch": 4.010201232338651, + "grad_norm": 0.2886429727077484, + "learning_rate": 1.360273260235601e-05, + "loss": 1.2232, + "step": 13464 + }, + { + "epoch": 4.01049907853832, + "grad_norm": 0.34025245904922485, + "learning_rate": 1.3601832792936646e-05, + "loss": 1.2191, + "step": 13465 + }, + { + "epoch": 4.010796924737988, + "grad_norm": 0.28943249583244324, + "learning_rate": 1.3600932950006254e-05, + "loss": 1.2253, + "step": 13466 + }, + { + "epoch": 4.011094770937657, + "grad_norm": 0.3403421938419342, + "learning_rate": 1.3600033073573204e-05, + "loss": 1.217, + "step": 13467 + }, + { + "epoch": 4.011392617137326, + "grad_norm": 0.3473210036754608, + "learning_rate": 1.3599133163645868e-05, + "loss": 1.2377, + "step": 13468 + }, + { + "epoch": 4.011690463336994, + "grad_norm": 0.33960193395614624, + "learning_rate": 1.3598233220232622e-05, + "loss": 1.2284, + "step": 13469 + }, + { + "epoch": 4.011988309536663, + "grad_norm": 0.31858521699905396, + "learning_rate": 1.3597333243341832e-05, + "loss": 1.2299, + "step": 13470 + }, + { + "epoch": 4.012286155736332, + "grad_norm": 0.2420596480369568, + "learning_rate": 1.3596433232981877e-05, + "loss": 1.2345, + "step": 13471 + }, + { + "epoch": 4.012584001936, + "grad_norm": 0.46303659677505493, + "learning_rate": 1.3595533189161128e-05, + "loss": 1.2074, + "step": 13472 + }, + { + "epoch": 4.0128818481356685, + "grad_norm": 0.26272690296173096, + "learning_rate": 1.359463311188796e-05, + "loss": 1.2163, + "step": 13473 + }, + { + "epoch": 4.013179694335338, + "grad_norm": 0.3758026957511902, + "learning_rate": 1.3593733001170749e-05, + "loss": 1.2184, + "step": 13474 + }, + { + "epoch": 4.013477540535006, + "grad_norm": 0.2465454787015915, + "learning_rate": 1.3592832857017865e-05, + "loss": 1.2257, + "step": 13475 + }, + { + "epoch": 4.013775386734675, + "grad_norm": 0.3642311692237854, + "learning_rate": 1.3591932679437689e-05, + "loss": 1.256, + "step": 13476 + }, + { + "epoch": 4.014073232934344, + "grad_norm": 0.28112563490867615, + "learning_rate": 1.3591032468438588e-05, + "loss": 1.2303, + "step": 13477 + }, + { + "epoch": 4.014371079134012, + "grad_norm": 0.2791341245174408, + "learning_rate": 1.3590132224028945e-05, + "loss": 1.2212, + "step": 13478 + }, + { + "epoch": 4.014668925333681, + "grad_norm": 0.35364946722984314, + "learning_rate": 1.358923194621713e-05, + "loss": 1.2257, + "step": 13479 + }, + { + "epoch": 4.0149667715333495, + "grad_norm": 0.25044015049934387, + "learning_rate": 1.3588331635011527e-05, + "loss": 1.2357, + "step": 13480 + }, + { + "epoch": 4.015264617733018, + "grad_norm": 0.397842675447464, + "learning_rate": 1.3587431290420502e-05, + "loss": 1.2275, + "step": 13481 + }, + { + "epoch": 4.015562463932687, + "grad_norm": 0.24611815810203552, + "learning_rate": 1.358653091245244e-05, + "loss": 1.2335, + "step": 13482 + }, + { + "epoch": 4.015860310132355, + "grad_norm": 0.35895636677742004, + "learning_rate": 1.3585630501115714e-05, + "loss": 1.2076, + "step": 13483 + }, + { + "epoch": 4.016158156332024, + "grad_norm": 0.2622363567352295, + "learning_rate": 1.3584730056418702e-05, + "loss": 1.2295, + "step": 13484 + }, + { + "epoch": 4.016456002531693, + "grad_norm": 0.3240194022655487, + "learning_rate": 1.3583829578369783e-05, + "loss": 1.2347, + "step": 13485 + }, + { + "epoch": 4.016753848731361, + "grad_norm": 0.2415049523115158, + "learning_rate": 1.3582929066977336e-05, + "loss": 1.2374, + "step": 13486 + }, + { + "epoch": 4.01705169493103, + "grad_norm": 0.262705534696579, + "learning_rate": 1.3582028522249732e-05, + "loss": 1.2085, + "step": 13487 + }, + { + "epoch": 4.017349541130699, + "grad_norm": 0.2660171687602997, + "learning_rate": 1.3581127944195356e-05, + "loss": 1.2117, + "step": 13488 + }, + { + "epoch": 4.017647387330367, + "grad_norm": 0.2507553696632385, + "learning_rate": 1.3580227332822584e-05, + "loss": 1.2341, + "step": 13489 + }, + { + "epoch": 4.017945233530036, + "grad_norm": 0.25532859563827515, + "learning_rate": 1.3579326688139802e-05, + "loss": 1.2244, + "step": 13490 + }, + { + "epoch": 4.018243079729705, + "grad_norm": 0.2624969184398651, + "learning_rate": 1.3578426010155381e-05, + "loss": 1.2357, + "step": 13491 + }, + { + "epoch": 4.018540925929373, + "grad_norm": 0.3329092860221863, + "learning_rate": 1.3577525298877705e-05, + "loss": 1.2153, + "step": 13492 + }, + { + "epoch": 4.018838772129042, + "grad_norm": 0.28040000796318054, + "learning_rate": 1.3576624554315154e-05, + "loss": 1.2216, + "step": 13493 + }, + { + "epoch": 4.0191366183287105, + "grad_norm": 0.2757415473461151, + "learning_rate": 1.3575723776476106e-05, + "loss": 1.2297, + "step": 13494 + }, + { + "epoch": 4.019434464528379, + "grad_norm": 0.30063891410827637, + "learning_rate": 1.3574822965368942e-05, + "loss": 1.2313, + "step": 13495 + }, + { + "epoch": 4.019732310728048, + "grad_norm": 0.26259714365005493, + "learning_rate": 1.3573922121002045e-05, + "loss": 1.2345, + "step": 13496 + }, + { + "epoch": 4.020030156927716, + "grad_norm": 0.2762293815612793, + "learning_rate": 1.3573021243383797e-05, + "loss": 1.2207, + "step": 13497 + }, + { + "epoch": 4.020328003127385, + "grad_norm": 0.2598060667514801, + "learning_rate": 1.3572120332522578e-05, + "loss": 1.2201, + "step": 13498 + }, + { + "epoch": 4.020625849327054, + "grad_norm": 0.2517746388912201, + "learning_rate": 1.357121938842677e-05, + "loss": 1.2226, + "step": 13499 + }, + { + "epoch": 4.020923695526722, + "grad_norm": 0.24112702906131744, + "learning_rate": 1.3570318411104756e-05, + "loss": 1.2254, + "step": 13500 + }, + { + "epoch": 4.020923695526722, + "eval_loss": 1.3276550769805908, + "eval_runtime": 21.9183, + "eval_samples_per_second": 79.112, + "eval_steps_per_second": 4.973, + "step": 13500 + }, + { + "epoch": 4.0212215417263915, + "grad_norm": 0.2654041647911072, + "learning_rate": 1.3569417400564917e-05, + "loss": 1.225, + "step": 13501 + }, + { + "epoch": 4.02151938792606, + "grad_norm": 0.2819170653820038, + "learning_rate": 1.3568516356815637e-05, + "loss": 1.2155, + "step": 13502 + }, + { + "epoch": 4.021817234125728, + "grad_norm": 0.3072175979614258, + "learning_rate": 1.3567615279865303e-05, + "loss": 1.2233, + "step": 13503 + }, + { + "epoch": 4.022115080325397, + "grad_norm": 0.24807359278202057, + "learning_rate": 1.3566714169722292e-05, + "loss": 1.2314, + "step": 13504 + }, + { + "epoch": 4.022412926525066, + "grad_norm": 0.2827318012714386, + "learning_rate": 1.3565813026394992e-05, + "loss": 1.222, + "step": 13505 + }, + { + "epoch": 4.022710772724734, + "grad_norm": 0.31187954545021057, + "learning_rate": 1.3564911849891785e-05, + "loss": 1.2238, + "step": 13506 + }, + { + "epoch": 4.023008618924403, + "grad_norm": 0.24782270193099976, + "learning_rate": 1.3564010640221052e-05, + "loss": 1.2349, + "step": 13507 + }, + { + "epoch": 4.0233064651240715, + "grad_norm": 0.29361018538475037, + "learning_rate": 1.3563109397391188e-05, + "loss": 1.222, + "step": 13508 + }, + { + "epoch": 4.02360431132374, + "grad_norm": 0.3015013635158539, + "learning_rate": 1.3562208121410568e-05, + "loss": 1.2186, + "step": 13509 + }, + { + "epoch": 4.023902157523409, + "grad_norm": 0.39766520261764526, + "learning_rate": 1.3561306812287584e-05, + "loss": 1.2016, + "step": 13510 + }, + { + "epoch": 4.024200003723077, + "grad_norm": 0.24687308073043823, + "learning_rate": 1.3560405470030617e-05, + "loss": 1.2216, + "step": 13511 + }, + { + "epoch": 4.024497849922746, + "grad_norm": 0.40833809971809387, + "learning_rate": 1.3559504094648055e-05, + "loss": 1.2143, + "step": 13512 + }, + { + "epoch": 4.024795696122415, + "grad_norm": 0.44821810722351074, + "learning_rate": 1.3558602686148285e-05, + "loss": 1.2134, + "step": 13513 + }, + { + "epoch": 4.025093542322083, + "grad_norm": 0.48145267367362976, + "learning_rate": 1.3557701244539696e-05, + "loss": 1.2478, + "step": 13514 + }, + { + "epoch": 4.0253913885217525, + "grad_norm": 0.2988905906677246, + "learning_rate": 1.3556799769830669e-05, + "loss": 1.2212, + "step": 13515 + }, + { + "epoch": 4.025689234721421, + "grad_norm": 0.3648199439048767, + "learning_rate": 1.3555898262029593e-05, + "loss": 1.2107, + "step": 13516 + }, + { + "epoch": 4.025987080921089, + "grad_norm": 0.2863132655620575, + "learning_rate": 1.355499672114486e-05, + "loss": 1.2067, + "step": 13517 + }, + { + "epoch": 4.026284927120758, + "grad_norm": 0.46540048718452454, + "learning_rate": 1.3554095147184849e-05, + "loss": 1.2273, + "step": 13518 + }, + { + "epoch": 4.026582773320427, + "grad_norm": 0.2618623673915863, + "learning_rate": 1.3553193540157956e-05, + "loss": 1.2197, + "step": 13519 + }, + { + "epoch": 4.026880619520095, + "grad_norm": 0.7541860342025757, + "learning_rate": 1.355229190007257e-05, + "loss": 1.2172, + "step": 13520 + }, + { + "epoch": 4.027178465719764, + "grad_norm": 0.46875596046447754, + "learning_rate": 1.3551390226937074e-05, + "loss": 1.2315, + "step": 13521 + }, + { + "epoch": 4.027476311919433, + "grad_norm": 0.30588921904563904, + "learning_rate": 1.355048852075986e-05, + "loss": 1.2422, + "step": 13522 + }, + { + "epoch": 4.027774158119101, + "grad_norm": 0.28768113255500793, + "learning_rate": 1.3549586781549318e-05, + "loss": 1.2218, + "step": 13523 + }, + { + "epoch": 4.02807200431877, + "grad_norm": 0.284957617521286, + "learning_rate": 1.3548685009313834e-05, + "loss": 1.2366, + "step": 13524 + }, + { + "epoch": 4.028369850518438, + "grad_norm": 0.2643536925315857, + "learning_rate": 1.3547783204061804e-05, + "loss": 1.2182, + "step": 13525 + }, + { + "epoch": 4.028667696718107, + "grad_norm": 0.29222121834754944, + "learning_rate": 1.3546881365801612e-05, + "loss": 1.2312, + "step": 13526 + }, + { + "epoch": 4.028965542917776, + "grad_norm": 0.3241350054740906, + "learning_rate": 1.3545979494541656e-05, + "loss": 1.2423, + "step": 13527 + }, + { + "epoch": 4.029263389117444, + "grad_norm": 0.25367748737335205, + "learning_rate": 1.3545077590290321e-05, + "loss": 1.2229, + "step": 13528 + }, + { + "epoch": 4.0295612353171135, + "grad_norm": 0.2630714178085327, + "learning_rate": 1.3544175653055997e-05, + "loss": 1.2149, + "step": 13529 + }, + { + "epoch": 4.029859081516782, + "grad_norm": 0.2765938937664032, + "learning_rate": 1.3543273682847078e-05, + "loss": 1.2138, + "step": 13530 + }, + { + "epoch": 4.03015692771645, + "grad_norm": 0.24845919013023376, + "learning_rate": 1.3542371679671961e-05, + "loss": 1.2226, + "step": 13531 + }, + { + "epoch": 4.030454773916119, + "grad_norm": 0.3170214295387268, + "learning_rate": 1.354146964353903e-05, + "loss": 1.22, + "step": 13532 + }, + { + "epoch": 4.030752620115788, + "grad_norm": 0.25234171748161316, + "learning_rate": 1.3540567574456683e-05, + "loss": 1.2161, + "step": 13533 + }, + { + "epoch": 4.031050466315456, + "grad_norm": 0.2870865762233734, + "learning_rate": 1.353966547243331e-05, + "loss": 1.2327, + "step": 13534 + }, + { + "epoch": 4.031348312515125, + "grad_norm": 0.30945733189582825, + "learning_rate": 1.3538763337477303e-05, + "loss": 1.2288, + "step": 13535 + }, + { + "epoch": 4.031646158714794, + "grad_norm": 0.2529684007167816, + "learning_rate": 1.3537861169597055e-05, + "loss": 1.2302, + "step": 13536 + }, + { + "epoch": 4.031944004914462, + "grad_norm": 0.29196882247924805, + "learning_rate": 1.3536958968800963e-05, + "loss": 1.2065, + "step": 13537 + }, + { + "epoch": 4.032241851114131, + "grad_norm": 0.264177143573761, + "learning_rate": 1.3536056735097423e-05, + "loss": 1.227, + "step": 13538 + }, + { + "epoch": 4.0325396973137995, + "grad_norm": 0.2778324484825134, + "learning_rate": 1.3535154468494822e-05, + "loss": 1.2007, + "step": 13539 + }, + { + "epoch": 4.032837543513468, + "grad_norm": 0.2707468271255493, + "learning_rate": 1.3534252169001561e-05, + "loss": 1.2514, + "step": 13540 + }, + { + "epoch": 4.033135389713137, + "grad_norm": 0.2863061726093292, + "learning_rate": 1.3533349836626031e-05, + "loss": 1.2478, + "step": 13541 + }, + { + "epoch": 4.033433235912805, + "grad_norm": 0.2589258849620819, + "learning_rate": 1.3532447471376628e-05, + "loss": 1.2259, + "step": 13542 + }, + { + "epoch": 4.0337310821124746, + "grad_norm": 0.27595287561416626, + "learning_rate": 1.3531545073261749e-05, + "loss": 1.238, + "step": 13543 + }, + { + "epoch": 4.034028928312143, + "grad_norm": 0.2699430286884308, + "learning_rate": 1.3530642642289789e-05, + "loss": 1.2228, + "step": 13544 + }, + { + "epoch": 4.034326774511811, + "grad_norm": 0.2584626078605652, + "learning_rate": 1.3529740178469144e-05, + "loss": 1.2238, + "step": 13545 + }, + { + "epoch": 4.03462462071148, + "grad_norm": 0.2900390326976776, + "learning_rate": 1.3528837681808206e-05, + "loss": 1.2167, + "step": 13546 + }, + { + "epoch": 4.034922466911149, + "grad_norm": 0.25129029154777527, + "learning_rate": 1.3527935152315381e-05, + "loss": 1.2191, + "step": 13547 + }, + { + "epoch": 4.035220313110817, + "grad_norm": 0.2600948214530945, + "learning_rate": 1.3527032589999058e-05, + "loss": 1.2234, + "step": 13548 + }, + { + "epoch": 4.035518159310486, + "grad_norm": 0.26066386699676514, + "learning_rate": 1.3526129994867639e-05, + "loss": 1.2198, + "step": 13549 + }, + { + "epoch": 4.035816005510155, + "grad_norm": 0.25908273458480835, + "learning_rate": 1.352522736692952e-05, + "loss": 1.2397, + "step": 13550 + }, + { + "epoch": 4.036113851709823, + "grad_norm": 0.2511611580848694, + "learning_rate": 1.3524324706193102e-05, + "loss": 1.2204, + "step": 13551 + }, + { + "epoch": 4.036411697909492, + "grad_norm": 0.2425713688135147, + "learning_rate": 1.3523422012666776e-05, + "loss": 1.2196, + "step": 13552 + }, + { + "epoch": 4.0367095441091605, + "grad_norm": 0.2602206766605377, + "learning_rate": 1.3522519286358944e-05, + "loss": 1.2331, + "step": 13553 + }, + { + "epoch": 4.03700739030883, + "grad_norm": 0.24712972342967987, + "learning_rate": 1.3521616527278006e-05, + "loss": 1.2283, + "step": 13554 + }, + { + "epoch": 4.037305236508498, + "grad_norm": 0.2583823502063751, + "learning_rate": 1.3520713735432365e-05, + "loss": 1.2206, + "step": 13555 + }, + { + "epoch": 4.037603082708166, + "grad_norm": 0.2572380304336548, + "learning_rate": 1.3519810910830411e-05, + "loss": 1.2191, + "step": 13556 + }, + { + "epoch": 4.037900928907836, + "grad_norm": 0.25522580742836, + "learning_rate": 1.3518908053480553e-05, + "loss": 1.2156, + "step": 13557 + }, + { + "epoch": 4.038198775107504, + "grad_norm": 0.261445015668869, + "learning_rate": 1.3518005163391185e-05, + "loss": 1.2348, + "step": 13558 + }, + { + "epoch": 4.038496621307172, + "grad_norm": 0.25178179144859314, + "learning_rate": 1.351710224057071e-05, + "loss": 1.2256, + "step": 13559 + }, + { + "epoch": 4.0387944675068415, + "grad_norm": 0.2590835988521576, + "learning_rate": 1.3516199285027527e-05, + "loss": 1.2259, + "step": 13560 + }, + { + "epoch": 4.03909231370651, + "grad_norm": 0.24784936010837555, + "learning_rate": 1.3515296296770041e-05, + "loss": 1.2336, + "step": 13561 + }, + { + "epoch": 4.039390159906178, + "grad_norm": 0.2646029591560364, + "learning_rate": 1.351439327580665e-05, + "loss": 1.2354, + "step": 13562 + }, + { + "epoch": 4.039688006105847, + "grad_norm": 0.25064483284950256, + "learning_rate": 1.3513490222145756e-05, + "loss": 1.2173, + "step": 13563 + }, + { + "epoch": 4.039985852305516, + "grad_norm": 0.24869340658187866, + "learning_rate": 1.3512587135795759e-05, + "loss": 1.2298, + "step": 13564 + }, + { + "epoch": 4.040283698505184, + "grad_norm": 0.27413302659988403, + "learning_rate": 1.3511684016765063e-05, + "loss": 1.2127, + "step": 13565 + }, + { + "epoch": 4.040581544704853, + "grad_norm": 0.25885817408561707, + "learning_rate": 1.3510780865062072e-05, + "loss": 1.2192, + "step": 13566 + }, + { + "epoch": 4.0408793909045215, + "grad_norm": 0.3105272948741913, + "learning_rate": 1.3509877680695187e-05, + "loss": 1.2232, + "step": 13567 + }, + { + "epoch": 4.041177237104191, + "grad_norm": 0.34326842427253723, + "learning_rate": 1.3508974463672814e-05, + "loss": 1.2114, + "step": 13568 + }, + { + "epoch": 4.041475083303859, + "grad_norm": 0.33438393473625183, + "learning_rate": 1.3508071214003353e-05, + "loss": 1.2284, + "step": 13569 + }, + { + "epoch": 4.041772929503527, + "grad_norm": 0.8982435464859009, + "learning_rate": 1.3507167931695206e-05, + "loss": 1.2093, + "step": 13570 + }, + { + "epoch": 4.042070775703197, + "grad_norm": 0.3824581205844879, + "learning_rate": 1.3506264616756784e-05, + "loss": 1.2331, + "step": 13571 + }, + { + "epoch": 4.042368621902865, + "grad_norm": 0.2815983295440674, + "learning_rate": 1.3505361269196482e-05, + "loss": 1.2305, + "step": 13572 + }, + { + "epoch": 4.042666468102533, + "grad_norm": 0.24957147240638733, + "learning_rate": 1.3504457889022713e-05, + "loss": 1.2327, + "step": 13573 + }, + { + "epoch": 4.0429643143022025, + "grad_norm": 0.2587170898914337, + "learning_rate": 1.3503554476243878e-05, + "loss": 1.227, + "step": 13574 + }, + { + "epoch": 4.043262160501871, + "grad_norm": 0.2537778913974762, + "learning_rate": 1.3502651030868386e-05, + "loss": 1.2342, + "step": 13575 + }, + { + "epoch": 4.043560006701539, + "grad_norm": 0.24664580821990967, + "learning_rate": 1.3501747552904636e-05, + "loss": 1.2229, + "step": 13576 + }, + { + "epoch": 4.043857852901208, + "grad_norm": 0.25332629680633545, + "learning_rate": 1.3500844042361036e-05, + "loss": 1.2416, + "step": 13577 + }, + { + "epoch": 4.044155699100877, + "grad_norm": 0.24326634407043457, + "learning_rate": 1.3499940499245998e-05, + "loss": 1.2323, + "step": 13578 + }, + { + "epoch": 4.044453545300545, + "grad_norm": 0.239749014377594, + "learning_rate": 1.3499036923567923e-05, + "loss": 1.2184, + "step": 13579 + }, + { + "epoch": 4.044751391500214, + "grad_norm": 0.24345549941062927, + "learning_rate": 1.3498133315335219e-05, + "loss": 1.2173, + "step": 13580 + }, + { + "epoch": 4.045049237699883, + "grad_norm": 0.24426348507404327, + "learning_rate": 1.349722967455629e-05, + "loss": 1.2177, + "step": 13581 + }, + { + "epoch": 4.045347083899552, + "grad_norm": 0.24000655114650726, + "learning_rate": 1.3496326001239547e-05, + "loss": 1.2185, + "step": 13582 + }, + { + "epoch": 4.04564493009922, + "grad_norm": 0.24266929924488068, + "learning_rate": 1.3495422295393399e-05, + "loss": 1.2074, + "step": 13583 + }, + { + "epoch": 4.045942776298888, + "grad_norm": 0.25049179792404175, + "learning_rate": 1.349451855702625e-05, + "loss": 1.2434, + "step": 13584 + }, + { + "epoch": 4.046240622498558, + "grad_norm": 0.24727383255958557, + "learning_rate": 1.3493614786146512e-05, + "loss": 1.2248, + "step": 13585 + }, + { + "epoch": 4.046538468698226, + "grad_norm": 0.24171684682369232, + "learning_rate": 1.3492710982762592e-05, + "loss": 1.2278, + "step": 13586 + }, + { + "epoch": 4.046836314897894, + "grad_norm": 0.24876807630062103, + "learning_rate": 1.3491807146882897e-05, + "loss": 1.2247, + "step": 13587 + }, + { + "epoch": 4.0471341610975635, + "grad_norm": 0.24070468544960022, + "learning_rate": 1.3490903278515837e-05, + "loss": 1.2244, + "step": 13588 + }, + { + "epoch": 4.047432007297232, + "grad_norm": 0.24079173803329468, + "learning_rate": 1.3489999377669823e-05, + "loss": 1.2276, + "step": 13589 + }, + { + "epoch": 4.0477298534969, + "grad_norm": 0.24558314681053162, + "learning_rate": 1.3489095444353261e-05, + "loss": 1.2237, + "step": 13590 + }, + { + "epoch": 4.048027699696569, + "grad_norm": 0.24840916693210602, + "learning_rate": 1.3488191478574568e-05, + "loss": 1.2273, + "step": 13591 + }, + { + "epoch": 4.048325545896238, + "grad_norm": 0.26182103157043457, + "learning_rate": 1.3487287480342152e-05, + "loss": 1.2173, + "step": 13592 + }, + { + "epoch": 4.048623392095906, + "grad_norm": 0.24094253778457642, + "learning_rate": 1.348638344966442e-05, + "loss": 1.2288, + "step": 13593 + }, + { + "epoch": 4.048921238295575, + "grad_norm": 0.24000322818756104, + "learning_rate": 1.3485479386549785e-05, + "loss": 1.2288, + "step": 13594 + }, + { + "epoch": 4.049219084495244, + "grad_norm": 0.2488178163766861, + "learning_rate": 1.3484575291006656e-05, + "loss": 1.2378, + "step": 13595 + }, + { + "epoch": 4.049516930694913, + "grad_norm": 0.24873749911785126, + "learning_rate": 1.3483671163043453e-05, + "loss": 1.2274, + "step": 13596 + }, + { + "epoch": 4.049814776894581, + "grad_norm": 0.2439805418252945, + "learning_rate": 1.3482767002668578e-05, + "loss": 1.2433, + "step": 13597 + }, + { + "epoch": 4.0501126230942495, + "grad_norm": 0.24247656762599945, + "learning_rate": 1.3481862809890447e-05, + "loss": 1.2142, + "step": 13598 + }, + { + "epoch": 4.050410469293919, + "grad_norm": 0.25325271487236023, + "learning_rate": 1.3480958584717476e-05, + "loss": 1.2266, + "step": 13599 + }, + { + "epoch": 4.050708315493587, + "grad_norm": 0.2725006341934204, + "learning_rate": 1.3480054327158069e-05, + "loss": 1.2292, + "step": 13600 + }, + { + "epoch": 4.051006161693255, + "grad_norm": 0.24800099432468414, + "learning_rate": 1.347915003722065e-05, + "loss": 1.2427, + "step": 13601 + }, + { + "epoch": 4.0513040078929246, + "grad_norm": 0.2671442925930023, + "learning_rate": 1.3478245714913626e-05, + "loss": 1.2367, + "step": 13602 + }, + { + "epoch": 4.051601854092593, + "grad_norm": 0.2499389946460724, + "learning_rate": 1.347734136024541e-05, + "loss": 1.2211, + "step": 13603 + }, + { + "epoch": 4.051899700292261, + "grad_norm": 0.2426474392414093, + "learning_rate": 1.3476436973224418e-05, + "loss": 1.2198, + "step": 13604 + }, + { + "epoch": 4.05219754649193, + "grad_norm": 0.2303323596715927, + "learning_rate": 1.3475532553859065e-05, + "loss": 1.2242, + "step": 13605 + }, + { + "epoch": 4.052495392691599, + "grad_norm": 0.27949002385139465, + "learning_rate": 1.3474628102157765e-05, + "loss": 1.2382, + "step": 13606 + }, + { + "epoch": 4.052793238891267, + "grad_norm": 0.3251260817050934, + "learning_rate": 1.3473723618128931e-05, + "loss": 1.2418, + "step": 13607 + }, + { + "epoch": 4.053091085090936, + "grad_norm": 0.2503196597099304, + "learning_rate": 1.3472819101780981e-05, + "loss": 1.2272, + "step": 13608 + }, + { + "epoch": 4.053388931290605, + "grad_norm": 0.28937655687332153, + "learning_rate": 1.3471914553122329e-05, + "loss": 1.2353, + "step": 13609 + }, + { + "epoch": 4.053686777490274, + "grad_norm": 0.2629067003726959, + "learning_rate": 1.3471009972161392e-05, + "loss": 1.2341, + "step": 13610 + }, + { + "epoch": 4.053984623689942, + "grad_norm": 0.2914303243160248, + "learning_rate": 1.3470105358906585e-05, + "loss": 1.2251, + "step": 13611 + }, + { + "epoch": 4.0542824698896105, + "grad_norm": 0.28820618987083435, + "learning_rate": 1.3469200713366323e-05, + "loss": 1.2296, + "step": 13612 + }, + { + "epoch": 4.05458031608928, + "grad_norm": 0.27909815311431885, + "learning_rate": 1.3468296035549026e-05, + "loss": 1.2254, + "step": 13613 + }, + { + "epoch": 4.054878162288948, + "grad_norm": 0.278202086687088, + "learning_rate": 1.3467391325463109e-05, + "loss": 1.2136, + "step": 13614 + }, + { + "epoch": 4.055176008488616, + "grad_norm": 0.2600470185279846, + "learning_rate": 1.3466486583116989e-05, + "loss": 1.225, + "step": 13615 + }, + { + "epoch": 4.055473854688286, + "grad_norm": 0.23804350197315216, + "learning_rate": 1.3465581808519086e-05, + "loss": 1.2346, + "step": 13616 + }, + { + "epoch": 4.055771700887954, + "grad_norm": 0.2773174047470093, + "learning_rate": 1.3464677001677815e-05, + "loss": 1.2341, + "step": 13617 + }, + { + "epoch": 4.056069547087622, + "grad_norm": 0.29110240936279297, + "learning_rate": 1.3463772162601594e-05, + "loss": 1.2143, + "step": 13618 + }, + { + "epoch": 4.0563673932872915, + "grad_norm": 0.2538909614086151, + "learning_rate": 1.3462867291298846e-05, + "loss": 1.2311, + "step": 13619 + }, + { + "epoch": 4.05666523948696, + "grad_norm": 0.3023252487182617, + "learning_rate": 1.3461962387777985e-05, + "loss": 1.2374, + "step": 13620 + }, + { + "epoch": 4.056963085686629, + "grad_norm": 0.23743636906147003, + "learning_rate": 1.3461057452047433e-05, + "loss": 1.2103, + "step": 13621 + }, + { + "epoch": 4.057260931886297, + "grad_norm": 0.26855871081352234, + "learning_rate": 1.3460152484115607e-05, + "loss": 1.2317, + "step": 13622 + }, + { + "epoch": 4.057558778085966, + "grad_norm": 0.2674756646156311, + "learning_rate": 1.3459247483990926e-05, + "loss": 1.2232, + "step": 13623 + }, + { + "epoch": 4.057856624285635, + "grad_norm": 0.25979241728782654, + "learning_rate": 1.3458342451681816e-05, + "loss": 1.2366, + "step": 13624 + }, + { + "epoch": 4.058154470485303, + "grad_norm": 0.29046401381492615, + "learning_rate": 1.3457437387196692e-05, + "loss": 1.227, + "step": 13625 + }, + { + "epoch": 4.0584523166849715, + "grad_norm": 0.27793610095977783, + "learning_rate": 1.3456532290543978e-05, + "loss": 1.2254, + "step": 13626 + }, + { + "epoch": 4.058750162884641, + "grad_norm": 0.3121613562107086, + "learning_rate": 1.3455627161732092e-05, + "loss": 1.2294, + "step": 13627 + }, + { + "epoch": 4.059048009084309, + "grad_norm": 0.25026342272758484, + "learning_rate": 1.3454722000769453e-05, + "loss": 1.2257, + "step": 13628 + }, + { + "epoch": 4.059345855283977, + "grad_norm": 0.28906697034835815, + "learning_rate": 1.3453816807664488e-05, + "loss": 1.2368, + "step": 13629 + }, + { + "epoch": 4.059643701483647, + "grad_norm": 0.29712942242622375, + "learning_rate": 1.3452911582425616e-05, + "loss": 1.2152, + "step": 13630 + }, + { + "epoch": 4.059941547683315, + "grad_norm": 0.28302890062332153, + "learning_rate": 1.345200632506126e-05, + "loss": 1.2133, + "step": 13631 + }, + { + "epoch": 4.060239393882983, + "grad_norm": 0.26594823598861694, + "learning_rate": 1.3451101035579841e-05, + "loss": 1.2086, + "step": 13632 + }, + { + "epoch": 4.0605372400826525, + "grad_norm": 0.29638671875, + "learning_rate": 1.3450195713989787e-05, + "loss": 1.2129, + "step": 13633 + }, + { + "epoch": 4.060835086282321, + "grad_norm": 0.2815476953983307, + "learning_rate": 1.3449290360299512e-05, + "loss": 1.2259, + "step": 13634 + }, + { + "epoch": 4.06113293248199, + "grad_norm": 0.31807461380958557, + "learning_rate": 1.3448384974517446e-05, + "loss": 1.2243, + "step": 13635 + }, + { + "epoch": 4.061430778681658, + "grad_norm": 0.26353856921195984, + "learning_rate": 1.3447479556652008e-05, + "loss": 1.232, + "step": 13636 + }, + { + "epoch": 4.061728624881327, + "grad_norm": 0.3384415805339813, + "learning_rate": 1.3446574106711625e-05, + "loss": 1.2311, + "step": 13637 + }, + { + "epoch": 4.062026471080996, + "grad_norm": 0.2682347595691681, + "learning_rate": 1.3445668624704722e-05, + "loss": 1.2031, + "step": 13638 + }, + { + "epoch": 4.062324317280664, + "grad_norm": 0.25412505865097046, + "learning_rate": 1.3444763110639722e-05, + "loss": 1.2228, + "step": 13639 + }, + { + "epoch": 4.062622163480333, + "grad_norm": 0.2534717917442322, + "learning_rate": 1.3443857564525051e-05, + "loss": 1.2201, + "step": 13640 + }, + { + "epoch": 4.062920009680002, + "grad_norm": 0.2565198540687561, + "learning_rate": 1.3442951986369131e-05, + "loss": 1.2268, + "step": 13641 + }, + { + "epoch": 4.06321785587967, + "grad_norm": 0.28886285424232483, + "learning_rate": 1.3442046376180388e-05, + "loss": 1.2391, + "step": 13642 + }, + { + "epoch": 4.063515702079338, + "grad_norm": 0.25802603363990784, + "learning_rate": 1.3441140733967254e-05, + "loss": 1.218, + "step": 13643 + }, + { + "epoch": 4.063813548279008, + "grad_norm": 0.256026029586792, + "learning_rate": 1.3440235059738147e-05, + "loss": 1.2096, + "step": 13644 + }, + { + "epoch": 4.064111394478676, + "grad_norm": 0.23858126997947693, + "learning_rate": 1.3439329353501497e-05, + "loss": 1.2446, + "step": 13645 + }, + { + "epoch": 4.064409240678344, + "grad_norm": 0.23307913541793823, + "learning_rate": 1.343842361526573e-05, + "loss": 1.2249, + "step": 13646 + }, + { + "epoch": 4.0647070868780135, + "grad_norm": 0.2645309269428253, + "learning_rate": 1.3437517845039273e-05, + "loss": 1.2106, + "step": 13647 + }, + { + "epoch": 4.065004933077682, + "grad_norm": 0.2464878261089325, + "learning_rate": 1.3436612042830552e-05, + "loss": 1.2319, + "step": 13648 + }, + { + "epoch": 4.065302779277351, + "grad_norm": 0.2564888894557953, + "learning_rate": 1.3435706208647998e-05, + "loss": 1.2243, + "step": 13649 + }, + { + "epoch": 4.065600625477019, + "grad_norm": 0.2656373977661133, + "learning_rate": 1.3434800342500036e-05, + "loss": 1.2244, + "step": 13650 + }, + { + "epoch": 4.065898471676688, + "grad_norm": 0.27228906750679016, + "learning_rate": 1.3433894444395093e-05, + "loss": 1.2307, + "step": 13651 + }, + { + "epoch": 4.066196317876357, + "grad_norm": 0.29931968450546265, + "learning_rate": 1.3432988514341598e-05, + "loss": 1.2191, + "step": 13652 + }, + { + "epoch": 4.066494164076025, + "grad_norm": 0.27375850081443787, + "learning_rate": 1.3432082552347983e-05, + "loss": 1.2256, + "step": 13653 + }, + { + "epoch": 4.066792010275694, + "grad_norm": 0.3032185733318329, + "learning_rate": 1.3431176558422671e-05, + "loss": 1.2123, + "step": 13654 + }, + { + "epoch": 4.067089856475363, + "grad_norm": 0.25765061378479004, + "learning_rate": 1.3430270532574097e-05, + "loss": 1.2333, + "step": 13655 + }, + { + "epoch": 4.067387702675031, + "grad_norm": 0.2828814387321472, + "learning_rate": 1.3429364474810689e-05, + "loss": 1.2352, + "step": 13656 + }, + { + "epoch": 4.0676855488746995, + "grad_norm": 0.2466113418340683, + "learning_rate": 1.3428458385140877e-05, + "loss": 1.2229, + "step": 13657 + }, + { + "epoch": 4.067983395074369, + "grad_norm": 0.298318088054657, + "learning_rate": 1.3427552263573087e-05, + "loss": 1.2168, + "step": 13658 + }, + { + "epoch": 4.068281241274037, + "grad_norm": 0.2558857798576355, + "learning_rate": 1.342664611011575e-05, + "loss": 1.2198, + "step": 13659 + }, + { + "epoch": 4.068579087473705, + "grad_norm": 0.28958582878112793, + "learning_rate": 1.3425739924777307e-05, + "loss": 1.2361, + "step": 13660 + }, + { + "epoch": 4.0688769336733746, + "grad_norm": 0.27416858077049255, + "learning_rate": 1.3424833707566176e-05, + "loss": 1.2117, + "step": 13661 + }, + { + "epoch": 4.069174779873043, + "grad_norm": 0.2728113532066345, + "learning_rate": 1.3423927458490795e-05, + "loss": 1.2275, + "step": 13662 + }, + { + "epoch": 4.069472626072712, + "grad_norm": 0.2720509171485901, + "learning_rate": 1.3423021177559596e-05, + "loss": 1.217, + "step": 13663 + }, + { + "epoch": 4.06977047227238, + "grad_norm": 0.27941185235977173, + "learning_rate": 1.3422114864781008e-05, + "loss": 1.2243, + "step": 13664 + }, + { + "epoch": 4.070068318472049, + "grad_norm": 0.24686238169670105, + "learning_rate": 1.3421208520163465e-05, + "loss": 1.2179, + "step": 13665 + }, + { + "epoch": 4.070366164671718, + "grad_norm": 0.2613919675350189, + "learning_rate": 1.3420302143715397e-05, + "loss": 1.2332, + "step": 13666 + }, + { + "epoch": 4.070664010871386, + "grad_norm": 0.2512933313846588, + "learning_rate": 1.3419395735445244e-05, + "loss": 1.2263, + "step": 13667 + }, + { + "epoch": 4.070961857071055, + "grad_norm": 0.2590857148170471, + "learning_rate": 1.3418489295361429e-05, + "loss": 1.225, + "step": 13668 + }, + { + "epoch": 4.071259703270724, + "grad_norm": 0.2707737982273102, + "learning_rate": 1.3417582823472395e-05, + "loss": 1.2137, + "step": 13669 + }, + { + "epoch": 4.071557549470392, + "grad_norm": 0.2573646605014801, + "learning_rate": 1.3416676319786568e-05, + "loss": 1.2313, + "step": 13670 + }, + { + "epoch": 4.0718553956700605, + "grad_norm": 0.27899056673049927, + "learning_rate": 1.3415769784312385e-05, + "loss": 1.2485, + "step": 13671 + }, + { + "epoch": 4.07215324186973, + "grad_norm": 0.2648579180240631, + "learning_rate": 1.3414863217058281e-05, + "loss": 1.2279, + "step": 13672 + }, + { + "epoch": 4.072451088069398, + "grad_norm": 0.2523336112499237, + "learning_rate": 1.3413956618032691e-05, + "loss": 1.2337, + "step": 13673 + }, + { + "epoch": 4.072748934269066, + "grad_norm": 0.24383379518985748, + "learning_rate": 1.3413049987244048e-05, + "loss": 1.2249, + "step": 13674 + }, + { + "epoch": 4.073046780468736, + "grad_norm": 0.2558092474937439, + "learning_rate": 1.3412143324700791e-05, + "loss": 1.229, + "step": 13675 + }, + { + "epoch": 4.073344626668404, + "grad_norm": 0.31439778208732605, + "learning_rate": 1.3411236630411349e-05, + "loss": 1.2237, + "step": 13676 + }, + { + "epoch": 4.073642472868073, + "grad_norm": 0.29358455538749695, + "learning_rate": 1.3410329904384161e-05, + "loss": 1.2266, + "step": 13677 + }, + { + "epoch": 4.0739403190677415, + "grad_norm": 0.2612122893333435, + "learning_rate": 1.3409423146627665e-05, + "loss": 1.2365, + "step": 13678 + }, + { + "epoch": 4.07423816526741, + "grad_norm": 0.3495548367500305, + "learning_rate": 1.3408516357150296e-05, + "loss": 1.2245, + "step": 13679 + }, + { + "epoch": 4.074536011467079, + "grad_norm": 0.2837928533554077, + "learning_rate": 1.340760953596049e-05, + "loss": 1.2136, + "step": 13680 + }, + { + "epoch": 4.074833857666747, + "grad_norm": 0.2951454222202301, + "learning_rate": 1.3406702683066688e-05, + "loss": 1.2296, + "step": 13681 + }, + { + "epoch": 4.075131703866416, + "grad_norm": 0.3578106164932251, + "learning_rate": 1.340579579847732e-05, + "loss": 1.2283, + "step": 13682 + }, + { + "epoch": 4.075429550066085, + "grad_norm": 0.30266961455345154, + "learning_rate": 1.3404888882200827e-05, + "loss": 1.2353, + "step": 13683 + }, + { + "epoch": 4.075727396265753, + "grad_norm": 0.29476091265678406, + "learning_rate": 1.3403981934245649e-05, + "loss": 1.2171, + "step": 13684 + }, + { + "epoch": 4.0760252424654215, + "grad_norm": 0.30868008732795715, + "learning_rate": 1.3403074954620219e-05, + "loss": 1.2048, + "step": 13685 + }, + { + "epoch": 4.076323088665091, + "grad_norm": 0.4029812216758728, + "learning_rate": 1.3402167943332982e-05, + "loss": 1.2165, + "step": 13686 + }, + { + "epoch": 4.076620934864759, + "grad_norm": 0.3996765613555908, + "learning_rate": 1.3401260900392373e-05, + "loss": 1.2177, + "step": 13687 + }, + { + "epoch": 4.076918781064428, + "grad_norm": 0.3624470829963684, + "learning_rate": 1.340035382580683e-05, + "loss": 1.2348, + "step": 13688 + }, + { + "epoch": 4.077216627264097, + "grad_norm": 0.9039879441261292, + "learning_rate": 1.3399446719584792e-05, + "loss": 1.22, + "step": 13689 + }, + { + "epoch": 4.077514473463765, + "grad_norm": 0.4560149908065796, + "learning_rate": 1.3398539581734704e-05, + "loss": 1.2212, + "step": 13690 + }, + { + "epoch": 4.077812319663434, + "grad_norm": 0.35248881578445435, + "learning_rate": 1.3397632412265002e-05, + "loss": 1.2291, + "step": 13691 + }, + { + "epoch": 4.0781101658631025, + "grad_norm": 0.3275471329689026, + "learning_rate": 1.3396725211184128e-05, + "loss": 1.2441, + "step": 13692 + }, + { + "epoch": 4.078408012062771, + "grad_norm": 0.3053240180015564, + "learning_rate": 1.3395817978500515e-05, + "loss": 1.2276, + "step": 13693 + }, + { + "epoch": 4.07870585826244, + "grad_norm": 0.6420515179634094, + "learning_rate": 1.3394910714222615e-05, + "loss": 1.2334, + "step": 13694 + }, + { + "epoch": 4.079003704462108, + "grad_norm": 0.3115173280239105, + "learning_rate": 1.339400341835886e-05, + "loss": 1.2163, + "step": 13695 + }, + { + "epoch": 4.079301550661777, + "grad_norm": 0.2744782865047455, + "learning_rate": 1.3393096090917699e-05, + "loss": 1.2232, + "step": 13696 + }, + { + "epoch": 4.079599396861446, + "grad_norm": 0.2513817548751831, + "learning_rate": 1.3392188731907567e-05, + "loss": 1.2228, + "step": 13697 + }, + { + "epoch": 4.079897243061114, + "grad_norm": 0.25459542870521545, + "learning_rate": 1.3391281341336913e-05, + "loss": 1.229, + "step": 13698 + }, + { + "epoch": 4.080195089260783, + "grad_norm": 0.258196622133255, + "learning_rate": 1.339037391921417e-05, + "loss": 1.2279, + "step": 13699 + }, + { + "epoch": 4.080492935460452, + "grad_norm": 0.24485677480697632, + "learning_rate": 1.338946646554779e-05, + "loss": 1.218, + "step": 13700 + }, + { + "epoch": 4.08079078166012, + "grad_norm": 0.24852995574474335, + "learning_rate": 1.3388558980346207e-05, + "loss": 1.2279, + "step": 13701 + }, + { + "epoch": 4.081088627859789, + "grad_norm": 0.25261256098747253, + "learning_rate": 1.3387651463617873e-05, + "loss": 1.2307, + "step": 13702 + }, + { + "epoch": 4.081386474059458, + "grad_norm": 0.27149641513824463, + "learning_rate": 1.3386743915371223e-05, + "loss": 1.2225, + "step": 13703 + }, + { + "epoch": 4.081684320259126, + "grad_norm": 0.2802477478981018, + "learning_rate": 1.3385836335614706e-05, + "loss": 1.2299, + "step": 13704 + }, + { + "epoch": 4.081982166458795, + "grad_norm": 0.24667353928089142, + "learning_rate": 1.3384928724356767e-05, + "loss": 1.2318, + "step": 13705 + }, + { + "epoch": 4.0822800126584635, + "grad_norm": 0.25782522559165955, + "learning_rate": 1.3384021081605844e-05, + "loss": 1.2312, + "step": 13706 + }, + { + "epoch": 4.082577858858132, + "grad_norm": 0.26943254470825195, + "learning_rate": 1.3383113407370387e-05, + "loss": 1.2036, + "step": 13707 + }, + { + "epoch": 4.082875705057801, + "grad_norm": 0.25699976086616516, + "learning_rate": 1.3382205701658843e-05, + "loss": 1.2087, + "step": 13708 + }, + { + "epoch": 4.083173551257469, + "grad_norm": 0.25810468196868896, + "learning_rate": 1.3381297964479654e-05, + "loss": 1.2139, + "step": 13709 + }, + { + "epoch": 4.083471397457138, + "grad_norm": 0.2752602994441986, + "learning_rate": 1.3380390195841262e-05, + "loss": 1.228, + "step": 13710 + }, + { + "epoch": 4.083769243656807, + "grad_norm": 0.26506638526916504, + "learning_rate": 1.3379482395752115e-05, + "loss": 1.2203, + "step": 13711 + }, + { + "epoch": 4.084067089856475, + "grad_norm": 0.28648409247398376, + "learning_rate": 1.337857456422066e-05, + "loss": 1.2204, + "step": 13712 + }, + { + "epoch": 4.084364936056144, + "grad_norm": 0.25765904784202576, + "learning_rate": 1.3377666701255347e-05, + "loss": 1.2178, + "step": 13713 + }, + { + "epoch": 4.084662782255813, + "grad_norm": 0.3073035478591919, + "learning_rate": 1.3376758806864618e-05, + "loss": 1.2279, + "step": 13714 + }, + { + "epoch": 4.084960628455481, + "grad_norm": 0.23774142563343048, + "learning_rate": 1.3375850881056922e-05, + "loss": 1.2182, + "step": 13715 + }, + { + "epoch": 4.08525847465515, + "grad_norm": 0.33706215023994446, + "learning_rate": 1.3374942923840703e-05, + "loss": 1.2248, + "step": 13716 + }, + { + "epoch": 4.085556320854819, + "grad_norm": 0.2777893841266632, + "learning_rate": 1.337403493522441e-05, + "loss": 1.2372, + "step": 13717 + }, + { + "epoch": 4.085854167054487, + "grad_norm": 0.25480255484580994, + "learning_rate": 1.3373126915216494e-05, + "loss": 1.2383, + "step": 13718 + }, + { + "epoch": 4.086152013254156, + "grad_norm": 0.24534185230731964, + "learning_rate": 1.3372218863825399e-05, + "loss": 1.2254, + "step": 13719 + }, + { + "epoch": 4.0864498594538246, + "grad_norm": 0.25387948751449585, + "learning_rate": 1.3371310781059576e-05, + "loss": 1.2358, + "step": 13720 + }, + { + "epoch": 4.086747705653493, + "grad_norm": 0.25739654898643494, + "learning_rate": 1.3370402666927473e-05, + "loss": 1.2351, + "step": 13721 + }, + { + "epoch": 4.087045551853162, + "grad_norm": 0.2902533710002899, + "learning_rate": 1.3369494521437541e-05, + "loss": 1.2252, + "step": 13722 + }, + { + "epoch": 4.08734339805283, + "grad_norm": 0.2753237783908844, + "learning_rate": 1.3368586344598225e-05, + "loss": 1.2322, + "step": 13723 + }, + { + "epoch": 4.087641244252499, + "grad_norm": 0.23811596632003784, + "learning_rate": 1.3367678136417974e-05, + "loss": 1.2352, + "step": 13724 + }, + { + "epoch": 4.087939090452168, + "grad_norm": 0.26392996311187744, + "learning_rate": 1.3366769896905245e-05, + "loss": 1.2156, + "step": 13725 + }, + { + "epoch": 4.088236936651836, + "grad_norm": 0.27758848667144775, + "learning_rate": 1.336586162606848e-05, + "loss": 1.2388, + "step": 13726 + }, + { + "epoch": 4.088534782851505, + "grad_norm": 0.2551907002925873, + "learning_rate": 1.3364953323916134e-05, + "loss": 1.2271, + "step": 13727 + }, + { + "epoch": 4.088832629051174, + "grad_norm": 0.26844868063926697, + "learning_rate": 1.3364044990456659e-05, + "loss": 1.2307, + "step": 13728 + }, + { + "epoch": 4.089130475250842, + "grad_norm": 0.29414692521095276, + "learning_rate": 1.3363136625698503e-05, + "loss": 1.2266, + "step": 13729 + }, + { + "epoch": 4.089428321450511, + "grad_norm": 0.42714792490005493, + "learning_rate": 1.3362228229650116e-05, + "loss": 1.2107, + "step": 13730 + }, + { + "epoch": 4.08972616765018, + "grad_norm": 0.3364658057689667, + "learning_rate": 1.3361319802319954e-05, + "loss": 1.2176, + "step": 13731 + }, + { + "epoch": 4.090024013849848, + "grad_norm": 0.315278023481369, + "learning_rate": 1.3360411343716468e-05, + "loss": 1.2365, + "step": 13732 + }, + { + "epoch": 4.090321860049517, + "grad_norm": 0.29315799474716187, + "learning_rate": 1.3359502853848107e-05, + "loss": 1.2126, + "step": 13733 + }, + { + "epoch": 4.090619706249186, + "grad_norm": 0.3561355471611023, + "learning_rate": 1.3358594332723325e-05, + "loss": 1.2337, + "step": 13734 + }, + { + "epoch": 4.090917552448854, + "grad_norm": 0.4308493137359619, + "learning_rate": 1.3357685780350575e-05, + "loss": 1.2385, + "step": 13735 + }, + { + "epoch": 4.091215398648523, + "grad_norm": 0.28329983353614807, + "learning_rate": 1.3356777196738312e-05, + "loss": 1.1968, + "step": 13736 + }, + { + "epoch": 4.0915132448481915, + "grad_norm": 0.2651686370372772, + "learning_rate": 1.3355868581894987e-05, + "loss": 1.2327, + "step": 13737 + }, + { + "epoch": 4.09181109104786, + "grad_norm": 0.3283056914806366, + "learning_rate": 1.3354959935829052e-05, + "loss": 1.223, + "step": 13738 + }, + { + "epoch": 4.092108937247529, + "grad_norm": 0.25197744369506836, + "learning_rate": 1.3354051258548968e-05, + "loss": 1.2141, + "step": 13739 + }, + { + "epoch": 4.092406783447197, + "grad_norm": 0.2821015417575836, + "learning_rate": 1.335314255006318e-05, + "loss": 1.2397, + "step": 13740 + }, + { + "epoch": 4.092704629646866, + "grad_norm": 0.24738141894340515, + "learning_rate": 1.3352233810380149e-05, + "loss": 1.2313, + "step": 13741 + }, + { + "epoch": 4.093002475846535, + "grad_norm": 0.2650309205055237, + "learning_rate": 1.3351325039508328e-05, + "loss": 1.218, + "step": 13742 + }, + { + "epoch": 4.093300322046203, + "grad_norm": 0.3236358165740967, + "learning_rate": 1.335041623745617e-05, + "loss": 1.2216, + "step": 13743 + }, + { + "epoch": 4.093598168245872, + "grad_norm": 0.30074071884155273, + "learning_rate": 1.3349507404232131e-05, + "loss": 1.2025, + "step": 13744 + }, + { + "epoch": 4.093896014445541, + "grad_norm": 0.2564895749092102, + "learning_rate": 1.3348598539844671e-05, + "loss": 1.2137, + "step": 13745 + }, + { + "epoch": 4.094193860645209, + "grad_norm": 0.24864192306995392, + "learning_rate": 1.3347689644302244e-05, + "loss": 1.2224, + "step": 13746 + }, + { + "epoch": 4.094491706844878, + "grad_norm": 0.25647157430648804, + "learning_rate": 1.3346780717613302e-05, + "loss": 1.2349, + "step": 13747 + }, + { + "epoch": 4.094789553044547, + "grad_norm": 0.28633326292037964, + "learning_rate": 1.3345871759786303e-05, + "loss": 1.2402, + "step": 13748 + }, + { + "epoch": 4.095087399244215, + "grad_norm": 0.25307002663612366, + "learning_rate": 1.3344962770829712e-05, + "loss": 1.2242, + "step": 13749 + }, + { + "epoch": 4.095385245443884, + "grad_norm": 0.28082963824272156, + "learning_rate": 1.3344053750751976e-05, + "loss": 1.2064, + "step": 13750 + }, + { + "epoch": 4.0956830916435525, + "grad_norm": 0.24947533011436462, + "learning_rate": 1.3343144699561555e-05, + "loss": 1.2471, + "step": 13751 + }, + { + "epoch": 4.095980937843221, + "grad_norm": 0.30670779943466187, + "learning_rate": 1.3342235617266908e-05, + "loss": 1.2178, + "step": 13752 + }, + { + "epoch": 4.09627878404289, + "grad_norm": 0.27580228447914124, + "learning_rate": 1.3341326503876495e-05, + "loss": 1.2266, + "step": 13753 + }, + { + "epoch": 4.096576630242558, + "grad_norm": 0.2716881334781647, + "learning_rate": 1.334041735939877e-05, + "loss": 1.212, + "step": 13754 + }, + { + "epoch": 4.096874476442228, + "grad_norm": 0.2548467516899109, + "learning_rate": 1.3339508183842194e-05, + "loss": 1.2271, + "step": 13755 + }, + { + "epoch": 4.097172322641896, + "grad_norm": 0.29218921065330505, + "learning_rate": 1.3338598977215224e-05, + "loss": 1.2217, + "step": 13756 + }, + { + "epoch": 4.097470168841564, + "grad_norm": 0.2565087676048279, + "learning_rate": 1.3337689739526325e-05, + "loss": 1.2173, + "step": 13757 + }, + { + "epoch": 4.0977680150412334, + "grad_norm": 0.39434826374053955, + "learning_rate": 1.3336780470783947e-05, + "loss": 1.2382, + "step": 13758 + }, + { + "epoch": 4.098065861240902, + "grad_norm": 0.3490411341190338, + "learning_rate": 1.3335871170996558e-05, + "loss": 1.2271, + "step": 13759 + }, + { + "epoch": 4.09836370744057, + "grad_norm": 0.4308874309062958, + "learning_rate": 1.3334961840172612e-05, + "loss": 1.2153, + "step": 13760 + }, + { + "epoch": 4.098661553640239, + "grad_norm": 0.39875128865242004, + "learning_rate": 1.3334052478320576e-05, + "loss": 1.2196, + "step": 13761 + }, + { + "epoch": 4.098959399839908, + "grad_norm": 0.3732142746448517, + "learning_rate": 1.3333143085448903e-05, + "loss": 1.2354, + "step": 13762 + }, + { + "epoch": 4.099257246039576, + "grad_norm": 0.2714923024177551, + "learning_rate": 1.333223366156606e-05, + "loss": 1.219, + "step": 13763 + }, + { + "epoch": 4.099555092239245, + "grad_norm": 0.6516803503036499, + "learning_rate": 1.3331324206680508e-05, + "loss": 1.2223, + "step": 13764 + }, + { + "epoch": 4.0998529384389135, + "grad_norm": 0.4559103846549988, + "learning_rate": 1.33304147208007e-05, + "loss": 1.2185, + "step": 13765 + }, + { + "epoch": 4.100150784638582, + "grad_norm": 0.339886873960495, + "learning_rate": 1.3329505203935109e-05, + "loss": 1.2091, + "step": 13766 + }, + { + "epoch": 4.100448630838251, + "grad_norm": 0.3002610504627228, + "learning_rate": 1.332859565609219e-05, + "loss": 1.2272, + "step": 13767 + }, + { + "epoch": 4.100746477037919, + "grad_norm": 0.4801383316516876, + "learning_rate": 1.3327686077280406e-05, + "loss": 1.2335, + "step": 13768 + }, + { + "epoch": 4.101044323237589, + "grad_norm": 0.34596481919288635, + "learning_rate": 1.3326776467508223e-05, + "loss": 1.2301, + "step": 13769 + }, + { + "epoch": 4.101342169437257, + "grad_norm": 0.34558749198913574, + "learning_rate": 1.33258668267841e-05, + "loss": 1.2378, + "step": 13770 + }, + { + "epoch": 4.101640015636925, + "grad_norm": 0.33433011174201965, + "learning_rate": 1.3324957155116503e-05, + "loss": 1.2129, + "step": 13771 + }, + { + "epoch": 4.1019378618365945, + "grad_norm": 0.3223732113838196, + "learning_rate": 1.3324047452513894e-05, + "loss": 1.2243, + "step": 13772 + }, + { + "epoch": 4.102235708036263, + "grad_norm": 0.539328932762146, + "learning_rate": 1.3323137718984743e-05, + "loss": 1.2165, + "step": 13773 + }, + { + "epoch": 4.102533554235931, + "grad_norm": 0.40519529581069946, + "learning_rate": 1.3322227954537503e-05, + "loss": 1.2218, + "step": 13774 + }, + { + "epoch": 4.1028314004356, + "grad_norm": 0.4319937825202942, + "learning_rate": 1.3321318159180643e-05, + "loss": 1.2247, + "step": 13775 + }, + { + "epoch": 4.103129246635269, + "grad_norm": 0.27791622281074524, + "learning_rate": 1.332040833292263e-05, + "loss": 1.2223, + "step": 13776 + }, + { + "epoch": 4.103427092834937, + "grad_norm": 0.5481389760971069, + "learning_rate": 1.3319498475771926e-05, + "loss": 1.2423, + "step": 13777 + }, + { + "epoch": 4.103724939034606, + "grad_norm": 0.2930184304714203, + "learning_rate": 1.3318588587736997e-05, + "loss": 1.2218, + "step": 13778 + }, + { + "epoch": 4.1040227852342746, + "grad_norm": 0.3152657151222229, + "learning_rate": 1.331767866882631e-05, + "loss": 1.2216, + "step": 13779 + }, + { + "epoch": 4.104320631433943, + "grad_norm": 0.35558372735977173, + "learning_rate": 1.3316768719048332e-05, + "loss": 1.2266, + "step": 13780 + }, + { + "epoch": 4.104618477633612, + "grad_norm": 0.2518128752708435, + "learning_rate": 1.3315858738411525e-05, + "loss": 1.24, + "step": 13781 + }, + { + "epoch": 4.10491632383328, + "grad_norm": 0.4876645803451538, + "learning_rate": 1.3314948726924358e-05, + "loss": 1.223, + "step": 13782 + }, + { + "epoch": 4.10521417003295, + "grad_norm": 0.24637188017368317, + "learning_rate": 1.3314038684595295e-05, + "loss": 1.2364, + "step": 13783 + }, + { + "epoch": 4.105512016232618, + "grad_norm": 0.3102695047855377, + "learning_rate": 1.3313128611432804e-05, + "loss": 1.2281, + "step": 13784 + }, + { + "epoch": 4.105809862432286, + "grad_norm": 0.32695600390434265, + "learning_rate": 1.3312218507445355e-05, + "loss": 1.2261, + "step": 13785 + }, + { + "epoch": 4.1061077086319555, + "grad_norm": 0.26559188961982727, + "learning_rate": 1.331130837264141e-05, + "loss": 1.2301, + "step": 13786 + }, + { + "epoch": 4.106405554831624, + "grad_norm": 0.3279210925102234, + "learning_rate": 1.3310398207029447e-05, + "loss": 1.224, + "step": 13787 + }, + { + "epoch": 4.106703401031292, + "grad_norm": 0.2661599814891815, + "learning_rate": 1.3309488010617921e-05, + "loss": 1.2074, + "step": 13788 + }, + { + "epoch": 4.107001247230961, + "grad_norm": 0.25574296712875366, + "learning_rate": 1.3308577783415308e-05, + "loss": 1.2262, + "step": 13789 + }, + { + "epoch": 4.10729909343063, + "grad_norm": 0.3278314173221588, + "learning_rate": 1.330766752543008e-05, + "loss": 1.2244, + "step": 13790 + }, + { + "epoch": 4.107596939630298, + "grad_norm": 0.2441791296005249, + "learning_rate": 1.3306757236670696e-05, + "loss": 1.2199, + "step": 13791 + }, + { + "epoch": 4.107894785829967, + "grad_norm": 0.3597651720046997, + "learning_rate": 1.330584691714563e-05, + "loss": 1.2252, + "step": 13792 + }, + { + "epoch": 4.108192632029636, + "grad_norm": 0.25514233112335205, + "learning_rate": 1.3304936566863358e-05, + "loss": 1.2293, + "step": 13793 + }, + { + "epoch": 4.108490478229304, + "grad_norm": 0.2969241142272949, + "learning_rate": 1.3304026185832335e-05, + "loss": 1.231, + "step": 13794 + }, + { + "epoch": 4.108788324428973, + "grad_norm": 0.27977922558784485, + "learning_rate": 1.3303115774061044e-05, + "loss": 1.2138, + "step": 13795 + }, + { + "epoch": 4.1090861706286415, + "grad_norm": 0.38240185379981995, + "learning_rate": 1.3302205331557952e-05, + "loss": 1.2334, + "step": 13796 + }, + { + "epoch": 4.109384016828311, + "grad_norm": 0.25447720289230347, + "learning_rate": 1.3301294858331531e-05, + "loss": 1.241, + "step": 13797 + }, + { + "epoch": 4.109681863027979, + "grad_norm": 0.3108261525630951, + "learning_rate": 1.3300384354390245e-05, + "loss": 1.2187, + "step": 13798 + }, + { + "epoch": 4.109979709227647, + "grad_norm": 0.2546389102935791, + "learning_rate": 1.3299473819742572e-05, + "loss": 1.2207, + "step": 13799 + }, + { + "epoch": 4.1102775554273165, + "grad_norm": 0.3126170039176941, + "learning_rate": 1.3298563254396983e-05, + "loss": 1.2232, + "step": 13800 + }, + { + "epoch": 4.110575401626985, + "grad_norm": 0.2660098969936371, + "learning_rate": 1.3297652658361945e-05, + "loss": 1.2191, + "step": 13801 + }, + { + "epoch": 4.110873247826653, + "grad_norm": 0.3079351782798767, + "learning_rate": 1.3296742031645934e-05, + "loss": 1.2434, + "step": 13802 + }, + { + "epoch": 4.111171094026322, + "grad_norm": 0.35410577058792114, + "learning_rate": 1.3295831374257424e-05, + "loss": 1.2086, + "step": 13803 + }, + { + "epoch": 4.111468940225991, + "grad_norm": 0.2747192978858948, + "learning_rate": 1.3294920686204886e-05, + "loss": 1.2276, + "step": 13804 + }, + { + "epoch": 4.111766786425659, + "grad_norm": 0.3622715473175049, + "learning_rate": 1.329400996749679e-05, + "loss": 1.2238, + "step": 13805 + }, + { + "epoch": 4.112064632625328, + "grad_norm": 0.26115700602531433, + "learning_rate": 1.329309921814161e-05, + "loss": 1.2171, + "step": 13806 + }, + { + "epoch": 4.112362478824997, + "grad_norm": 0.3849085569381714, + "learning_rate": 1.3292188438147826e-05, + "loss": 1.2181, + "step": 13807 + }, + { + "epoch": 4.112660325024665, + "grad_norm": 0.2930947244167328, + "learning_rate": 1.3291277627523905e-05, + "loss": 1.2191, + "step": 13808 + }, + { + "epoch": 4.112958171224334, + "grad_norm": 0.4109828472137451, + "learning_rate": 1.3290366786278321e-05, + "loss": 1.2389, + "step": 13809 + }, + { + "epoch": 4.1132560174240025, + "grad_norm": 0.41322556138038635, + "learning_rate": 1.3289455914419552e-05, + "loss": 1.2278, + "step": 13810 + }, + { + "epoch": 4.113553863623672, + "grad_norm": 0.3003167510032654, + "learning_rate": 1.3288545011956071e-05, + "loss": 1.2425, + "step": 13811 + }, + { + "epoch": 4.11385170982334, + "grad_norm": 0.25727447867393494, + "learning_rate": 1.328763407889635e-05, + "loss": 1.2172, + "step": 13812 + }, + { + "epoch": 4.114149556023008, + "grad_norm": 0.4414829909801483, + "learning_rate": 1.3286723115248867e-05, + "loss": 1.2267, + "step": 13813 + }, + { + "epoch": 4.114447402222678, + "grad_norm": 0.3486790955066681, + "learning_rate": 1.3285812121022101e-05, + "loss": 1.2093, + "step": 13814 + }, + { + "epoch": 4.114745248422346, + "grad_norm": 0.36375582218170166, + "learning_rate": 1.3284901096224525e-05, + "loss": 1.2105, + "step": 13815 + }, + { + "epoch": 4.115043094622014, + "grad_norm": 0.2826632559299469, + "learning_rate": 1.3283990040864611e-05, + "loss": 1.2295, + "step": 13816 + }, + { + "epoch": 4.1153409408216834, + "grad_norm": 0.47464892268180847, + "learning_rate": 1.3283078954950842e-05, + "loss": 1.2225, + "step": 13817 + }, + { + "epoch": 4.115638787021352, + "grad_norm": 0.34501388669013977, + "learning_rate": 1.3282167838491687e-05, + "loss": 1.2306, + "step": 13818 + }, + { + "epoch": 4.11593663322102, + "grad_norm": 0.34228000044822693, + "learning_rate": 1.328125669149563e-05, + "loss": 1.2193, + "step": 13819 + }, + { + "epoch": 4.116234479420689, + "grad_norm": 0.3054259121417999, + "learning_rate": 1.3280345513971144e-05, + "loss": 1.235, + "step": 13820 + }, + { + "epoch": 4.116532325620358, + "grad_norm": 0.39247167110443115, + "learning_rate": 1.3279434305926713e-05, + "loss": 1.1979, + "step": 13821 + }, + { + "epoch": 4.116830171820027, + "grad_norm": 0.27253589034080505, + "learning_rate": 1.3278523067370805e-05, + "loss": 1.2, + "step": 13822 + }, + { + "epoch": 4.117128018019695, + "grad_norm": 0.3247166574001312, + "learning_rate": 1.3277611798311902e-05, + "loss": 1.225, + "step": 13823 + }, + { + "epoch": 4.1174258642193635, + "grad_norm": 0.27334272861480713, + "learning_rate": 1.3276700498758486e-05, + "loss": 1.2293, + "step": 13824 + }, + { + "epoch": 4.117723710419033, + "grad_norm": 0.2963360846042633, + "learning_rate": 1.3275789168719033e-05, + "loss": 1.2196, + "step": 13825 + }, + { + "epoch": 4.118021556618701, + "grad_norm": 0.36786139011383057, + "learning_rate": 1.327487780820202e-05, + "loss": 1.2256, + "step": 13826 + }, + { + "epoch": 4.118319402818369, + "grad_norm": 0.32970741391181946, + "learning_rate": 1.3273966417215926e-05, + "loss": 1.2125, + "step": 13827 + }, + { + "epoch": 4.118617249018039, + "grad_norm": 0.33740895986557007, + "learning_rate": 1.3273054995769238e-05, + "loss": 1.2051, + "step": 13828 + }, + { + "epoch": 4.118915095217707, + "grad_norm": 0.26221126317977905, + "learning_rate": 1.3272143543870427e-05, + "loss": 1.2054, + "step": 13829 + }, + { + "epoch": 4.119212941417375, + "grad_norm": 0.37126147747039795, + "learning_rate": 1.3271232061527974e-05, + "loss": 1.2165, + "step": 13830 + }, + { + "epoch": 4.1195107876170445, + "grad_norm": 0.24958691000938416, + "learning_rate": 1.3270320548750366e-05, + "loss": 1.2261, + "step": 13831 + }, + { + "epoch": 4.119808633816713, + "grad_norm": 0.31862881779670715, + "learning_rate": 1.3269409005546077e-05, + "loss": 1.2204, + "step": 13832 + }, + { + "epoch": 4.120106480016381, + "grad_norm": 0.25110116600990295, + "learning_rate": 1.3268497431923591e-05, + "loss": 1.2428, + "step": 13833 + }, + { + "epoch": 4.12040432621605, + "grad_norm": 0.4147375822067261, + "learning_rate": 1.3267585827891387e-05, + "loss": 1.2206, + "step": 13834 + }, + { + "epoch": 4.120702172415719, + "grad_norm": 0.3040507733821869, + "learning_rate": 1.3266674193457944e-05, + "loss": 1.2347, + "step": 13835 + }, + { + "epoch": 4.121000018615388, + "grad_norm": 0.2682487964630127, + "learning_rate": 1.3265762528631752e-05, + "loss": 1.2152, + "step": 13836 + }, + { + "epoch": 4.121297864815056, + "grad_norm": 0.26606062054634094, + "learning_rate": 1.3264850833421288e-05, + "loss": 1.2188, + "step": 13837 + }, + { + "epoch": 4.1215957110147246, + "grad_norm": 0.2641187310218811, + "learning_rate": 1.3263939107835036e-05, + "loss": 1.2058, + "step": 13838 + }, + { + "epoch": 4.121893557214394, + "grad_norm": 0.3513612449169159, + "learning_rate": 1.3263027351881475e-05, + "loss": 1.2338, + "step": 13839 + }, + { + "epoch": 4.122191403414062, + "grad_norm": 0.31655916571617126, + "learning_rate": 1.3262115565569088e-05, + "loss": 1.2186, + "step": 13840 + }, + { + "epoch": 4.12248924961373, + "grad_norm": 0.340635746717453, + "learning_rate": 1.3261203748906363e-05, + "loss": 1.2047, + "step": 13841 + }, + { + "epoch": 4.1227870958134, + "grad_norm": 0.2886393964290619, + "learning_rate": 1.3260291901901783e-05, + "loss": 1.2451, + "step": 13842 + }, + { + "epoch": 4.123084942013068, + "grad_norm": 0.3992542624473572, + "learning_rate": 1.3259380024563826e-05, + "loss": 1.2129, + "step": 13843 + }, + { + "epoch": 4.123382788212736, + "grad_norm": 0.28295430541038513, + "learning_rate": 1.325846811690098e-05, + "loss": 1.2276, + "step": 13844 + }, + { + "epoch": 4.1236806344124055, + "grad_norm": 0.33247432112693787, + "learning_rate": 1.3257556178921732e-05, + "loss": 1.2315, + "step": 13845 + }, + { + "epoch": 4.123978480612074, + "grad_norm": 0.3000031113624573, + "learning_rate": 1.325664421063456e-05, + "loss": 1.2202, + "step": 13846 + }, + { + "epoch": 4.124276326811742, + "grad_norm": 0.5144345164299011, + "learning_rate": 1.325573221204795e-05, + "loss": 1.2314, + "step": 13847 + }, + { + "epoch": 4.124574173011411, + "grad_norm": 0.35543328523635864, + "learning_rate": 1.3254820183170394e-05, + "loss": 1.2247, + "step": 13848 + }, + { + "epoch": 4.12487201921108, + "grad_norm": 0.29800620675086975, + "learning_rate": 1.325390812401037e-05, + "loss": 1.2249, + "step": 13849 + }, + { + "epoch": 4.125169865410749, + "grad_norm": 0.2537350058555603, + "learning_rate": 1.3252996034576368e-05, + "loss": 1.214, + "step": 13850 + }, + { + "epoch": 4.125467711610417, + "grad_norm": 0.37921464443206787, + "learning_rate": 1.3252083914876873e-05, + "loss": 1.2276, + "step": 13851 + }, + { + "epoch": 4.125765557810086, + "grad_norm": 0.26001298427581787, + "learning_rate": 1.3251171764920369e-05, + "loss": 1.2152, + "step": 13852 + }, + { + "epoch": 4.126063404009755, + "grad_norm": 0.3361720144748688, + "learning_rate": 1.3250259584715341e-05, + "loss": 1.2226, + "step": 13853 + }, + { + "epoch": 4.126361250209423, + "grad_norm": 0.24085834622383118, + "learning_rate": 1.3249347374270282e-05, + "loss": 1.2331, + "step": 13854 + }, + { + "epoch": 4.1266590964090915, + "grad_norm": 0.2936878502368927, + "learning_rate": 1.324843513359368e-05, + "loss": 1.2185, + "step": 13855 + }, + { + "epoch": 4.126956942608761, + "grad_norm": 0.2679438889026642, + "learning_rate": 1.3247522862694014e-05, + "loss": 1.2318, + "step": 13856 + }, + { + "epoch": 4.127254788808429, + "grad_norm": 0.26150068640708923, + "learning_rate": 1.3246610561579776e-05, + "loss": 1.2457, + "step": 13857 + }, + { + "epoch": 4.127552635008097, + "grad_norm": 0.3115221858024597, + "learning_rate": 1.3245698230259456e-05, + "loss": 1.2204, + "step": 13858 + }, + { + "epoch": 4.1278504812077665, + "grad_norm": 0.2776271104812622, + "learning_rate": 1.3244785868741539e-05, + "loss": 1.2215, + "step": 13859 + }, + { + "epoch": 4.128148327407435, + "grad_norm": 0.37744003534317017, + "learning_rate": 1.3243873477034514e-05, + "loss": 1.2316, + "step": 13860 + }, + { + "epoch": 4.128446173607104, + "grad_norm": 0.28370046615600586, + "learning_rate": 1.3242961055146873e-05, + "loss": 1.233, + "step": 13861 + }, + { + "epoch": 4.128744019806772, + "grad_norm": 0.3110189139842987, + "learning_rate": 1.32420486030871e-05, + "loss": 1.2228, + "step": 13862 + }, + { + "epoch": 4.129041866006441, + "grad_norm": 0.30852046608924866, + "learning_rate": 1.324113612086369e-05, + "loss": 1.2175, + "step": 13863 + }, + { + "epoch": 4.12933971220611, + "grad_norm": 0.28926947712898254, + "learning_rate": 1.3240223608485127e-05, + "loss": 1.2121, + "step": 13864 + }, + { + "epoch": 4.129637558405778, + "grad_norm": 0.3092958331108093, + "learning_rate": 1.3239311065959903e-05, + "loss": 1.2142, + "step": 13865 + }, + { + "epoch": 4.129935404605447, + "grad_norm": 0.2516164779663086, + "learning_rate": 1.3238398493296511e-05, + "loss": 1.2047, + "step": 13866 + }, + { + "epoch": 4.130233250805116, + "grad_norm": 0.30716443061828613, + "learning_rate": 1.323748589050344e-05, + "loss": 1.2194, + "step": 13867 + }, + { + "epoch": 4.130531097004784, + "grad_norm": 0.2708042562007904, + "learning_rate": 1.3236573257589178e-05, + "loss": 1.22, + "step": 13868 + }, + { + "epoch": 4.1308289432044525, + "grad_norm": 0.38280943036079407, + "learning_rate": 1.3235660594562219e-05, + "loss": 1.2159, + "step": 13869 + }, + { + "epoch": 4.131126789404122, + "grad_norm": 0.36915719509124756, + "learning_rate": 1.323474790143105e-05, + "loss": 1.2203, + "step": 13870 + }, + { + "epoch": 4.13142463560379, + "grad_norm": 0.25969138741493225, + "learning_rate": 1.3233835178204166e-05, + "loss": 1.221, + "step": 13871 + }, + { + "epoch": 4.131722481803458, + "grad_norm": 0.36319831013679504, + "learning_rate": 1.3232922424890063e-05, + "loss": 1.2302, + "step": 13872 + }, + { + "epoch": 4.132020328003128, + "grad_norm": 0.24696652591228485, + "learning_rate": 1.3232009641497227e-05, + "loss": 1.2209, + "step": 13873 + }, + { + "epoch": 4.132318174202796, + "grad_norm": 0.395910382270813, + "learning_rate": 1.3231096828034151e-05, + "loss": 1.2239, + "step": 13874 + }, + { + "epoch": 4.132616020402464, + "grad_norm": 0.24994641542434692, + "learning_rate": 1.3230183984509333e-05, + "loss": 1.2199, + "step": 13875 + }, + { + "epoch": 4.1329138666021334, + "grad_norm": 0.8485873937606812, + "learning_rate": 1.3229271110931254e-05, + "loss": 1.2243, + "step": 13876 + }, + { + "epoch": 4.133211712801802, + "grad_norm": 0.51860511302948, + "learning_rate": 1.322835820730842e-05, + "loss": 1.2158, + "step": 13877 + }, + { + "epoch": 4.133509559001471, + "grad_norm": 0.43362873792648315, + "learning_rate": 1.3227445273649323e-05, + "loss": 1.2246, + "step": 13878 + }, + { + "epoch": 4.133807405201139, + "grad_norm": 0.3828974664211273, + "learning_rate": 1.3226532309962453e-05, + "loss": 1.2237, + "step": 13879 + }, + { + "epoch": 4.134105251400808, + "grad_norm": 0.6636733412742615, + "learning_rate": 1.32256193162563e-05, + "loss": 1.2219, + "step": 13880 + }, + { + "epoch": 4.134403097600477, + "grad_norm": 0.3177591860294342, + "learning_rate": 1.3224706292539366e-05, + "loss": 1.2144, + "step": 13881 + }, + { + "epoch": 4.134700943800145, + "grad_norm": 0.33369359374046326, + "learning_rate": 1.3223793238820141e-05, + "loss": 1.2169, + "step": 13882 + }, + { + "epoch": 4.1349987899998135, + "grad_norm": 0.30640286207199097, + "learning_rate": 1.3222880155107124e-05, + "loss": 1.233, + "step": 13883 + }, + { + "epoch": 4.135296636199483, + "grad_norm": 0.24687960743904114, + "learning_rate": 1.3221967041408808e-05, + "loss": 1.2352, + "step": 13884 + }, + { + "epoch": 4.135594482399151, + "grad_norm": 0.32527896761894226, + "learning_rate": 1.3221053897733686e-05, + "loss": 1.2094, + "step": 13885 + }, + { + "epoch": 4.135892328598819, + "grad_norm": 0.33888906240463257, + "learning_rate": 1.322014072409026e-05, + "loss": 1.2253, + "step": 13886 + }, + { + "epoch": 4.136190174798489, + "grad_norm": 0.25949883460998535, + "learning_rate": 1.3219227520487019e-05, + "loss": 1.2213, + "step": 13887 + }, + { + "epoch": 4.136488020998157, + "grad_norm": 0.29973429441452026, + "learning_rate": 1.3218314286932465e-05, + "loss": 1.2214, + "step": 13888 + }, + { + "epoch": 4.136785867197826, + "grad_norm": 0.2880532443523407, + "learning_rate": 1.321740102343509e-05, + "loss": 1.2469, + "step": 13889 + }, + { + "epoch": 4.1370837133974945, + "grad_norm": 0.26244160532951355, + "learning_rate": 1.3216487730003396e-05, + "loss": 1.2477, + "step": 13890 + }, + { + "epoch": 4.137381559597163, + "grad_norm": 0.29476916790008545, + "learning_rate": 1.3215574406645874e-05, + "loss": 1.2189, + "step": 13891 + }, + { + "epoch": 4.137679405796832, + "grad_norm": 0.27913185954093933, + "learning_rate": 1.3214661053371027e-05, + "loss": 1.2226, + "step": 13892 + }, + { + "epoch": 4.1379772519965, + "grad_norm": 0.27448925375938416, + "learning_rate": 1.321374767018735e-05, + "loss": 1.2203, + "step": 13893 + }, + { + "epoch": 4.138275098196169, + "grad_norm": 0.29136985540390015, + "learning_rate": 1.3212834257103339e-05, + "loss": 1.2114, + "step": 13894 + }, + { + "epoch": 4.138572944395838, + "grad_norm": 0.2683270275592804, + "learning_rate": 1.3211920814127499e-05, + "loss": 1.236, + "step": 13895 + }, + { + "epoch": 4.138870790595506, + "grad_norm": 0.3383072316646576, + "learning_rate": 1.3211007341268324e-05, + "loss": 1.2139, + "step": 13896 + }, + { + "epoch": 4.1391686367951745, + "grad_norm": 0.2853613793849945, + "learning_rate": 1.3210093838534313e-05, + "loss": 1.2348, + "step": 13897 + }, + { + "epoch": 4.139466482994844, + "grad_norm": 0.2859134376049042, + "learning_rate": 1.3209180305933961e-05, + "loss": 1.2372, + "step": 13898 + }, + { + "epoch": 4.139764329194512, + "grad_norm": 0.3109267055988312, + "learning_rate": 1.3208266743475777e-05, + "loss": 1.2296, + "step": 13899 + }, + { + "epoch": 4.14006217539418, + "grad_norm": 0.26448291540145874, + "learning_rate": 1.3207353151168254e-05, + "loss": 1.2289, + "step": 13900 + }, + { + "epoch": 4.14036002159385, + "grad_norm": 0.3694174587726593, + "learning_rate": 1.3206439529019892e-05, + "loss": 1.2336, + "step": 13901 + }, + { + "epoch": 4.140657867793518, + "grad_norm": 0.2503821551799774, + "learning_rate": 1.3205525877039194e-05, + "loss": 1.2256, + "step": 13902 + }, + { + "epoch": 4.140955713993187, + "grad_norm": 0.2735968828201294, + "learning_rate": 1.320461219523466e-05, + "loss": 1.2217, + "step": 13903 + }, + { + "epoch": 4.1412535601928555, + "grad_norm": 0.2927398383617401, + "learning_rate": 1.320369848361479e-05, + "loss": 1.2367, + "step": 13904 + }, + { + "epoch": 4.141551406392524, + "grad_norm": 0.2513747215270996, + "learning_rate": 1.3202784742188081e-05, + "loss": 1.2328, + "step": 13905 + }, + { + "epoch": 4.141849252592193, + "grad_norm": 0.29553186893463135, + "learning_rate": 1.3201870970963042e-05, + "loss": 1.2373, + "step": 13906 + }, + { + "epoch": 4.142147098791861, + "grad_norm": 0.2649666368961334, + "learning_rate": 1.3200957169948169e-05, + "loss": 1.2308, + "step": 13907 + }, + { + "epoch": 4.14244494499153, + "grad_norm": 0.29517480731010437, + "learning_rate": 1.3200043339151967e-05, + "loss": 1.2306, + "step": 13908 + }, + { + "epoch": 4.142742791191199, + "grad_norm": 0.282366544008255, + "learning_rate": 1.3199129478582937e-05, + "loss": 1.226, + "step": 13909 + }, + { + "epoch": 4.143040637390867, + "grad_norm": 0.28966495394706726, + "learning_rate": 1.319821558824958e-05, + "loss": 1.2118, + "step": 13910 + }, + { + "epoch": 4.143338483590536, + "grad_norm": 0.35091632604599, + "learning_rate": 1.3197301668160405e-05, + "loss": 1.2323, + "step": 13911 + }, + { + "epoch": 4.143636329790205, + "grad_norm": 0.3068065643310547, + "learning_rate": 1.3196387718323905e-05, + "loss": 1.2225, + "step": 13912 + }, + { + "epoch": 4.143934175989873, + "grad_norm": 0.33353498578071594, + "learning_rate": 1.319547373874859e-05, + "loss": 1.2139, + "step": 13913 + }, + { + "epoch": 4.1442320221895415, + "grad_norm": 0.28323015570640564, + "learning_rate": 1.3194559729442962e-05, + "loss": 1.2158, + "step": 13914 + }, + { + "epoch": 4.144529868389211, + "grad_norm": 0.44431978464126587, + "learning_rate": 1.3193645690415524e-05, + "loss": 1.223, + "step": 13915 + }, + { + "epoch": 4.144827714588879, + "grad_norm": 0.32332706451416016, + "learning_rate": 1.3192731621674786e-05, + "loss": 1.2252, + "step": 13916 + }, + { + "epoch": 4.145125560788548, + "grad_norm": 0.403382807970047, + "learning_rate": 1.319181752322924e-05, + "loss": 1.2375, + "step": 13917 + }, + { + "epoch": 4.1454234069882165, + "grad_norm": 0.2784106731414795, + "learning_rate": 1.3190903395087403e-05, + "loss": 1.2145, + "step": 13918 + }, + { + "epoch": 4.145721253187885, + "grad_norm": 0.5025516748428345, + "learning_rate": 1.3189989237257771e-05, + "loss": 1.2113, + "step": 13919 + }, + { + "epoch": 4.146019099387554, + "grad_norm": 0.3577045798301697, + "learning_rate": 1.3189075049748858e-05, + "loss": 1.2258, + "step": 13920 + }, + { + "epoch": 4.146316945587222, + "grad_norm": 0.3309796452522278, + "learning_rate": 1.3188160832569164e-05, + "loss": 1.2167, + "step": 13921 + }, + { + "epoch": 4.146614791786891, + "grad_norm": 0.270114004611969, + "learning_rate": 1.3187246585727192e-05, + "loss": 1.232, + "step": 13922 + }, + { + "epoch": 4.14691263798656, + "grad_norm": 0.4047696888446808, + "learning_rate": 1.3186332309231453e-05, + "loss": 1.222, + "step": 13923 + }, + { + "epoch": 4.147210484186228, + "grad_norm": 0.25433439016342163, + "learning_rate": 1.3185418003090454e-05, + "loss": 1.224, + "step": 13924 + }, + { + "epoch": 4.147508330385897, + "grad_norm": 0.3170766532421112, + "learning_rate": 1.3184503667312695e-05, + "loss": 1.2381, + "step": 13925 + }, + { + "epoch": 4.147806176585566, + "grad_norm": 0.2565918266773224, + "learning_rate": 1.3183589301906695e-05, + "loss": 1.219, + "step": 13926 + }, + { + "epoch": 4.148104022785234, + "grad_norm": 0.2843881845474243, + "learning_rate": 1.3182674906880948e-05, + "loss": 1.219, + "step": 13927 + }, + { + "epoch": 4.148401868984903, + "grad_norm": 0.329664945602417, + "learning_rate": 1.3181760482243967e-05, + "loss": 1.2375, + "step": 13928 + }, + { + "epoch": 4.148699715184572, + "grad_norm": 0.2986266314983368, + "learning_rate": 1.318084602800426e-05, + "loss": 1.2276, + "step": 13929 + }, + { + "epoch": 4.14899756138424, + "grad_norm": 0.3061686158180237, + "learning_rate": 1.3179931544170332e-05, + "loss": 1.2247, + "step": 13930 + }, + { + "epoch": 4.149295407583909, + "grad_norm": 0.26146790385246277, + "learning_rate": 1.3179017030750698e-05, + "loss": 1.2186, + "step": 13931 + }, + { + "epoch": 4.149593253783578, + "grad_norm": 0.4060761332511902, + "learning_rate": 1.3178102487753859e-05, + "loss": 1.2242, + "step": 13932 + }, + { + "epoch": 4.149891099983246, + "grad_norm": 0.2841872274875641, + "learning_rate": 1.3177187915188331e-05, + "loss": 1.2341, + "step": 13933 + }, + { + "epoch": 4.150188946182915, + "grad_norm": 0.3375382423400879, + "learning_rate": 1.3176273313062617e-05, + "loss": 1.2207, + "step": 13934 + }, + { + "epoch": 4.1504867923825834, + "grad_norm": 0.2719001770019531, + "learning_rate": 1.3175358681385227e-05, + "loss": 1.2392, + "step": 13935 + }, + { + "epoch": 4.150784638582252, + "grad_norm": 0.35442888736724854, + "learning_rate": 1.3174444020164674e-05, + "loss": 1.2207, + "step": 13936 + }, + { + "epoch": 4.151082484781921, + "grad_norm": 0.3728295862674713, + "learning_rate": 1.3173529329409466e-05, + "loss": 1.2188, + "step": 13937 + }, + { + "epoch": 4.151380330981589, + "grad_norm": 0.25157758593559265, + "learning_rate": 1.3172614609128114e-05, + "loss": 1.2388, + "step": 13938 + }, + { + "epoch": 4.151678177181258, + "grad_norm": 0.2839163541793823, + "learning_rate": 1.3171699859329125e-05, + "loss": 1.2331, + "step": 13939 + }, + { + "epoch": 4.151976023380927, + "grad_norm": 0.24345000088214874, + "learning_rate": 1.3170785080021019e-05, + "loss": 1.2087, + "step": 13940 + }, + { + "epoch": 4.152273869580595, + "grad_norm": 0.2459232360124588, + "learning_rate": 1.3169870271212293e-05, + "loss": 1.2345, + "step": 13941 + }, + { + "epoch": 4.1525717157802635, + "grad_norm": 0.29808467626571655, + "learning_rate": 1.3168955432911467e-05, + "loss": 1.2244, + "step": 13942 + }, + { + "epoch": 4.152869561979933, + "grad_norm": 0.2867465913295746, + "learning_rate": 1.3168040565127055e-05, + "loss": 1.1921, + "step": 13943 + }, + { + "epoch": 4.153167408179601, + "grad_norm": 0.25989869236946106, + "learning_rate": 1.3167125667867562e-05, + "loss": 1.2328, + "step": 13944 + }, + { + "epoch": 4.15346525437927, + "grad_norm": 0.2782016694545746, + "learning_rate": 1.3166210741141505e-05, + "loss": 1.225, + "step": 13945 + }, + { + "epoch": 4.153763100578939, + "grad_norm": 0.2776489555835724, + "learning_rate": 1.3165295784957393e-05, + "loss": 1.2262, + "step": 13946 + }, + { + "epoch": 4.154060946778607, + "grad_norm": 0.2627328336238861, + "learning_rate": 1.316438079932374e-05, + "loss": 1.2092, + "step": 13947 + }, + { + "epoch": 4.154358792978276, + "grad_norm": 0.2466094046831131, + "learning_rate": 1.316346578424906e-05, + "loss": 1.2425, + "step": 13948 + }, + { + "epoch": 4.1546566391779445, + "grad_norm": 0.26793643832206726, + "learning_rate": 1.3162550739741864e-05, + "loss": 1.22, + "step": 13949 + }, + { + "epoch": 4.154954485377613, + "grad_norm": 0.28842565417289734, + "learning_rate": 1.316163566581067e-05, + "loss": 1.2268, + "step": 13950 + }, + { + "epoch": 4.155252331577282, + "grad_norm": 0.3220886290073395, + "learning_rate": 1.3160720562463986e-05, + "loss": 1.2127, + "step": 13951 + }, + { + "epoch": 4.15555017777695, + "grad_norm": 0.36773040890693665, + "learning_rate": 1.315980542971033e-05, + "loss": 1.2324, + "step": 13952 + }, + { + "epoch": 4.155848023976619, + "grad_norm": 0.36914122104644775, + "learning_rate": 1.315889026755821e-05, + "loss": 1.1924, + "step": 13953 + }, + { + "epoch": 4.156145870176288, + "grad_norm": 0.24720020592212677, + "learning_rate": 1.3157975076016154e-05, + "loss": 1.2391, + "step": 13954 + }, + { + "epoch": 4.156443716375956, + "grad_norm": 0.26182740926742554, + "learning_rate": 1.3157059855092662e-05, + "loss": 1.223, + "step": 13955 + }, + { + "epoch": 4.156741562575625, + "grad_norm": 0.30646032094955444, + "learning_rate": 1.3156144604796257e-05, + "loss": 1.2277, + "step": 13956 + }, + { + "epoch": 4.157039408775294, + "grad_norm": 0.27570775151252747, + "learning_rate": 1.3155229325135455e-05, + "loss": 1.2372, + "step": 13957 + }, + { + "epoch": 4.157337254974962, + "grad_norm": 0.47832658886909485, + "learning_rate": 1.3154314016118763e-05, + "loss": 1.207, + "step": 13958 + }, + { + "epoch": 4.157635101174631, + "grad_norm": 0.3838043510913849, + "learning_rate": 1.3153398677754707e-05, + "loss": 1.2133, + "step": 13959 + }, + { + "epoch": 4.1579329473743, + "grad_norm": 0.3545500338077545, + "learning_rate": 1.31524833100518e-05, + "loss": 1.2345, + "step": 13960 + }, + { + "epoch": 4.158230793573968, + "grad_norm": 0.2836247682571411, + "learning_rate": 1.315156791301856e-05, + "loss": 1.2229, + "step": 13961 + }, + { + "epoch": 4.158528639773637, + "grad_norm": 0.26212188601493835, + "learning_rate": 1.3150652486663498e-05, + "loss": 1.233, + "step": 13962 + }, + { + "epoch": 4.1588264859733055, + "grad_norm": 0.3060859441757202, + "learning_rate": 1.3149737030995136e-05, + "loss": 1.2049, + "step": 13963 + }, + { + "epoch": 4.159124332172974, + "grad_norm": 0.2617625594139099, + "learning_rate": 1.3148821546021991e-05, + "loss": 1.2229, + "step": 13964 + }, + { + "epoch": 4.159422178372643, + "grad_norm": 0.32073497772216797, + "learning_rate": 1.3147906031752578e-05, + "loss": 1.2253, + "step": 13965 + }, + { + "epoch": 4.159720024572311, + "grad_norm": 0.44537511467933655, + "learning_rate": 1.3146990488195415e-05, + "loss": 1.2287, + "step": 13966 + }, + { + "epoch": 4.16001787077198, + "grad_norm": 0.3504146933555603, + "learning_rate": 1.3146074915359026e-05, + "loss": 1.2299, + "step": 13967 + }, + { + "epoch": 4.160315716971649, + "grad_norm": 0.26046764850616455, + "learning_rate": 1.3145159313251922e-05, + "loss": 1.217, + "step": 13968 + }, + { + "epoch": 4.160613563171317, + "grad_norm": 0.3015144467353821, + "learning_rate": 1.3144243681882625e-05, + "loss": 1.217, + "step": 13969 + }, + { + "epoch": 4.1609114093709865, + "grad_norm": 0.38506749272346497, + "learning_rate": 1.3143328021259654e-05, + "loss": 1.2119, + "step": 13970 + }, + { + "epoch": 4.161209255570655, + "grad_norm": 0.3009447753429413, + "learning_rate": 1.3142412331391528e-05, + "loss": 1.2172, + "step": 13971 + }, + { + "epoch": 4.161507101770323, + "grad_norm": 0.25323185324668884, + "learning_rate": 1.3141496612286763e-05, + "loss": 1.2244, + "step": 13972 + }, + { + "epoch": 4.161804947969992, + "grad_norm": 0.34163016080856323, + "learning_rate": 1.3140580863953885e-05, + "loss": 1.2385, + "step": 13973 + }, + { + "epoch": 4.162102794169661, + "grad_norm": 0.25774526596069336, + "learning_rate": 1.3139665086401413e-05, + "loss": 1.2332, + "step": 13974 + }, + { + "epoch": 4.162400640369329, + "grad_norm": 0.40288153290748596, + "learning_rate": 1.313874927963786e-05, + "loss": 1.2274, + "step": 13975 + }, + { + "epoch": 4.162698486568998, + "grad_norm": 0.30704954266548157, + "learning_rate": 1.3137833443671753e-05, + "loss": 1.2231, + "step": 13976 + }, + { + "epoch": 4.1629963327686665, + "grad_norm": 0.30630987882614136, + "learning_rate": 1.3136917578511612e-05, + "loss": 1.2277, + "step": 13977 + }, + { + "epoch": 4.163294178968335, + "grad_norm": 0.2474551796913147, + "learning_rate": 1.313600168416596e-05, + "loss": 1.2126, + "step": 13978 + }, + { + "epoch": 4.163592025168004, + "grad_norm": 0.30916136503219604, + "learning_rate": 1.3135085760643316e-05, + "loss": 1.2065, + "step": 13979 + }, + { + "epoch": 4.163889871367672, + "grad_norm": 0.2527836263179779, + "learning_rate": 1.31341698079522e-05, + "loss": 1.227, + "step": 13980 + }, + { + "epoch": 4.164187717567341, + "grad_norm": 0.2940084636211395, + "learning_rate": 1.3133253826101138e-05, + "loss": 1.2185, + "step": 13981 + }, + { + "epoch": 4.16448556376701, + "grad_norm": 0.2573065757751465, + "learning_rate": 1.3132337815098646e-05, + "loss": 1.2043, + "step": 13982 + }, + { + "epoch": 4.164783409966678, + "grad_norm": 0.28639015555381775, + "learning_rate": 1.313142177495325e-05, + "loss": 1.2074, + "step": 13983 + }, + { + "epoch": 4.1650812561663475, + "grad_norm": 0.3267950713634491, + "learning_rate": 1.3130505705673478e-05, + "loss": 1.2213, + "step": 13984 + }, + { + "epoch": 4.165379102366016, + "grad_norm": 0.25070884823799133, + "learning_rate": 1.3129589607267846e-05, + "loss": 1.2164, + "step": 13985 + }, + { + "epoch": 4.165676948565684, + "grad_norm": 0.3283660411834717, + "learning_rate": 1.3128673479744878e-05, + "loss": 1.2177, + "step": 13986 + }, + { + "epoch": 4.165974794765353, + "grad_norm": 0.4489118754863739, + "learning_rate": 1.3127757323113097e-05, + "loss": 1.2288, + "step": 13987 + }, + { + "epoch": 4.166272640965022, + "grad_norm": 0.4153156876564026, + "learning_rate": 1.3126841137381033e-05, + "loss": 1.2365, + "step": 13988 + }, + { + "epoch": 4.16657048716469, + "grad_norm": 0.2731744647026062, + "learning_rate": 1.3125924922557203e-05, + "loss": 1.2207, + "step": 13989 + }, + { + "epoch": 4.166868333364359, + "grad_norm": 0.37405744194984436, + "learning_rate": 1.3125008678650133e-05, + "loss": 1.2232, + "step": 13990 + }, + { + "epoch": 4.167166179564028, + "grad_norm": 0.35321325063705444, + "learning_rate": 1.3124092405668351e-05, + "loss": 1.2007, + "step": 13991 + }, + { + "epoch": 4.167464025763696, + "grad_norm": 0.2919888198375702, + "learning_rate": 1.312317610362038e-05, + "loss": 1.2098, + "step": 13992 + }, + { + "epoch": 4.167761871963365, + "grad_norm": 0.489874929189682, + "learning_rate": 1.3122259772514744e-05, + "loss": 1.2241, + "step": 13993 + }, + { + "epoch": 4.1680597181630334, + "grad_norm": 0.3767636716365814, + "learning_rate": 1.3121343412359966e-05, + "loss": 1.2304, + "step": 13994 + }, + { + "epoch": 4.168357564362703, + "grad_norm": 0.24852719902992249, + "learning_rate": 1.3120427023164579e-05, + "loss": 1.2294, + "step": 13995 + }, + { + "epoch": 4.168655410562371, + "grad_norm": 0.30303964018821716, + "learning_rate": 1.3119510604937105e-05, + "loss": 1.2189, + "step": 13996 + }, + { + "epoch": 4.168953256762039, + "grad_norm": 0.25402772426605225, + "learning_rate": 1.311859415768607e-05, + "loss": 1.2216, + "step": 13997 + }, + { + "epoch": 4.1692511029617085, + "grad_norm": 0.323548823595047, + "learning_rate": 1.311767768142e-05, + "loss": 1.2204, + "step": 13998 + }, + { + "epoch": 4.169548949161377, + "grad_norm": 0.5445820689201355, + "learning_rate": 1.3116761176147421e-05, + "loss": 1.2239, + "step": 13999 + }, + { + "epoch": 4.169846795361045, + "grad_norm": 0.4276760220527649, + "learning_rate": 1.311584464187686e-05, + "loss": 1.2277, + "step": 14000 + }, + { + "epoch": 4.169846795361045, + "eval_loss": 1.3272372484207153, + "eval_runtime": 20.3261, + "eval_samples_per_second": 85.309, + "eval_steps_per_second": 5.363, + "step": 14000 + }, + { + "epoch": 4.170144641560714, + "grad_norm": 0.3908666968345642, + "learning_rate": 1.3114928078616848e-05, + "loss": 1.2366, + "step": 14001 + }, + { + "epoch": 4.170442487760383, + "grad_norm": 0.811020016670227, + "learning_rate": 1.3114011486375912e-05, + "loss": 1.2227, + "step": 14002 + }, + { + "epoch": 4.170740333960051, + "grad_norm": 0.5054917335510254, + "learning_rate": 1.311309486516258e-05, + "loss": 1.2273, + "step": 14003 + }, + { + "epoch": 4.17103818015972, + "grad_norm": 0.5227484107017517, + "learning_rate": 1.3112178214985374e-05, + "loss": 1.215, + "step": 14004 + }, + { + "epoch": 4.171336026359389, + "grad_norm": 0.581238329410553, + "learning_rate": 1.3111261535852828e-05, + "loss": 1.228, + "step": 14005 + }, + { + "epoch": 4.171633872559057, + "grad_norm": 0.33035385608673096, + "learning_rate": 1.3110344827773469e-05, + "loss": 1.2347, + "step": 14006 + }, + { + "epoch": 4.171931718758726, + "grad_norm": 0.4523831009864807, + "learning_rate": 1.310942809075583e-05, + "loss": 1.2375, + "step": 14007 + }, + { + "epoch": 4.1722295649583945, + "grad_norm": 0.4137898087501526, + "learning_rate": 1.3108511324808433e-05, + "loss": 1.2254, + "step": 14008 + }, + { + "epoch": 4.172527411158063, + "grad_norm": 0.34187790751457214, + "learning_rate": 1.3107594529939814e-05, + "loss": 1.2237, + "step": 14009 + }, + { + "epoch": 4.172825257357732, + "grad_norm": 0.36907291412353516, + "learning_rate": 1.3106677706158496e-05, + "loss": 1.2112, + "step": 14010 + }, + { + "epoch": 4.1731231035574, + "grad_norm": 0.2730240821838379, + "learning_rate": 1.3105760853473015e-05, + "loss": 1.2328, + "step": 14011 + }, + { + "epoch": 4.17342094975707, + "grad_norm": 0.30101391673088074, + "learning_rate": 1.3104843971891898e-05, + "loss": 1.2301, + "step": 14012 + }, + { + "epoch": 4.173718795956738, + "grad_norm": 0.3022208511829376, + "learning_rate": 1.310392706142368e-05, + "loss": 1.2206, + "step": 14013 + }, + { + "epoch": 4.174016642156406, + "grad_norm": 0.2642434537410736, + "learning_rate": 1.3103010122076888e-05, + "loss": 1.213, + "step": 14014 + }, + { + "epoch": 4.174314488356075, + "grad_norm": 0.42281395196914673, + "learning_rate": 1.3102093153860052e-05, + "loss": 1.2185, + "step": 14015 + }, + { + "epoch": 4.174612334555744, + "grad_norm": 0.26686713099479675, + "learning_rate": 1.3101176156781706e-05, + "loss": 1.2045, + "step": 14016 + }, + { + "epoch": 4.174910180755412, + "grad_norm": 0.3635387420654297, + "learning_rate": 1.3100259130850378e-05, + "loss": 1.2232, + "step": 14017 + }, + { + "epoch": 4.175208026955081, + "grad_norm": 0.2564242482185364, + "learning_rate": 1.3099342076074604e-05, + "loss": 1.2125, + "step": 14018 + }, + { + "epoch": 4.17550587315475, + "grad_norm": 0.4493139684200287, + "learning_rate": 1.3098424992462918e-05, + "loss": 1.2257, + "step": 14019 + }, + { + "epoch": 4.175803719354418, + "grad_norm": 0.27888286113739014, + "learning_rate": 1.3097507880023846e-05, + "loss": 1.2272, + "step": 14020 + }, + { + "epoch": 4.176101565554087, + "grad_norm": 0.28345227241516113, + "learning_rate": 1.3096590738765925e-05, + "loss": 1.2192, + "step": 14021 + }, + { + "epoch": 4.1763994117537555, + "grad_norm": 0.30897077918052673, + "learning_rate": 1.3095673568697687e-05, + "loss": 1.2022, + "step": 14022 + }, + { + "epoch": 4.176697257953425, + "grad_norm": 0.34538790583610535, + "learning_rate": 1.3094756369827661e-05, + "loss": 1.2117, + "step": 14023 + }, + { + "epoch": 4.176995104153093, + "grad_norm": 0.29366397857666016, + "learning_rate": 1.309383914216439e-05, + "loss": 1.2301, + "step": 14024 + }, + { + "epoch": 4.177292950352761, + "grad_norm": 0.34857288002967834, + "learning_rate": 1.3092921885716404e-05, + "loss": 1.201, + "step": 14025 + }, + { + "epoch": 4.177590796552431, + "grad_norm": 0.2962677478790283, + "learning_rate": 1.309200460049223e-05, + "loss": 1.223, + "step": 14026 + }, + { + "epoch": 4.177888642752099, + "grad_norm": 0.2779518663883209, + "learning_rate": 1.309108728650041e-05, + "loss": 1.2353, + "step": 14027 + }, + { + "epoch": 4.178186488951767, + "grad_norm": 0.2660236358642578, + "learning_rate": 1.3090169943749475e-05, + "loss": 1.2174, + "step": 14028 + }, + { + "epoch": 4.1784843351514365, + "grad_norm": 0.2925933003425598, + "learning_rate": 1.3089252572247963e-05, + "loss": 1.2096, + "step": 14029 + }, + { + "epoch": 4.178782181351105, + "grad_norm": 0.29860958456993103, + "learning_rate": 1.3088335172004408e-05, + "loss": 1.2192, + "step": 14030 + }, + { + "epoch": 4.179080027550773, + "grad_norm": 0.35349053144454956, + "learning_rate": 1.3087417743027341e-05, + "loss": 1.2231, + "step": 14031 + }, + { + "epoch": 4.179377873750442, + "grad_norm": 0.28386175632476807, + "learning_rate": 1.3086500285325307e-05, + "loss": 1.2453, + "step": 14032 + }, + { + "epoch": 4.179675719950111, + "grad_norm": 0.36909663677215576, + "learning_rate": 1.3085582798906834e-05, + "loss": 1.2282, + "step": 14033 + }, + { + "epoch": 4.179973566149779, + "grad_norm": 0.2573685050010681, + "learning_rate": 1.3084665283780458e-05, + "loss": 1.2265, + "step": 14034 + }, + { + "epoch": 4.180271412349448, + "grad_norm": 0.4504440724849701, + "learning_rate": 1.308374773995472e-05, + "loss": 1.2124, + "step": 14035 + }, + { + "epoch": 4.1805692585491165, + "grad_norm": 0.32981064915657043, + "learning_rate": 1.3082830167438154e-05, + "loss": 1.2332, + "step": 14036 + }, + { + "epoch": 4.180867104748786, + "grad_norm": 0.4069383442401886, + "learning_rate": 1.3081912566239296e-05, + "loss": 1.2135, + "step": 14037 + }, + { + "epoch": 4.181164950948454, + "grad_norm": 0.35591837763786316, + "learning_rate": 1.3080994936366689e-05, + "loss": 1.2217, + "step": 14038 + }, + { + "epoch": 4.181462797148122, + "grad_norm": 0.3362163007259369, + "learning_rate": 1.3080077277828864e-05, + "loss": 1.2325, + "step": 14039 + }, + { + "epoch": 4.181760643347792, + "grad_norm": 0.3613108992576599, + "learning_rate": 1.3079159590634363e-05, + "loss": 1.2328, + "step": 14040 + }, + { + "epoch": 4.18205848954746, + "grad_norm": 0.33352869749069214, + "learning_rate": 1.3078241874791718e-05, + "loss": 1.2261, + "step": 14041 + }, + { + "epoch": 4.182356335747128, + "grad_norm": 0.5207704305648804, + "learning_rate": 1.3077324130309475e-05, + "loss": 1.2294, + "step": 14042 + }, + { + "epoch": 4.1826541819467975, + "grad_norm": 0.25804591178894043, + "learning_rate": 1.3076406357196168e-05, + "loss": 1.2166, + "step": 14043 + }, + { + "epoch": 4.182952028146466, + "grad_norm": 0.4659302532672882, + "learning_rate": 1.3075488555460338e-05, + "loss": 1.2197, + "step": 14044 + }, + { + "epoch": 4.183249874346134, + "grad_norm": 0.3011718690395355, + "learning_rate": 1.3074570725110523e-05, + "loss": 1.2216, + "step": 14045 + }, + { + "epoch": 4.183547720545803, + "grad_norm": 0.4368714988231659, + "learning_rate": 1.307365286615526e-05, + "loss": 1.2253, + "step": 14046 + }, + { + "epoch": 4.183845566745472, + "grad_norm": 0.3222927153110504, + "learning_rate": 1.3072734978603094e-05, + "loss": 1.2158, + "step": 14047 + }, + { + "epoch": 4.18414341294514, + "grad_norm": 0.3827953040599823, + "learning_rate": 1.3071817062462562e-05, + "loss": 1.2255, + "step": 14048 + }, + { + "epoch": 4.184441259144809, + "grad_norm": 0.3645618259906769, + "learning_rate": 1.3070899117742204e-05, + "loss": 1.1938, + "step": 14049 + }, + { + "epoch": 4.184739105344478, + "grad_norm": 0.2756417393684387, + "learning_rate": 1.3069981144450561e-05, + "loss": 1.2309, + "step": 14050 + }, + { + "epoch": 4.185036951544147, + "grad_norm": 0.25370144844055176, + "learning_rate": 1.3069063142596173e-05, + "loss": 1.2215, + "step": 14051 + }, + { + "epoch": 4.185334797743815, + "grad_norm": 0.2705208957195282, + "learning_rate": 1.306814511218758e-05, + "loss": 1.2258, + "step": 14052 + }, + { + "epoch": 4.1856326439434834, + "grad_norm": 0.2705756425857544, + "learning_rate": 1.3067227053233325e-05, + "loss": 1.217, + "step": 14053 + }, + { + "epoch": 4.185930490143153, + "grad_norm": 0.24070847034454346, + "learning_rate": 1.306630896574195e-05, + "loss": 1.2357, + "step": 14054 + }, + { + "epoch": 4.186228336342821, + "grad_norm": 0.2530656158924103, + "learning_rate": 1.3065390849721995e-05, + "loss": 1.2222, + "step": 14055 + }, + { + "epoch": 4.186526182542489, + "grad_norm": 0.3035649359226227, + "learning_rate": 1.3064472705182008e-05, + "loss": 1.2152, + "step": 14056 + }, + { + "epoch": 4.1868240287421585, + "grad_norm": 0.2673301100730896, + "learning_rate": 1.306355453213052e-05, + "loss": 1.2228, + "step": 14057 + }, + { + "epoch": 4.187121874941827, + "grad_norm": 0.27571043372154236, + "learning_rate": 1.3062636330576081e-05, + "loss": 1.1945, + "step": 14058 + }, + { + "epoch": 4.187419721141495, + "grad_norm": 0.25011956691741943, + "learning_rate": 1.3061718100527232e-05, + "loss": 1.1996, + "step": 14059 + }, + { + "epoch": 4.187717567341164, + "grad_norm": 0.2861412465572357, + "learning_rate": 1.3060799841992521e-05, + "loss": 1.2173, + "step": 14060 + }, + { + "epoch": 4.188015413540833, + "grad_norm": 0.25167977809906006, + "learning_rate": 1.3059881554980485e-05, + "loss": 1.2421, + "step": 14061 + }, + { + "epoch": 4.188313259740502, + "grad_norm": 0.2569390535354614, + "learning_rate": 1.3058963239499669e-05, + "loss": 1.2221, + "step": 14062 + }, + { + "epoch": 4.18861110594017, + "grad_norm": 0.26285067200660706, + "learning_rate": 1.3058044895558621e-05, + "loss": 1.2129, + "step": 14063 + }, + { + "epoch": 4.188908952139839, + "grad_norm": 0.26100948452949524, + "learning_rate": 1.3057126523165877e-05, + "loss": 1.2097, + "step": 14064 + }, + { + "epoch": 4.189206798339508, + "grad_norm": 0.278222918510437, + "learning_rate": 1.3056208122329988e-05, + "loss": 1.215, + "step": 14065 + }, + { + "epoch": 4.189504644539176, + "grad_norm": 0.2587638199329376, + "learning_rate": 1.3055289693059501e-05, + "loss": 1.2135, + "step": 14066 + }, + { + "epoch": 4.1898024907388445, + "grad_norm": 0.2603953778743744, + "learning_rate": 1.3054371235362952e-05, + "loss": 1.2125, + "step": 14067 + }, + { + "epoch": 4.190100336938514, + "grad_norm": 0.27730056643486023, + "learning_rate": 1.3053452749248892e-05, + "loss": 1.2363, + "step": 14068 + }, + { + "epoch": 4.190398183138182, + "grad_norm": 0.2562923729419708, + "learning_rate": 1.3052534234725866e-05, + "loss": 1.2342, + "step": 14069 + }, + { + "epoch": 4.19069602933785, + "grad_norm": 0.25303590297698975, + "learning_rate": 1.3051615691802421e-05, + "loss": 1.2091, + "step": 14070 + }, + { + "epoch": 4.19099387553752, + "grad_norm": 0.2474832534790039, + "learning_rate": 1.30506971204871e-05, + "loss": 1.2249, + "step": 14071 + }, + { + "epoch": 4.191291721737188, + "grad_norm": 0.2717093527317047, + "learning_rate": 1.304977852078845e-05, + "loss": 1.2165, + "step": 14072 + }, + { + "epoch": 4.191589567936856, + "grad_norm": 0.2634011209011078, + "learning_rate": 1.3048859892715022e-05, + "loss": 1.231, + "step": 14073 + }, + { + "epoch": 4.191887414136525, + "grad_norm": 0.38878726959228516, + "learning_rate": 1.3047941236275356e-05, + "loss": 1.2132, + "step": 14074 + }, + { + "epoch": 4.192185260336194, + "grad_norm": 0.47264716029167175, + "learning_rate": 1.3047022551478004e-05, + "loss": 1.2169, + "step": 14075 + }, + { + "epoch": 4.192483106535862, + "grad_norm": 0.2845783829689026, + "learning_rate": 1.304610383833151e-05, + "loss": 1.2115, + "step": 14076 + }, + { + "epoch": 4.192780952735531, + "grad_norm": 0.33200255036354065, + "learning_rate": 1.3045185096844422e-05, + "loss": 1.238, + "step": 14077 + }, + { + "epoch": 4.1930787989352, + "grad_norm": 0.2838309705257416, + "learning_rate": 1.304426632702529e-05, + "loss": 1.2281, + "step": 14078 + }, + { + "epoch": 4.193376645134869, + "grad_norm": 0.30562594532966614, + "learning_rate": 1.3043347528882662e-05, + "loss": 1.2167, + "step": 14079 + }, + { + "epoch": 4.193674491334537, + "grad_norm": 0.3289188742637634, + "learning_rate": 1.3042428702425084e-05, + "loss": 1.2449, + "step": 14080 + }, + { + "epoch": 4.1939723375342055, + "grad_norm": 0.3240574598312378, + "learning_rate": 1.3041509847661109e-05, + "loss": 1.219, + "step": 14081 + }, + { + "epoch": 4.194270183733875, + "grad_norm": 0.2809533178806305, + "learning_rate": 1.3040590964599278e-05, + "loss": 1.2265, + "step": 14082 + }, + { + "epoch": 4.194568029933543, + "grad_norm": 0.24093203246593475, + "learning_rate": 1.3039672053248148e-05, + "loss": 1.229, + "step": 14083 + }, + { + "epoch": 4.194865876133211, + "grad_norm": 0.26057231426239014, + "learning_rate": 1.3038753113616267e-05, + "loss": 1.2378, + "step": 14084 + }, + { + "epoch": 4.195163722332881, + "grad_norm": 0.26788297295570374, + "learning_rate": 1.3037834145712183e-05, + "loss": 1.2179, + "step": 14085 + }, + { + "epoch": 4.195461568532549, + "grad_norm": 0.24841520190238953, + "learning_rate": 1.3036915149544444e-05, + "loss": 1.2367, + "step": 14086 + }, + { + "epoch": 4.195759414732217, + "grad_norm": 0.33649685978889465, + "learning_rate": 1.3035996125121604e-05, + "loss": 1.2086, + "step": 14087 + }, + { + "epoch": 4.1960572609318865, + "grad_norm": 0.3129236400127411, + "learning_rate": 1.3035077072452214e-05, + "loss": 1.2353, + "step": 14088 + }, + { + "epoch": 4.196355107131555, + "grad_norm": 0.2614390552043915, + "learning_rate": 1.303415799154482e-05, + "loss": 1.2247, + "step": 14089 + }, + { + "epoch": 4.196652953331224, + "grad_norm": 0.46155256032943726, + "learning_rate": 1.3033238882407977e-05, + "loss": 1.2074, + "step": 14090 + }, + { + "epoch": 4.196950799530892, + "grad_norm": 0.3769415318965912, + "learning_rate": 1.3032319745050237e-05, + "loss": 1.2206, + "step": 14091 + }, + { + "epoch": 4.197248645730561, + "grad_norm": 0.3116456866264343, + "learning_rate": 1.3031400579480147e-05, + "loss": 1.2176, + "step": 14092 + }, + { + "epoch": 4.19754649193023, + "grad_norm": 0.5263674855232239, + "learning_rate": 1.3030481385706263e-05, + "loss": 1.2134, + "step": 14093 + }, + { + "epoch": 4.197844338129898, + "grad_norm": 0.34529250860214233, + "learning_rate": 1.3029562163737133e-05, + "loss": 1.2373, + "step": 14094 + }, + { + "epoch": 4.1981421843295665, + "grad_norm": 0.3057539463043213, + "learning_rate": 1.3028642913581313e-05, + "loss": 1.2365, + "step": 14095 + }, + { + "epoch": 4.198440030529236, + "grad_norm": 0.2712446451187134, + "learning_rate": 1.3027723635247355e-05, + "loss": 1.218, + "step": 14096 + }, + { + "epoch": 4.198737876728904, + "grad_norm": 0.27539825439453125, + "learning_rate": 1.3026804328743813e-05, + "loss": 1.24, + "step": 14097 + }, + { + "epoch": 4.199035722928572, + "grad_norm": 0.27623793482780457, + "learning_rate": 1.3025884994079237e-05, + "loss": 1.2323, + "step": 14098 + }, + { + "epoch": 4.199333569128242, + "grad_norm": 0.2889955937862396, + "learning_rate": 1.302496563126218e-05, + "loss": 1.2234, + "step": 14099 + }, + { + "epoch": 4.19963141532791, + "grad_norm": 0.2603602707386017, + "learning_rate": 1.3024046240301201e-05, + "loss": 1.238, + "step": 14100 + }, + { + "epoch": 4.199929261527578, + "grad_norm": 0.24106217920780182, + "learning_rate": 1.3023126821204848e-05, + "loss": 1.2285, + "step": 14101 + }, + { + "epoch": 4.2002271077272475, + "grad_norm": 0.24685721099376678, + "learning_rate": 1.3022207373981676e-05, + "loss": 1.2341, + "step": 14102 + }, + { + "epoch": 4.200524953926916, + "grad_norm": 0.2619490325450897, + "learning_rate": 1.3021287898640245e-05, + "loss": 1.2277, + "step": 14103 + }, + { + "epoch": 4.200822800126585, + "grad_norm": 0.2629368305206299, + "learning_rate": 1.3020368395189107e-05, + "loss": 1.2363, + "step": 14104 + }, + { + "epoch": 4.201120646326253, + "grad_norm": 0.3153458833694458, + "learning_rate": 1.301944886363681e-05, + "loss": 1.226, + "step": 14105 + }, + { + "epoch": 4.201418492525922, + "grad_norm": 0.5970566868782043, + "learning_rate": 1.3018529303991915e-05, + "loss": 1.2054, + "step": 14106 + }, + { + "epoch": 4.201716338725591, + "grad_norm": 0.5265555381774902, + "learning_rate": 1.3017609716262981e-05, + "loss": 1.2322, + "step": 14107 + }, + { + "epoch": 4.202014184925259, + "grad_norm": 0.3880898058414459, + "learning_rate": 1.301669010045856e-05, + "loss": 1.2163, + "step": 14108 + }, + { + "epoch": 4.202312031124928, + "grad_norm": 0.8552187085151672, + "learning_rate": 1.3015770456587206e-05, + "loss": 1.2292, + "step": 14109 + }, + { + "epoch": 4.202609877324597, + "grad_norm": 0.503619372844696, + "learning_rate": 1.3014850784657478e-05, + "loss": 1.2387, + "step": 14110 + }, + { + "epoch": 4.202907723524265, + "grad_norm": 0.4142906963825226, + "learning_rate": 1.3013931084677934e-05, + "loss": 1.2342, + "step": 14111 + }, + { + "epoch": 4.203205569723933, + "grad_norm": 0.41472697257995605, + "learning_rate": 1.3013011356657126e-05, + "loss": 1.2136, + "step": 14112 + }, + { + "epoch": 4.203503415923603, + "grad_norm": 0.35401034355163574, + "learning_rate": 1.3012091600603613e-05, + "loss": 1.2189, + "step": 14113 + }, + { + "epoch": 4.203801262123271, + "grad_norm": 0.30135732889175415, + "learning_rate": 1.3011171816525955e-05, + "loss": 1.2154, + "step": 14114 + }, + { + "epoch": 4.204099108322939, + "grad_norm": 0.3800581991672516, + "learning_rate": 1.3010252004432707e-05, + "loss": 1.2442, + "step": 14115 + }, + { + "epoch": 4.2043969545226085, + "grad_norm": 0.27238842844963074, + "learning_rate": 1.3009332164332427e-05, + "loss": 1.2241, + "step": 14116 + }, + { + "epoch": 4.204694800722277, + "grad_norm": 0.44803619384765625, + "learning_rate": 1.3008412296233673e-05, + "loss": 1.2229, + "step": 14117 + }, + { + "epoch": 4.204992646921946, + "grad_norm": 0.267049640417099, + "learning_rate": 1.3007492400145005e-05, + "loss": 1.2225, + "step": 14118 + }, + { + "epoch": 4.205290493121614, + "grad_norm": 0.41336846351623535, + "learning_rate": 1.3006572476074978e-05, + "loss": 1.2422, + "step": 14119 + }, + { + "epoch": 4.205588339321283, + "grad_norm": 0.2532871663570404, + "learning_rate": 1.3005652524032156e-05, + "loss": 1.2167, + "step": 14120 + }, + { + "epoch": 4.205886185520952, + "grad_norm": 0.2909505367279053, + "learning_rate": 1.3004732544025096e-05, + "loss": 1.2452, + "step": 14121 + }, + { + "epoch": 4.20618403172062, + "grad_norm": 0.3960720896720886, + "learning_rate": 1.3003812536062355e-05, + "loss": 1.2214, + "step": 14122 + }, + { + "epoch": 4.206481877920289, + "grad_norm": 0.24674756824970245, + "learning_rate": 1.3002892500152493e-05, + "loss": 1.2286, + "step": 14123 + }, + { + "epoch": 4.206779724119958, + "grad_norm": 0.37536731362342834, + "learning_rate": 1.3001972436304073e-05, + "loss": 1.2241, + "step": 14124 + }, + { + "epoch": 4.207077570319626, + "grad_norm": 0.27816709876060486, + "learning_rate": 1.3001052344525652e-05, + "loss": 1.243, + "step": 14125 + }, + { + "epoch": 4.2073754165192945, + "grad_norm": 0.2934020459651947, + "learning_rate": 1.3000132224825794e-05, + "loss": 1.2319, + "step": 14126 + }, + { + "epoch": 4.207673262718964, + "grad_norm": 0.35045763850212097, + "learning_rate": 1.2999212077213057e-05, + "loss": 1.2158, + "step": 14127 + }, + { + "epoch": 4.207971108918632, + "grad_norm": 0.31835460662841797, + "learning_rate": 1.2998291901696006e-05, + "loss": 1.2182, + "step": 14128 + }, + { + "epoch": 4.208268955118301, + "grad_norm": 0.4084266424179077, + "learning_rate": 1.2997371698283192e-05, + "loss": 1.2191, + "step": 14129 + }, + { + "epoch": 4.20856680131797, + "grad_norm": 0.2961149215698242, + "learning_rate": 1.2996451466983185e-05, + "loss": 1.2226, + "step": 14130 + }, + { + "epoch": 4.208864647517638, + "grad_norm": 0.5896714329719543, + "learning_rate": 1.2995531207804549e-05, + "loss": 1.2208, + "step": 14131 + }, + { + "epoch": 4.209162493717307, + "grad_norm": 0.33247706294059753, + "learning_rate": 1.2994610920755839e-05, + "loss": 1.2405, + "step": 14132 + }, + { + "epoch": 4.209460339916975, + "grad_norm": 0.317874550819397, + "learning_rate": 1.299369060584562e-05, + "loss": 1.2235, + "step": 14133 + }, + { + "epoch": 4.209758186116644, + "grad_norm": 0.30994951725006104, + "learning_rate": 1.2992770263082455e-05, + "loss": 1.2207, + "step": 14134 + }, + { + "epoch": 4.210056032316313, + "grad_norm": 0.2707478106021881, + "learning_rate": 1.2991849892474905e-05, + "loss": 1.2281, + "step": 14135 + }, + { + "epoch": 4.210353878515981, + "grad_norm": 0.4367784261703491, + "learning_rate": 1.2990929494031537e-05, + "loss": 1.2324, + "step": 14136 + }, + { + "epoch": 4.21065172471565, + "grad_norm": 0.39060741662979126, + "learning_rate": 1.2990009067760908e-05, + "loss": 1.2371, + "step": 14137 + }, + { + "epoch": 4.210949570915319, + "grad_norm": 0.2724064588546753, + "learning_rate": 1.298908861367159e-05, + "loss": 1.2203, + "step": 14138 + }, + { + "epoch": 4.211247417114987, + "grad_norm": 0.3767302930355072, + "learning_rate": 1.2988168131772141e-05, + "loss": 1.2293, + "step": 14139 + }, + { + "epoch": 4.2115452633146555, + "grad_norm": 0.32680049538612366, + "learning_rate": 1.2987247622071124e-05, + "loss": 1.2237, + "step": 14140 + }, + { + "epoch": 4.211843109514325, + "grad_norm": 0.33390113711357117, + "learning_rate": 1.2986327084577106e-05, + "loss": 1.2128, + "step": 14141 + }, + { + "epoch": 4.212140955713993, + "grad_norm": 0.4874330461025238, + "learning_rate": 1.2985406519298652e-05, + "loss": 1.2256, + "step": 14142 + }, + { + "epoch": 4.212438801913661, + "grad_norm": 0.2610390782356262, + "learning_rate": 1.2984485926244326e-05, + "loss": 1.2243, + "step": 14143 + }, + { + "epoch": 4.212736648113331, + "grad_norm": 0.28563764691352844, + "learning_rate": 1.298356530542269e-05, + "loss": 1.2334, + "step": 14144 + }, + { + "epoch": 4.213034494312999, + "grad_norm": 0.32336580753326416, + "learning_rate": 1.2982644656842318e-05, + "loss": 1.223, + "step": 14145 + }, + { + "epoch": 4.213332340512668, + "grad_norm": 0.270393431186676, + "learning_rate": 1.2981723980511764e-05, + "loss": 1.2202, + "step": 14146 + }, + { + "epoch": 4.2136301867123365, + "grad_norm": 0.2720387578010559, + "learning_rate": 1.2980803276439602e-05, + "loss": 1.2232, + "step": 14147 + }, + { + "epoch": 4.213928032912005, + "grad_norm": 0.284346342086792, + "learning_rate": 1.2979882544634397e-05, + "loss": 1.2377, + "step": 14148 + }, + { + "epoch": 4.214225879111674, + "grad_norm": 0.2852690815925598, + "learning_rate": 1.2978961785104714e-05, + "loss": 1.2122, + "step": 14149 + }, + { + "epoch": 4.214523725311342, + "grad_norm": 0.25768882036209106, + "learning_rate": 1.2978040997859118e-05, + "loss": 1.2211, + "step": 14150 + }, + { + "epoch": 4.214821571511011, + "grad_norm": 0.2937699854373932, + "learning_rate": 1.2977120182906178e-05, + "loss": 1.2263, + "step": 14151 + }, + { + "epoch": 4.21511941771068, + "grad_norm": 0.4234558045864105, + "learning_rate": 1.297619934025446e-05, + "loss": 1.2104, + "step": 14152 + }, + { + "epoch": 4.215417263910348, + "grad_norm": 0.46062833070755005, + "learning_rate": 1.2975278469912536e-05, + "loss": 1.2197, + "step": 14153 + }, + { + "epoch": 4.2157151101100165, + "grad_norm": 0.3027644455432892, + "learning_rate": 1.2974357571888966e-05, + "loss": 1.2188, + "step": 14154 + }, + { + "epoch": 4.216012956309686, + "grad_norm": 0.32012224197387695, + "learning_rate": 1.2973436646192328e-05, + "loss": 1.2381, + "step": 14155 + }, + { + "epoch": 4.216310802509354, + "grad_norm": 0.3370289206504822, + "learning_rate": 1.2972515692831178e-05, + "loss": 1.2266, + "step": 14156 + }, + { + "epoch": 4.216608648709023, + "grad_norm": 0.3698771595954895, + "learning_rate": 1.2971594711814093e-05, + "loss": 1.2333, + "step": 14157 + }, + { + "epoch": 4.216906494908692, + "grad_norm": 0.3698229193687439, + "learning_rate": 1.297067370314964e-05, + "loss": 1.2148, + "step": 14158 + }, + { + "epoch": 4.21720434110836, + "grad_norm": 0.4046650230884552, + "learning_rate": 1.2969752666846384e-05, + "loss": 1.2057, + "step": 14159 + }, + { + "epoch": 4.217502187308029, + "grad_norm": 0.3872756063938141, + "learning_rate": 1.2968831602912902e-05, + "loss": 1.2098, + "step": 14160 + }, + { + "epoch": 4.2178000335076975, + "grad_norm": 0.3703349232673645, + "learning_rate": 1.2967910511357755e-05, + "loss": 1.2177, + "step": 14161 + }, + { + "epoch": 4.218097879707366, + "grad_norm": 0.28257542848587036, + "learning_rate": 1.296698939218952e-05, + "loss": 1.2281, + "step": 14162 + }, + { + "epoch": 4.218395725907035, + "grad_norm": 0.3109702467918396, + "learning_rate": 1.2966068245416762e-05, + "loss": 1.2281, + "step": 14163 + }, + { + "epoch": 4.218693572106703, + "grad_norm": 0.28986650705337524, + "learning_rate": 1.296514707104805e-05, + "loss": 1.2254, + "step": 14164 + }, + { + "epoch": 4.218991418306372, + "grad_norm": 0.2609320878982544, + "learning_rate": 1.2964225869091962e-05, + "loss": 1.226, + "step": 14165 + }, + { + "epoch": 4.219289264506041, + "grad_norm": 0.32675209641456604, + "learning_rate": 1.2963304639557061e-05, + "loss": 1.2322, + "step": 14166 + }, + { + "epoch": 4.219587110705709, + "grad_norm": 0.2727464437484741, + "learning_rate": 1.2962383382451922e-05, + "loss": 1.2296, + "step": 14167 + }, + { + "epoch": 4.219884956905378, + "grad_norm": 0.29681476950645447, + "learning_rate": 1.2961462097785116e-05, + "loss": 1.2254, + "step": 14168 + }, + { + "epoch": 4.220182803105047, + "grad_norm": 0.26645466685295105, + "learning_rate": 1.2960540785565216e-05, + "loss": 1.2223, + "step": 14169 + }, + { + "epoch": 4.220480649304715, + "grad_norm": 0.28149473667144775, + "learning_rate": 1.2959619445800786e-05, + "loss": 1.245, + "step": 14170 + }, + { + "epoch": 4.220778495504384, + "grad_norm": 0.2772013247013092, + "learning_rate": 1.295869807850041e-05, + "loss": 1.2414, + "step": 14171 + }, + { + "epoch": 4.221076341704053, + "grad_norm": 0.32597294449806213, + "learning_rate": 1.2957776683672651e-05, + "loss": 1.2328, + "step": 14172 + }, + { + "epoch": 4.221374187903721, + "grad_norm": 0.2925127148628235, + "learning_rate": 1.2956855261326085e-05, + "loss": 1.2263, + "step": 14173 + }, + { + "epoch": 4.22167203410339, + "grad_norm": 0.4555399715900421, + "learning_rate": 1.2955933811469284e-05, + "loss": 1.2173, + "step": 14174 + }, + { + "epoch": 4.2219698803030585, + "grad_norm": 0.42685988545417786, + "learning_rate": 1.295501233411082e-05, + "loss": 1.2259, + "step": 14175 + }, + { + "epoch": 4.222267726502727, + "grad_norm": 0.29122740030288696, + "learning_rate": 1.295409082925927e-05, + "loss": 1.24, + "step": 14176 + }, + { + "epoch": 4.222565572702396, + "grad_norm": 0.35038653016090393, + "learning_rate": 1.2953169296923202e-05, + "loss": 1.2337, + "step": 14177 + }, + { + "epoch": 4.222863418902064, + "grad_norm": 0.2838687300682068, + "learning_rate": 1.2952247737111196e-05, + "loss": 1.2201, + "step": 14178 + }, + { + "epoch": 4.223161265101733, + "grad_norm": 0.3801766037940979, + "learning_rate": 1.2951326149831826e-05, + "loss": 1.2299, + "step": 14179 + }, + { + "epoch": 4.223459111301402, + "grad_norm": 0.2637985050678253, + "learning_rate": 1.2950404535093659e-05, + "loss": 1.2275, + "step": 14180 + }, + { + "epoch": 4.22375695750107, + "grad_norm": 0.6842769980430603, + "learning_rate": 1.2949482892905276e-05, + "loss": 1.2197, + "step": 14181 + }, + { + "epoch": 4.224054803700739, + "grad_norm": 0.5419729948043823, + "learning_rate": 1.294856122327525e-05, + "loss": 1.2285, + "step": 14182 + }, + { + "epoch": 4.224352649900408, + "grad_norm": 0.4426734149456024, + "learning_rate": 1.2947639526212157e-05, + "loss": 1.2243, + "step": 14183 + }, + { + "epoch": 4.224650496100076, + "grad_norm": 0.5549633502960205, + "learning_rate": 1.2946717801724571e-05, + "loss": 1.214, + "step": 14184 + }, + { + "epoch": 4.224948342299745, + "grad_norm": 0.29709017276763916, + "learning_rate": 1.2945796049821066e-05, + "loss": 1.2121, + "step": 14185 + }, + { + "epoch": 4.225246188499414, + "grad_norm": 0.4249708652496338, + "learning_rate": 1.2944874270510223e-05, + "loss": 1.2439, + "step": 14186 + }, + { + "epoch": 4.225544034699082, + "grad_norm": 0.29149335622787476, + "learning_rate": 1.2943952463800616e-05, + "loss": 1.2213, + "step": 14187 + }, + { + "epoch": 4.225841880898751, + "grad_norm": 0.3362438976764679, + "learning_rate": 1.2943030629700815e-05, + "loss": 1.2235, + "step": 14188 + }, + { + "epoch": 4.22613972709842, + "grad_norm": 0.26623889803886414, + "learning_rate": 1.2942108768219406e-05, + "loss": 1.2314, + "step": 14189 + }, + { + "epoch": 4.226437573298088, + "grad_norm": 0.31887322664260864, + "learning_rate": 1.2941186879364961e-05, + "loss": 1.215, + "step": 14190 + }, + { + "epoch": 4.226735419497757, + "grad_norm": 0.2840903103351593, + "learning_rate": 1.2940264963146059e-05, + "loss": 1.208, + "step": 14191 + }, + { + "epoch": 4.227033265697425, + "grad_norm": 0.33410122990608215, + "learning_rate": 1.2939343019571275e-05, + "loss": 1.2239, + "step": 14192 + }, + { + "epoch": 4.227331111897094, + "grad_norm": 0.28891879320144653, + "learning_rate": 1.293842104864919e-05, + "loss": 1.2237, + "step": 14193 + }, + { + "epoch": 4.227628958096763, + "grad_norm": 0.3174760341644287, + "learning_rate": 1.2937499050388377e-05, + "loss": 1.2228, + "step": 14194 + }, + { + "epoch": 4.227926804296431, + "grad_norm": 0.2664237916469574, + "learning_rate": 1.2936577024797421e-05, + "loss": 1.2337, + "step": 14195 + }, + { + "epoch": 4.2282246504961005, + "grad_norm": 0.3116903007030487, + "learning_rate": 1.2935654971884897e-05, + "loss": 1.2027, + "step": 14196 + }, + { + "epoch": 4.228522496695769, + "grad_norm": 0.36390095949172974, + "learning_rate": 1.2934732891659382e-05, + "loss": 1.2338, + "step": 14197 + }, + { + "epoch": 4.228820342895437, + "grad_norm": 0.2807815968990326, + "learning_rate": 1.2933810784129457e-05, + "loss": 1.2329, + "step": 14198 + }, + { + "epoch": 4.229118189095106, + "grad_norm": 0.36470362544059753, + "learning_rate": 1.2932888649303699e-05, + "loss": 1.2241, + "step": 14199 + }, + { + "epoch": 4.229416035294775, + "grad_norm": 0.25736430287361145, + "learning_rate": 1.2931966487190691e-05, + "loss": 1.2186, + "step": 14200 + }, + { + "epoch": 4.229713881494443, + "grad_norm": 0.39754676818847656, + "learning_rate": 1.2931044297799007e-05, + "loss": 1.2139, + "step": 14201 + }, + { + "epoch": 4.230011727694112, + "grad_norm": 0.3175043761730194, + "learning_rate": 1.2930122081137234e-05, + "loss": 1.2213, + "step": 14202 + }, + { + "epoch": 4.230309573893781, + "grad_norm": 0.43965598940849304, + "learning_rate": 1.2929199837213949e-05, + "loss": 1.2295, + "step": 14203 + }, + { + "epoch": 4.230607420093449, + "grad_norm": 0.29722118377685547, + "learning_rate": 1.292827756603773e-05, + "loss": 1.2271, + "step": 14204 + }, + { + "epoch": 4.230905266293118, + "grad_norm": 0.28060266375541687, + "learning_rate": 1.292735526761716e-05, + "loss": 1.2311, + "step": 14205 + }, + { + "epoch": 4.2312031124927865, + "grad_norm": 0.2804664373397827, + "learning_rate": 1.2926432941960822e-05, + "loss": 1.2219, + "step": 14206 + }, + { + "epoch": 4.231500958692455, + "grad_norm": 0.27232399582862854, + "learning_rate": 1.2925510589077292e-05, + "loss": 1.2217, + "step": 14207 + }, + { + "epoch": 4.231798804892124, + "grad_norm": 0.2631955146789551, + "learning_rate": 1.2924588208975158e-05, + "loss": 1.2178, + "step": 14208 + }, + { + "epoch": 4.232096651091792, + "grad_norm": 0.29186931252479553, + "learning_rate": 1.2923665801662995e-05, + "loss": 1.2332, + "step": 14209 + }, + { + "epoch": 4.232394497291461, + "grad_norm": 0.3281896412372589, + "learning_rate": 1.292274336714939e-05, + "loss": 1.2254, + "step": 14210 + }, + { + "epoch": 4.23269234349113, + "grad_norm": 0.25704681873321533, + "learning_rate": 1.2921820905442923e-05, + "loss": 1.2138, + "step": 14211 + }, + { + "epoch": 4.232990189690798, + "grad_norm": 0.2990695536136627, + "learning_rate": 1.2920898416552177e-05, + "loss": 1.2292, + "step": 14212 + }, + { + "epoch": 4.233288035890467, + "grad_norm": 0.2677648067474365, + "learning_rate": 1.2919975900485735e-05, + "loss": 1.2259, + "step": 14213 + }, + { + "epoch": 4.233585882090136, + "grad_norm": 0.25805628299713135, + "learning_rate": 1.2919053357252181e-05, + "loss": 1.223, + "step": 14214 + }, + { + "epoch": 4.233883728289804, + "grad_norm": 0.25021156668663025, + "learning_rate": 1.2918130786860094e-05, + "loss": 1.2151, + "step": 14215 + }, + { + "epoch": 4.234181574489473, + "grad_norm": 0.34445720911026, + "learning_rate": 1.2917208189318063e-05, + "loss": 1.2344, + "step": 14216 + }, + { + "epoch": 4.234479420689142, + "grad_norm": 0.26035237312316895, + "learning_rate": 1.2916285564634667e-05, + "loss": 1.2167, + "step": 14217 + }, + { + "epoch": 4.23477726688881, + "grad_norm": 0.31001225113868713, + "learning_rate": 1.2915362912818492e-05, + "loss": 1.2054, + "step": 14218 + }, + { + "epoch": 4.235075113088479, + "grad_norm": 0.26624149084091187, + "learning_rate": 1.2914440233878123e-05, + "loss": 1.2323, + "step": 14219 + }, + { + "epoch": 4.2353729592881475, + "grad_norm": 0.28126007318496704, + "learning_rate": 1.2913517527822146e-05, + "loss": 1.2271, + "step": 14220 + }, + { + "epoch": 4.235670805487816, + "grad_norm": 0.28897514939308167, + "learning_rate": 1.2912594794659141e-05, + "loss": 1.2294, + "step": 14221 + }, + { + "epoch": 4.235968651687485, + "grad_norm": 0.27257728576660156, + "learning_rate": 1.2911672034397695e-05, + "loss": 1.2132, + "step": 14222 + }, + { + "epoch": 4.236266497887153, + "grad_norm": 0.42830389738082886, + "learning_rate": 1.2910749247046397e-05, + "loss": 1.2183, + "step": 14223 + }, + { + "epoch": 4.236564344086823, + "grad_norm": 0.25782597064971924, + "learning_rate": 1.2909826432613825e-05, + "loss": 1.2186, + "step": 14224 + }, + { + "epoch": 4.236862190286491, + "grad_norm": 0.2812288701534271, + "learning_rate": 1.2908903591108572e-05, + "loss": 1.2252, + "step": 14225 + }, + { + "epoch": 4.237160036486159, + "grad_norm": 0.30920568108558655, + "learning_rate": 1.2907980722539219e-05, + "loss": 1.2352, + "step": 14226 + }, + { + "epoch": 4.2374578826858285, + "grad_norm": 0.3106936812400818, + "learning_rate": 1.2907057826914357e-05, + "loss": 1.2217, + "step": 14227 + }, + { + "epoch": 4.237755728885497, + "grad_norm": 0.3300728499889374, + "learning_rate": 1.290613490424257e-05, + "loss": 1.2418, + "step": 14228 + }, + { + "epoch": 4.238053575085165, + "grad_norm": 0.39398500323295593, + "learning_rate": 1.2905211954532441e-05, + "loss": 1.2237, + "step": 14229 + }, + { + "epoch": 4.238351421284834, + "grad_norm": 0.3569934666156769, + "learning_rate": 1.2904288977792566e-05, + "loss": 1.2324, + "step": 14230 + }, + { + "epoch": 4.238649267484503, + "grad_norm": 0.6860139966011047, + "learning_rate": 1.2903365974031523e-05, + "loss": 1.2127, + "step": 14231 + }, + { + "epoch": 4.238947113684171, + "grad_norm": 0.4754354655742645, + "learning_rate": 1.2902442943257905e-05, + "loss": 1.232, + "step": 14232 + }, + { + "epoch": 4.23924495988384, + "grad_norm": 0.5190975069999695, + "learning_rate": 1.2901519885480297e-05, + "loss": 1.2271, + "step": 14233 + }, + { + "epoch": 4.2395428060835085, + "grad_norm": 0.47939419746398926, + "learning_rate": 1.2900596800707288e-05, + "loss": 1.2143, + "step": 14234 + }, + { + "epoch": 4.239840652283177, + "grad_norm": 0.4061165452003479, + "learning_rate": 1.2899673688947467e-05, + "loss": 1.226, + "step": 14235 + }, + { + "epoch": 4.240138498482846, + "grad_norm": 0.3673345446586609, + "learning_rate": 1.2898750550209422e-05, + "loss": 1.2187, + "step": 14236 + }, + { + "epoch": 4.240436344682514, + "grad_norm": 0.4002215564250946, + "learning_rate": 1.2897827384501744e-05, + "loss": 1.2104, + "step": 14237 + }, + { + "epoch": 4.240734190882184, + "grad_norm": 0.266471266746521, + "learning_rate": 1.2896904191833018e-05, + "loss": 1.2061, + "step": 14238 + }, + { + "epoch": 4.241032037081852, + "grad_norm": 0.3146986663341522, + "learning_rate": 1.2895980972211835e-05, + "loss": 1.2267, + "step": 14239 + }, + { + "epoch": 4.24132988328152, + "grad_norm": 0.25907179713249207, + "learning_rate": 1.2895057725646785e-05, + "loss": 1.2282, + "step": 14240 + }, + { + "epoch": 4.2416277294811895, + "grad_norm": 0.2718636989593506, + "learning_rate": 1.2894134452146457e-05, + "loss": 1.2281, + "step": 14241 + }, + { + "epoch": 4.241925575680858, + "grad_norm": 0.27919113636016846, + "learning_rate": 1.2893211151719441e-05, + "loss": 1.2328, + "step": 14242 + }, + { + "epoch": 4.242223421880526, + "grad_norm": 0.2516349256038666, + "learning_rate": 1.2892287824374328e-05, + "loss": 1.2097, + "step": 14243 + }, + { + "epoch": 4.242521268080195, + "grad_norm": 0.3680274784564972, + "learning_rate": 1.2891364470119711e-05, + "loss": 1.2298, + "step": 14244 + }, + { + "epoch": 4.242819114279864, + "grad_norm": 0.26119786500930786, + "learning_rate": 1.2890441088964174e-05, + "loss": 1.2286, + "step": 14245 + }, + { + "epoch": 4.243116960479532, + "grad_norm": 0.307192862033844, + "learning_rate": 1.2889517680916314e-05, + "loss": 1.2067, + "step": 14246 + }, + { + "epoch": 4.243414806679201, + "grad_norm": 0.3036113977432251, + "learning_rate": 1.288859424598472e-05, + "loss": 1.2199, + "step": 14247 + }, + { + "epoch": 4.24371265287887, + "grad_norm": 0.34055301547050476, + "learning_rate": 1.2887670784177983e-05, + "loss": 1.2278, + "step": 14248 + }, + { + "epoch": 4.244010499078538, + "grad_norm": 0.27632489800453186, + "learning_rate": 1.2886747295504698e-05, + "loss": 1.2235, + "step": 14249 + }, + { + "epoch": 4.244308345278207, + "grad_norm": 0.27771714329719543, + "learning_rate": 1.2885823779973451e-05, + "loss": 1.2332, + "step": 14250 + }, + { + "epoch": 4.244606191477875, + "grad_norm": 0.2673502266407013, + "learning_rate": 1.2884900237592843e-05, + "loss": 1.2194, + "step": 14251 + }, + { + "epoch": 4.244904037677545, + "grad_norm": 0.25433048605918884, + "learning_rate": 1.2883976668371456e-05, + "loss": 1.2304, + "step": 14252 + }, + { + "epoch": 4.245201883877213, + "grad_norm": 0.2676333487033844, + "learning_rate": 1.2883053072317891e-05, + "loss": 1.2256, + "step": 14253 + }, + { + "epoch": 4.245499730076881, + "grad_norm": 0.2577076554298401, + "learning_rate": 1.288212944944074e-05, + "loss": 1.2295, + "step": 14254 + }, + { + "epoch": 4.2457975762765505, + "grad_norm": 0.31689655780792236, + "learning_rate": 1.2881205799748593e-05, + "loss": 1.2179, + "step": 14255 + }, + { + "epoch": 4.246095422476219, + "grad_norm": 0.280807226896286, + "learning_rate": 1.2880282123250045e-05, + "loss": 1.2518, + "step": 14256 + }, + { + "epoch": 4.246393268675887, + "grad_norm": 0.30824899673461914, + "learning_rate": 1.2879358419953691e-05, + "loss": 1.2263, + "step": 14257 + }, + { + "epoch": 4.246691114875556, + "grad_norm": 0.2893029451370239, + "learning_rate": 1.287843468986812e-05, + "loss": 1.2277, + "step": 14258 + }, + { + "epoch": 4.246988961075225, + "grad_norm": 0.3122074007987976, + "learning_rate": 1.2877510933001933e-05, + "loss": 1.2029, + "step": 14259 + }, + { + "epoch": 4.247286807274893, + "grad_norm": 0.36463871598243713, + "learning_rate": 1.2876587149363722e-05, + "loss": 1.2354, + "step": 14260 + }, + { + "epoch": 4.247584653474562, + "grad_norm": 0.29337945580482483, + "learning_rate": 1.287566333896208e-05, + "loss": 1.2265, + "step": 14261 + }, + { + "epoch": 4.247882499674231, + "grad_norm": 0.270013689994812, + "learning_rate": 1.2874739501805607e-05, + "loss": 1.2264, + "step": 14262 + }, + { + "epoch": 4.2481803458739, + "grad_norm": 0.28883761167526245, + "learning_rate": 1.287381563790289e-05, + "loss": 1.2119, + "step": 14263 + }, + { + "epoch": 4.248478192073568, + "grad_norm": 0.5028244853019714, + "learning_rate": 1.2872891747262534e-05, + "loss": 1.2346, + "step": 14264 + }, + { + "epoch": 4.2487760382732365, + "grad_norm": 0.267386794090271, + "learning_rate": 1.2871967829893128e-05, + "loss": 1.2188, + "step": 14265 + }, + { + "epoch": 4.249073884472906, + "grad_norm": 0.4257965683937073, + "learning_rate": 1.287104388580327e-05, + "loss": 1.2298, + "step": 14266 + }, + { + "epoch": 4.249371730672574, + "grad_norm": 0.3177397549152374, + "learning_rate": 1.2870119915001555e-05, + "loss": 1.2432, + "step": 14267 + }, + { + "epoch": 4.249669576872242, + "grad_norm": 0.3956317603588104, + "learning_rate": 1.2869195917496585e-05, + "loss": 1.2344, + "step": 14268 + }, + { + "epoch": 4.2499674230719116, + "grad_norm": 0.35344603657722473, + "learning_rate": 1.2868271893296949e-05, + "loss": 1.206, + "step": 14269 + }, + { + "epoch": 4.25026526927158, + "grad_norm": 0.3135831356048584, + "learning_rate": 1.2867347842411246e-05, + "loss": 1.2191, + "step": 14270 + }, + { + "epoch": 4.250563115471248, + "grad_norm": 0.3514217734336853, + "learning_rate": 1.2866423764848079e-05, + "loss": 1.2314, + "step": 14271 + }, + { + "epoch": 4.250860961670917, + "grad_norm": 0.2604618966579437, + "learning_rate": 1.2865499660616039e-05, + "loss": 1.2224, + "step": 14272 + }, + { + "epoch": 4.251158807870586, + "grad_norm": 0.31919756531715393, + "learning_rate": 1.2864575529723726e-05, + "loss": 1.2254, + "step": 14273 + }, + { + "epoch": 4.251456654070254, + "grad_norm": 0.24949854612350464, + "learning_rate": 1.2863651372179742e-05, + "loss": 1.2261, + "step": 14274 + }, + { + "epoch": 4.251754500269923, + "grad_norm": 0.254134863615036, + "learning_rate": 1.2862727187992676e-05, + "loss": 1.2409, + "step": 14275 + }, + { + "epoch": 4.252052346469592, + "grad_norm": 0.2743396461009979, + "learning_rate": 1.2861802977171133e-05, + "loss": 1.2387, + "step": 14276 + }, + { + "epoch": 4.25235019266926, + "grad_norm": 0.44552743434906006, + "learning_rate": 1.2860878739723713e-05, + "loss": 1.2083, + "step": 14277 + }, + { + "epoch": 4.252648038868929, + "grad_norm": 0.3872476816177368, + "learning_rate": 1.2859954475659013e-05, + "loss": 1.2043, + "step": 14278 + }, + { + "epoch": 4.2529458850685975, + "grad_norm": 0.3023822009563446, + "learning_rate": 1.285903018498563e-05, + "loss": 1.2316, + "step": 14279 + }, + { + "epoch": 4.253243731268267, + "grad_norm": 0.453902930021286, + "learning_rate": 1.2858105867712167e-05, + "loss": 1.2175, + "step": 14280 + }, + { + "epoch": 4.253541577467935, + "grad_norm": 0.25171342492103577, + "learning_rate": 1.2857181523847222e-05, + "loss": 1.2347, + "step": 14281 + }, + { + "epoch": 4.253839423667603, + "grad_norm": 0.47940343618392944, + "learning_rate": 1.2856257153399395e-05, + "loss": 1.2295, + "step": 14282 + }, + { + "epoch": 4.254137269867273, + "grad_norm": 0.2452671080827713, + "learning_rate": 1.2855332756377284e-05, + "loss": 1.2132, + "step": 14283 + }, + { + "epoch": 4.254435116066941, + "grad_norm": 0.5154544115066528, + "learning_rate": 1.2854408332789494e-05, + "loss": 1.2238, + "step": 14284 + }, + { + "epoch": 4.254732962266609, + "grad_norm": 0.31857696175575256, + "learning_rate": 1.2853483882644625e-05, + "loss": 1.2058, + "step": 14285 + }, + { + "epoch": 4.2550308084662785, + "grad_norm": 0.3479495942592621, + "learning_rate": 1.2852559405951274e-05, + "loss": 1.2229, + "step": 14286 + }, + { + "epoch": 4.255328654665947, + "grad_norm": 0.26344847679138184, + "learning_rate": 1.2851634902718047e-05, + "loss": 1.2437, + "step": 14287 + }, + { + "epoch": 4.255626500865615, + "grad_norm": 0.3922542929649353, + "learning_rate": 1.2850710372953541e-05, + "loss": 1.2248, + "step": 14288 + }, + { + "epoch": 4.255924347065284, + "grad_norm": 0.27182137966156006, + "learning_rate": 1.284978581666636e-05, + "loss": 1.2076, + "step": 14289 + }, + { + "epoch": 4.256222193264953, + "grad_norm": 0.3713935613632202, + "learning_rate": 1.2848861233865107e-05, + "loss": 1.2165, + "step": 14290 + }, + { + "epoch": 4.256520039464622, + "grad_norm": 0.306208074092865, + "learning_rate": 1.2847936624558383e-05, + "loss": 1.234, + "step": 14291 + }, + { + "epoch": 4.25681788566429, + "grad_norm": 0.3006187379360199, + "learning_rate": 1.2847011988754794e-05, + "loss": 1.2088, + "step": 14292 + }, + { + "epoch": 4.2571157318639585, + "grad_norm": 0.2920163869857788, + "learning_rate": 1.2846087326462932e-05, + "loss": 1.2279, + "step": 14293 + }, + { + "epoch": 4.257413578063628, + "grad_norm": 0.3001825213432312, + "learning_rate": 1.2845162637691414e-05, + "loss": 1.2356, + "step": 14294 + }, + { + "epoch": 4.257711424263296, + "grad_norm": 0.290507972240448, + "learning_rate": 1.2844237922448835e-05, + "loss": 1.2277, + "step": 14295 + }, + { + "epoch": 4.258009270462964, + "grad_norm": 0.3379744589328766, + "learning_rate": 1.2843313180743797e-05, + "loss": 1.2146, + "step": 14296 + }, + { + "epoch": 4.258307116662634, + "grad_norm": 0.26328563690185547, + "learning_rate": 1.2842388412584907e-05, + "loss": 1.2225, + "step": 14297 + }, + { + "epoch": 4.258604962862302, + "grad_norm": 0.32128599286079407, + "learning_rate": 1.2841463617980772e-05, + "loss": 1.2375, + "step": 14298 + }, + { + "epoch": 4.25890280906197, + "grad_norm": 0.2736668884754181, + "learning_rate": 1.2840538796939987e-05, + "loss": 1.216, + "step": 14299 + }, + { + "epoch": 4.2592006552616395, + "grad_norm": 0.26555588841438293, + "learning_rate": 1.2839613949471168e-05, + "loss": 1.2213, + "step": 14300 + }, + { + "epoch": 4.259498501461308, + "grad_norm": 0.2700282037258148, + "learning_rate": 1.2838689075582911e-05, + "loss": 1.2363, + "step": 14301 + }, + { + "epoch": 4.259796347660977, + "grad_norm": 0.2413858026266098, + "learning_rate": 1.2837764175283827e-05, + "loss": 1.2367, + "step": 14302 + }, + { + "epoch": 4.260094193860645, + "grad_norm": 0.2933841347694397, + "learning_rate": 1.2836839248582514e-05, + "loss": 1.2148, + "step": 14303 + }, + { + "epoch": 4.260392040060314, + "grad_norm": 0.24752715229988098, + "learning_rate": 1.2835914295487581e-05, + "loss": 1.2238, + "step": 14304 + }, + { + "epoch": 4.260689886259983, + "grad_norm": 0.24317148327827454, + "learning_rate": 1.2834989316007635e-05, + "loss": 1.2135, + "step": 14305 + }, + { + "epoch": 4.260987732459651, + "grad_norm": 0.2943892180919647, + "learning_rate": 1.2834064310151283e-05, + "loss": 1.228, + "step": 14306 + }, + { + "epoch": 4.26128557865932, + "grad_norm": 0.2483043521642685, + "learning_rate": 1.2833139277927125e-05, + "loss": 1.2301, + "step": 14307 + }, + { + "epoch": 4.261583424858989, + "grad_norm": 0.28642261028289795, + "learning_rate": 1.2832214219343773e-05, + "loss": 1.2229, + "step": 14308 + }, + { + "epoch": 4.261881271058657, + "grad_norm": 0.2563004195690155, + "learning_rate": 1.2831289134409834e-05, + "loss": 1.2426, + "step": 14309 + }, + { + "epoch": 4.262179117258325, + "grad_norm": 0.30432990193367004, + "learning_rate": 1.2830364023133913e-05, + "loss": 1.2275, + "step": 14310 + }, + { + "epoch": 4.262476963457995, + "grad_norm": 0.29737430810928345, + "learning_rate": 1.2829438885524611e-05, + "loss": 1.2241, + "step": 14311 + }, + { + "epoch": 4.262774809657663, + "grad_norm": 0.2994697690010071, + "learning_rate": 1.2828513721590547e-05, + "loss": 1.2146, + "step": 14312 + }, + { + "epoch": 4.263072655857331, + "grad_norm": 0.30888044834136963, + "learning_rate": 1.2827588531340323e-05, + "loss": 1.2396, + "step": 14313 + }, + { + "epoch": 4.2633705020570005, + "grad_norm": 0.42900556325912476, + "learning_rate": 1.2826663314782547e-05, + "loss": 1.2519, + "step": 14314 + }, + { + "epoch": 4.263668348256669, + "grad_norm": 0.29265686869621277, + "learning_rate": 1.2825738071925827e-05, + "loss": 1.2235, + "step": 14315 + }, + { + "epoch": 4.263966194456337, + "grad_norm": 0.31383705139160156, + "learning_rate": 1.282481280277877e-05, + "loss": 1.2239, + "step": 14316 + }, + { + "epoch": 4.264264040656006, + "grad_norm": 0.298958957195282, + "learning_rate": 1.2823887507349983e-05, + "loss": 1.2237, + "step": 14317 + }, + { + "epoch": 4.264561886855675, + "grad_norm": 0.33606821298599243, + "learning_rate": 1.282296218564808e-05, + "loss": 1.2116, + "step": 14318 + }, + { + "epoch": 4.264859733055344, + "grad_norm": 0.3485826551914215, + "learning_rate": 1.282203683768167e-05, + "loss": 1.2226, + "step": 14319 + }, + { + "epoch": 4.265157579255012, + "grad_norm": 0.3396190106868744, + "learning_rate": 1.282111146345936e-05, + "loss": 1.2279, + "step": 14320 + }, + { + "epoch": 4.265455425454681, + "grad_norm": 0.24842680990695953, + "learning_rate": 1.2820186062989759e-05, + "loss": 1.2215, + "step": 14321 + }, + { + "epoch": 4.26575327165435, + "grad_norm": 0.2768348157405853, + "learning_rate": 1.2819260636281477e-05, + "loss": 1.2108, + "step": 14322 + }, + { + "epoch": 4.266051117854018, + "grad_norm": 0.30374816060066223, + "learning_rate": 1.2818335183343125e-05, + "loss": 1.2133, + "step": 14323 + }, + { + "epoch": 4.2663489640536865, + "grad_norm": 0.6033021807670593, + "learning_rate": 1.2817409704183312e-05, + "loss": 1.2129, + "step": 14324 + }, + { + "epoch": 4.266646810253356, + "grad_norm": 0.5600675344467163, + "learning_rate": 1.281648419881065e-05, + "loss": 1.2161, + "step": 14325 + }, + { + "epoch": 4.266944656453024, + "grad_norm": 0.27175745368003845, + "learning_rate": 1.2815558667233751e-05, + "loss": 1.2339, + "step": 14326 + }, + { + "epoch": 4.267242502652692, + "grad_norm": 0.5855613350868225, + "learning_rate": 1.2814633109461224e-05, + "loss": 1.2074, + "step": 14327 + }, + { + "epoch": 4.2675403488523616, + "grad_norm": 0.29754626750946045, + "learning_rate": 1.281370752550168e-05, + "loss": 1.2185, + "step": 14328 + }, + { + "epoch": 4.26783819505203, + "grad_norm": 0.520555853843689, + "learning_rate": 1.281278191536373e-05, + "loss": 1.219, + "step": 14329 + }, + { + "epoch": 4.268136041251699, + "grad_norm": 0.35054296255111694, + "learning_rate": 1.2811856279055985e-05, + "loss": 1.2157, + "step": 14330 + }, + { + "epoch": 4.268433887451367, + "grad_norm": 0.36541903018951416, + "learning_rate": 1.2810930616587063e-05, + "loss": 1.2146, + "step": 14331 + }, + { + "epoch": 4.268731733651036, + "grad_norm": 0.37231627106666565, + "learning_rate": 1.2810004927965568e-05, + "loss": 1.2109, + "step": 14332 + }, + { + "epoch": 4.269029579850705, + "grad_norm": 0.30162325501441956, + "learning_rate": 1.2809079213200124e-05, + "loss": 1.2299, + "step": 14333 + }, + { + "epoch": 4.269327426050373, + "grad_norm": 0.2992917001247406, + "learning_rate": 1.2808153472299329e-05, + "loss": 1.2135, + "step": 14334 + }, + { + "epoch": 4.269625272250042, + "grad_norm": 0.34873780608177185, + "learning_rate": 1.2807227705271806e-05, + "loss": 1.2145, + "step": 14335 + }, + { + "epoch": 4.269923118449711, + "grad_norm": 0.2616593837738037, + "learning_rate": 1.2806301912126166e-05, + "loss": 1.2382, + "step": 14336 + }, + { + "epoch": 4.270220964649379, + "grad_norm": 0.536194920539856, + "learning_rate": 1.2805376092871022e-05, + "loss": 1.2171, + "step": 14337 + }, + { + "epoch": 4.2705188108490475, + "grad_norm": 0.3619884252548218, + "learning_rate": 1.2804450247514988e-05, + "loss": 1.216, + "step": 14338 + }, + { + "epoch": 4.270816657048717, + "grad_norm": 0.42572298645973206, + "learning_rate": 1.280352437606668e-05, + "loss": 1.2324, + "step": 14339 + }, + { + "epoch": 4.271114503248385, + "grad_norm": 0.2787296175956726, + "learning_rate": 1.2802598478534705e-05, + "loss": 1.2221, + "step": 14340 + }, + { + "epoch": 4.271412349448053, + "grad_norm": 0.4477073550224304, + "learning_rate": 1.2801672554927684e-05, + "loss": 1.2267, + "step": 14341 + }, + { + "epoch": 4.271710195647723, + "grad_norm": 0.24937401711940765, + "learning_rate": 1.280074660525423e-05, + "loss": 1.2149, + "step": 14342 + }, + { + "epoch": 4.272008041847391, + "grad_norm": 0.40457332134246826, + "learning_rate": 1.279982062952296e-05, + "loss": 1.2414, + "step": 14343 + }, + { + "epoch": 4.272305888047059, + "grad_norm": 0.37612777948379517, + "learning_rate": 1.2798894627742486e-05, + "loss": 1.2288, + "step": 14344 + }, + { + "epoch": 4.2726037342467285, + "grad_norm": 0.4070606231689453, + "learning_rate": 1.2797968599921426e-05, + "loss": 1.2226, + "step": 14345 + }, + { + "epoch": 4.272901580446397, + "grad_norm": 0.380602091550827, + "learning_rate": 1.2797042546068392e-05, + "loss": 1.2367, + "step": 14346 + }, + { + "epoch": 4.273199426646066, + "grad_norm": 0.3362421691417694, + "learning_rate": 1.2796116466192004e-05, + "loss": 1.2326, + "step": 14347 + }, + { + "epoch": 4.273497272845734, + "grad_norm": 0.37402573227882385, + "learning_rate": 1.2795190360300875e-05, + "loss": 1.2382, + "step": 14348 + }, + { + "epoch": 4.273795119045403, + "grad_norm": 0.2479506880044937, + "learning_rate": 1.2794264228403624e-05, + "loss": 1.22, + "step": 14349 + }, + { + "epoch": 4.274092965245072, + "grad_norm": 0.7761954069137573, + "learning_rate": 1.2793338070508865e-05, + "loss": 1.2152, + "step": 14350 + }, + { + "epoch": 4.27439081144474, + "grad_norm": 0.4211745262145996, + "learning_rate": 1.2792411886625215e-05, + "loss": 1.2271, + "step": 14351 + }, + { + "epoch": 4.2746886576444085, + "grad_norm": 0.45816418528556824, + "learning_rate": 1.2791485676761295e-05, + "loss": 1.2294, + "step": 14352 + }, + { + "epoch": 4.274986503844078, + "grad_norm": 0.4153323173522949, + "learning_rate": 1.2790559440925718e-05, + "loss": 1.2453, + "step": 14353 + }, + { + "epoch": 4.275284350043746, + "grad_norm": 0.40850386023521423, + "learning_rate": 1.2789633179127103e-05, + "loss": 1.2236, + "step": 14354 + }, + { + "epoch": 4.275582196243414, + "grad_norm": 0.6133487224578857, + "learning_rate": 1.2788706891374068e-05, + "loss": 1.2135, + "step": 14355 + }, + { + "epoch": 4.275880042443084, + "grad_norm": 0.30651339888572693, + "learning_rate": 1.2787780577675235e-05, + "loss": 1.2315, + "step": 14356 + }, + { + "epoch": 4.276177888642752, + "grad_norm": 0.2998017370700836, + "learning_rate": 1.2786854238039215e-05, + "loss": 1.233, + "step": 14357 + }, + { + "epoch": 4.276475734842421, + "grad_norm": 0.3250484764575958, + "learning_rate": 1.2785927872474627e-05, + "loss": 1.2161, + "step": 14358 + }, + { + "epoch": 4.2767735810420895, + "grad_norm": 0.28351834416389465, + "learning_rate": 1.2785001480990095e-05, + "loss": 1.2064, + "step": 14359 + }, + { + "epoch": 4.277071427241758, + "grad_norm": 0.25644004344940186, + "learning_rate": 1.278407506359424e-05, + "loss": 1.2167, + "step": 14360 + }, + { + "epoch": 4.277369273441427, + "grad_norm": 0.30562421679496765, + "learning_rate": 1.2783148620295674e-05, + "loss": 1.2369, + "step": 14361 + }, + { + "epoch": 4.277667119641095, + "grad_norm": 0.3682887852191925, + "learning_rate": 1.2782222151103017e-05, + "loss": 1.2195, + "step": 14362 + }, + { + "epoch": 4.277964965840764, + "grad_norm": 0.25823503732681274, + "learning_rate": 1.2781295656024895e-05, + "loss": 1.2285, + "step": 14363 + }, + { + "epoch": 4.278262812040433, + "grad_norm": 0.30589163303375244, + "learning_rate": 1.2780369135069921e-05, + "loss": 1.224, + "step": 14364 + }, + { + "epoch": 4.278560658240101, + "grad_norm": 0.28790926933288574, + "learning_rate": 1.2779442588246723e-05, + "loss": 1.2241, + "step": 14365 + }, + { + "epoch": 4.27885850443977, + "grad_norm": 0.24648842215538025, + "learning_rate": 1.2778516015563914e-05, + "loss": 1.2176, + "step": 14366 + }, + { + "epoch": 4.279156350639439, + "grad_norm": 0.3285800516605377, + "learning_rate": 1.277758941703012e-05, + "loss": 1.2218, + "step": 14367 + }, + { + "epoch": 4.279454196839107, + "grad_norm": 0.2547835409641266, + "learning_rate": 1.2776662792653957e-05, + "loss": 1.2142, + "step": 14368 + }, + { + "epoch": 4.279752043038776, + "grad_norm": 0.2792496085166931, + "learning_rate": 1.277573614244405e-05, + "loss": 1.2032, + "step": 14369 + }, + { + "epoch": 4.280049889238445, + "grad_norm": 0.25271403789520264, + "learning_rate": 1.2774809466409019e-05, + "loss": 1.222, + "step": 14370 + }, + { + "epoch": 4.280347735438113, + "grad_norm": 0.2696061134338379, + "learning_rate": 1.2773882764557489e-05, + "loss": 1.24, + "step": 14371 + }, + { + "epoch": 4.280645581637782, + "grad_norm": 0.24025243520736694, + "learning_rate": 1.2772956036898076e-05, + "loss": 1.2281, + "step": 14372 + }, + { + "epoch": 4.2809434278374505, + "grad_norm": 0.26664793491363525, + "learning_rate": 1.277202928343941e-05, + "loss": 1.2175, + "step": 14373 + }, + { + "epoch": 4.281241274037119, + "grad_norm": 0.2511109411716461, + "learning_rate": 1.2771102504190103e-05, + "loss": 1.2258, + "step": 14374 + }, + { + "epoch": 4.281539120236788, + "grad_norm": 0.2659510374069214, + "learning_rate": 1.2770175699158784e-05, + "loss": 1.2186, + "step": 14375 + }, + { + "epoch": 4.281836966436456, + "grad_norm": 0.2575090229511261, + "learning_rate": 1.2769248868354074e-05, + "loss": 1.2288, + "step": 14376 + }, + { + "epoch": 4.282134812636125, + "grad_norm": 0.2587834298610687, + "learning_rate": 1.2768322011784601e-05, + "loss": 1.2122, + "step": 14377 + }, + { + "epoch": 4.282432658835794, + "grad_norm": 0.2907363176345825, + "learning_rate": 1.2767395129458983e-05, + "loss": 1.2136, + "step": 14378 + }, + { + "epoch": 4.282730505035462, + "grad_norm": 0.2576514184474945, + "learning_rate": 1.2766468221385843e-05, + "loss": 1.2219, + "step": 14379 + }, + { + "epoch": 4.283028351235131, + "grad_norm": 0.3286265730857849, + "learning_rate": 1.2765541287573814e-05, + "loss": 1.228, + "step": 14380 + }, + { + "epoch": 4.2833261974348, + "grad_norm": 0.3233201801776886, + "learning_rate": 1.2764614328031506e-05, + "loss": 1.2254, + "step": 14381 + }, + { + "epoch": 4.283624043634468, + "grad_norm": 0.3771904408931732, + "learning_rate": 1.2763687342767553e-05, + "loss": 1.2163, + "step": 14382 + }, + { + "epoch": 4.2839218898341365, + "grad_norm": 0.26839783787727356, + "learning_rate": 1.2762760331790576e-05, + "loss": 1.2104, + "step": 14383 + }, + { + "epoch": 4.284219736033806, + "grad_norm": 0.3020113408565521, + "learning_rate": 1.2761833295109205e-05, + "loss": 1.239, + "step": 14384 + }, + { + "epoch": 4.284517582233474, + "grad_norm": 0.2633955478668213, + "learning_rate": 1.2760906232732055e-05, + "loss": 1.228, + "step": 14385 + }, + { + "epoch": 4.284815428433143, + "grad_norm": 0.28546205163002014, + "learning_rate": 1.275997914466776e-05, + "loss": 1.2173, + "step": 14386 + }, + { + "epoch": 4.2851132746328116, + "grad_norm": 0.3386612832546234, + "learning_rate": 1.2759052030924944e-05, + "loss": 1.2262, + "step": 14387 + }, + { + "epoch": 4.28541112083248, + "grad_norm": 0.3652389645576477, + "learning_rate": 1.275812489151223e-05, + "loss": 1.2142, + "step": 14388 + }, + { + "epoch": 4.285708967032149, + "grad_norm": 0.3611365854740143, + "learning_rate": 1.2757197726438245e-05, + "loss": 1.2137, + "step": 14389 + }, + { + "epoch": 4.286006813231817, + "grad_norm": 0.28039783239364624, + "learning_rate": 1.2756270535711615e-05, + "loss": 1.2216, + "step": 14390 + }, + { + "epoch": 4.286304659431486, + "grad_norm": 0.3334992825984955, + "learning_rate": 1.2755343319340971e-05, + "loss": 1.233, + "step": 14391 + }, + { + "epoch": 4.286602505631155, + "grad_norm": 0.3214553892612457, + "learning_rate": 1.2754416077334933e-05, + "loss": 1.2095, + "step": 14392 + }, + { + "epoch": 4.286900351830823, + "grad_norm": 0.3099585771560669, + "learning_rate": 1.275348880970213e-05, + "loss": 1.2207, + "step": 14393 + }, + { + "epoch": 4.287198198030492, + "grad_norm": 0.43240270018577576, + "learning_rate": 1.2752561516451191e-05, + "loss": 1.2227, + "step": 14394 + }, + { + "epoch": 4.287496044230161, + "grad_norm": 0.5841981172561646, + "learning_rate": 1.2751634197590743e-05, + "loss": 1.217, + "step": 14395 + }, + { + "epoch": 4.287793890429829, + "grad_norm": 0.2586600184440613, + "learning_rate": 1.2750706853129414e-05, + "loss": 1.2301, + "step": 14396 + }, + { + "epoch": 4.288091736629498, + "grad_norm": 0.4778662621974945, + "learning_rate": 1.2749779483075831e-05, + "loss": 1.218, + "step": 14397 + }, + { + "epoch": 4.288389582829167, + "grad_norm": 0.35863474011421204, + "learning_rate": 1.274885208743862e-05, + "loss": 1.2102, + "step": 14398 + }, + { + "epoch": 4.288687429028835, + "grad_norm": 0.566791832447052, + "learning_rate": 1.274792466622641e-05, + "loss": 1.2299, + "step": 14399 + }, + { + "epoch": 4.288985275228504, + "grad_norm": 0.36866599321365356, + "learning_rate": 1.2746997219447833e-05, + "loss": 1.2211, + "step": 14400 + }, + { + "epoch": 4.289283121428173, + "grad_norm": 0.9903460741043091, + "learning_rate": 1.2746069747111518e-05, + "loss": 1.235, + "step": 14401 + }, + { + "epoch": 4.289580967627841, + "grad_norm": 0.47457778453826904, + "learning_rate": 1.274514224922609e-05, + "loss": 1.2186, + "step": 14402 + }, + { + "epoch": 4.28987881382751, + "grad_norm": 0.5852064490318298, + "learning_rate": 1.2744214725800182e-05, + "loss": 1.2257, + "step": 14403 + }, + { + "epoch": 4.2901766600271785, + "grad_norm": 0.26925256848335266, + "learning_rate": 1.274328717684242e-05, + "loss": 1.2183, + "step": 14404 + }, + { + "epoch": 4.290474506226847, + "grad_norm": 0.554728090763092, + "learning_rate": 1.2742359602361436e-05, + "loss": 1.2098, + "step": 14405 + }, + { + "epoch": 4.290772352426516, + "grad_norm": 0.35576191544532776, + "learning_rate": 1.2741432002365862e-05, + "loss": 1.2225, + "step": 14406 + }, + { + "epoch": 4.291070198626184, + "grad_norm": 0.2810397744178772, + "learning_rate": 1.2740504376864322e-05, + "loss": 1.2048, + "step": 14407 + }, + { + "epoch": 4.291368044825853, + "grad_norm": 0.3317042589187622, + "learning_rate": 1.2739576725865457e-05, + "loss": 1.1954, + "step": 14408 + }, + { + "epoch": 4.291665891025522, + "grad_norm": 0.2716878354549408, + "learning_rate": 1.2738649049377887e-05, + "loss": 1.2193, + "step": 14409 + }, + { + "epoch": 4.29196373722519, + "grad_norm": 0.24953965842723846, + "learning_rate": 1.2737721347410247e-05, + "loss": 1.2269, + "step": 14410 + }, + { + "epoch": 4.2922615834248585, + "grad_norm": 0.26282334327697754, + "learning_rate": 1.273679361997117e-05, + "loss": 1.2088, + "step": 14411 + }, + { + "epoch": 4.292559429624528, + "grad_norm": 0.2759837806224823, + "learning_rate": 1.2735865867069285e-05, + "loss": 1.222, + "step": 14412 + }, + { + "epoch": 4.292857275824196, + "grad_norm": 0.25941190123558044, + "learning_rate": 1.2734938088713226e-05, + "loss": 1.2109, + "step": 14413 + }, + { + "epoch": 4.293155122023865, + "grad_norm": 0.24825970828533173, + "learning_rate": 1.2734010284911627e-05, + "loss": 1.2357, + "step": 14414 + }, + { + "epoch": 4.293452968223534, + "grad_norm": 0.2763080298900604, + "learning_rate": 1.2733082455673111e-05, + "loss": 1.2293, + "step": 14415 + }, + { + "epoch": 4.293750814423202, + "grad_norm": 0.2807149291038513, + "learning_rate": 1.273215460100632e-05, + "loss": 1.2109, + "step": 14416 + }, + { + "epoch": 4.294048660622871, + "grad_norm": 0.25246360898017883, + "learning_rate": 1.2731226720919877e-05, + "loss": 1.2116, + "step": 14417 + }, + { + "epoch": 4.2943465068225395, + "grad_norm": 0.2973587214946747, + "learning_rate": 1.2730298815422428e-05, + "loss": 1.2133, + "step": 14418 + }, + { + "epoch": 4.294644353022208, + "grad_norm": 0.27922913432121277, + "learning_rate": 1.2729370884522597e-05, + "loss": 1.2135, + "step": 14419 + }, + { + "epoch": 4.294942199221877, + "grad_norm": 0.29587066173553467, + "learning_rate": 1.2728442928229019e-05, + "loss": 1.2293, + "step": 14420 + }, + { + "epoch": 4.295240045421545, + "grad_norm": 0.28535765409469604, + "learning_rate": 1.272751494655033e-05, + "loss": 1.2109, + "step": 14421 + }, + { + "epoch": 4.295537891621214, + "grad_norm": 0.25837650895118713, + "learning_rate": 1.2726586939495156e-05, + "loss": 1.2152, + "step": 14422 + }, + { + "epoch": 4.295835737820883, + "grad_norm": 0.3583243191242218, + "learning_rate": 1.2725658907072141e-05, + "loss": 1.2325, + "step": 14423 + }, + { + "epoch": 4.296133584020551, + "grad_norm": 0.28593847155570984, + "learning_rate": 1.2724730849289913e-05, + "loss": 1.2143, + "step": 14424 + }, + { + "epoch": 4.2964314302202204, + "grad_norm": 0.29528507590293884, + "learning_rate": 1.2723802766157114e-05, + "loss": 1.2114, + "step": 14425 + }, + { + "epoch": 4.296729276419889, + "grad_norm": 0.3226224184036255, + "learning_rate": 1.272287465768237e-05, + "loss": 1.2178, + "step": 14426 + }, + { + "epoch": 4.297027122619557, + "grad_norm": 0.27137264609336853, + "learning_rate": 1.2721946523874318e-05, + "loss": 1.2291, + "step": 14427 + }, + { + "epoch": 4.297324968819226, + "grad_norm": 0.6712373495101929, + "learning_rate": 1.2721018364741596e-05, + "loss": 1.2111, + "step": 14428 + }, + { + "epoch": 4.297622815018895, + "grad_norm": 0.3394869863986969, + "learning_rate": 1.2720090180292836e-05, + "loss": 1.2155, + "step": 14429 + }, + { + "epoch": 4.297920661218563, + "grad_norm": 0.45824187994003296, + "learning_rate": 1.2719161970536677e-05, + "loss": 1.2143, + "step": 14430 + }, + { + "epoch": 4.298218507418232, + "grad_norm": 0.3214802145957947, + "learning_rate": 1.2718233735481759e-05, + "loss": 1.2471, + "step": 14431 + }, + { + "epoch": 4.2985163536179005, + "grad_norm": 0.5555426478385925, + "learning_rate": 1.2717305475136708e-05, + "loss": 1.2186, + "step": 14432 + }, + { + "epoch": 4.298814199817569, + "grad_norm": 0.34570813179016113, + "learning_rate": 1.2716377189510167e-05, + "loss": 1.2216, + "step": 14433 + }, + { + "epoch": 4.299112046017238, + "grad_norm": 0.3341878056526184, + "learning_rate": 1.2715448878610769e-05, + "loss": 1.226, + "step": 14434 + }, + { + "epoch": 4.299409892216906, + "grad_norm": 0.4007425904273987, + "learning_rate": 1.2714520542447155e-05, + "loss": 1.2323, + "step": 14435 + }, + { + "epoch": 4.299707738416576, + "grad_norm": 0.27162936329841614, + "learning_rate": 1.271359218102796e-05, + "loss": 1.2418, + "step": 14436 + }, + { + "epoch": 4.300005584616244, + "grad_norm": 0.4114162027835846, + "learning_rate": 1.2712663794361821e-05, + "loss": 1.2138, + "step": 14437 + }, + { + "epoch": 4.300303430815912, + "grad_norm": 0.26284918189048767, + "learning_rate": 1.271173538245738e-05, + "loss": 1.2077, + "step": 14438 + }, + { + "epoch": 4.3006012770155815, + "grad_norm": 0.2583646774291992, + "learning_rate": 1.2710806945323269e-05, + "loss": 1.2364, + "step": 14439 + }, + { + "epoch": 4.30089912321525, + "grad_norm": 0.3204265236854553, + "learning_rate": 1.2709878482968124e-05, + "loss": 1.2261, + "step": 14440 + }, + { + "epoch": 4.301196969414918, + "grad_norm": 0.26713070273399353, + "learning_rate": 1.2708949995400589e-05, + "loss": 1.2366, + "step": 14441 + }, + { + "epoch": 4.301494815614587, + "grad_norm": 0.26088231801986694, + "learning_rate": 1.2708021482629302e-05, + "loss": 1.2177, + "step": 14442 + }, + { + "epoch": 4.301792661814256, + "grad_norm": 0.4549257457256317, + "learning_rate": 1.2707092944662902e-05, + "loss": 1.2052, + "step": 14443 + }, + { + "epoch": 4.302090508013924, + "grad_norm": 0.33362963795661926, + "learning_rate": 1.2706164381510026e-05, + "loss": 1.2259, + "step": 14444 + }, + { + "epoch": 4.302388354213593, + "grad_norm": 0.34627485275268555, + "learning_rate": 1.2705235793179313e-05, + "loss": 1.2352, + "step": 14445 + }, + { + "epoch": 4.3026862004132616, + "grad_norm": 0.27429014444351196, + "learning_rate": 1.2704307179679402e-05, + "loss": 1.222, + "step": 14446 + }, + { + "epoch": 4.30298404661293, + "grad_norm": 0.34523606300354004, + "learning_rate": 1.2703378541018936e-05, + "loss": 1.2322, + "step": 14447 + }, + { + "epoch": 4.303281892812599, + "grad_norm": 0.2928785979747772, + "learning_rate": 1.2702449877206552e-05, + "loss": 1.2133, + "step": 14448 + }, + { + "epoch": 4.303579739012267, + "grad_norm": 0.3015083968639374, + "learning_rate": 1.2701521188250897e-05, + "loss": 1.2218, + "step": 14449 + }, + { + "epoch": 4.303877585211936, + "grad_norm": 0.34831950068473816, + "learning_rate": 1.27005924741606e-05, + "loss": 1.2263, + "step": 14450 + }, + { + "epoch": 4.304175431411605, + "grad_norm": 0.2671651244163513, + "learning_rate": 1.2699663734944307e-05, + "loss": 1.2203, + "step": 14451 + }, + { + "epoch": 4.304473277611273, + "grad_norm": 0.5656589269638062, + "learning_rate": 1.269873497061066e-05, + "loss": 1.2161, + "step": 14452 + }, + { + "epoch": 4.3047711238109425, + "grad_norm": 0.2613632380962372, + "learning_rate": 1.26978061811683e-05, + "loss": 1.2227, + "step": 14453 + }, + { + "epoch": 4.305068970010611, + "grad_norm": 0.4541975259780884, + "learning_rate": 1.2696877366625869e-05, + "loss": 1.2298, + "step": 14454 + }, + { + "epoch": 4.305366816210279, + "grad_norm": 0.2899562418460846, + "learning_rate": 1.2695948526992005e-05, + "loss": 1.2363, + "step": 14455 + }, + { + "epoch": 4.305664662409948, + "grad_norm": 0.3821897804737091, + "learning_rate": 1.2695019662275354e-05, + "loss": 1.206, + "step": 14456 + }, + { + "epoch": 4.305962508609617, + "grad_norm": 0.2865409255027771, + "learning_rate": 1.2694090772484553e-05, + "loss": 1.2306, + "step": 14457 + }, + { + "epoch": 4.306260354809285, + "grad_norm": 0.29775720834732056, + "learning_rate": 1.2693161857628246e-05, + "loss": 1.2178, + "step": 14458 + }, + { + "epoch": 4.306558201008954, + "grad_norm": 0.34338074922561646, + "learning_rate": 1.2692232917715081e-05, + "loss": 1.2248, + "step": 14459 + }, + { + "epoch": 4.306856047208623, + "grad_norm": 0.27675661444664, + "learning_rate": 1.2691303952753697e-05, + "loss": 1.234, + "step": 14460 + }, + { + "epoch": 4.307153893408291, + "grad_norm": 0.31029143929481506, + "learning_rate": 1.2690374962752734e-05, + "loss": 1.2366, + "step": 14461 + }, + { + "epoch": 4.30745173960796, + "grad_norm": 0.29586637020111084, + "learning_rate": 1.2689445947720842e-05, + "loss": 1.22, + "step": 14462 + }, + { + "epoch": 4.3077495858076285, + "grad_norm": 0.2669788599014282, + "learning_rate": 1.2688516907666657e-05, + "loss": 1.21, + "step": 14463 + }, + { + "epoch": 4.308047432007298, + "grad_norm": 0.3429713547229767, + "learning_rate": 1.2687587842598826e-05, + "loss": 1.2196, + "step": 14464 + }, + { + "epoch": 4.308345278206966, + "grad_norm": 0.26067858934402466, + "learning_rate": 1.2686658752525992e-05, + "loss": 1.2266, + "step": 14465 + }, + { + "epoch": 4.308643124406634, + "grad_norm": 0.4082125127315521, + "learning_rate": 1.2685729637456803e-05, + "loss": 1.2159, + "step": 14466 + }, + { + "epoch": 4.3089409706063035, + "grad_norm": 0.2630990445613861, + "learning_rate": 1.26848004973999e-05, + "loss": 1.2253, + "step": 14467 + }, + { + "epoch": 4.309238816805972, + "grad_norm": 0.3698793351650238, + "learning_rate": 1.2683871332363924e-05, + "loss": 1.2233, + "step": 14468 + }, + { + "epoch": 4.30953666300564, + "grad_norm": 0.2409800887107849, + "learning_rate": 1.2682942142357526e-05, + "loss": 1.217, + "step": 14469 + }, + { + "epoch": 4.309834509205309, + "grad_norm": 0.29655179381370544, + "learning_rate": 1.268201292738935e-05, + "loss": 1.2208, + "step": 14470 + }, + { + "epoch": 4.310132355404978, + "grad_norm": 0.2768140733242035, + "learning_rate": 1.2681083687468038e-05, + "loss": 1.2265, + "step": 14471 + }, + { + "epoch": 4.310430201604646, + "grad_norm": 0.3115537464618683, + "learning_rate": 1.2680154422602242e-05, + "loss": 1.2583, + "step": 14472 + }, + { + "epoch": 4.310728047804315, + "grad_norm": 0.2554987370967865, + "learning_rate": 1.2679225132800601e-05, + "loss": 1.2266, + "step": 14473 + }, + { + "epoch": 4.311025894003984, + "grad_norm": 0.31973329186439514, + "learning_rate": 1.2678295818071764e-05, + "loss": 1.2199, + "step": 14474 + }, + { + "epoch": 4.311323740203652, + "grad_norm": 0.27614399790763855, + "learning_rate": 1.2677366478424375e-05, + "loss": 1.2281, + "step": 14475 + }, + { + "epoch": 4.311621586403321, + "grad_norm": 0.3766910135746002, + "learning_rate": 1.2676437113867084e-05, + "loss": 1.2227, + "step": 14476 + }, + { + "epoch": 4.3119194326029895, + "grad_norm": 0.24778923392295837, + "learning_rate": 1.2675507724408536e-05, + "loss": 1.2343, + "step": 14477 + }, + { + "epoch": 4.312217278802658, + "grad_norm": 0.4000803232192993, + "learning_rate": 1.2674578310057375e-05, + "loss": 1.2177, + "step": 14478 + }, + { + "epoch": 4.312515125002327, + "grad_norm": 0.2522578835487366, + "learning_rate": 1.2673648870822257e-05, + "loss": 1.2127, + "step": 14479 + }, + { + "epoch": 4.312812971201995, + "grad_norm": 0.3849125802516937, + "learning_rate": 1.267271940671182e-05, + "loss": 1.2254, + "step": 14480 + }, + { + "epoch": 4.313110817401665, + "grad_norm": 0.2478676736354828, + "learning_rate": 1.2671789917734714e-05, + "loss": 1.2318, + "step": 14481 + }, + { + "epoch": 4.313408663601333, + "grad_norm": 0.3485409617424011, + "learning_rate": 1.2670860403899585e-05, + "loss": 1.2119, + "step": 14482 + }, + { + "epoch": 4.313706509801001, + "grad_norm": 0.2599025368690491, + "learning_rate": 1.266993086521509e-05, + "loss": 1.2133, + "step": 14483 + }, + { + "epoch": 4.3140043560006704, + "grad_norm": 0.3301621973514557, + "learning_rate": 1.2669001301689869e-05, + "loss": 1.2207, + "step": 14484 + }, + { + "epoch": 4.314302202200339, + "grad_norm": 0.2623264491558075, + "learning_rate": 1.2668071713332573e-05, + "loss": 1.2236, + "step": 14485 + }, + { + "epoch": 4.314600048400007, + "grad_norm": 0.2623007893562317, + "learning_rate": 1.2667142100151852e-05, + "loss": 1.2153, + "step": 14486 + }, + { + "epoch": 4.314897894599676, + "grad_norm": 0.25490888953208923, + "learning_rate": 1.2666212462156352e-05, + "loss": 1.2198, + "step": 14487 + }, + { + "epoch": 4.315195740799345, + "grad_norm": 0.27106112241744995, + "learning_rate": 1.2665282799354723e-05, + "loss": 1.2259, + "step": 14488 + }, + { + "epoch": 4.315493586999013, + "grad_norm": 0.2589944303035736, + "learning_rate": 1.2664353111755618e-05, + "loss": 1.2164, + "step": 14489 + }, + { + "epoch": 4.315791433198682, + "grad_norm": 0.3240913450717926, + "learning_rate": 1.2663423399367682e-05, + "loss": 1.2363, + "step": 14490 + }, + { + "epoch": 4.3160892793983505, + "grad_norm": 0.4749835729598999, + "learning_rate": 1.2662493662199569e-05, + "loss": 1.2277, + "step": 14491 + }, + { + "epoch": 4.31638712559802, + "grad_norm": 0.3653053045272827, + "learning_rate": 1.2661563900259925e-05, + "loss": 1.2445, + "step": 14492 + }, + { + "epoch": 4.316684971797688, + "grad_norm": 0.294147789478302, + "learning_rate": 1.2660634113557401e-05, + "loss": 1.2202, + "step": 14493 + }, + { + "epoch": 4.316982817997356, + "grad_norm": 0.32274118065834045, + "learning_rate": 1.2659704302100653e-05, + "loss": 1.2303, + "step": 14494 + }, + { + "epoch": 4.317280664197026, + "grad_norm": 0.3582480549812317, + "learning_rate": 1.2658774465898327e-05, + "loss": 1.2286, + "step": 14495 + }, + { + "epoch": 4.317578510396694, + "grad_norm": 0.4758828580379486, + "learning_rate": 1.2657844604959077e-05, + "loss": 1.2373, + "step": 14496 + }, + { + "epoch": 4.317876356596362, + "grad_norm": 0.2943171560764313, + "learning_rate": 1.2656914719291548e-05, + "loss": 1.1998, + "step": 14497 + }, + { + "epoch": 4.3181742027960315, + "grad_norm": 0.4365615248680115, + "learning_rate": 1.26559848089044e-05, + "loss": 1.2392, + "step": 14498 + }, + { + "epoch": 4.3184720489957, + "grad_norm": 0.34083935618400574, + "learning_rate": 1.2655054873806276e-05, + "loss": 1.2236, + "step": 14499 + }, + { + "epoch": 4.318769895195368, + "grad_norm": 0.3708080053329468, + "learning_rate": 1.2654124914005839e-05, + "loss": 1.2295, + "step": 14500 + }, + { + "epoch": 4.318769895195368, + "eval_loss": 1.3240220546722412, + "eval_runtime": 20.9016, + "eval_samples_per_second": 82.96, + "eval_steps_per_second": 5.215, + "step": 14500 + }, + { + "epoch": 4.319067741395037, + "grad_norm": 0.35728931427001953, + "learning_rate": 1.2653194929511732e-05, + "loss": 1.2367, + "step": 14501 + }, + { + "epoch": 4.319365587594706, + "grad_norm": 0.3800946772098541, + "learning_rate": 1.2652264920332611e-05, + "loss": 1.2061, + "step": 14502 + }, + { + "epoch": 4.319663433794375, + "grad_norm": 0.3083878457546234, + "learning_rate": 1.2651334886477129e-05, + "loss": 1.227, + "step": 14503 + }, + { + "epoch": 4.319961279994043, + "grad_norm": 0.36207953095436096, + "learning_rate": 1.2650404827953938e-05, + "loss": 1.23, + "step": 14504 + }, + { + "epoch": 4.3202591261937116, + "grad_norm": 0.3269515931606293, + "learning_rate": 1.2649474744771686e-05, + "loss": 1.2238, + "step": 14505 + }, + { + "epoch": 4.320556972393381, + "grad_norm": 0.3152298331260681, + "learning_rate": 1.2648544636939035e-05, + "loss": 1.2121, + "step": 14506 + }, + { + "epoch": 4.320854818593049, + "grad_norm": 0.264396607875824, + "learning_rate": 1.2647614504464638e-05, + "loss": 1.2223, + "step": 14507 + }, + { + "epoch": 4.321152664792717, + "grad_norm": 0.2580949366092682, + "learning_rate": 1.2646684347357146e-05, + "loss": 1.2268, + "step": 14508 + }, + { + "epoch": 4.321450510992387, + "grad_norm": 0.29828542470932007, + "learning_rate": 1.264575416562521e-05, + "loss": 1.2439, + "step": 14509 + }, + { + "epoch": 4.321748357192055, + "grad_norm": 0.29618167877197266, + "learning_rate": 1.2644823959277489e-05, + "loss": 1.2107, + "step": 14510 + }, + { + "epoch": 4.322046203391723, + "grad_norm": 0.28344979882240295, + "learning_rate": 1.2643893728322635e-05, + "loss": 1.2285, + "step": 14511 + }, + { + "epoch": 4.3223440495913925, + "grad_norm": 0.3486150801181793, + "learning_rate": 1.2642963472769305e-05, + "loss": 1.229, + "step": 14512 + }, + { + "epoch": 4.322641895791061, + "grad_norm": 0.28179824352264404, + "learning_rate": 1.2642033192626153e-05, + "loss": 1.2207, + "step": 14513 + }, + { + "epoch": 4.322939741990729, + "grad_norm": 0.36442992091178894, + "learning_rate": 1.2641102887901835e-05, + "loss": 1.2305, + "step": 14514 + }, + { + "epoch": 4.323237588190398, + "grad_norm": 0.2730322480201721, + "learning_rate": 1.2640172558605005e-05, + "loss": 1.237, + "step": 14515 + }, + { + "epoch": 4.323535434390067, + "grad_norm": 0.31957578659057617, + "learning_rate": 1.2639242204744315e-05, + "loss": 1.2193, + "step": 14516 + }, + { + "epoch": 4.323833280589735, + "grad_norm": 0.2447746843099594, + "learning_rate": 1.2638311826328429e-05, + "loss": 1.2309, + "step": 14517 + }, + { + "epoch": 4.324131126789404, + "grad_norm": 0.3349570333957672, + "learning_rate": 1.2637381423365998e-05, + "loss": 1.2134, + "step": 14518 + }, + { + "epoch": 4.324428972989073, + "grad_norm": 0.26600414514541626, + "learning_rate": 1.2636450995865679e-05, + "loss": 1.2064, + "step": 14519 + }, + { + "epoch": 4.324726819188742, + "grad_norm": 0.30910712480545044, + "learning_rate": 1.2635520543836133e-05, + "loss": 1.2337, + "step": 14520 + }, + { + "epoch": 4.32502466538841, + "grad_norm": 0.31716904044151306, + "learning_rate": 1.2634590067286007e-05, + "loss": 1.2266, + "step": 14521 + }, + { + "epoch": 4.3253225115880785, + "grad_norm": 0.24567840993404388, + "learning_rate": 1.2633659566223968e-05, + "loss": 1.2174, + "step": 14522 + }, + { + "epoch": 4.325620357787748, + "grad_norm": 0.3010712265968323, + "learning_rate": 1.2632729040658665e-05, + "loss": 1.1963, + "step": 14523 + }, + { + "epoch": 4.325918203987416, + "grad_norm": 0.29390281438827515, + "learning_rate": 1.2631798490598765e-05, + "loss": 1.2116, + "step": 14524 + }, + { + "epoch": 4.326216050187084, + "grad_norm": 0.2399107664823532, + "learning_rate": 1.2630867916052918e-05, + "loss": 1.2192, + "step": 14525 + }, + { + "epoch": 4.3265138963867535, + "grad_norm": 0.28184759616851807, + "learning_rate": 1.2629937317029786e-05, + "loss": 1.2272, + "step": 14526 + }, + { + "epoch": 4.326811742586422, + "grad_norm": 0.253844290971756, + "learning_rate": 1.2629006693538024e-05, + "loss": 1.2086, + "step": 14527 + }, + { + "epoch": 4.32710958878609, + "grad_norm": 0.30625760555267334, + "learning_rate": 1.2628076045586291e-05, + "loss": 1.2377, + "step": 14528 + }, + { + "epoch": 4.327407434985759, + "grad_norm": 0.29708221554756165, + "learning_rate": 1.2627145373183248e-05, + "loss": 1.2166, + "step": 14529 + }, + { + "epoch": 4.327705281185428, + "grad_norm": 0.3452737331390381, + "learning_rate": 1.2626214676337554e-05, + "loss": 1.2224, + "step": 14530 + }, + { + "epoch": 4.328003127385097, + "grad_norm": 0.3978636562824249, + "learning_rate": 1.2625283955057865e-05, + "loss": 1.2122, + "step": 14531 + }, + { + "epoch": 4.328300973584765, + "grad_norm": 0.27863913774490356, + "learning_rate": 1.2624353209352842e-05, + "loss": 1.2273, + "step": 14532 + }, + { + "epoch": 4.328598819784434, + "grad_norm": 0.391391396522522, + "learning_rate": 1.2623422439231142e-05, + "loss": 1.2359, + "step": 14533 + }, + { + "epoch": 4.328896665984103, + "grad_norm": 0.31735673546791077, + "learning_rate": 1.2622491644701432e-05, + "loss": 1.2185, + "step": 14534 + }, + { + "epoch": 4.329194512183771, + "grad_norm": 0.2565832734107971, + "learning_rate": 1.2621560825772363e-05, + "loss": 1.2176, + "step": 14535 + }, + { + "epoch": 4.3294923583834395, + "grad_norm": 0.26544708013534546, + "learning_rate": 1.26206299824526e-05, + "loss": 1.2096, + "step": 14536 + }, + { + "epoch": 4.329790204583109, + "grad_norm": 0.27407172322273254, + "learning_rate": 1.2619699114750807e-05, + "loss": 1.2315, + "step": 14537 + }, + { + "epoch": 4.330088050782777, + "grad_norm": 0.2865266799926758, + "learning_rate": 1.2618768222675639e-05, + "loss": 1.2255, + "step": 14538 + }, + { + "epoch": 4.330385896982445, + "grad_norm": 0.28279930353164673, + "learning_rate": 1.2617837306235756e-05, + "loss": 1.2155, + "step": 14539 + }, + { + "epoch": 4.330683743182115, + "grad_norm": 0.32242903113365173, + "learning_rate": 1.2616906365439823e-05, + "loss": 1.2194, + "step": 14540 + }, + { + "epoch": 4.330981589381783, + "grad_norm": 0.30280429124832153, + "learning_rate": 1.2615975400296501e-05, + "loss": 1.2324, + "step": 14541 + }, + { + "epoch": 4.331279435581451, + "grad_norm": 0.24647094309329987, + "learning_rate": 1.261504441081445e-05, + "loss": 1.2266, + "step": 14542 + }, + { + "epoch": 4.3315772817811204, + "grad_norm": 0.2495497465133667, + "learning_rate": 1.2614113397002332e-05, + "loss": 1.2317, + "step": 14543 + }, + { + "epoch": 4.331875127980789, + "grad_norm": 0.24952080845832825, + "learning_rate": 1.2613182358868812e-05, + "loss": 1.2297, + "step": 14544 + }, + { + "epoch": 4.332172974180457, + "grad_norm": 0.256031334400177, + "learning_rate": 1.2612251296422546e-05, + "loss": 1.229, + "step": 14545 + }, + { + "epoch": 4.332470820380126, + "grad_norm": 0.2596443295478821, + "learning_rate": 1.26113202096722e-05, + "loss": 1.2321, + "step": 14546 + }, + { + "epoch": 4.332768666579795, + "grad_norm": 0.2813296318054199, + "learning_rate": 1.2610389098626441e-05, + "loss": 1.2191, + "step": 14547 + }, + { + "epoch": 4.333066512779464, + "grad_norm": 0.3075757622718811, + "learning_rate": 1.2609457963293927e-05, + "loss": 1.2109, + "step": 14548 + }, + { + "epoch": 4.333364358979132, + "grad_norm": 0.27286309003829956, + "learning_rate": 1.2608526803683318e-05, + "loss": 1.2178, + "step": 14549 + }, + { + "epoch": 4.3336622051788005, + "grad_norm": 0.2939545810222626, + "learning_rate": 1.2607595619803285e-05, + "loss": 1.2312, + "step": 14550 + }, + { + "epoch": 4.33396005137847, + "grad_norm": 0.30843645334243774, + "learning_rate": 1.2606664411662489e-05, + "loss": 1.2324, + "step": 14551 + }, + { + "epoch": 4.334257897578138, + "grad_norm": 0.3237903416156769, + "learning_rate": 1.260573317926959e-05, + "loss": 1.2329, + "step": 14552 + }, + { + "epoch": 4.334555743777806, + "grad_norm": 0.5066404938697815, + "learning_rate": 1.2604801922633256e-05, + "loss": 1.2204, + "step": 14553 + }, + { + "epoch": 4.334853589977476, + "grad_norm": 0.37350624799728394, + "learning_rate": 1.2603870641762155e-05, + "loss": 1.2224, + "step": 14554 + }, + { + "epoch": 4.335151436177144, + "grad_norm": 0.3123527467250824, + "learning_rate": 1.2602939336664943e-05, + "loss": 1.2187, + "step": 14555 + }, + { + "epoch": 4.335449282376812, + "grad_norm": 0.27443909645080566, + "learning_rate": 1.2602008007350289e-05, + "loss": 1.211, + "step": 14556 + }, + { + "epoch": 4.3357471285764815, + "grad_norm": 0.32020115852355957, + "learning_rate": 1.2601076653826856e-05, + "loss": 1.2229, + "step": 14557 + }, + { + "epoch": 4.33604497477615, + "grad_norm": 0.4458685517311096, + "learning_rate": 1.2600145276103311e-05, + "loss": 1.2174, + "step": 14558 + }, + { + "epoch": 4.336342820975819, + "grad_norm": 0.39707452058792114, + "learning_rate": 1.259921387418832e-05, + "loss": 1.2136, + "step": 14559 + }, + { + "epoch": 4.336640667175487, + "grad_norm": 0.27815672755241394, + "learning_rate": 1.2598282448090548e-05, + "loss": 1.2096, + "step": 14560 + }, + { + "epoch": 4.336938513375156, + "grad_norm": 0.372768759727478, + "learning_rate": 1.2597350997818663e-05, + "loss": 1.2405, + "step": 14561 + }, + { + "epoch": 4.337236359574825, + "grad_norm": 0.31767967343330383, + "learning_rate": 1.2596419523381327e-05, + "loss": 1.2335, + "step": 14562 + }, + { + "epoch": 4.337534205774493, + "grad_norm": 0.274199903011322, + "learning_rate": 1.2595488024787208e-05, + "loss": 1.2161, + "step": 14563 + }, + { + "epoch": 4.3378320519741616, + "grad_norm": 0.3076978623867035, + "learning_rate": 1.259455650204497e-05, + "loss": 1.2185, + "step": 14564 + }, + { + "epoch": 4.338129898173831, + "grad_norm": 0.34883975982666016, + "learning_rate": 1.2593624955163288e-05, + "loss": 1.2095, + "step": 14565 + }, + { + "epoch": 4.338427744373499, + "grad_norm": 0.6107051968574524, + "learning_rate": 1.259269338415082e-05, + "loss": 1.2316, + "step": 14566 + }, + { + "epoch": 4.338725590573167, + "grad_norm": 0.35214030742645264, + "learning_rate": 1.2591761789016239e-05, + "loss": 1.2213, + "step": 14567 + }, + { + "epoch": 4.339023436772837, + "grad_norm": 0.3620079755783081, + "learning_rate": 1.2590830169768208e-05, + "loss": 1.2292, + "step": 14568 + }, + { + "epoch": 4.339321282972505, + "grad_norm": 0.33429720997810364, + "learning_rate": 1.2589898526415399e-05, + "loss": 1.2295, + "step": 14569 + }, + { + "epoch": 4.339619129172174, + "grad_norm": 0.3748376667499542, + "learning_rate": 1.2588966858966476e-05, + "loss": 1.2166, + "step": 14570 + }, + { + "epoch": 4.3399169753718425, + "grad_norm": 0.5470907688140869, + "learning_rate": 1.258803516743011e-05, + "loss": 1.2227, + "step": 14571 + }, + { + "epoch": 4.340214821571511, + "grad_norm": 0.2732674479484558, + "learning_rate": 1.2587103451814969e-05, + "loss": 1.2258, + "step": 14572 + }, + { + "epoch": 4.34051266777118, + "grad_norm": 0.5472095012664795, + "learning_rate": 1.258617171212972e-05, + "loss": 1.2192, + "step": 14573 + }, + { + "epoch": 4.340810513970848, + "grad_norm": 0.3168555796146393, + "learning_rate": 1.2585239948383032e-05, + "loss": 1.2207, + "step": 14574 + }, + { + "epoch": 4.341108360170517, + "grad_norm": 0.37085288763046265, + "learning_rate": 1.2584308160583573e-05, + "loss": 1.2261, + "step": 14575 + }, + { + "epoch": 4.341406206370186, + "grad_norm": 0.25340375304222107, + "learning_rate": 1.2583376348740017e-05, + "loss": 1.2153, + "step": 14576 + }, + { + "epoch": 4.341704052569854, + "grad_norm": 0.3801701068878174, + "learning_rate": 1.258244451286103e-05, + "loss": 1.2231, + "step": 14577 + }, + { + "epoch": 4.342001898769523, + "grad_norm": 0.2782164514064789, + "learning_rate": 1.2581512652955284e-05, + "loss": 1.2226, + "step": 14578 + }, + { + "epoch": 4.342299744969192, + "grad_norm": 0.37056881189346313, + "learning_rate": 1.2580580769031443e-05, + "loss": 1.222, + "step": 14579 + }, + { + "epoch": 4.34259759116886, + "grad_norm": 0.2865006625652313, + "learning_rate": 1.2579648861098184e-05, + "loss": 1.2277, + "step": 14580 + }, + { + "epoch": 4.3428954373685285, + "grad_norm": 0.3121490776538849, + "learning_rate": 1.2578716929164171e-05, + "loss": 1.2316, + "step": 14581 + }, + { + "epoch": 4.343193283568198, + "grad_norm": 0.28218358755111694, + "learning_rate": 1.2577784973238081e-05, + "loss": 1.2252, + "step": 14582 + }, + { + "epoch": 4.343491129767866, + "grad_norm": 0.24253541231155396, + "learning_rate": 1.2576852993328582e-05, + "loss": 1.2256, + "step": 14583 + }, + { + "epoch": 4.343788975967534, + "grad_norm": 0.25248849391937256, + "learning_rate": 1.2575920989444343e-05, + "loss": 1.2127, + "step": 14584 + }, + { + "epoch": 4.3440868221672035, + "grad_norm": 0.2888668477535248, + "learning_rate": 1.2574988961594041e-05, + "loss": 1.2166, + "step": 14585 + }, + { + "epoch": 4.344384668366872, + "grad_norm": 0.25354546308517456, + "learning_rate": 1.257405690978634e-05, + "loss": 1.2393, + "step": 14586 + }, + { + "epoch": 4.344682514566541, + "grad_norm": 0.3220248818397522, + "learning_rate": 1.2573124834029915e-05, + "loss": 1.2179, + "step": 14587 + }, + { + "epoch": 4.344980360766209, + "grad_norm": 0.3816487193107605, + "learning_rate": 1.2572192734333441e-05, + "loss": 1.214, + "step": 14588 + }, + { + "epoch": 4.345278206965878, + "grad_norm": 0.31159496307373047, + "learning_rate": 1.2571260610705585e-05, + "loss": 1.2192, + "step": 14589 + }, + { + "epoch": 4.345576053165547, + "grad_norm": 0.3061455190181732, + "learning_rate": 1.2570328463155024e-05, + "loss": 1.2293, + "step": 14590 + }, + { + "epoch": 4.345873899365215, + "grad_norm": 0.30936703085899353, + "learning_rate": 1.2569396291690428e-05, + "loss": 1.2183, + "step": 14591 + }, + { + "epoch": 4.346171745564884, + "grad_norm": 0.25135019421577454, + "learning_rate": 1.2568464096320467e-05, + "loss": 1.2224, + "step": 14592 + }, + { + "epoch": 4.346469591764553, + "grad_norm": 0.2868601679801941, + "learning_rate": 1.256753187705382e-05, + "loss": 1.2156, + "step": 14593 + }, + { + "epoch": 4.346767437964221, + "grad_norm": 0.2719976305961609, + "learning_rate": 1.2566599633899158e-05, + "loss": 1.2332, + "step": 14594 + }, + { + "epoch": 4.3470652841638895, + "grad_norm": 0.2508276700973511, + "learning_rate": 1.2565667366865155e-05, + "loss": 1.2245, + "step": 14595 + }, + { + "epoch": 4.347363130363559, + "grad_norm": 0.3190496861934662, + "learning_rate": 1.256473507596048e-05, + "loss": 1.2302, + "step": 14596 + }, + { + "epoch": 4.347660976563227, + "grad_norm": 0.25456708669662476, + "learning_rate": 1.2563802761193812e-05, + "loss": 1.2182, + "step": 14597 + }, + { + "epoch": 4.347958822762896, + "grad_norm": 0.28232961893081665, + "learning_rate": 1.2562870422573825e-05, + "loss": 1.2129, + "step": 14598 + }, + { + "epoch": 4.348256668962565, + "grad_norm": 0.2526226341724396, + "learning_rate": 1.2561938060109191e-05, + "loss": 1.22, + "step": 14599 + }, + { + "epoch": 4.348554515162233, + "grad_norm": 0.24340102076530457, + "learning_rate": 1.2561005673808586e-05, + "loss": 1.2165, + "step": 14600 + }, + { + "epoch": 4.348852361361902, + "grad_norm": 0.27539387345314026, + "learning_rate": 1.2560073263680685e-05, + "loss": 1.2056, + "step": 14601 + }, + { + "epoch": 4.3491502075615704, + "grad_norm": 0.25885745882987976, + "learning_rate": 1.2559140829734164e-05, + "loss": 1.2287, + "step": 14602 + }, + { + "epoch": 4.349448053761239, + "grad_norm": 0.25479578971862793, + "learning_rate": 1.2558208371977693e-05, + "loss": 1.2182, + "step": 14603 + }, + { + "epoch": 4.349745899960908, + "grad_norm": 0.2609451413154602, + "learning_rate": 1.2557275890419956e-05, + "loss": 1.2159, + "step": 14604 + }, + { + "epoch": 4.350043746160576, + "grad_norm": 0.3177453875541687, + "learning_rate": 1.255634338506962e-05, + "loss": 1.2213, + "step": 14605 + }, + { + "epoch": 4.350341592360245, + "grad_norm": 0.3020949363708496, + "learning_rate": 1.2555410855935366e-05, + "loss": 1.214, + "step": 14606 + }, + { + "epoch": 4.350639438559914, + "grad_norm": 0.25208285450935364, + "learning_rate": 1.255447830302587e-05, + "loss": 1.238, + "step": 14607 + }, + { + "epoch": 4.350937284759582, + "grad_norm": 0.25857990980148315, + "learning_rate": 1.2553545726349805e-05, + "loss": 1.2154, + "step": 14608 + }, + { + "epoch": 4.3512351309592505, + "grad_norm": 0.296636700630188, + "learning_rate": 1.2552613125915853e-05, + "loss": 1.2176, + "step": 14609 + }, + { + "epoch": 4.35153297715892, + "grad_norm": 0.3673078417778015, + "learning_rate": 1.2551680501732686e-05, + "loss": 1.2135, + "step": 14610 + }, + { + "epoch": 4.351830823358588, + "grad_norm": 0.25487521290779114, + "learning_rate": 1.2550747853808982e-05, + "loss": 1.2153, + "step": 14611 + }, + { + "epoch": 4.352128669558256, + "grad_norm": 0.5622789263725281, + "learning_rate": 1.2549815182153421e-05, + "loss": 1.2221, + "step": 14612 + }, + { + "epoch": 4.352426515757926, + "grad_norm": 0.8438968062400818, + "learning_rate": 1.2548882486774678e-05, + "loss": 1.2127, + "step": 14613 + }, + { + "epoch": 4.352724361957594, + "grad_norm": 0.6250828504562378, + "learning_rate": 1.2547949767681433e-05, + "loss": 1.2239, + "step": 14614 + }, + { + "epoch": 4.353022208157263, + "grad_norm": 0.35660964250564575, + "learning_rate": 1.254701702488236e-05, + "loss": 1.2245, + "step": 14615 + }, + { + "epoch": 4.3533200543569315, + "grad_norm": 0.47316092252731323, + "learning_rate": 1.254608425838614e-05, + "loss": 1.2436, + "step": 14616 + }, + { + "epoch": 4.3536179005566, + "grad_norm": 0.3825356066226959, + "learning_rate": 1.2545151468201448e-05, + "loss": 1.2214, + "step": 14617 + }, + { + "epoch": 4.353915746756269, + "grad_norm": 0.49703580141067505, + "learning_rate": 1.2544218654336969e-05, + "loss": 1.198, + "step": 14618 + }, + { + "epoch": 4.354213592955937, + "grad_norm": 0.3626794219017029, + "learning_rate": 1.2543285816801379e-05, + "loss": 1.2271, + "step": 14619 + }, + { + "epoch": 4.354511439155606, + "grad_norm": 0.40533241629600525, + "learning_rate": 1.2542352955603353e-05, + "loss": 1.2355, + "step": 14620 + }, + { + "epoch": 4.354809285355275, + "grad_norm": 0.2636505365371704, + "learning_rate": 1.2541420070751572e-05, + "loss": 1.2303, + "step": 14621 + }, + { + "epoch": 4.355107131554943, + "grad_norm": 0.3172827959060669, + "learning_rate": 1.254048716225472e-05, + "loss": 1.2101, + "step": 14622 + }, + { + "epoch": 4.3554049777546116, + "grad_norm": 0.2429002970457077, + "learning_rate": 1.253955423012147e-05, + "loss": 1.2391, + "step": 14623 + }, + { + "epoch": 4.355702823954281, + "grad_norm": 0.2845412492752075, + "learning_rate": 1.2538621274360507e-05, + "loss": 1.2211, + "step": 14624 + }, + { + "epoch": 4.356000670153949, + "grad_norm": 0.26535719633102417, + "learning_rate": 1.2537688294980507e-05, + "loss": 1.2361, + "step": 14625 + }, + { + "epoch": 4.356298516353618, + "grad_norm": 0.2541870176792145, + "learning_rate": 1.2536755291990158e-05, + "loss": 1.2195, + "step": 14626 + }, + { + "epoch": 4.356596362553287, + "grad_norm": 0.31307801604270935, + "learning_rate": 1.2535822265398134e-05, + "loss": 1.2348, + "step": 14627 + }, + { + "epoch": 4.356894208752955, + "grad_norm": 0.3025127947330475, + "learning_rate": 1.2534889215213112e-05, + "loss": 1.219, + "step": 14628 + }, + { + "epoch": 4.357192054952624, + "grad_norm": 0.2949710190296173, + "learning_rate": 1.2533956141443784e-05, + "loss": 1.2225, + "step": 14629 + }, + { + "epoch": 4.3574899011522925, + "grad_norm": 0.3623819053173065, + "learning_rate": 1.2533023044098822e-05, + "loss": 1.238, + "step": 14630 + }, + { + "epoch": 4.357787747351961, + "grad_norm": 0.27650731801986694, + "learning_rate": 1.253208992318691e-05, + "loss": 1.2272, + "step": 14631 + }, + { + "epoch": 4.35808559355163, + "grad_norm": 0.39150604605674744, + "learning_rate": 1.253115677871673e-05, + "loss": 1.2322, + "step": 14632 + }, + { + "epoch": 4.358383439751298, + "grad_norm": 0.26151758432388306, + "learning_rate": 1.253022361069697e-05, + "loss": 1.2142, + "step": 14633 + }, + { + "epoch": 4.358681285950967, + "grad_norm": 0.4499976336956024, + "learning_rate": 1.2529290419136297e-05, + "loss": 1.2102, + "step": 14634 + }, + { + "epoch": 4.358979132150636, + "grad_norm": 0.2854762077331543, + "learning_rate": 1.2528357204043409e-05, + "loss": 1.2182, + "step": 14635 + }, + { + "epoch": 4.359276978350304, + "grad_norm": 0.6315831542015076, + "learning_rate": 1.2527423965426982e-05, + "loss": 1.2133, + "step": 14636 + }, + { + "epoch": 4.3595748245499735, + "grad_norm": 0.44361525774002075, + "learning_rate": 1.2526490703295697e-05, + "loss": 1.2365, + "step": 14637 + }, + { + "epoch": 4.359872670749642, + "grad_norm": 0.48205363750457764, + "learning_rate": 1.2525557417658238e-05, + "loss": 1.2322, + "step": 14638 + }, + { + "epoch": 4.36017051694931, + "grad_norm": 0.49180200695991516, + "learning_rate": 1.252462410852329e-05, + "loss": 1.1915, + "step": 14639 + }, + { + "epoch": 4.360468363148979, + "grad_norm": 0.4336889088153839, + "learning_rate": 1.2523690775899535e-05, + "loss": 1.2137, + "step": 14640 + }, + { + "epoch": 4.360766209348648, + "grad_norm": 0.47530874609947205, + "learning_rate": 1.2522757419795657e-05, + "loss": 1.1999, + "step": 14641 + }, + { + "epoch": 4.361064055548316, + "grad_norm": 0.37315914034843445, + "learning_rate": 1.2521824040220341e-05, + "loss": 1.2209, + "step": 14642 + }, + { + "epoch": 4.361361901747985, + "grad_norm": 0.4032912254333496, + "learning_rate": 1.2520890637182271e-05, + "loss": 1.2333, + "step": 14643 + }, + { + "epoch": 4.3616597479476535, + "grad_norm": 0.45708635449409485, + "learning_rate": 1.2519957210690126e-05, + "loss": 1.2334, + "step": 14644 + }, + { + "epoch": 4.361957594147322, + "grad_norm": 0.33763477206230164, + "learning_rate": 1.2519023760752597e-05, + "loss": 1.2052, + "step": 14645 + }, + { + "epoch": 4.362255440346991, + "grad_norm": 0.4702033996582031, + "learning_rate": 1.2518090287378366e-05, + "loss": 1.239, + "step": 14646 + }, + { + "epoch": 4.362553286546659, + "grad_norm": 0.34722065925598145, + "learning_rate": 1.2517156790576117e-05, + "loss": 1.2212, + "step": 14647 + }, + { + "epoch": 4.362851132746328, + "grad_norm": 0.6574146747589111, + "learning_rate": 1.2516223270354538e-05, + "loss": 1.2235, + "step": 14648 + }, + { + "epoch": 4.363148978945997, + "grad_norm": 0.28901952505111694, + "learning_rate": 1.2515289726722312e-05, + "loss": 1.2374, + "step": 14649 + }, + { + "epoch": 4.363446825145665, + "grad_norm": 0.5301523208618164, + "learning_rate": 1.2514356159688128e-05, + "loss": 1.2225, + "step": 14650 + }, + { + "epoch": 4.363744671345334, + "grad_norm": 0.2746599018573761, + "learning_rate": 1.2513422569260665e-05, + "loss": 1.2323, + "step": 14651 + }, + { + "epoch": 4.364042517545003, + "grad_norm": 0.4509832262992859, + "learning_rate": 1.2512488955448614e-05, + "loss": 1.2179, + "step": 14652 + }, + { + "epoch": 4.364340363744671, + "grad_norm": 0.3198435604572296, + "learning_rate": 1.2511555318260662e-05, + "loss": 1.2294, + "step": 14653 + }, + { + "epoch": 4.36463820994434, + "grad_norm": 0.36141619086265564, + "learning_rate": 1.2510621657705496e-05, + "loss": 1.2276, + "step": 14654 + }, + { + "epoch": 4.364936056144009, + "grad_norm": 0.29216331243515015, + "learning_rate": 1.2509687973791795e-05, + "loss": 1.2161, + "step": 14655 + }, + { + "epoch": 4.365233902343677, + "grad_norm": 0.2578272521495819, + "learning_rate": 1.2508754266528253e-05, + "loss": 1.2204, + "step": 14656 + }, + { + "epoch": 4.365531748543346, + "grad_norm": 0.27996453642845154, + "learning_rate": 1.2507820535923559e-05, + "loss": 1.2278, + "step": 14657 + }, + { + "epoch": 4.365829594743015, + "grad_norm": 0.2756759226322174, + "learning_rate": 1.2506886781986394e-05, + "loss": 1.2079, + "step": 14658 + }, + { + "epoch": 4.366127440942683, + "grad_norm": 0.2989519536495209, + "learning_rate": 1.2505953004725448e-05, + "loss": 1.2228, + "step": 14659 + }, + { + "epoch": 4.366425287142352, + "grad_norm": 0.280781090259552, + "learning_rate": 1.2505019204149411e-05, + "loss": 1.2263, + "step": 14660 + }, + { + "epoch": 4.3667231333420204, + "grad_norm": 0.2653757631778717, + "learning_rate": 1.2504085380266968e-05, + "loss": 1.2195, + "step": 14661 + }, + { + "epoch": 4.367020979541689, + "grad_norm": 0.2949097752571106, + "learning_rate": 1.2503151533086808e-05, + "loss": 1.2244, + "step": 14662 + }, + { + "epoch": 4.367318825741358, + "grad_norm": 0.30720022320747375, + "learning_rate": 1.250221766261762e-05, + "loss": 1.222, + "step": 14663 + }, + { + "epoch": 4.367616671941026, + "grad_norm": 0.33239036798477173, + "learning_rate": 1.2501283768868092e-05, + "loss": 1.209, + "step": 14664 + }, + { + "epoch": 4.3679145181406955, + "grad_norm": 0.4349282383918762, + "learning_rate": 1.2500349851846914e-05, + "loss": 1.234, + "step": 14665 + }, + { + "epoch": 4.368212364340364, + "grad_norm": 0.26425737142562866, + "learning_rate": 1.249941591156277e-05, + "loss": 1.2171, + "step": 14666 + }, + { + "epoch": 4.368510210540032, + "grad_norm": 0.3702079951763153, + "learning_rate": 1.249848194802436e-05, + "loss": 1.2326, + "step": 14667 + }, + { + "epoch": 4.368808056739701, + "grad_norm": 0.25778859853744507, + "learning_rate": 1.2497547961240363e-05, + "loss": 1.2194, + "step": 14668 + }, + { + "epoch": 4.36910590293937, + "grad_norm": 0.28464213013648987, + "learning_rate": 1.2496613951219474e-05, + "loss": 1.2273, + "step": 14669 + }, + { + "epoch": 4.369403749139038, + "grad_norm": 0.27053070068359375, + "learning_rate": 1.2495679917970382e-05, + "loss": 1.2256, + "step": 14670 + }, + { + "epoch": 4.369701595338707, + "grad_norm": 0.27638304233551025, + "learning_rate": 1.2494745861501777e-05, + "loss": 1.2225, + "step": 14671 + }, + { + "epoch": 4.369999441538376, + "grad_norm": 0.3397147059440613, + "learning_rate": 1.2493811781822344e-05, + "loss": 1.2281, + "step": 14672 + }, + { + "epoch": 4.370297287738044, + "grad_norm": 0.2771948277950287, + "learning_rate": 1.2492877678940784e-05, + "loss": 1.2182, + "step": 14673 + }, + { + "epoch": 4.370595133937713, + "grad_norm": 0.33297523856163025, + "learning_rate": 1.2491943552865781e-05, + "loss": 1.2129, + "step": 14674 + }, + { + "epoch": 4.3708929801373815, + "grad_norm": 0.4865691065788269, + "learning_rate": 1.2491009403606026e-05, + "loss": 1.2157, + "step": 14675 + }, + { + "epoch": 4.37119082633705, + "grad_norm": 0.32315999269485474, + "learning_rate": 1.2490075231170213e-05, + "loss": 1.2364, + "step": 14676 + }, + { + "epoch": 4.371488672536719, + "grad_norm": 0.551485002040863, + "learning_rate": 1.2489141035567035e-05, + "loss": 1.2303, + "step": 14677 + }, + { + "epoch": 4.371786518736387, + "grad_norm": 0.30194327235221863, + "learning_rate": 1.2488206816805177e-05, + "loss": 1.2164, + "step": 14678 + }, + { + "epoch": 4.372084364936056, + "grad_norm": 0.5695297122001648, + "learning_rate": 1.2487272574893335e-05, + "loss": 1.2349, + "step": 14679 + }, + { + "epoch": 4.372382211135725, + "grad_norm": 0.3976595997810364, + "learning_rate": 1.24863383098402e-05, + "loss": 1.2175, + "step": 14680 + }, + { + "epoch": 4.372680057335393, + "grad_norm": 0.43812090158462524, + "learning_rate": 1.2485404021654465e-05, + "loss": 1.2435, + "step": 14681 + }, + { + "epoch": 4.372977903535062, + "grad_norm": 0.3519086241722107, + "learning_rate": 1.2484469710344824e-05, + "loss": 1.2352, + "step": 14682 + }, + { + "epoch": 4.373275749734731, + "grad_norm": 0.3912816345691681, + "learning_rate": 1.2483535375919967e-05, + "loss": 1.2341, + "step": 14683 + }, + { + "epoch": 4.373573595934399, + "grad_norm": 0.30880996584892273, + "learning_rate": 1.248260101838859e-05, + "loss": 1.2118, + "step": 14684 + }, + { + "epoch": 4.373871442134068, + "grad_norm": 0.29675379395484924, + "learning_rate": 1.2481666637759381e-05, + "loss": 1.2349, + "step": 14685 + }, + { + "epoch": 4.374169288333737, + "grad_norm": 0.24437694251537323, + "learning_rate": 1.248073223404104e-05, + "loss": 1.2284, + "step": 14686 + }, + { + "epoch": 4.374467134533405, + "grad_norm": 0.31316179037094116, + "learning_rate": 1.2479797807242255e-05, + "loss": 1.2168, + "step": 14687 + }, + { + "epoch": 4.374764980733074, + "grad_norm": 0.2598910331726074, + "learning_rate": 1.2478863357371722e-05, + "loss": 1.2359, + "step": 14688 + }, + { + "epoch": 4.3750628269327425, + "grad_norm": 0.298622727394104, + "learning_rate": 1.2477928884438135e-05, + "loss": 1.2252, + "step": 14689 + }, + { + "epoch": 4.375360673132411, + "grad_norm": 0.25254201889038086, + "learning_rate": 1.247699438845019e-05, + "loss": 1.219, + "step": 14690 + }, + { + "epoch": 4.37565851933208, + "grad_norm": 0.26050952076911926, + "learning_rate": 1.2476059869416579e-05, + "loss": 1.2312, + "step": 14691 + }, + { + "epoch": 4.375956365531748, + "grad_norm": 0.2809820771217346, + "learning_rate": 1.2475125327345998e-05, + "loss": 1.2136, + "step": 14692 + }, + { + "epoch": 4.376254211731418, + "grad_norm": 0.28439196944236755, + "learning_rate": 1.2474190762247136e-05, + "loss": 1.2141, + "step": 14693 + }, + { + "epoch": 4.376552057931086, + "grad_norm": 0.3428935706615448, + "learning_rate": 1.2473256174128701e-05, + "loss": 1.2341, + "step": 14694 + }, + { + "epoch": 4.376849904130754, + "grad_norm": 0.32866135239601135, + "learning_rate": 1.2472321562999376e-05, + "loss": 1.2228, + "step": 14695 + }, + { + "epoch": 4.3771477503304235, + "grad_norm": 0.27710849046707153, + "learning_rate": 1.2471386928867862e-05, + "loss": 1.2275, + "step": 14696 + }, + { + "epoch": 4.377445596530092, + "grad_norm": 0.29330718517303467, + "learning_rate": 1.2470452271742853e-05, + "loss": 1.2241, + "step": 14697 + }, + { + "epoch": 4.37774344272976, + "grad_norm": 0.25902777910232544, + "learning_rate": 1.2469517591633047e-05, + "loss": 1.2269, + "step": 14698 + }, + { + "epoch": 4.378041288929429, + "grad_norm": 0.30511945486068726, + "learning_rate": 1.2468582888547139e-05, + "loss": 1.2243, + "step": 14699 + }, + { + "epoch": 4.378339135129098, + "grad_norm": 0.27240312099456787, + "learning_rate": 1.2467648162493823e-05, + "loss": 1.22, + "step": 14700 + }, + { + "epoch": 4.378636981328766, + "grad_norm": 0.30048826336860657, + "learning_rate": 1.24667134134818e-05, + "loss": 1.2165, + "step": 14701 + }, + { + "epoch": 4.378934827528435, + "grad_norm": 0.29394248127937317, + "learning_rate": 1.2465778641519766e-05, + "loss": 1.2471, + "step": 14702 + }, + { + "epoch": 4.3792326737281035, + "grad_norm": 0.29744812846183777, + "learning_rate": 1.2464843846616414e-05, + "loss": 1.2063, + "step": 14703 + }, + { + "epoch": 4.379530519927773, + "grad_norm": 0.4182775020599365, + "learning_rate": 1.2463909028780446e-05, + "loss": 1.2098, + "step": 14704 + }, + { + "epoch": 4.379828366127441, + "grad_norm": 0.41468486189842224, + "learning_rate": 1.2462974188020557e-05, + "loss": 1.2181, + "step": 14705 + }, + { + "epoch": 4.380126212327109, + "grad_norm": 0.3401853144168854, + "learning_rate": 1.2462039324345443e-05, + "loss": 1.2221, + "step": 14706 + }, + { + "epoch": 4.380424058526779, + "grad_norm": 0.29607275128364563, + "learning_rate": 1.2461104437763804e-05, + "loss": 1.2118, + "step": 14707 + }, + { + "epoch": 4.380721904726447, + "grad_norm": 0.37843289971351624, + "learning_rate": 1.246016952828434e-05, + "loss": 1.2261, + "step": 14708 + }, + { + "epoch": 4.381019750926115, + "grad_norm": 0.26002398133277893, + "learning_rate": 1.2459234595915746e-05, + "loss": 1.2039, + "step": 14709 + }, + { + "epoch": 4.3813175971257845, + "grad_norm": 0.3562231957912445, + "learning_rate": 1.2458299640666722e-05, + "loss": 1.2253, + "step": 14710 + }, + { + "epoch": 4.381615443325453, + "grad_norm": 0.24358078837394714, + "learning_rate": 1.2457364662545965e-05, + "loss": 1.2195, + "step": 14711 + }, + { + "epoch": 4.381913289525121, + "grad_norm": 0.4411725401878357, + "learning_rate": 1.2456429661562177e-05, + "loss": 1.2374, + "step": 14712 + }, + { + "epoch": 4.38221113572479, + "grad_norm": 0.41273796558380127, + "learning_rate": 1.2455494637724055e-05, + "loss": 1.243, + "step": 14713 + }, + { + "epoch": 4.382508981924459, + "grad_norm": 0.29247844219207764, + "learning_rate": 1.2454559591040299e-05, + "loss": 1.2326, + "step": 14714 + }, + { + "epoch": 4.382806828124127, + "grad_norm": 0.25195637345314026, + "learning_rate": 1.245362452151961e-05, + "loss": 1.208, + "step": 14715 + }, + { + "epoch": 4.383104674323796, + "grad_norm": 0.30427172780036926, + "learning_rate": 1.2452689429170681e-05, + "loss": 1.2134, + "step": 14716 + }, + { + "epoch": 4.383402520523465, + "grad_norm": 0.2622573673725128, + "learning_rate": 1.2451754314002223e-05, + "loss": 1.2037, + "step": 14717 + }, + { + "epoch": 4.383700366723133, + "grad_norm": 0.2846639156341553, + "learning_rate": 1.2450819176022928e-05, + "loss": 1.235, + "step": 14718 + }, + { + "epoch": 4.383998212922802, + "grad_norm": 0.2838877737522125, + "learning_rate": 1.2449884015241498e-05, + "loss": 1.2351, + "step": 14719 + }, + { + "epoch": 4.3842960591224704, + "grad_norm": 0.2814798355102539, + "learning_rate": 1.2448948831666636e-05, + "loss": 1.2326, + "step": 14720 + }, + { + "epoch": 4.38459390532214, + "grad_norm": 0.2451288402080536, + "learning_rate": 1.244801362530704e-05, + "loss": 1.2131, + "step": 14721 + }, + { + "epoch": 4.384891751521808, + "grad_norm": 0.4032379686832428, + "learning_rate": 1.2447078396171412e-05, + "loss": 1.2264, + "step": 14722 + }, + { + "epoch": 4.385189597721476, + "grad_norm": 0.40528061985969543, + "learning_rate": 1.2446143144268454e-05, + "loss": 1.2213, + "step": 14723 + }, + { + "epoch": 4.3854874439211455, + "grad_norm": 0.37685254216194153, + "learning_rate": 1.2445207869606868e-05, + "loss": 1.2422, + "step": 14724 + }, + { + "epoch": 4.385785290120814, + "grad_norm": 0.2608760893344879, + "learning_rate": 1.2444272572195354e-05, + "loss": 1.2256, + "step": 14725 + }, + { + "epoch": 4.386083136320482, + "grad_norm": 0.42907699942588806, + "learning_rate": 1.2443337252042615e-05, + "loss": 1.2364, + "step": 14726 + }, + { + "epoch": 4.386380982520151, + "grad_norm": 0.3708975315093994, + "learning_rate": 1.2442401909157353e-05, + "loss": 1.2305, + "step": 14727 + }, + { + "epoch": 4.38667882871982, + "grad_norm": 0.4012928307056427, + "learning_rate": 1.244146654354827e-05, + "loss": 1.2076, + "step": 14728 + }, + { + "epoch": 4.386976674919488, + "grad_norm": 0.6299765706062317, + "learning_rate": 1.2440531155224067e-05, + "loss": 1.2276, + "step": 14729 + }, + { + "epoch": 4.387274521119157, + "grad_norm": 0.29901444911956787, + "learning_rate": 1.2439595744193449e-05, + "loss": 1.2369, + "step": 14730 + }, + { + "epoch": 4.387572367318826, + "grad_norm": 0.4860142171382904, + "learning_rate": 1.2438660310465118e-05, + "loss": 1.2252, + "step": 14731 + }, + { + "epoch": 4.387870213518495, + "grad_norm": 0.5218364000320435, + "learning_rate": 1.2437724854047781e-05, + "loss": 1.2136, + "step": 14732 + }, + { + "epoch": 4.388168059718163, + "grad_norm": 0.2555951476097107, + "learning_rate": 1.2436789374950133e-05, + "loss": 1.2207, + "step": 14733 + }, + { + "epoch": 4.3884659059178315, + "grad_norm": 0.369017094373703, + "learning_rate": 1.2435853873180884e-05, + "loss": 1.2143, + "step": 14734 + }, + { + "epoch": 4.388763752117501, + "grad_norm": 0.2541263997554779, + "learning_rate": 1.243491834874874e-05, + "loss": 1.2129, + "step": 14735 + }, + { + "epoch": 4.389061598317169, + "grad_norm": 0.31892648339271545, + "learning_rate": 1.2433982801662397e-05, + "loss": 1.223, + "step": 14736 + }, + { + "epoch": 4.389359444516837, + "grad_norm": 0.2780649960041046, + "learning_rate": 1.2433047231930562e-05, + "loss": 1.225, + "step": 14737 + }, + { + "epoch": 4.389657290716507, + "grad_norm": 0.3346238136291504, + "learning_rate": 1.2432111639561943e-05, + "loss": 1.2218, + "step": 14738 + }, + { + "epoch": 4.389955136916175, + "grad_norm": 0.2518664598464966, + "learning_rate": 1.2431176024565241e-05, + "loss": 1.2035, + "step": 14739 + }, + { + "epoch": 4.390252983115843, + "grad_norm": 0.3096642792224884, + "learning_rate": 1.2430240386949165e-05, + "loss": 1.238, + "step": 14740 + }, + { + "epoch": 4.390550829315512, + "grad_norm": 0.3540990948677063, + "learning_rate": 1.2429304726722416e-05, + "loss": 1.2294, + "step": 14741 + }, + { + "epoch": 4.390848675515181, + "grad_norm": 0.27909091114997864, + "learning_rate": 1.2428369043893703e-05, + "loss": 1.2262, + "step": 14742 + }, + { + "epoch": 4.391146521714849, + "grad_norm": 0.5325915813446045, + "learning_rate": 1.2427433338471726e-05, + "loss": 1.2236, + "step": 14743 + }, + { + "epoch": 4.391444367914518, + "grad_norm": 0.3627501130104065, + "learning_rate": 1.2426497610465196e-05, + "loss": 1.2344, + "step": 14744 + }, + { + "epoch": 4.391742214114187, + "grad_norm": 0.3426806926727295, + "learning_rate": 1.2425561859882815e-05, + "loss": 1.2282, + "step": 14745 + }, + { + "epoch": 4.392040060313855, + "grad_norm": 0.32188719511032104, + "learning_rate": 1.2424626086733291e-05, + "loss": 1.212, + "step": 14746 + }, + { + "epoch": 4.392337906513524, + "grad_norm": 0.27562034130096436, + "learning_rate": 1.2423690291025332e-05, + "loss": 1.2317, + "step": 14747 + }, + { + "epoch": 4.3926357527131925, + "grad_norm": 0.3530782461166382, + "learning_rate": 1.2422754472767641e-05, + "loss": 1.2185, + "step": 14748 + }, + { + "epoch": 4.392933598912862, + "grad_norm": 0.2722001373767853, + "learning_rate": 1.242181863196893e-05, + "loss": 1.2306, + "step": 14749 + }, + { + "epoch": 4.39323144511253, + "grad_norm": 0.46160808205604553, + "learning_rate": 1.2420882768637899e-05, + "loss": 1.2161, + "step": 14750 + }, + { + "epoch": 4.393529291312198, + "grad_norm": 0.32977479696273804, + "learning_rate": 1.2419946882783259e-05, + "loss": 1.2435, + "step": 14751 + }, + { + "epoch": 4.393827137511868, + "grad_norm": 0.30083370208740234, + "learning_rate": 1.2419010974413717e-05, + "loss": 1.2118, + "step": 14752 + }, + { + "epoch": 4.394124983711536, + "grad_norm": 0.37337085604667664, + "learning_rate": 1.241807504353798e-05, + "loss": 1.2238, + "step": 14753 + }, + { + "epoch": 4.394422829911204, + "grad_norm": 0.27815932035446167, + "learning_rate": 1.2417139090164758e-05, + "loss": 1.2384, + "step": 14754 + }, + { + "epoch": 4.3947206761108735, + "grad_norm": 0.30677685141563416, + "learning_rate": 1.2416203114302756e-05, + "loss": 1.2222, + "step": 14755 + }, + { + "epoch": 4.395018522310542, + "grad_norm": 0.3042740225791931, + "learning_rate": 1.2415267115960685e-05, + "loss": 1.2222, + "step": 14756 + }, + { + "epoch": 4.39531636851021, + "grad_norm": 0.3490796387195587, + "learning_rate": 1.2414331095147249e-05, + "loss": 1.2113, + "step": 14757 + }, + { + "epoch": 4.395614214709879, + "grad_norm": 0.3328080177307129, + "learning_rate": 1.2413395051871163e-05, + "loss": 1.227, + "step": 14758 + }, + { + "epoch": 4.395912060909548, + "grad_norm": 0.2872603237628937, + "learning_rate": 1.2412458986141131e-05, + "loss": 1.2232, + "step": 14759 + }, + { + "epoch": 4.396209907109217, + "grad_norm": 0.27765852212905884, + "learning_rate": 1.2411522897965865e-05, + "loss": 1.2288, + "step": 14760 + }, + { + "epoch": 4.396507753308885, + "grad_norm": 0.34384989738464355, + "learning_rate": 1.2410586787354068e-05, + "loss": 1.2071, + "step": 14761 + }, + { + "epoch": 4.3968055995085535, + "grad_norm": 0.326244056224823, + "learning_rate": 1.2409650654314462e-05, + "loss": 1.2303, + "step": 14762 + }, + { + "epoch": 4.397103445708223, + "grad_norm": 0.3647850751876831, + "learning_rate": 1.2408714498855741e-05, + "loss": 1.2216, + "step": 14763 + }, + { + "epoch": 4.397401291907891, + "grad_norm": 0.3525470495223999, + "learning_rate": 1.2407778320986625e-05, + "loss": 1.2219, + "step": 14764 + }, + { + "epoch": 4.397699138107559, + "grad_norm": 0.24576127529144287, + "learning_rate": 1.2406842120715825e-05, + "loss": 1.2218, + "step": 14765 + }, + { + "epoch": 4.397996984307229, + "grad_norm": 0.4573582410812378, + "learning_rate": 1.2405905898052047e-05, + "loss": 1.2293, + "step": 14766 + }, + { + "epoch": 4.398294830506897, + "grad_norm": 0.4853639006614685, + "learning_rate": 1.2404969653004002e-05, + "loss": 1.2209, + "step": 14767 + }, + { + "epoch": 4.398592676706565, + "grad_norm": 0.2845476269721985, + "learning_rate": 1.2404033385580401e-05, + "loss": 1.2371, + "step": 14768 + }, + { + "epoch": 4.3988905229062345, + "grad_norm": 0.8294855952262878, + "learning_rate": 1.2403097095789955e-05, + "loss": 1.2102, + "step": 14769 + }, + { + "epoch": 4.399188369105903, + "grad_norm": 0.4614257216453552, + "learning_rate": 1.2402160783641374e-05, + "loss": 1.2323, + "step": 14770 + }, + { + "epoch": 4.399486215305572, + "grad_norm": 0.7635871767997742, + "learning_rate": 1.2401224449143374e-05, + "loss": 1.2091, + "step": 14771 + }, + { + "epoch": 4.39978406150524, + "grad_norm": 0.576861560344696, + "learning_rate": 1.2400288092304663e-05, + "loss": 1.233, + "step": 14772 + }, + { + "epoch": 4.400081907704909, + "grad_norm": 0.48361676931381226, + "learning_rate": 1.2399351713133953e-05, + "loss": 1.2192, + "step": 14773 + }, + { + "epoch": 4.400379753904578, + "grad_norm": 0.26259520649909973, + "learning_rate": 1.2398415311639954e-05, + "loss": 1.2013, + "step": 14774 + }, + { + "epoch": 4.400677600104246, + "grad_norm": 0.5118993520736694, + "learning_rate": 1.239747888783138e-05, + "loss": 1.2351, + "step": 14775 + }, + { + "epoch": 4.400975446303915, + "grad_norm": 0.40735262632369995, + "learning_rate": 1.2396542441716946e-05, + "loss": 1.214, + "step": 14776 + }, + { + "epoch": 4.401273292503584, + "grad_norm": 0.25754430890083313, + "learning_rate": 1.2395605973305362e-05, + "loss": 1.2058, + "step": 14777 + }, + { + "epoch": 4.401571138703252, + "grad_norm": 0.4109981656074524, + "learning_rate": 1.239466948260534e-05, + "loss": 1.2302, + "step": 14778 + }, + { + "epoch": 4.4018689849029204, + "grad_norm": 0.3392815589904785, + "learning_rate": 1.2393732969625597e-05, + "loss": 1.2222, + "step": 14779 + }, + { + "epoch": 4.40216683110259, + "grad_norm": 0.2568325102329254, + "learning_rate": 1.2392796434374836e-05, + "loss": 1.2215, + "step": 14780 + }, + { + "epoch": 4.402464677302258, + "grad_norm": 0.6553801894187927, + "learning_rate": 1.239185987686178e-05, + "loss": 1.2087, + "step": 14781 + }, + { + "epoch": 4.402762523501926, + "grad_norm": 0.3319759964942932, + "learning_rate": 1.2390923297095142e-05, + "loss": 1.2298, + "step": 14782 + }, + { + "epoch": 4.4030603697015955, + "grad_norm": 0.39988401532173157, + "learning_rate": 1.2389986695083636e-05, + "loss": 1.2386, + "step": 14783 + }, + { + "epoch": 4.403358215901264, + "grad_norm": 0.3550228774547577, + "learning_rate": 1.2389050070835972e-05, + "loss": 1.2066, + "step": 14784 + }, + { + "epoch": 4.403656062100932, + "grad_norm": 0.2917831242084503, + "learning_rate": 1.2388113424360865e-05, + "loss": 1.2204, + "step": 14785 + }, + { + "epoch": 4.403953908300601, + "grad_norm": 0.6269718408584595, + "learning_rate": 1.2387176755667032e-05, + "loss": 1.2186, + "step": 14786 + }, + { + "epoch": 4.40425175450027, + "grad_norm": 0.31222423911094666, + "learning_rate": 1.2386240064763187e-05, + "loss": 1.2076, + "step": 14787 + }, + { + "epoch": 4.404549600699939, + "grad_norm": 0.4469805061817169, + "learning_rate": 1.2385303351658042e-05, + "loss": 1.2248, + "step": 14788 + }, + { + "epoch": 4.404847446899607, + "grad_norm": 0.3180270195007324, + "learning_rate": 1.2384366616360317e-05, + "loss": 1.2218, + "step": 14789 + }, + { + "epoch": 4.405145293099276, + "grad_norm": 0.3021213412284851, + "learning_rate": 1.2383429858878723e-05, + "loss": 1.2304, + "step": 14790 + }, + { + "epoch": 4.405443139298945, + "grad_norm": 0.36727070808410645, + "learning_rate": 1.238249307922198e-05, + "loss": 1.2406, + "step": 14791 + }, + { + "epoch": 4.405740985498613, + "grad_norm": 0.33401358127593994, + "learning_rate": 1.2381556277398796e-05, + "loss": 1.2157, + "step": 14792 + }, + { + "epoch": 4.4060388316982815, + "grad_norm": 0.24210068583488464, + "learning_rate": 1.2380619453417895e-05, + "loss": 1.2189, + "step": 14793 + }, + { + "epoch": 4.406336677897951, + "grad_norm": 0.35449934005737305, + "learning_rate": 1.2379682607287988e-05, + "loss": 1.2329, + "step": 14794 + }, + { + "epoch": 4.406634524097619, + "grad_norm": 0.3415757715702057, + "learning_rate": 1.2378745739017795e-05, + "loss": 1.2203, + "step": 14795 + }, + { + "epoch": 4.406932370297287, + "grad_norm": 0.305342435836792, + "learning_rate": 1.2377808848616029e-05, + "loss": 1.2099, + "step": 14796 + }, + { + "epoch": 4.407230216496957, + "grad_norm": 0.5291569828987122, + "learning_rate": 1.237687193609141e-05, + "loss": 1.2111, + "step": 14797 + }, + { + "epoch": 4.407528062696625, + "grad_norm": 0.3190738260746002, + "learning_rate": 1.2375935001452652e-05, + "loss": 1.2317, + "step": 14798 + }, + { + "epoch": 4.407825908896294, + "grad_norm": 0.42422544956207275, + "learning_rate": 1.2374998044708471e-05, + "loss": 1.2084, + "step": 14799 + }, + { + "epoch": 4.408123755095962, + "grad_norm": 0.3175092041492462, + "learning_rate": 1.2374061065867592e-05, + "loss": 1.2224, + "step": 14800 + }, + { + "epoch": 4.408421601295631, + "grad_norm": 0.3442769944667816, + "learning_rate": 1.2373124064938722e-05, + "loss": 1.2054, + "step": 14801 + }, + { + "epoch": 4.4087194474953, + "grad_norm": 0.44270092248916626, + "learning_rate": 1.2372187041930588e-05, + "loss": 1.2225, + "step": 14802 + }, + { + "epoch": 4.409017293694968, + "grad_norm": 0.25048506259918213, + "learning_rate": 1.2371249996851903e-05, + "loss": 1.2283, + "step": 14803 + }, + { + "epoch": 4.409315139894637, + "grad_norm": 0.36463814973831177, + "learning_rate": 1.2370312929711383e-05, + "loss": 1.2071, + "step": 14804 + }, + { + "epoch": 4.409612986094306, + "grad_norm": 0.34449633955955505, + "learning_rate": 1.2369375840517752e-05, + "loss": 1.2133, + "step": 14805 + }, + { + "epoch": 4.409910832293974, + "grad_norm": 0.26221033930778503, + "learning_rate": 1.2368438729279725e-05, + "loss": 1.209, + "step": 14806 + }, + { + "epoch": 4.4102086784936425, + "grad_norm": 0.4529891610145569, + "learning_rate": 1.2367501596006023e-05, + "loss": 1.2131, + "step": 14807 + }, + { + "epoch": 4.410506524693312, + "grad_norm": 0.26924896240234375, + "learning_rate": 1.2366564440705363e-05, + "loss": 1.2267, + "step": 14808 + }, + { + "epoch": 4.41080437089298, + "grad_norm": 0.35140368342399597, + "learning_rate": 1.2365627263386468e-05, + "loss": 1.2187, + "step": 14809 + }, + { + "epoch": 4.411102217092648, + "grad_norm": 0.33502423763275146, + "learning_rate": 1.2364690064058052e-05, + "loss": 1.2154, + "step": 14810 + }, + { + "epoch": 4.411400063292318, + "grad_norm": 0.3482076823711395, + "learning_rate": 1.2363752842728836e-05, + "loss": 1.229, + "step": 14811 + }, + { + "epoch": 4.411697909491986, + "grad_norm": 0.40956559777259827, + "learning_rate": 1.236281559940754e-05, + "loss": 1.222, + "step": 14812 + }, + { + "epoch": 4.411995755691654, + "grad_norm": 0.32557663321495056, + "learning_rate": 1.2361878334102885e-05, + "loss": 1.2239, + "step": 14813 + }, + { + "epoch": 4.4122936018913235, + "grad_norm": 0.42252591252326965, + "learning_rate": 1.2360941046823596e-05, + "loss": 1.2478, + "step": 14814 + }, + { + "epoch": 4.412591448090992, + "grad_norm": 0.2798241674900055, + "learning_rate": 1.2360003737578383e-05, + "loss": 1.2122, + "step": 14815 + }, + { + "epoch": 4.412889294290661, + "grad_norm": 0.35809919238090515, + "learning_rate": 1.2359066406375973e-05, + "loss": 1.2075, + "step": 14816 + }, + { + "epoch": 4.413187140490329, + "grad_norm": 0.24428319931030273, + "learning_rate": 1.2358129053225088e-05, + "loss": 1.2215, + "step": 14817 + }, + { + "epoch": 4.413484986689998, + "grad_norm": 0.38702455163002014, + "learning_rate": 1.2357191678134443e-05, + "loss": 1.2066, + "step": 14818 + }, + { + "epoch": 4.413782832889667, + "grad_norm": 0.265264630317688, + "learning_rate": 1.2356254281112766e-05, + "loss": 1.2069, + "step": 14819 + }, + { + "epoch": 4.414080679089335, + "grad_norm": 0.3403869569301605, + "learning_rate": 1.2355316862168776e-05, + "loss": 1.217, + "step": 14820 + }, + { + "epoch": 4.4143785252890035, + "grad_norm": 0.27034205198287964, + "learning_rate": 1.2354379421311192e-05, + "loss": 1.224, + "step": 14821 + }, + { + "epoch": 4.414676371488673, + "grad_norm": 0.34566351771354675, + "learning_rate": 1.2353441958548736e-05, + "loss": 1.2247, + "step": 14822 + }, + { + "epoch": 4.414974217688341, + "grad_norm": 0.2551047205924988, + "learning_rate": 1.2352504473890135e-05, + "loss": 1.2262, + "step": 14823 + }, + { + "epoch": 4.415272063888009, + "grad_norm": 0.3116854727268219, + "learning_rate": 1.2351566967344109e-05, + "loss": 1.2063, + "step": 14824 + }, + { + "epoch": 4.415569910087679, + "grad_norm": 0.29129716753959656, + "learning_rate": 1.2350629438919379e-05, + "loss": 1.2205, + "step": 14825 + }, + { + "epoch": 4.415867756287347, + "grad_norm": 0.28079909086227417, + "learning_rate": 1.2349691888624667e-05, + "loss": 1.2141, + "step": 14826 + }, + { + "epoch": 4.416165602487016, + "grad_norm": 0.2966746389865875, + "learning_rate": 1.2348754316468699e-05, + "loss": 1.2275, + "step": 14827 + }, + { + "epoch": 4.4164634486866845, + "grad_norm": 0.24404703080654144, + "learning_rate": 1.2347816722460196e-05, + "loss": 1.2214, + "step": 14828 + }, + { + "epoch": 4.416761294886353, + "grad_norm": 0.286882609128952, + "learning_rate": 1.2346879106607878e-05, + "loss": 1.2331, + "step": 14829 + }, + { + "epoch": 4.417059141086022, + "grad_norm": 0.26083606481552124, + "learning_rate": 1.2345941468920476e-05, + "loss": 1.2353, + "step": 14830 + }, + { + "epoch": 4.41735698728569, + "grad_norm": 0.27248531579971313, + "learning_rate": 1.234500380940671e-05, + "loss": 1.2112, + "step": 14831 + }, + { + "epoch": 4.417654833485359, + "grad_norm": 0.253886878490448, + "learning_rate": 1.2344066128075303e-05, + "loss": 1.2263, + "step": 14832 + }, + { + "epoch": 4.417952679685028, + "grad_norm": 0.27620503306388855, + "learning_rate": 1.2343128424934978e-05, + "loss": 1.2071, + "step": 14833 + }, + { + "epoch": 4.418250525884696, + "grad_norm": 0.2539706528186798, + "learning_rate": 1.2342190699994461e-05, + "loss": 1.2246, + "step": 14834 + }, + { + "epoch": 4.418548372084365, + "grad_norm": 0.32821041345596313, + "learning_rate": 1.2341252953262477e-05, + "loss": 1.2081, + "step": 14835 + }, + { + "epoch": 4.418846218284034, + "grad_norm": 0.2732594311237335, + "learning_rate": 1.2340315184747749e-05, + "loss": 1.2381, + "step": 14836 + }, + { + "epoch": 4.419144064483702, + "grad_norm": 0.35821712017059326, + "learning_rate": 1.2339377394459006e-05, + "loss": 1.2319, + "step": 14837 + }, + { + "epoch": 4.419441910683371, + "grad_norm": 0.2847093641757965, + "learning_rate": 1.2338439582404969e-05, + "loss": 1.2155, + "step": 14838 + }, + { + "epoch": 4.41973975688304, + "grad_norm": 0.41309723258018494, + "learning_rate": 1.2337501748594362e-05, + "loss": 1.2232, + "step": 14839 + }, + { + "epoch": 4.420037603082708, + "grad_norm": 0.29065367579460144, + "learning_rate": 1.2336563893035913e-05, + "loss": 1.2236, + "step": 14840 + }, + { + "epoch": 4.420335449282377, + "grad_norm": 0.3107321262359619, + "learning_rate": 1.2335626015738352e-05, + "loss": 1.2126, + "step": 14841 + }, + { + "epoch": 4.4206332954820455, + "grad_norm": 0.2843754291534424, + "learning_rate": 1.2334688116710396e-05, + "loss": 1.2189, + "step": 14842 + }, + { + "epoch": 4.420931141681714, + "grad_norm": 0.26646536588668823, + "learning_rate": 1.2333750195960776e-05, + "loss": 1.2059, + "step": 14843 + }, + { + "epoch": 4.421228987881383, + "grad_norm": 0.2937961220741272, + "learning_rate": 1.233281225349822e-05, + "loss": 1.2224, + "step": 14844 + }, + { + "epoch": 4.421526834081051, + "grad_norm": 0.3047434687614441, + "learning_rate": 1.2331874289331449e-05, + "loss": 1.2273, + "step": 14845 + }, + { + "epoch": 4.42182468028072, + "grad_norm": 0.27585846185684204, + "learning_rate": 1.2330936303469194e-05, + "loss": 1.2048, + "step": 14846 + }, + { + "epoch": 4.422122526480389, + "grad_norm": 0.2681923806667328, + "learning_rate": 1.2329998295920183e-05, + "loss": 1.2067, + "step": 14847 + }, + { + "epoch": 4.422420372680057, + "grad_norm": 0.27037373185157776, + "learning_rate": 1.2329060266693142e-05, + "loss": 1.2038, + "step": 14848 + }, + { + "epoch": 4.422718218879726, + "grad_norm": 0.28603503108024597, + "learning_rate": 1.2328122215796797e-05, + "loss": 1.2197, + "step": 14849 + }, + { + "epoch": 4.423016065079395, + "grad_norm": 0.27683189511299133, + "learning_rate": 1.2327184143239872e-05, + "loss": 1.2242, + "step": 14850 + }, + { + "epoch": 4.423313911279063, + "grad_norm": 0.27266234159469604, + "learning_rate": 1.2326246049031102e-05, + "loss": 1.2123, + "step": 14851 + }, + { + "epoch": 4.4236117574787315, + "grad_norm": 0.2785188853740692, + "learning_rate": 1.232530793317921e-05, + "loss": 1.2184, + "step": 14852 + }, + { + "epoch": 4.423909603678401, + "grad_norm": 0.2624119818210602, + "learning_rate": 1.2324369795692925e-05, + "loss": 1.2185, + "step": 14853 + }, + { + "epoch": 4.424207449878069, + "grad_norm": 0.33412596583366394, + "learning_rate": 1.2323431636580979e-05, + "loss": 1.2289, + "step": 14854 + }, + { + "epoch": 4.424505296077738, + "grad_norm": 0.2909590005874634, + "learning_rate": 1.2322493455852096e-05, + "loss": 1.2244, + "step": 14855 + }, + { + "epoch": 4.424803142277407, + "grad_norm": 0.33464330434799194, + "learning_rate": 1.2321555253515005e-05, + "loss": 1.2262, + "step": 14856 + }, + { + "epoch": 4.425100988477075, + "grad_norm": 0.38978371024131775, + "learning_rate": 1.2320617029578438e-05, + "loss": 1.2211, + "step": 14857 + }, + { + "epoch": 4.425398834676744, + "grad_norm": 0.3329388201236725, + "learning_rate": 1.2319678784051121e-05, + "loss": 1.2378, + "step": 14858 + }, + { + "epoch": 4.425696680876412, + "grad_norm": 0.3889275789260864, + "learning_rate": 1.2318740516941786e-05, + "loss": 1.2158, + "step": 14859 + }, + { + "epoch": 4.425994527076081, + "grad_norm": 0.34946542978286743, + "learning_rate": 1.231780222825916e-05, + "loss": 1.222, + "step": 14860 + }, + { + "epoch": 4.42629237327575, + "grad_norm": 0.315692275762558, + "learning_rate": 1.2316863918011975e-05, + "loss": 1.2102, + "step": 14861 + }, + { + "epoch": 4.426590219475418, + "grad_norm": 0.2709731161594391, + "learning_rate": 1.2315925586208958e-05, + "loss": 1.2273, + "step": 14862 + }, + { + "epoch": 4.426888065675087, + "grad_norm": 0.3175421953201294, + "learning_rate": 1.231498723285884e-05, + "loss": 1.2106, + "step": 14863 + }, + { + "epoch": 4.427185911874756, + "grad_norm": 0.2550317049026489, + "learning_rate": 1.2314048857970354e-05, + "loss": 1.2278, + "step": 14864 + }, + { + "epoch": 4.427483758074424, + "grad_norm": 0.365434467792511, + "learning_rate": 1.231311046155223e-05, + "loss": 1.2143, + "step": 14865 + }, + { + "epoch": 4.427781604274093, + "grad_norm": 0.3138810396194458, + "learning_rate": 1.2312172043613197e-05, + "loss": 1.2314, + "step": 14866 + }, + { + "epoch": 4.428079450473762, + "grad_norm": 0.32338428497314453, + "learning_rate": 1.2311233604161984e-05, + "loss": 1.2055, + "step": 14867 + }, + { + "epoch": 4.42837729667343, + "grad_norm": 0.3391285240650177, + "learning_rate": 1.2310295143207328e-05, + "loss": 1.2074, + "step": 14868 + }, + { + "epoch": 4.428675142873099, + "grad_norm": 0.3374027609825134, + "learning_rate": 1.2309356660757953e-05, + "loss": 1.2198, + "step": 14869 + }, + { + "epoch": 4.428972989072768, + "grad_norm": 0.29991164803504944, + "learning_rate": 1.2308418156822599e-05, + "loss": 1.2398, + "step": 14870 + }, + { + "epoch": 4.429270835272436, + "grad_norm": 0.36310943961143494, + "learning_rate": 1.2307479631409992e-05, + "loss": 1.2222, + "step": 14871 + }, + { + "epoch": 4.429568681472105, + "grad_norm": 0.35218074917793274, + "learning_rate": 1.2306541084528864e-05, + "loss": 1.2158, + "step": 14872 + }, + { + "epoch": 4.4298665276717735, + "grad_norm": 0.2511431872844696, + "learning_rate": 1.230560251618795e-05, + "loss": 1.2144, + "step": 14873 + }, + { + "epoch": 4.430164373871442, + "grad_norm": 0.34200379252433777, + "learning_rate": 1.2304663926395977e-05, + "loss": 1.249, + "step": 14874 + }, + { + "epoch": 4.430462220071111, + "grad_norm": 0.33715832233428955, + "learning_rate": 1.2303725315161682e-05, + "loss": 1.2203, + "step": 14875 + }, + { + "epoch": 4.430760066270779, + "grad_norm": 0.3251497447490692, + "learning_rate": 1.2302786682493797e-05, + "loss": 1.2208, + "step": 14876 + }, + { + "epoch": 4.431057912470448, + "grad_norm": 0.3092595636844635, + "learning_rate": 1.2301848028401055e-05, + "loss": 1.2414, + "step": 14877 + }, + { + "epoch": 4.431355758670117, + "grad_norm": 0.26255378127098083, + "learning_rate": 1.2300909352892192e-05, + "loss": 1.2122, + "step": 14878 + }, + { + "epoch": 4.431653604869785, + "grad_norm": 0.3438512980937958, + "learning_rate": 1.2299970655975935e-05, + "loss": 1.2061, + "step": 14879 + }, + { + "epoch": 4.4319514510694535, + "grad_norm": 0.2744996249675751, + "learning_rate": 1.229903193766102e-05, + "loss": 1.2267, + "step": 14880 + }, + { + "epoch": 4.432249297269123, + "grad_norm": 0.3187209665775299, + "learning_rate": 1.229809319795618e-05, + "loss": 1.2147, + "step": 14881 + }, + { + "epoch": 4.432547143468791, + "grad_norm": 0.6051568388938904, + "learning_rate": 1.2297154436870155e-05, + "loss": 1.2268, + "step": 14882 + }, + { + "epoch": 4.43284498966846, + "grad_norm": 0.6375366449356079, + "learning_rate": 1.2296215654411674e-05, + "loss": 1.2223, + "step": 14883 + }, + { + "epoch": 4.433142835868129, + "grad_norm": 0.29692256450653076, + "learning_rate": 1.2295276850589471e-05, + "loss": 1.2285, + "step": 14884 + }, + { + "epoch": 4.433440682067797, + "grad_norm": 0.37519916892051697, + "learning_rate": 1.229433802541228e-05, + "loss": 1.2154, + "step": 14885 + }, + { + "epoch": 4.433738528267466, + "grad_norm": 0.2912464141845703, + "learning_rate": 1.2293399178888836e-05, + "loss": 1.2194, + "step": 14886 + }, + { + "epoch": 4.4340363744671345, + "grad_norm": 0.4365774393081665, + "learning_rate": 1.2292460311027878e-05, + "loss": 1.2401, + "step": 14887 + }, + { + "epoch": 4.434334220666803, + "grad_norm": 0.29991504549980164, + "learning_rate": 1.2291521421838135e-05, + "loss": 1.2164, + "step": 14888 + }, + { + "epoch": 4.434632066866472, + "grad_norm": 0.37065500020980835, + "learning_rate": 1.229058251132835e-05, + "loss": 1.2142, + "step": 14889 + }, + { + "epoch": 4.43492991306614, + "grad_norm": 0.2551129162311554, + "learning_rate": 1.2289643579507251e-05, + "loss": 1.2062, + "step": 14890 + }, + { + "epoch": 4.435227759265809, + "grad_norm": 0.41739919781684875, + "learning_rate": 1.2288704626383576e-05, + "loss": 1.2331, + "step": 14891 + }, + { + "epoch": 4.435525605465478, + "grad_norm": 0.2787053883075714, + "learning_rate": 1.2287765651966064e-05, + "loss": 1.229, + "step": 14892 + }, + { + "epoch": 4.435823451665146, + "grad_norm": 0.38573333621025085, + "learning_rate": 1.2286826656263445e-05, + "loss": 1.2234, + "step": 14893 + }, + { + "epoch": 4.4361212978648155, + "grad_norm": 0.3470078110694885, + "learning_rate": 1.2285887639284461e-05, + "loss": 1.2266, + "step": 14894 + }, + { + "epoch": 4.436419144064484, + "grad_norm": 0.2691015601158142, + "learning_rate": 1.2284948601037847e-05, + "loss": 1.2335, + "step": 14895 + }, + { + "epoch": 4.436716990264152, + "grad_norm": 0.2952558994293213, + "learning_rate": 1.228400954153234e-05, + "loss": 1.2217, + "step": 14896 + }, + { + "epoch": 4.437014836463821, + "grad_norm": 0.3009900748729706, + "learning_rate": 1.2283070460776674e-05, + "loss": 1.2047, + "step": 14897 + }, + { + "epoch": 4.43731268266349, + "grad_norm": 0.3212072551250458, + "learning_rate": 1.228213135877959e-05, + "loss": 1.2204, + "step": 14898 + }, + { + "epoch": 4.437610528863158, + "grad_norm": 0.2819103002548218, + "learning_rate": 1.2281192235549822e-05, + "loss": 1.2143, + "step": 14899 + }, + { + "epoch": 4.437908375062827, + "grad_norm": 0.3015141189098358, + "learning_rate": 1.2280253091096108e-05, + "loss": 1.2301, + "step": 14900 + }, + { + "epoch": 4.4382062212624955, + "grad_norm": 0.33626100420951843, + "learning_rate": 1.2279313925427188e-05, + "loss": 1.2429, + "step": 14901 + }, + { + "epoch": 4.438504067462164, + "grad_norm": 0.3932854235172272, + "learning_rate": 1.22783747385518e-05, + "loss": 1.236, + "step": 14902 + }, + { + "epoch": 4.438801913661833, + "grad_norm": 0.30726158618927, + "learning_rate": 1.2277435530478679e-05, + "loss": 1.1996, + "step": 14903 + }, + { + "epoch": 4.439099759861501, + "grad_norm": 0.319837749004364, + "learning_rate": 1.2276496301216564e-05, + "loss": 1.2312, + "step": 14904 + }, + { + "epoch": 4.439397606061171, + "grad_norm": 0.28553327918052673, + "learning_rate": 1.2275557050774191e-05, + "loss": 1.2307, + "step": 14905 + }, + { + "epoch": 4.439695452260839, + "grad_norm": 0.3147313892841339, + "learning_rate": 1.227461777916031e-05, + "loss": 1.2167, + "step": 14906 + }, + { + "epoch": 4.439993298460507, + "grad_norm": 0.316043883562088, + "learning_rate": 1.2273678486383647e-05, + "loss": 1.2276, + "step": 14907 + }, + { + "epoch": 4.4402911446601765, + "grad_norm": 0.2953035831451416, + "learning_rate": 1.2272739172452947e-05, + "loss": 1.1966, + "step": 14908 + }, + { + "epoch": 4.440588990859845, + "grad_norm": 0.43210822343826294, + "learning_rate": 1.227179983737695e-05, + "loss": 1.2325, + "step": 14909 + }, + { + "epoch": 4.440886837059513, + "grad_norm": 0.4180282950401306, + "learning_rate": 1.2270860481164391e-05, + "loss": 1.2111, + "step": 14910 + }, + { + "epoch": 4.441184683259182, + "grad_norm": 0.3034069836139679, + "learning_rate": 1.2269921103824015e-05, + "loss": 1.2263, + "step": 14911 + }, + { + "epoch": 4.441482529458851, + "grad_norm": 0.6624096632003784, + "learning_rate": 1.2268981705364556e-05, + "loss": 1.2138, + "step": 14912 + }, + { + "epoch": 4.441780375658519, + "grad_norm": 0.4512910842895508, + "learning_rate": 1.2268042285794761e-05, + "loss": 1.2096, + "step": 14913 + }, + { + "epoch": 4.442078221858188, + "grad_norm": 0.4340059757232666, + "learning_rate": 1.2267102845123364e-05, + "loss": 1.228, + "step": 14914 + }, + { + "epoch": 4.442376068057857, + "grad_norm": 0.550622284412384, + "learning_rate": 1.226616338335911e-05, + "loss": 1.2339, + "step": 14915 + }, + { + "epoch": 4.442673914257525, + "grad_norm": 0.3245103657245636, + "learning_rate": 1.2265223900510734e-05, + "loss": 1.214, + "step": 14916 + }, + { + "epoch": 4.442971760457194, + "grad_norm": 0.5484495162963867, + "learning_rate": 1.2264284396586982e-05, + "loss": 1.2391, + "step": 14917 + }, + { + "epoch": 4.443269606656862, + "grad_norm": 0.2908317744731903, + "learning_rate": 1.2263344871596595e-05, + "loss": 1.2207, + "step": 14918 + }, + { + "epoch": 4.443567452856531, + "grad_norm": 0.3075171709060669, + "learning_rate": 1.2262405325548313e-05, + "loss": 1.2199, + "step": 14919 + }, + { + "epoch": 4.4438652990562, + "grad_norm": 0.4471679925918579, + "learning_rate": 1.2261465758450877e-05, + "loss": 1.226, + "step": 14920 + }, + { + "epoch": 4.444163145255868, + "grad_norm": 0.3009333908557892, + "learning_rate": 1.2260526170313027e-05, + "loss": 1.2263, + "step": 14921 + }, + { + "epoch": 4.4444609914555375, + "grad_norm": 0.4344242215156555, + "learning_rate": 1.2259586561143504e-05, + "loss": 1.22, + "step": 14922 + }, + { + "epoch": 4.444758837655206, + "grad_norm": 0.28826597332954407, + "learning_rate": 1.2258646930951058e-05, + "loss": 1.2158, + "step": 14923 + }, + { + "epoch": 4.445056683854874, + "grad_norm": 0.4402480721473694, + "learning_rate": 1.2257707279744424e-05, + "loss": 1.2161, + "step": 14924 + }, + { + "epoch": 4.445354530054543, + "grad_norm": 0.35612747073173523, + "learning_rate": 1.2256767607532344e-05, + "loss": 1.2085, + "step": 14925 + }, + { + "epoch": 4.445652376254212, + "grad_norm": 0.36830493807792664, + "learning_rate": 1.2255827914323568e-05, + "loss": 1.2092, + "step": 14926 + }, + { + "epoch": 4.44595022245388, + "grad_norm": 0.35415905714035034, + "learning_rate": 1.2254888200126829e-05, + "loss": 1.2053, + "step": 14927 + }, + { + "epoch": 4.446248068653549, + "grad_norm": 0.35066384077072144, + "learning_rate": 1.2253948464950875e-05, + "loss": 1.2161, + "step": 14928 + }, + { + "epoch": 4.446545914853218, + "grad_norm": 0.33780568838119507, + "learning_rate": 1.2253008708804451e-05, + "loss": 1.2341, + "step": 14929 + }, + { + "epoch": 4.446843761052886, + "grad_norm": 0.3455325961112976, + "learning_rate": 1.22520689316963e-05, + "loss": 1.2197, + "step": 14930 + }, + { + "epoch": 4.447141607252555, + "grad_norm": 0.2523250877857208, + "learning_rate": 1.2251129133635158e-05, + "loss": 1.2222, + "step": 14931 + }, + { + "epoch": 4.4474394534522235, + "grad_norm": 0.34670156240463257, + "learning_rate": 1.2250189314629778e-05, + "loss": 1.2193, + "step": 14932 + }, + { + "epoch": 4.447737299651893, + "grad_norm": 0.2636018693447113, + "learning_rate": 1.22492494746889e-05, + "loss": 1.2022, + "step": 14933 + }, + { + "epoch": 4.448035145851561, + "grad_norm": 0.3007007837295532, + "learning_rate": 1.2248309613821267e-05, + "loss": 1.2254, + "step": 14934 + }, + { + "epoch": 4.448332992051229, + "grad_norm": 0.2943074703216553, + "learning_rate": 1.2247369732035628e-05, + "loss": 1.2271, + "step": 14935 + }, + { + "epoch": 4.448630838250899, + "grad_norm": 0.26788967847824097, + "learning_rate": 1.2246429829340724e-05, + "loss": 1.2442, + "step": 14936 + }, + { + "epoch": 4.448928684450567, + "grad_norm": 0.3123735785484314, + "learning_rate": 1.2245489905745298e-05, + "loss": 1.2185, + "step": 14937 + }, + { + "epoch": 4.449226530650235, + "grad_norm": 0.25530245900154114, + "learning_rate": 1.2244549961258098e-05, + "loss": 1.2148, + "step": 14938 + }, + { + "epoch": 4.449524376849904, + "grad_norm": 0.2747754156589508, + "learning_rate": 1.224360999588787e-05, + "loss": 1.2215, + "step": 14939 + }, + { + "epoch": 4.449822223049573, + "grad_norm": 0.2637244462966919, + "learning_rate": 1.2242670009643357e-05, + "loss": 1.225, + "step": 14940 + }, + { + "epoch": 4.450120069249241, + "grad_norm": 0.3310074806213379, + "learning_rate": 1.2241730002533303e-05, + "loss": 1.2368, + "step": 14941 + }, + { + "epoch": 4.45041791544891, + "grad_norm": 0.30113866925239563, + "learning_rate": 1.2240789974566458e-05, + "loss": 1.21, + "step": 14942 + }, + { + "epoch": 4.450715761648579, + "grad_norm": 0.40723204612731934, + "learning_rate": 1.2239849925751568e-05, + "loss": 1.2365, + "step": 14943 + }, + { + "epoch": 4.451013607848247, + "grad_norm": 0.48997971415519714, + "learning_rate": 1.2238909856097374e-05, + "loss": 1.2199, + "step": 14944 + }, + { + "epoch": 4.451311454047916, + "grad_norm": 0.27049025893211365, + "learning_rate": 1.2237969765612622e-05, + "loss": 1.2268, + "step": 14945 + }, + { + "epoch": 4.4516093002475845, + "grad_norm": 0.3590318262577057, + "learning_rate": 1.2237029654306068e-05, + "loss": 1.2199, + "step": 14946 + }, + { + "epoch": 4.451907146447254, + "grad_norm": 0.27747493982315063, + "learning_rate": 1.2236089522186452e-05, + "loss": 1.2234, + "step": 14947 + }, + { + "epoch": 4.452204992646922, + "grad_norm": 0.295388400554657, + "learning_rate": 1.2235149369262517e-05, + "loss": 1.2089, + "step": 14948 + }, + { + "epoch": 4.45250283884659, + "grad_norm": 0.26772403717041016, + "learning_rate": 1.2234209195543016e-05, + "loss": 1.2114, + "step": 14949 + }, + { + "epoch": 4.45280068504626, + "grad_norm": 0.31864991784095764, + "learning_rate": 1.2233269001036698e-05, + "loss": 1.2265, + "step": 14950 + }, + { + "epoch": 4.453098531245928, + "grad_norm": 0.301789253950119, + "learning_rate": 1.2232328785752304e-05, + "loss": 1.2396, + "step": 14951 + }, + { + "epoch": 4.453396377445596, + "grad_norm": 0.4028361141681671, + "learning_rate": 1.2231388549698584e-05, + "loss": 1.2165, + "step": 14952 + }, + { + "epoch": 4.4536942236452655, + "grad_norm": 0.38277995586395264, + "learning_rate": 1.223044829288429e-05, + "loss": 1.2108, + "step": 14953 + }, + { + "epoch": 4.453992069844934, + "grad_norm": 0.304472953081131, + "learning_rate": 1.2229508015318163e-05, + "loss": 1.2275, + "step": 14954 + }, + { + "epoch": 4.454289916044602, + "grad_norm": 0.3237842321395874, + "learning_rate": 1.2228567717008956e-05, + "loss": 1.2159, + "step": 14955 + }, + { + "epoch": 4.454587762244271, + "grad_norm": 0.36103376746177673, + "learning_rate": 1.2227627397965417e-05, + "loss": 1.2131, + "step": 14956 + }, + { + "epoch": 4.45488560844394, + "grad_norm": 0.6664847135543823, + "learning_rate": 1.2226687058196292e-05, + "loss": 1.2336, + "step": 14957 + }, + { + "epoch": 4.455183454643608, + "grad_norm": 0.2946167290210724, + "learning_rate": 1.2225746697710334e-05, + "loss": 1.2162, + "step": 14958 + }, + { + "epoch": 4.455481300843277, + "grad_norm": 0.5487415194511414, + "learning_rate": 1.2224806316516288e-05, + "loss": 1.2071, + "step": 14959 + }, + { + "epoch": 4.4557791470429455, + "grad_norm": 0.3942016065120697, + "learning_rate": 1.2223865914622908e-05, + "loss": 1.2124, + "step": 14960 + }, + { + "epoch": 4.456076993242615, + "grad_norm": 0.33333155512809753, + "learning_rate": 1.2222925492038938e-05, + "loss": 1.2308, + "step": 14961 + }, + { + "epoch": 4.456374839442283, + "grad_norm": 0.31741824746131897, + "learning_rate": 1.2221985048773131e-05, + "loss": 1.2147, + "step": 14962 + }, + { + "epoch": 4.456672685641951, + "grad_norm": 0.34265267848968506, + "learning_rate": 1.2221044584834231e-05, + "loss": 1.2316, + "step": 14963 + }, + { + "epoch": 4.456970531841621, + "grad_norm": 0.3344602584838867, + "learning_rate": 1.2220104100230999e-05, + "loss": 1.2107, + "step": 14964 + }, + { + "epoch": 4.457268378041289, + "grad_norm": 0.40091463923454285, + "learning_rate": 1.2219163594972177e-05, + "loss": 1.2291, + "step": 14965 + }, + { + "epoch": 4.457566224240957, + "grad_norm": 0.5136224031448364, + "learning_rate": 1.2218223069066515e-05, + "loss": 1.22, + "step": 14966 + }, + { + "epoch": 4.4578640704406265, + "grad_norm": 0.3041544556617737, + "learning_rate": 1.2217282522522768e-05, + "loss": 1.2207, + "step": 14967 + }, + { + "epoch": 4.458161916640295, + "grad_norm": 0.5141249299049377, + "learning_rate": 1.2216341955349685e-05, + "loss": 1.225, + "step": 14968 + }, + { + "epoch": 4.458459762839963, + "grad_norm": 0.28203797340393066, + "learning_rate": 1.221540136755601e-05, + "loss": 1.2196, + "step": 14969 + }, + { + "epoch": 4.458757609039632, + "grad_norm": 0.4156658351421356, + "learning_rate": 1.2214460759150506e-05, + "loss": 1.2312, + "step": 14970 + }, + { + "epoch": 4.459055455239301, + "grad_norm": 0.24402017891407013, + "learning_rate": 1.221352013014192e-05, + "loss": 1.2392, + "step": 14971 + }, + { + "epoch": 4.45935330143897, + "grad_norm": 0.3900117874145508, + "learning_rate": 1.2212579480538999e-05, + "loss": 1.2063, + "step": 14972 + }, + { + "epoch": 4.459651147638638, + "grad_norm": 0.3157013952732086, + "learning_rate": 1.2211638810350499e-05, + "loss": 1.2264, + "step": 14973 + }, + { + "epoch": 4.459948993838307, + "grad_norm": 0.34973737597465515, + "learning_rate": 1.2210698119585171e-05, + "loss": 1.2093, + "step": 14974 + }, + { + "epoch": 4.460246840037976, + "grad_norm": 0.3000805974006653, + "learning_rate": 1.2209757408251768e-05, + "loss": 1.2282, + "step": 14975 + }, + { + "epoch": 4.460544686237644, + "grad_norm": 0.25990670919418335, + "learning_rate": 1.2208816676359038e-05, + "loss": 1.2203, + "step": 14976 + }, + { + "epoch": 4.460842532437312, + "grad_norm": 0.3210603594779968, + "learning_rate": 1.2207875923915741e-05, + "loss": 1.2111, + "step": 14977 + }, + { + "epoch": 4.461140378636982, + "grad_norm": 0.25940433144569397, + "learning_rate": 1.2206935150930623e-05, + "loss": 1.2071, + "step": 14978 + }, + { + "epoch": 4.46143822483665, + "grad_norm": 0.27722057700157166, + "learning_rate": 1.220599435741244e-05, + "loss": 1.2124, + "step": 14979 + }, + { + "epoch": 4.461736071036318, + "grad_norm": 0.2738329768180847, + "learning_rate": 1.2205053543369942e-05, + "loss": 1.2147, + "step": 14980 + }, + { + "epoch": 4.4620339172359875, + "grad_norm": 0.4540187120437622, + "learning_rate": 1.2204112708811888e-05, + "loss": 1.2208, + "step": 14981 + }, + { + "epoch": 4.462331763435656, + "grad_norm": 0.4237889349460602, + "learning_rate": 1.2203171853747024e-05, + "loss": 1.2141, + "step": 14982 + }, + { + "epoch": 4.462629609635324, + "grad_norm": 0.3281431198120117, + "learning_rate": 1.220223097818411e-05, + "loss": 1.2338, + "step": 14983 + }, + { + "epoch": 4.462927455834993, + "grad_norm": 0.3441547751426697, + "learning_rate": 1.2201290082131898e-05, + "loss": 1.218, + "step": 14984 + }, + { + "epoch": 4.463225302034662, + "grad_norm": 0.36121034622192383, + "learning_rate": 1.2200349165599139e-05, + "loss": 1.215, + "step": 14985 + }, + { + "epoch": 4.46352314823433, + "grad_norm": 0.4869498908519745, + "learning_rate": 1.2199408228594591e-05, + "loss": 1.2131, + "step": 14986 + }, + { + "epoch": 4.463820994433999, + "grad_norm": 0.2633175551891327, + "learning_rate": 1.2198467271127003e-05, + "loss": 1.2085, + "step": 14987 + }, + { + "epoch": 4.464118840633668, + "grad_norm": 0.34987908601760864, + "learning_rate": 1.2197526293205138e-05, + "loss": 1.2242, + "step": 14988 + }, + { + "epoch": 4.464416686833337, + "grad_norm": 0.23947428166866302, + "learning_rate": 1.2196585294837744e-05, + "loss": 1.2235, + "step": 14989 + }, + { + "epoch": 4.464714533033005, + "grad_norm": 0.4048132300376892, + "learning_rate": 1.2195644276033578e-05, + "loss": 1.223, + "step": 14990 + }, + { + "epoch": 4.4650123792326735, + "grad_norm": 0.3032701015472412, + "learning_rate": 1.2194703236801398e-05, + "loss": 1.2376, + "step": 14991 + }, + { + "epoch": 4.465310225432343, + "grad_norm": 0.30714356899261475, + "learning_rate": 1.219376217714995e-05, + "loss": 1.2159, + "step": 14992 + }, + { + "epoch": 4.465608071632011, + "grad_norm": 0.27791735529899597, + "learning_rate": 1.2192821097088e-05, + "loss": 1.2182, + "step": 14993 + }, + { + "epoch": 4.465905917831679, + "grad_norm": 0.3147997260093689, + "learning_rate": 1.21918799966243e-05, + "loss": 1.2437, + "step": 14994 + }, + { + "epoch": 4.4662037640313486, + "grad_norm": 0.31485629081726074, + "learning_rate": 1.2190938875767606e-05, + "loss": 1.2258, + "step": 14995 + }, + { + "epoch": 4.466501610231017, + "grad_norm": 0.3440761864185333, + "learning_rate": 1.2189997734526672e-05, + "loss": 1.2163, + "step": 14996 + }, + { + "epoch": 4.466799456430685, + "grad_norm": 0.3485150635242462, + "learning_rate": 1.2189056572910256e-05, + "loss": 1.2129, + "step": 14997 + }, + { + "epoch": 4.467097302630354, + "grad_norm": 0.2978845536708832, + "learning_rate": 1.2188115390927115e-05, + "loss": 1.2229, + "step": 14998 + }, + { + "epoch": 4.467395148830023, + "grad_norm": 0.2978161871433258, + "learning_rate": 1.2187174188586004e-05, + "loss": 1.2187, + "step": 14999 + }, + { + "epoch": 4.467692995029692, + "grad_norm": 0.29847341775894165, + "learning_rate": 1.2186232965895681e-05, + "loss": 1.2375, + "step": 15000 + }, + { + "epoch": 4.467692995029692, + "eval_loss": 1.328788161277771, + "eval_runtime": 21.4624, + "eval_samples_per_second": 80.792, + "eval_steps_per_second": 5.079, + "step": 15000 + }, + { + "epoch": 4.46799084122936, + "grad_norm": 0.2802181541919708, + "learning_rate": 1.2185291722864907e-05, + "loss": 1.2094, + "step": 15001 + }, + { + "epoch": 4.468288687429029, + "grad_norm": 0.5638412237167358, + "learning_rate": 1.218435045950243e-05, + "loss": 1.2276, + "step": 15002 + }, + { + "epoch": 4.468586533628698, + "grad_norm": 0.5578691959381104, + "learning_rate": 1.2183409175817013e-05, + "loss": 1.2234, + "step": 15003 + }, + { + "epoch": 4.468884379828366, + "grad_norm": 0.3073020875453949, + "learning_rate": 1.2182467871817415e-05, + "loss": 1.2274, + "step": 15004 + }, + { + "epoch": 4.4691822260280345, + "grad_norm": 0.3942149877548218, + "learning_rate": 1.218152654751239e-05, + "loss": 1.2297, + "step": 15005 + }, + { + "epoch": 4.469480072227704, + "grad_norm": 0.29925331473350525, + "learning_rate": 1.21805852029107e-05, + "loss": 1.2161, + "step": 15006 + }, + { + "epoch": 4.469777918427372, + "grad_norm": 0.3018183410167694, + "learning_rate": 1.2179643838021098e-05, + "loss": 1.2259, + "step": 15007 + }, + { + "epoch": 4.47007576462704, + "grad_norm": 0.3541475534439087, + "learning_rate": 1.217870245285235e-05, + "loss": 1.2223, + "step": 15008 + }, + { + "epoch": 4.47037361082671, + "grad_norm": 0.3656480610370636, + "learning_rate": 1.2177761047413205e-05, + "loss": 1.2288, + "step": 15009 + }, + { + "epoch": 4.470671457026378, + "grad_norm": 0.31403958797454834, + "learning_rate": 1.2176819621712428e-05, + "loss": 1.229, + "step": 15010 + }, + { + "epoch": 4.470969303226046, + "grad_norm": 0.33523741364479065, + "learning_rate": 1.2175878175758777e-05, + "loss": 1.2113, + "step": 15011 + }, + { + "epoch": 4.4712671494257155, + "grad_norm": 0.31044355034828186, + "learning_rate": 1.2174936709561012e-05, + "loss": 1.2339, + "step": 15012 + }, + { + "epoch": 4.471564995625384, + "grad_norm": 0.30819764733314514, + "learning_rate": 1.2173995223127891e-05, + "loss": 1.2239, + "step": 15013 + }, + { + "epoch": 4.471862841825053, + "grad_norm": 0.2760671377182007, + "learning_rate": 1.217305371646817e-05, + "loss": 1.2172, + "step": 15014 + }, + { + "epoch": 4.472160688024721, + "grad_norm": 0.2517237663269043, + "learning_rate": 1.2172112189590613e-05, + "loss": 1.202, + "step": 15015 + }, + { + "epoch": 4.47245853422439, + "grad_norm": 0.3907548189163208, + "learning_rate": 1.217117064250398e-05, + "loss": 1.2289, + "step": 15016 + }, + { + "epoch": 4.472756380424059, + "grad_norm": 0.35696491599082947, + "learning_rate": 1.217022907521703e-05, + "loss": 1.2165, + "step": 15017 + }, + { + "epoch": 4.473054226623727, + "grad_norm": 0.26156413555145264, + "learning_rate": 1.2169287487738524e-05, + "loss": 1.2175, + "step": 15018 + }, + { + "epoch": 4.4733520728233955, + "grad_norm": 0.25209230184555054, + "learning_rate": 1.2168345880077222e-05, + "loss": 1.2315, + "step": 15019 + }, + { + "epoch": 4.473649919023065, + "grad_norm": 0.2818114161491394, + "learning_rate": 1.216740425224188e-05, + "loss": 1.2133, + "step": 15020 + }, + { + "epoch": 4.473947765222733, + "grad_norm": 0.26874756813049316, + "learning_rate": 1.2166462604241267e-05, + "loss": 1.2302, + "step": 15021 + }, + { + "epoch": 4.474245611422401, + "grad_norm": 0.2840188443660736, + "learning_rate": 1.2165520936084139e-05, + "loss": 1.2016, + "step": 15022 + }, + { + "epoch": 4.474543457622071, + "grad_norm": 0.2546764612197876, + "learning_rate": 1.2164579247779258e-05, + "loss": 1.2161, + "step": 15023 + }, + { + "epoch": 4.474841303821739, + "grad_norm": 0.39144718647003174, + "learning_rate": 1.2163637539335384e-05, + "loss": 1.2264, + "step": 15024 + }, + { + "epoch": 4.475139150021407, + "grad_norm": 0.3180117607116699, + "learning_rate": 1.2162695810761283e-05, + "loss": 1.2164, + "step": 15025 + }, + { + "epoch": 4.4754369962210765, + "grad_norm": 0.30262383818626404, + "learning_rate": 1.2161754062065714e-05, + "loss": 1.2206, + "step": 15026 + }, + { + "epoch": 4.475734842420745, + "grad_norm": 0.25390443205833435, + "learning_rate": 1.2160812293257436e-05, + "loss": 1.2224, + "step": 15027 + }, + { + "epoch": 4.476032688620414, + "grad_norm": 0.2694064676761627, + "learning_rate": 1.2159870504345212e-05, + "loss": 1.2105, + "step": 15028 + }, + { + "epoch": 4.476330534820082, + "grad_norm": 0.2786681056022644, + "learning_rate": 1.2158928695337811e-05, + "loss": 1.2228, + "step": 15029 + }, + { + "epoch": 4.476628381019751, + "grad_norm": 0.24426937103271484, + "learning_rate": 1.2157986866243988e-05, + "loss": 1.2198, + "step": 15030 + }, + { + "epoch": 4.47692622721942, + "grad_norm": 0.2864536941051483, + "learning_rate": 1.2157045017072509e-05, + "loss": 1.2201, + "step": 15031 + }, + { + "epoch": 4.477224073419088, + "grad_norm": 0.266191303730011, + "learning_rate": 1.2156103147832137e-05, + "loss": 1.2339, + "step": 15032 + }, + { + "epoch": 4.477521919618757, + "grad_norm": 0.3499411344528198, + "learning_rate": 1.2155161258531632e-05, + "loss": 1.2225, + "step": 15033 + }, + { + "epoch": 4.477819765818426, + "grad_norm": 0.24897105991840363, + "learning_rate": 1.215421934917976e-05, + "loss": 1.2188, + "step": 15034 + }, + { + "epoch": 4.478117612018094, + "grad_norm": 0.33457404375076294, + "learning_rate": 1.2153277419785285e-05, + "loss": 1.2324, + "step": 15035 + }, + { + "epoch": 4.478415458217762, + "grad_norm": 0.2837419807910919, + "learning_rate": 1.2152335470356968e-05, + "loss": 1.2267, + "step": 15036 + }, + { + "epoch": 4.478713304417432, + "grad_norm": 0.29611486196517944, + "learning_rate": 1.2151393500903575e-05, + "loss": 1.2239, + "step": 15037 + }, + { + "epoch": 4.4790111506171, + "grad_norm": 0.3383772671222687, + "learning_rate": 1.2150451511433868e-05, + "loss": 1.222, + "step": 15038 + }, + { + "epoch": 4.479308996816769, + "grad_norm": 0.256071001291275, + "learning_rate": 1.2149509501956613e-05, + "loss": 1.1995, + "step": 15039 + }, + { + "epoch": 4.4796068430164375, + "grad_norm": 0.3517363369464874, + "learning_rate": 1.2148567472480574e-05, + "loss": 1.2244, + "step": 15040 + }, + { + "epoch": 4.479904689216106, + "grad_norm": 0.30239713191986084, + "learning_rate": 1.2147625423014516e-05, + "loss": 1.2116, + "step": 15041 + }, + { + "epoch": 4.480202535415775, + "grad_norm": 0.34286734461784363, + "learning_rate": 1.2146683353567204e-05, + "loss": 1.2291, + "step": 15042 + }, + { + "epoch": 4.480500381615443, + "grad_norm": 0.3684154152870178, + "learning_rate": 1.2145741264147397e-05, + "loss": 1.2289, + "step": 15043 + }, + { + "epoch": 4.480798227815112, + "grad_norm": 0.34026914834976196, + "learning_rate": 1.2144799154763868e-05, + "loss": 1.2172, + "step": 15044 + }, + { + "epoch": 4.481096074014781, + "grad_norm": 0.5373833775520325, + "learning_rate": 1.214385702542538e-05, + "loss": 1.2015, + "step": 15045 + }, + { + "epoch": 4.481393920214449, + "grad_norm": 0.2383013218641281, + "learning_rate": 1.2142914876140694e-05, + "loss": 1.2112, + "step": 15046 + }, + { + "epoch": 4.481691766414118, + "grad_norm": 0.7224148511886597, + "learning_rate": 1.2141972706918583e-05, + "loss": 1.2205, + "step": 15047 + }, + { + "epoch": 4.481989612613787, + "grad_norm": 0.5347703099250793, + "learning_rate": 1.2141030517767807e-05, + "loss": 1.231, + "step": 15048 + }, + { + "epoch": 4.482287458813455, + "grad_norm": 0.4365391135215759, + "learning_rate": 1.2140088308697137e-05, + "loss": 1.2163, + "step": 15049 + }, + { + "epoch": 4.4825853050131235, + "grad_norm": 0.4629792869091034, + "learning_rate": 1.2139146079715334e-05, + "loss": 1.2422, + "step": 15050 + }, + { + "epoch": 4.482883151212793, + "grad_norm": 0.426817923784256, + "learning_rate": 1.2138203830831165e-05, + "loss": 1.2169, + "step": 15051 + }, + { + "epoch": 4.483180997412461, + "grad_norm": 0.32950761914253235, + "learning_rate": 1.2137261562053401e-05, + "loss": 1.2174, + "step": 15052 + }, + { + "epoch": 4.483478843612129, + "grad_norm": 0.506644070148468, + "learning_rate": 1.2136319273390807e-05, + "loss": 1.2284, + "step": 15053 + }, + { + "epoch": 4.4837766898117986, + "grad_norm": 0.35195106267929077, + "learning_rate": 1.2135376964852145e-05, + "loss": 1.2356, + "step": 15054 + }, + { + "epoch": 4.484074536011467, + "grad_norm": 0.3815198242664337, + "learning_rate": 1.213443463644619e-05, + "loss": 1.2255, + "step": 15055 + }, + { + "epoch": 4.484372382211136, + "grad_norm": 0.26191478967666626, + "learning_rate": 1.2133492288181703e-05, + "loss": 1.2308, + "step": 15056 + }, + { + "epoch": 4.484670228410804, + "grad_norm": 0.2946739196777344, + "learning_rate": 1.2132549920067455e-05, + "loss": 1.2274, + "step": 15057 + }, + { + "epoch": 4.484968074610473, + "grad_norm": 0.30766019225120544, + "learning_rate": 1.2131607532112213e-05, + "loss": 1.2215, + "step": 15058 + }, + { + "epoch": 4.485265920810142, + "grad_norm": 0.24756793677806854, + "learning_rate": 1.2130665124324745e-05, + "loss": 1.2109, + "step": 15059 + }, + { + "epoch": 4.48556376700981, + "grad_norm": 0.2954517602920532, + "learning_rate": 1.2129722696713817e-05, + "loss": 1.2137, + "step": 15060 + }, + { + "epoch": 4.485861613209479, + "grad_norm": 0.30444443225860596, + "learning_rate": 1.2128780249288198e-05, + "loss": 1.2045, + "step": 15061 + }, + { + "epoch": 4.486159459409148, + "grad_norm": 0.3060961961746216, + "learning_rate": 1.2127837782056659e-05, + "loss": 1.2191, + "step": 15062 + }, + { + "epoch": 4.486457305608816, + "grad_norm": 0.2736704647541046, + "learning_rate": 1.2126895295027968e-05, + "loss": 1.2448, + "step": 15063 + }, + { + "epoch": 4.4867551518084845, + "grad_norm": 0.243803009390831, + "learning_rate": 1.212595278821089e-05, + "loss": 1.2097, + "step": 15064 + }, + { + "epoch": 4.487052998008154, + "grad_norm": 0.2489195019006729, + "learning_rate": 1.2125010261614197e-05, + "loss": 1.2203, + "step": 15065 + }, + { + "epoch": 4.487350844207822, + "grad_norm": 0.3543930649757385, + "learning_rate": 1.2124067715246661e-05, + "loss": 1.2078, + "step": 15066 + }, + { + "epoch": 4.487648690407491, + "grad_norm": 0.244902104139328, + "learning_rate": 1.2123125149117043e-05, + "loss": 1.2378, + "step": 15067 + }, + { + "epoch": 4.48794653660716, + "grad_norm": 0.42561474442481995, + "learning_rate": 1.2122182563234121e-05, + "loss": 1.2123, + "step": 15068 + }, + { + "epoch": 4.488244382806828, + "grad_norm": 0.2636564075946808, + "learning_rate": 1.2121239957606661e-05, + "loss": 1.2138, + "step": 15069 + }, + { + "epoch": 4.488542229006497, + "grad_norm": 0.3436940312385559, + "learning_rate": 1.2120297332243432e-05, + "loss": 1.23, + "step": 15070 + }, + { + "epoch": 4.4888400752061655, + "grad_norm": 0.2705753743648529, + "learning_rate": 1.2119354687153204e-05, + "loss": 1.2155, + "step": 15071 + }, + { + "epoch": 4.489137921405834, + "grad_norm": 0.34441936016082764, + "learning_rate": 1.211841202234475e-05, + "loss": 1.2256, + "step": 15072 + }, + { + "epoch": 4.489435767605503, + "grad_norm": 0.24522235989570618, + "learning_rate": 1.2117469337826842e-05, + "loss": 1.2273, + "step": 15073 + }, + { + "epoch": 4.489733613805171, + "grad_norm": 0.4094565510749817, + "learning_rate": 1.211652663360824e-05, + "loss": 1.2345, + "step": 15074 + }, + { + "epoch": 4.49003146000484, + "grad_norm": 0.24970418214797974, + "learning_rate": 1.2115583909697729e-05, + "loss": 1.2257, + "step": 15075 + }, + { + "epoch": 4.490329306204509, + "grad_norm": 0.37969470024108887, + "learning_rate": 1.2114641166104074e-05, + "loss": 1.2171, + "step": 15076 + }, + { + "epoch": 4.490627152404177, + "grad_norm": 0.25172027945518494, + "learning_rate": 1.211369840283604e-05, + "loss": 1.2246, + "step": 15077 + }, + { + "epoch": 4.4909249986038455, + "grad_norm": 0.5294029712677002, + "learning_rate": 1.2112755619902408e-05, + "loss": 1.2127, + "step": 15078 + }, + { + "epoch": 4.491222844803515, + "grad_norm": 0.28505566716194153, + "learning_rate": 1.2111812817311943e-05, + "loss": 1.239, + "step": 15079 + }, + { + "epoch": 4.491520691003183, + "grad_norm": 0.41262924671173096, + "learning_rate": 1.2110869995073422e-05, + "loss": 1.2166, + "step": 15080 + }, + { + "epoch": 4.491818537202852, + "grad_norm": 0.24879197776317596, + "learning_rate": 1.2109927153195612e-05, + "loss": 1.2323, + "step": 15081 + }, + { + "epoch": 4.492116383402521, + "grad_norm": 0.5299745202064514, + "learning_rate": 1.2108984291687286e-05, + "loss": 1.2162, + "step": 15082 + }, + { + "epoch": 4.492414229602189, + "grad_norm": 0.39849525690078735, + "learning_rate": 1.2108041410557222e-05, + "loss": 1.2248, + "step": 15083 + }, + { + "epoch": 4.492712075801858, + "grad_norm": 0.43437138199806213, + "learning_rate": 1.2107098509814185e-05, + "loss": 1.2331, + "step": 15084 + }, + { + "epoch": 4.4930099220015265, + "grad_norm": 0.47554242610931396, + "learning_rate": 1.210615558946695e-05, + "loss": 1.2294, + "step": 15085 + }, + { + "epoch": 4.493307768201195, + "grad_norm": 0.28594446182250977, + "learning_rate": 1.2105212649524292e-05, + "loss": 1.2188, + "step": 15086 + }, + { + "epoch": 4.493605614400864, + "grad_norm": 0.33014607429504395, + "learning_rate": 1.210426968999498e-05, + "loss": 1.2345, + "step": 15087 + }, + { + "epoch": 4.493903460600532, + "grad_norm": 0.3126441538333893, + "learning_rate": 1.2103326710887793e-05, + "loss": 1.2024, + "step": 15088 + }, + { + "epoch": 4.494201306800201, + "grad_norm": 0.2798210382461548, + "learning_rate": 1.2102383712211499e-05, + "loss": 1.2144, + "step": 15089 + }, + { + "epoch": 4.49449915299987, + "grad_norm": 0.3776017129421234, + "learning_rate": 1.2101440693974875e-05, + "loss": 1.2151, + "step": 15090 + }, + { + "epoch": 4.494796999199538, + "grad_norm": 0.2667417824268341, + "learning_rate": 1.2100497656186692e-05, + "loss": 1.2207, + "step": 15091 + }, + { + "epoch": 4.495094845399207, + "grad_norm": 0.3946310877799988, + "learning_rate": 1.209955459885572e-05, + "loss": 1.2264, + "step": 15092 + }, + { + "epoch": 4.495392691598876, + "grad_norm": 0.25927817821502686, + "learning_rate": 1.2098611521990747e-05, + "loss": 1.2327, + "step": 15093 + }, + { + "epoch": 4.495690537798544, + "grad_norm": 0.4457034766674042, + "learning_rate": 1.2097668425600532e-05, + "loss": 1.24, + "step": 15094 + }, + { + "epoch": 4.495988383998213, + "grad_norm": 0.28256189823150635, + "learning_rate": 1.209672530969386e-05, + "loss": 1.2195, + "step": 15095 + }, + { + "epoch": 4.496286230197882, + "grad_norm": 0.4875396490097046, + "learning_rate": 1.20957821742795e-05, + "loss": 1.2132, + "step": 15096 + }, + { + "epoch": 4.49658407639755, + "grad_norm": 0.2714369297027588, + "learning_rate": 1.2094839019366229e-05, + "loss": 1.2314, + "step": 15097 + }, + { + "epoch": 4.496881922597219, + "grad_norm": 0.4154144525527954, + "learning_rate": 1.209389584496282e-05, + "loss": 1.2328, + "step": 15098 + }, + { + "epoch": 4.4971797687968875, + "grad_norm": 0.2884173095226288, + "learning_rate": 1.209295265107805e-05, + "loss": 1.2117, + "step": 15099 + }, + { + "epoch": 4.497477614996556, + "grad_norm": 0.396198570728302, + "learning_rate": 1.2092009437720696e-05, + "loss": 1.2197, + "step": 15100 + }, + { + "epoch": 4.497775461196225, + "grad_norm": 0.38598132133483887, + "learning_rate": 1.209106620489953e-05, + "loss": 1.2479, + "step": 15101 + }, + { + "epoch": 4.498073307395893, + "grad_norm": 0.40717682242393494, + "learning_rate": 1.2090122952623329e-05, + "loss": 1.235, + "step": 15102 + }, + { + "epoch": 4.498371153595562, + "grad_norm": 0.3123427629470825, + "learning_rate": 1.2089179680900869e-05, + "loss": 1.2152, + "step": 15103 + }, + { + "epoch": 4.498668999795231, + "grad_norm": 0.30639252066612244, + "learning_rate": 1.2088236389740925e-05, + "loss": 1.2282, + "step": 15104 + }, + { + "epoch": 4.498966845994899, + "grad_norm": 0.3442763090133667, + "learning_rate": 1.2087293079152277e-05, + "loss": 1.2128, + "step": 15105 + }, + { + "epoch": 4.4992646921945685, + "grad_norm": 0.32975438237190247, + "learning_rate": 1.2086349749143698e-05, + "loss": 1.2176, + "step": 15106 + }, + { + "epoch": 4.499562538394237, + "grad_norm": 0.3876637816429138, + "learning_rate": 1.2085406399723966e-05, + "loss": 1.2276, + "step": 15107 + }, + { + "epoch": 4.499860384593905, + "grad_norm": 0.2568848729133606, + "learning_rate": 1.2084463030901859e-05, + "loss": 1.2295, + "step": 15108 + }, + { + "epoch": 4.500158230793574, + "grad_norm": 0.40290164947509766, + "learning_rate": 1.208351964268615e-05, + "loss": 1.2211, + "step": 15109 + }, + { + "epoch": 4.500456076993243, + "grad_norm": 0.31851232051849365, + "learning_rate": 1.2082576235085619e-05, + "loss": 1.2003, + "step": 15110 + }, + { + "epoch": 4.500753923192911, + "grad_norm": 0.40014412999153137, + "learning_rate": 1.2081632808109043e-05, + "loss": 1.2168, + "step": 15111 + }, + { + "epoch": 4.50105176939258, + "grad_norm": 0.39033442735671997, + "learning_rate": 1.20806893617652e-05, + "loss": 1.2284, + "step": 15112 + }, + { + "epoch": 4.5013496155922486, + "grad_norm": 0.34930282831192017, + "learning_rate": 1.2079745896062864e-05, + "loss": 1.2205, + "step": 15113 + }, + { + "epoch": 4.501647461791917, + "grad_norm": 0.39152130484580994, + "learning_rate": 1.207880241101082e-05, + "loss": 1.2218, + "step": 15114 + }, + { + "epoch": 4.501945307991586, + "grad_norm": 0.29718637466430664, + "learning_rate": 1.2077858906617843e-05, + "loss": 1.2193, + "step": 15115 + }, + { + "epoch": 4.502243154191254, + "grad_norm": 0.34129148721694946, + "learning_rate": 1.2076915382892705e-05, + "loss": 1.2078, + "step": 15116 + }, + { + "epoch": 4.502541000390923, + "grad_norm": 0.3030122220516205, + "learning_rate": 1.2075971839844195e-05, + "loss": 1.2281, + "step": 15117 + }, + { + "epoch": 4.502838846590592, + "grad_norm": 0.385215699672699, + "learning_rate": 1.2075028277481084e-05, + "loss": 1.2282, + "step": 15118 + }, + { + "epoch": 4.50313669279026, + "grad_norm": 0.274476557970047, + "learning_rate": 1.2074084695812153e-05, + "loss": 1.222, + "step": 15119 + }, + { + "epoch": 4.503434538989929, + "grad_norm": 0.38454511761665344, + "learning_rate": 1.207314109484618e-05, + "loss": 1.2353, + "step": 15120 + }, + { + "epoch": 4.503732385189598, + "grad_norm": 0.2698459327220917, + "learning_rate": 1.2072197474591948e-05, + "loss": 1.2289, + "step": 15121 + }, + { + "epoch": 4.504030231389266, + "grad_norm": 0.27188873291015625, + "learning_rate": 1.2071253835058235e-05, + "loss": 1.2335, + "step": 15122 + }, + { + "epoch": 4.504328077588935, + "grad_norm": 0.28518879413604736, + "learning_rate": 1.2070310176253817e-05, + "loss": 1.2084, + "step": 15123 + }, + { + "epoch": 4.504625923788604, + "grad_norm": 0.30094584822654724, + "learning_rate": 1.2069366498187478e-05, + "loss": 1.2215, + "step": 15124 + }, + { + "epoch": 4.504923769988272, + "grad_norm": 0.3028467893600464, + "learning_rate": 1.2068422800867995e-05, + "loss": 1.2167, + "step": 15125 + }, + { + "epoch": 4.505221616187941, + "grad_norm": 0.43491336703300476, + "learning_rate": 1.2067479084304146e-05, + "loss": 1.2246, + "step": 15126 + }, + { + "epoch": 4.50551946238761, + "grad_norm": 0.24532203376293182, + "learning_rate": 1.2066535348504716e-05, + "loss": 1.2061, + "step": 15127 + }, + { + "epoch": 4.505817308587278, + "grad_norm": 0.4910340905189514, + "learning_rate": 1.2065591593478483e-05, + "loss": 1.233, + "step": 15128 + }, + { + "epoch": 4.506115154786947, + "grad_norm": 0.25325271487236023, + "learning_rate": 1.2064647819234227e-05, + "loss": 1.226, + "step": 15129 + }, + { + "epoch": 4.5064130009866155, + "grad_norm": 0.4711860418319702, + "learning_rate": 1.2063704025780732e-05, + "loss": 1.2235, + "step": 15130 + }, + { + "epoch": 4.506710847186284, + "grad_norm": 0.2528174817562103, + "learning_rate": 1.2062760213126776e-05, + "loss": 1.2285, + "step": 15131 + }, + { + "epoch": 4.507008693385953, + "grad_norm": 0.5877256393432617, + "learning_rate": 1.206181638128114e-05, + "loss": 1.215, + "step": 15132 + }, + { + "epoch": 4.507306539585621, + "grad_norm": 0.43910297751426697, + "learning_rate": 1.2060872530252605e-05, + "loss": 1.2362, + "step": 15133 + }, + { + "epoch": 4.5076043857852905, + "grad_norm": 0.2653259336948395, + "learning_rate": 1.2059928660049958e-05, + "loss": 1.2033, + "step": 15134 + }, + { + "epoch": 4.507902231984959, + "grad_norm": 0.3588211238384247, + "learning_rate": 1.2058984770681974e-05, + "loss": 1.2182, + "step": 15135 + }, + { + "epoch": 4.508200078184627, + "grad_norm": 0.26585859060287476, + "learning_rate": 1.2058040862157437e-05, + "loss": 1.2212, + "step": 15136 + }, + { + "epoch": 4.508497924384296, + "grad_norm": 0.33971595764160156, + "learning_rate": 1.2057096934485126e-05, + "loss": 1.2231, + "step": 15137 + }, + { + "epoch": 4.508795770583965, + "grad_norm": 0.2848120331764221, + "learning_rate": 1.2056152987673832e-05, + "loss": 1.2117, + "step": 15138 + }, + { + "epoch": 4.509093616783633, + "grad_norm": 0.34800732135772705, + "learning_rate": 1.2055209021732328e-05, + "loss": 1.2268, + "step": 15139 + }, + { + "epoch": 4.509391462983302, + "grad_norm": 0.3181397020816803, + "learning_rate": 1.2054265036669401e-05, + "loss": 1.21, + "step": 15140 + }, + { + "epoch": 4.509689309182971, + "grad_norm": 0.45380064845085144, + "learning_rate": 1.2053321032493835e-05, + "loss": 1.2027, + "step": 15141 + }, + { + "epoch": 4.509987155382639, + "grad_norm": 0.2743885815143585, + "learning_rate": 1.205237700921441e-05, + "loss": 1.2308, + "step": 15142 + }, + { + "epoch": 4.510285001582308, + "grad_norm": 0.3314945697784424, + "learning_rate": 1.2051432966839908e-05, + "loss": 1.2297, + "step": 15143 + }, + { + "epoch": 4.5105828477819765, + "grad_norm": 0.38072890043258667, + "learning_rate": 1.2050488905379116e-05, + "loss": 1.2173, + "step": 15144 + }, + { + "epoch": 4.510880693981646, + "grad_norm": 0.28301769495010376, + "learning_rate": 1.2049544824840815e-05, + "loss": 1.2183, + "step": 15145 + }, + { + "epoch": 4.511178540181314, + "grad_norm": 0.3803479075431824, + "learning_rate": 1.2048600725233787e-05, + "loss": 1.2137, + "step": 15146 + }, + { + "epoch": 4.511476386380982, + "grad_norm": 0.27927228808403015, + "learning_rate": 1.2047656606566822e-05, + "loss": 1.2307, + "step": 15147 + }, + { + "epoch": 4.511774232580651, + "grad_norm": 0.26439303159713745, + "learning_rate": 1.20467124688487e-05, + "loss": 1.2235, + "step": 15148 + }, + { + "epoch": 4.51207207878032, + "grad_norm": 0.3017038404941559, + "learning_rate": 1.2045768312088203e-05, + "loss": 1.2292, + "step": 15149 + }, + { + "epoch": 4.512369924979988, + "grad_norm": 0.25685378909111023, + "learning_rate": 1.2044824136294118e-05, + "loss": 1.2261, + "step": 15150 + }, + { + "epoch": 4.5126677711796574, + "grad_norm": 0.3176990747451782, + "learning_rate": 1.204387994147523e-05, + "loss": 1.2246, + "step": 15151 + }, + { + "epoch": 4.512965617379326, + "grad_norm": 0.2701776623725891, + "learning_rate": 1.2042935727640321e-05, + "loss": 1.2117, + "step": 15152 + }, + { + "epoch": 4.513263463578994, + "grad_norm": 0.41709625720977783, + "learning_rate": 1.204199149479818e-05, + "loss": 1.22, + "step": 15153 + }, + { + "epoch": 4.513561309778663, + "grad_norm": 0.34297245740890503, + "learning_rate": 1.2041047242957586e-05, + "loss": 1.224, + "step": 15154 + }, + { + "epoch": 4.513859155978332, + "grad_norm": 0.2829325795173645, + "learning_rate": 1.2040102972127332e-05, + "loss": 1.2324, + "step": 15155 + }, + { + "epoch": 4.514157002178, + "grad_norm": 0.46197330951690674, + "learning_rate": 1.2039158682316198e-05, + "loss": 1.2214, + "step": 15156 + }, + { + "epoch": 4.514454848377669, + "grad_norm": 0.27032995223999023, + "learning_rate": 1.2038214373532969e-05, + "loss": 1.2286, + "step": 15157 + }, + { + "epoch": 4.5147526945773375, + "grad_norm": 0.4450484812259674, + "learning_rate": 1.2037270045786434e-05, + "loss": 1.2122, + "step": 15158 + }, + { + "epoch": 4.515050540777006, + "grad_norm": 0.24114517867565155, + "learning_rate": 1.2036325699085375e-05, + "loss": 1.2221, + "step": 15159 + }, + { + "epoch": 4.515348386976675, + "grad_norm": 0.34915485978126526, + "learning_rate": 1.2035381333438583e-05, + "loss": 1.2177, + "step": 15160 + }, + { + "epoch": 4.515646233176343, + "grad_norm": 0.3585357367992401, + "learning_rate": 1.2034436948854842e-05, + "loss": 1.218, + "step": 15161 + }, + { + "epoch": 4.515944079376013, + "grad_norm": 0.28885915875434875, + "learning_rate": 1.2033492545342934e-05, + "loss": 1.2206, + "step": 15162 + }, + { + "epoch": 4.516241925575681, + "grad_norm": 0.30171290040016174, + "learning_rate": 1.2032548122911655e-05, + "loss": 1.2069, + "step": 15163 + }, + { + "epoch": 4.516539771775349, + "grad_norm": 0.3419775366783142, + "learning_rate": 1.2031603681569784e-05, + "loss": 1.2192, + "step": 15164 + }, + { + "epoch": 4.5168376179750185, + "grad_norm": 0.4366108775138855, + "learning_rate": 1.2030659221326114e-05, + "loss": 1.2149, + "step": 15165 + }, + { + "epoch": 4.517135464174687, + "grad_norm": 0.2641226351261139, + "learning_rate": 1.2029714742189424e-05, + "loss": 1.2189, + "step": 15166 + }, + { + "epoch": 4.517433310374355, + "grad_norm": 0.3460739254951477, + "learning_rate": 1.2028770244168508e-05, + "loss": 1.2363, + "step": 15167 + }, + { + "epoch": 4.517731156574024, + "grad_norm": 0.34574875235557556, + "learning_rate": 1.2027825727272153e-05, + "loss": 1.2231, + "step": 15168 + }, + { + "epoch": 4.518029002773693, + "grad_norm": 0.26001298427581787, + "learning_rate": 1.2026881191509143e-05, + "loss": 1.2278, + "step": 15169 + }, + { + "epoch": 4.518326848973361, + "grad_norm": 0.3446212112903595, + "learning_rate": 1.2025936636888269e-05, + "loss": 1.2251, + "step": 15170 + }, + { + "epoch": 4.51862469517303, + "grad_norm": 0.3986576497554779, + "learning_rate": 1.2024992063418316e-05, + "loss": 1.2315, + "step": 15171 + }, + { + "epoch": 4.5189225413726986, + "grad_norm": 0.26384684443473816, + "learning_rate": 1.2024047471108081e-05, + "loss": 1.2131, + "step": 15172 + }, + { + "epoch": 4.519220387572368, + "grad_norm": 0.4883301854133606, + "learning_rate": 1.202310285996634e-05, + "loss": 1.2231, + "step": 15173 + }, + { + "epoch": 4.519518233772036, + "grad_norm": 0.6936948299407959, + "learning_rate": 1.2022158230001888e-05, + "loss": 1.2297, + "step": 15174 + }, + { + "epoch": 4.519816079971704, + "grad_norm": 0.44598180055618286, + "learning_rate": 1.2021213581223512e-05, + "loss": 1.23, + "step": 15175 + }, + { + "epoch": 4.520113926171374, + "grad_norm": 0.29919061064720154, + "learning_rate": 1.2020268913640003e-05, + "loss": 1.2297, + "step": 15176 + }, + { + "epoch": 4.520411772371042, + "grad_norm": 0.2921298146247864, + "learning_rate": 1.201932422726015e-05, + "loss": 1.2063, + "step": 15177 + }, + { + "epoch": 4.52070961857071, + "grad_norm": 0.32926321029663086, + "learning_rate": 1.2018379522092737e-05, + "loss": 1.2336, + "step": 15178 + }, + { + "epoch": 4.5210074647703795, + "grad_norm": 0.3749915063381195, + "learning_rate": 1.2017434798146565e-05, + "loss": 1.2278, + "step": 15179 + }, + { + "epoch": 4.521305310970048, + "grad_norm": 0.23137181997299194, + "learning_rate": 1.201649005543041e-05, + "loss": 1.2424, + "step": 15180 + }, + { + "epoch": 4.521603157169716, + "grad_norm": 0.2518247067928314, + "learning_rate": 1.2015545293953067e-05, + "loss": 1.2187, + "step": 15181 + }, + { + "epoch": 4.521901003369385, + "grad_norm": 0.2674430012702942, + "learning_rate": 1.2014600513723333e-05, + "loss": 1.217, + "step": 15182 + }, + { + "epoch": 4.522198849569054, + "grad_norm": 0.2746109664440155, + "learning_rate": 1.201365571474999e-05, + "loss": 1.2169, + "step": 15183 + }, + { + "epoch": 4.522496695768722, + "grad_norm": 0.2592957019805908, + "learning_rate": 1.2012710897041828e-05, + "loss": 1.2334, + "step": 15184 + }, + { + "epoch": 4.522794541968391, + "grad_norm": 0.34598320722579956, + "learning_rate": 1.2011766060607641e-05, + "loss": 1.2273, + "step": 15185 + }, + { + "epoch": 4.52309238816806, + "grad_norm": 0.3186018466949463, + "learning_rate": 1.2010821205456218e-05, + "loss": 1.2471, + "step": 15186 + }, + { + "epoch": 4.523390234367728, + "grad_norm": 0.2697501480579376, + "learning_rate": 1.200987633159635e-05, + "loss": 1.227, + "step": 15187 + }, + { + "epoch": 4.523688080567397, + "grad_norm": 0.29812702536582947, + "learning_rate": 1.200893143903683e-05, + "loss": 1.2237, + "step": 15188 + }, + { + "epoch": 4.5239859267670655, + "grad_norm": 0.2813689410686493, + "learning_rate": 1.2007986527786449e-05, + "loss": 1.2272, + "step": 15189 + }, + { + "epoch": 4.524283772966735, + "grad_norm": 0.3204902410507202, + "learning_rate": 1.2007041597853995e-05, + "loss": 1.2092, + "step": 15190 + }, + { + "epoch": 4.524581619166403, + "grad_norm": 0.3014407455921173, + "learning_rate": 1.200609664924826e-05, + "loss": 1.2209, + "step": 15191 + }, + { + "epoch": 4.524879465366071, + "grad_norm": 0.27222299575805664, + "learning_rate": 1.2005151681978038e-05, + "loss": 1.2063, + "step": 15192 + }, + { + "epoch": 4.5251773115657405, + "grad_norm": 0.2536955177783966, + "learning_rate": 1.2004206696052119e-05, + "loss": 1.2309, + "step": 15193 + }, + { + "epoch": 4.525475157765409, + "grad_norm": 0.27589666843414307, + "learning_rate": 1.2003261691479298e-05, + "loss": 1.2402, + "step": 15194 + }, + { + "epoch": 4.525773003965077, + "grad_norm": 0.38982024788856506, + "learning_rate": 1.2002316668268364e-05, + "loss": 1.208, + "step": 15195 + }, + { + "epoch": 4.526070850164746, + "grad_norm": 0.40475478768348694, + "learning_rate": 1.2001371626428111e-05, + "loss": 1.2263, + "step": 15196 + }, + { + "epoch": 4.526368696364415, + "grad_norm": 0.24196231365203857, + "learning_rate": 1.200042656596733e-05, + "loss": 1.2229, + "step": 15197 + }, + { + "epoch": 4.526666542564083, + "grad_norm": 0.47569191455841064, + "learning_rate": 1.1999481486894813e-05, + "loss": 1.2344, + "step": 15198 + }, + { + "epoch": 4.526964388763752, + "grad_norm": 0.5380421280860901, + "learning_rate": 1.199853638921936e-05, + "loss": 1.2235, + "step": 15199 + }, + { + "epoch": 4.527262234963421, + "grad_norm": 0.3004697561264038, + "learning_rate": 1.1997591272949754e-05, + "loss": 1.215, + "step": 15200 + }, + { + "epoch": 4.52756008116309, + "grad_norm": 0.6487541794776917, + "learning_rate": 1.1996646138094794e-05, + "loss": 1.2185, + "step": 15201 + }, + { + "epoch": 4.527857927362758, + "grad_norm": 0.2904670536518097, + "learning_rate": 1.1995700984663275e-05, + "loss": 1.2232, + "step": 15202 + }, + { + "epoch": 4.5281557735624265, + "grad_norm": 0.6748509407043457, + "learning_rate": 1.1994755812663985e-05, + "loss": 1.2239, + "step": 15203 + }, + { + "epoch": 4.528453619762096, + "grad_norm": 0.5291069746017456, + "learning_rate": 1.199381062210572e-05, + "loss": 1.2232, + "step": 15204 + }, + { + "epoch": 4.528751465961764, + "grad_norm": 0.47623565793037415, + "learning_rate": 1.1992865412997279e-05, + "loss": 1.2215, + "step": 15205 + }, + { + "epoch": 4.529049312161432, + "grad_norm": 0.38587868213653564, + "learning_rate": 1.1991920185347452e-05, + "loss": 1.2156, + "step": 15206 + }, + { + "epoch": 4.529347158361102, + "grad_norm": 0.3916699290275574, + "learning_rate": 1.199097493916503e-05, + "loss": 1.2157, + "step": 15207 + }, + { + "epoch": 4.52964500456077, + "grad_norm": 0.2876312732696533, + "learning_rate": 1.1990029674458812e-05, + "loss": 1.2226, + "step": 15208 + }, + { + "epoch": 4.529942850760438, + "grad_norm": 0.37393510341644287, + "learning_rate": 1.1989084391237591e-05, + "loss": 1.2319, + "step": 15209 + }, + { + "epoch": 4.5302406969601074, + "grad_norm": 0.301517516374588, + "learning_rate": 1.1988139089510162e-05, + "loss": 1.2364, + "step": 15210 + }, + { + "epoch": 4.530538543159776, + "grad_norm": 0.28969547152519226, + "learning_rate": 1.198719376928532e-05, + "loss": 1.22, + "step": 15211 + }, + { + "epoch": 4.530836389359445, + "grad_norm": 0.2703687250614166, + "learning_rate": 1.1986248430571858e-05, + "loss": 1.1986, + "step": 15212 + }, + { + "epoch": 4.531134235559113, + "grad_norm": 0.27715325355529785, + "learning_rate": 1.1985303073378578e-05, + "loss": 1.1986, + "step": 15213 + }, + { + "epoch": 4.531432081758782, + "grad_norm": 0.2817557752132416, + "learning_rate": 1.198435769771427e-05, + "loss": 1.2354, + "step": 15214 + }, + { + "epoch": 4.53172992795845, + "grad_norm": 0.3252423107624054, + "learning_rate": 1.1983412303587729e-05, + "loss": 1.2324, + "step": 15215 + }, + { + "epoch": 4.532027774158119, + "grad_norm": 0.26138994097709656, + "learning_rate": 1.1982466891007753e-05, + "loss": 1.2306, + "step": 15216 + }, + { + "epoch": 4.5323256203577875, + "grad_norm": 0.3477913737297058, + "learning_rate": 1.1981521459983137e-05, + "loss": 1.1917, + "step": 15217 + }, + { + "epoch": 4.532623466557457, + "grad_norm": 0.2689266800880432, + "learning_rate": 1.1980576010522678e-05, + "loss": 1.2104, + "step": 15218 + }, + { + "epoch": 4.532921312757125, + "grad_norm": 0.44335079193115234, + "learning_rate": 1.1979630542635173e-05, + "loss": 1.2249, + "step": 15219 + }, + { + "epoch": 4.533219158956793, + "grad_norm": 0.2719641625881195, + "learning_rate": 1.1978685056329417e-05, + "loss": 1.2148, + "step": 15220 + }, + { + "epoch": 4.533517005156463, + "grad_norm": 0.3772193193435669, + "learning_rate": 1.1977739551614205e-05, + "loss": 1.2141, + "step": 15221 + }, + { + "epoch": 4.533814851356131, + "grad_norm": 0.3737548291683197, + "learning_rate": 1.1976794028498338e-05, + "loss": 1.2214, + "step": 15222 + }, + { + "epoch": 4.534112697555799, + "grad_norm": 0.31302064657211304, + "learning_rate": 1.1975848486990613e-05, + "loss": 1.213, + "step": 15223 + }, + { + "epoch": 4.5344105437554685, + "grad_norm": 0.2857608497142792, + "learning_rate": 1.1974902927099826e-05, + "loss": 1.2186, + "step": 15224 + }, + { + "epoch": 4.534708389955137, + "grad_norm": 0.2496808022260666, + "learning_rate": 1.1973957348834771e-05, + "loss": 1.2274, + "step": 15225 + }, + { + "epoch": 4.535006236154805, + "grad_norm": 0.3113750219345093, + "learning_rate": 1.1973011752204248e-05, + "loss": 1.2189, + "step": 15226 + }, + { + "epoch": 4.535304082354474, + "grad_norm": 0.2819344103336334, + "learning_rate": 1.1972066137217056e-05, + "loss": 1.2182, + "step": 15227 + }, + { + "epoch": 4.535601928554143, + "grad_norm": 0.34998559951782227, + "learning_rate": 1.197112050388199e-05, + "loss": 1.2169, + "step": 15228 + }, + { + "epoch": 4.535899774753812, + "grad_norm": 0.3272688090801239, + "learning_rate": 1.1970174852207853e-05, + "loss": 1.2148, + "step": 15229 + }, + { + "epoch": 4.53619762095348, + "grad_norm": 0.28262338042259216, + "learning_rate": 1.196922918220344e-05, + "loss": 1.2253, + "step": 15230 + }, + { + "epoch": 4.5364954671531486, + "grad_norm": 0.3195551335811615, + "learning_rate": 1.1968283493877548e-05, + "loss": 1.2287, + "step": 15231 + }, + { + "epoch": 4.536793313352818, + "grad_norm": 0.34402960538864136, + "learning_rate": 1.1967337787238977e-05, + "loss": 1.2108, + "step": 15232 + }, + { + "epoch": 4.537091159552486, + "grad_norm": 0.28968119621276855, + "learning_rate": 1.1966392062296528e-05, + "loss": 1.2332, + "step": 15233 + }, + { + "epoch": 4.537389005752154, + "grad_norm": 0.3309728503227234, + "learning_rate": 1.1965446319058995e-05, + "loss": 1.2296, + "step": 15234 + }, + { + "epoch": 4.537686851951824, + "grad_norm": 0.5360700488090515, + "learning_rate": 1.196450055753518e-05, + "loss": 1.2272, + "step": 15235 + }, + { + "epoch": 4.537984698151492, + "grad_norm": 0.2983231246471405, + "learning_rate": 1.1963554777733886e-05, + "loss": 1.2045, + "step": 15236 + }, + { + "epoch": 4.53828254435116, + "grad_norm": 0.645355224609375, + "learning_rate": 1.1962608979663907e-05, + "loss": 1.2208, + "step": 15237 + }, + { + "epoch": 4.5385803905508295, + "grad_norm": 0.7613120675086975, + "learning_rate": 1.1961663163334042e-05, + "loss": 1.2237, + "step": 15238 + }, + { + "epoch": 4.538878236750498, + "grad_norm": 0.2756083607673645, + "learning_rate": 1.1960717328753093e-05, + "loss": 1.2194, + "step": 15239 + }, + { + "epoch": 4.539176082950167, + "grad_norm": 0.5473653674125671, + "learning_rate": 1.1959771475929864e-05, + "loss": 1.2209, + "step": 15240 + }, + { + "epoch": 4.539473929149835, + "grad_norm": 0.34889474511146545, + "learning_rate": 1.1958825604873148e-05, + "loss": 1.2211, + "step": 15241 + }, + { + "epoch": 4.539771775349504, + "grad_norm": 0.5132902264595032, + "learning_rate": 1.1957879715591749e-05, + "loss": 1.2254, + "step": 15242 + }, + { + "epoch": 4.540069621549173, + "grad_norm": 0.27200672030448914, + "learning_rate": 1.195693380809447e-05, + "loss": 1.2166, + "step": 15243 + }, + { + "epoch": 4.540367467748841, + "grad_norm": 0.3609423339366913, + "learning_rate": 1.1955987882390101e-05, + "loss": 1.2012, + "step": 15244 + }, + { + "epoch": 4.54066531394851, + "grad_norm": 0.3313136100769043, + "learning_rate": 1.1955041938487452e-05, + "loss": 1.2305, + "step": 15245 + }, + { + "epoch": 4.540963160148179, + "grad_norm": 0.28946390748023987, + "learning_rate": 1.1954095976395323e-05, + "loss": 1.215, + "step": 15246 + }, + { + "epoch": 4.541261006347847, + "grad_norm": 0.4484001696109772, + "learning_rate": 1.1953149996122518e-05, + "loss": 1.2168, + "step": 15247 + }, + { + "epoch": 4.5415588525475155, + "grad_norm": 0.3039822280406952, + "learning_rate": 1.1952203997677832e-05, + "loss": 1.2304, + "step": 15248 + }, + { + "epoch": 4.541856698747185, + "grad_norm": 0.3786887228488922, + "learning_rate": 1.1951257981070068e-05, + "loss": 1.2292, + "step": 15249 + }, + { + "epoch": 4.542154544946853, + "grad_norm": 0.2886325716972351, + "learning_rate": 1.1950311946308029e-05, + "loss": 1.2207, + "step": 15250 + }, + { + "epoch": 4.542452391146521, + "grad_norm": 0.3433248996734619, + "learning_rate": 1.1949365893400515e-05, + "loss": 1.2053, + "step": 15251 + }, + { + "epoch": 4.5427502373461905, + "grad_norm": 0.2520977854728699, + "learning_rate": 1.1948419822356332e-05, + "loss": 1.2064, + "step": 15252 + }, + { + "epoch": 4.543048083545859, + "grad_norm": 0.30309560894966125, + "learning_rate": 1.1947473733184275e-05, + "loss": 1.2362, + "step": 15253 + }, + { + "epoch": 4.543345929745527, + "grad_norm": 0.3005962371826172, + "learning_rate": 1.1946527625893159e-05, + "loss": 1.2167, + "step": 15254 + }, + { + "epoch": 4.543643775945196, + "grad_norm": 0.329084575176239, + "learning_rate": 1.1945581500491771e-05, + "loss": 1.2235, + "step": 15255 + }, + { + "epoch": 4.543941622144865, + "grad_norm": 0.2776544392108917, + "learning_rate": 1.1944635356988924e-05, + "loss": 1.2192, + "step": 15256 + }, + { + "epoch": 4.544239468344534, + "grad_norm": 0.2981942594051361, + "learning_rate": 1.1943689195393415e-05, + "loss": 1.2048, + "step": 15257 + }, + { + "epoch": 4.544537314544202, + "grad_norm": 0.33692529797554016, + "learning_rate": 1.1942743015714051e-05, + "loss": 1.235, + "step": 15258 + }, + { + "epoch": 4.544835160743871, + "grad_norm": 0.2756461203098297, + "learning_rate": 1.1941796817959634e-05, + "loss": 1.215, + "step": 15259 + }, + { + "epoch": 4.54513300694354, + "grad_norm": 0.42082613706588745, + "learning_rate": 1.1940850602138967e-05, + "loss": 1.213, + "step": 15260 + }, + { + "epoch": 4.545430853143208, + "grad_norm": 0.48097655177116394, + "learning_rate": 1.1939904368260855e-05, + "loss": 1.2113, + "step": 15261 + }, + { + "epoch": 4.5457286993428765, + "grad_norm": 0.3191584348678589, + "learning_rate": 1.1938958116334099e-05, + "loss": 1.2366, + "step": 15262 + }, + { + "epoch": 4.546026545542546, + "grad_norm": 0.620833694934845, + "learning_rate": 1.1938011846367503e-05, + "loss": 1.2237, + "step": 15263 + }, + { + "epoch": 4.546324391742214, + "grad_norm": 0.3137511909008026, + "learning_rate": 1.1937065558369876e-05, + "loss": 1.2284, + "step": 15264 + }, + { + "epoch": 4.546622237941882, + "grad_norm": 0.9560343027114868, + "learning_rate": 1.1936119252350013e-05, + "loss": 1.218, + "step": 15265 + }, + { + "epoch": 4.546920084141552, + "grad_norm": 0.6752145886421204, + "learning_rate": 1.1935172928316727e-05, + "loss": 1.2355, + "step": 15266 + }, + { + "epoch": 4.54721793034122, + "grad_norm": 0.6256542205810547, + "learning_rate": 1.193422658627882e-05, + "loss": 1.2112, + "step": 15267 + }, + { + "epoch": 4.547515776540889, + "grad_norm": 0.38072991371154785, + "learning_rate": 1.193328022624509e-05, + "loss": 1.2138, + "step": 15268 + }, + { + "epoch": 4.5478136227405574, + "grad_norm": 0.6567838191986084, + "learning_rate": 1.1932333848224351e-05, + "loss": 1.2129, + "step": 15269 + }, + { + "epoch": 4.548111468940226, + "grad_norm": 0.266481876373291, + "learning_rate": 1.1931387452225405e-05, + "loss": 1.2063, + "step": 15270 + }, + { + "epoch": 4.548409315139895, + "grad_norm": 0.38249945640563965, + "learning_rate": 1.1930441038257057e-05, + "loss": 1.2198, + "step": 15271 + }, + { + "epoch": 4.548707161339563, + "grad_norm": 0.3684477210044861, + "learning_rate": 1.1929494606328112e-05, + "loss": 1.2238, + "step": 15272 + }, + { + "epoch": 4.549005007539232, + "grad_norm": 0.2897947430610657, + "learning_rate": 1.1928548156447372e-05, + "loss": 1.2036, + "step": 15273 + }, + { + "epoch": 4.549302853738901, + "grad_norm": 0.3501741290092468, + "learning_rate": 1.192760168862365e-05, + "loss": 1.2306, + "step": 15274 + }, + { + "epoch": 4.549600699938569, + "grad_norm": 0.3471895456314087, + "learning_rate": 1.1926655202865747e-05, + "loss": 1.2336, + "step": 15275 + }, + { + "epoch": 4.5498985461382375, + "grad_norm": 0.3095743656158447, + "learning_rate": 1.1925708699182467e-05, + "loss": 1.2175, + "step": 15276 + }, + { + "epoch": 4.550196392337907, + "grad_norm": 0.28557077050209045, + "learning_rate": 1.1924762177582623e-05, + "loss": 1.2158, + "step": 15277 + }, + { + "epoch": 4.550494238537575, + "grad_norm": 0.2939230501651764, + "learning_rate": 1.1923815638075019e-05, + "loss": 1.2278, + "step": 15278 + }, + { + "epoch": 4.550792084737244, + "grad_norm": 0.322860985994339, + "learning_rate": 1.1922869080668456e-05, + "loss": 1.2269, + "step": 15279 + }, + { + "epoch": 4.551089930936913, + "grad_norm": 0.2795043885707855, + "learning_rate": 1.1921922505371744e-05, + "loss": 1.218, + "step": 15280 + }, + { + "epoch": 4.551387777136581, + "grad_norm": 0.2841345965862274, + "learning_rate": 1.1920975912193692e-05, + "loss": 1.2361, + "step": 15281 + }, + { + "epoch": 4.551685623336249, + "grad_norm": 0.27874767780303955, + "learning_rate": 1.1920029301143105e-05, + "loss": 1.216, + "step": 15282 + }, + { + "epoch": 4.5519834695359185, + "grad_norm": 0.285792738199234, + "learning_rate": 1.191908267222879e-05, + "loss": 1.2222, + "step": 15283 + }, + { + "epoch": 4.552281315735587, + "grad_norm": 0.26173120737075806, + "learning_rate": 1.191813602545956e-05, + "loss": 1.2155, + "step": 15284 + }, + { + "epoch": 4.552579161935256, + "grad_norm": 0.2594122290611267, + "learning_rate": 1.1917189360844211e-05, + "loss": 1.233, + "step": 15285 + }, + { + "epoch": 4.552877008134924, + "grad_norm": 0.3487972319126129, + "learning_rate": 1.1916242678391557e-05, + "loss": 1.2162, + "step": 15286 + }, + { + "epoch": 4.553174854334593, + "grad_norm": 0.30598223209381104, + "learning_rate": 1.1915295978110409e-05, + "loss": 1.2151, + "step": 15287 + }, + { + "epoch": 4.553472700534262, + "grad_norm": 0.3238547444343567, + "learning_rate": 1.191434926000957e-05, + "loss": 1.2077, + "step": 15288 + }, + { + "epoch": 4.55377054673393, + "grad_norm": 0.282140851020813, + "learning_rate": 1.191340252409785e-05, + "loss": 1.2222, + "step": 15289 + }, + { + "epoch": 4.5540683929335986, + "grad_norm": 0.37073850631713867, + "learning_rate": 1.191245577038406e-05, + "loss": 1.2125, + "step": 15290 + }, + { + "epoch": 4.554366239133268, + "grad_norm": 0.30525949597358704, + "learning_rate": 1.1911508998877001e-05, + "loss": 1.2221, + "step": 15291 + }, + { + "epoch": 4.554664085332936, + "grad_norm": 0.34771493077278137, + "learning_rate": 1.191056220958549e-05, + "loss": 1.2098, + "step": 15292 + }, + { + "epoch": 4.554961931532604, + "grad_norm": 0.33122164011001587, + "learning_rate": 1.190961540251833e-05, + "loss": 1.2249, + "step": 15293 + }, + { + "epoch": 4.555259777732274, + "grad_norm": 0.2992601692676544, + "learning_rate": 1.1908668577684335e-05, + "loss": 1.2267, + "step": 15294 + }, + { + "epoch": 4.555557623931942, + "grad_norm": 0.28742265701293945, + "learning_rate": 1.190772173509231e-05, + "loss": 1.216, + "step": 15295 + }, + { + "epoch": 4.555855470131611, + "grad_norm": 0.29658302664756775, + "learning_rate": 1.1906774874751065e-05, + "loss": 1.2213, + "step": 15296 + }, + { + "epoch": 4.5561533163312795, + "grad_norm": 0.27233144640922546, + "learning_rate": 1.1905827996669413e-05, + "loss": 1.2107, + "step": 15297 + }, + { + "epoch": 4.556451162530948, + "grad_norm": 0.3032150864601135, + "learning_rate": 1.1904881100856159e-05, + "loss": 1.2444, + "step": 15298 + }, + { + "epoch": 4.556749008730617, + "grad_norm": 0.304830402135849, + "learning_rate": 1.1903934187320112e-05, + "loss": 1.2176, + "step": 15299 + }, + { + "epoch": 4.557046854930285, + "grad_norm": 0.3461590111255646, + "learning_rate": 1.1902987256070087e-05, + "loss": 1.2157, + "step": 15300 + }, + { + "epoch": 4.557344701129954, + "grad_norm": 0.2693634033203125, + "learning_rate": 1.1902040307114892e-05, + "loss": 1.2154, + "step": 15301 + }, + { + "epoch": 4.557642547329623, + "grad_norm": 0.4794714152812958, + "learning_rate": 1.1901093340463339e-05, + "loss": 1.2279, + "step": 15302 + }, + { + "epoch": 4.557940393529291, + "grad_norm": 0.3167456388473511, + "learning_rate": 1.1900146356124234e-05, + "loss": 1.2193, + "step": 15303 + }, + { + "epoch": 4.55823823972896, + "grad_norm": 0.3777005672454834, + "learning_rate": 1.1899199354106386e-05, + "loss": 1.2167, + "step": 15304 + }, + { + "epoch": 4.558536085928629, + "grad_norm": 0.4013977646827698, + "learning_rate": 1.1898252334418616e-05, + "loss": 1.207, + "step": 15305 + }, + { + "epoch": 4.558833932128297, + "grad_norm": 0.2669907212257385, + "learning_rate": 1.1897305297069727e-05, + "loss": 1.2219, + "step": 15306 + }, + { + "epoch": 4.559131778327966, + "grad_norm": 0.33641159534454346, + "learning_rate": 1.189635824206853e-05, + "loss": 1.2193, + "step": 15307 + }, + { + "epoch": 4.559429624527635, + "grad_norm": 0.2593099772930145, + "learning_rate": 1.1895411169423845e-05, + "loss": 1.2167, + "step": 15308 + }, + { + "epoch": 4.559727470727303, + "grad_norm": 0.2861332595348358, + "learning_rate": 1.1894464079144467e-05, + "loss": 1.2154, + "step": 15309 + }, + { + "epoch": 4.560025316926972, + "grad_norm": 0.25847122073173523, + "learning_rate": 1.1893516971239224e-05, + "loss": 1.2236, + "step": 15310 + }, + { + "epoch": 4.5603231631266405, + "grad_norm": 0.3084351122379303, + "learning_rate": 1.189256984571692e-05, + "loss": 1.2022, + "step": 15311 + }, + { + "epoch": 4.560621009326309, + "grad_norm": 0.2561310827732086, + "learning_rate": 1.1891622702586369e-05, + "loss": 1.2329, + "step": 15312 + }, + { + "epoch": 4.560918855525978, + "grad_norm": 0.3563559353351593, + "learning_rate": 1.189067554185638e-05, + "loss": 1.2152, + "step": 15313 + }, + { + "epoch": 4.561216701725646, + "grad_norm": 0.3996835947036743, + "learning_rate": 1.1889728363535769e-05, + "loss": 1.2264, + "step": 15314 + }, + { + "epoch": 4.561514547925315, + "grad_norm": 0.2627658247947693, + "learning_rate": 1.1888781167633345e-05, + "loss": 1.2226, + "step": 15315 + }, + { + "epoch": 4.561812394124984, + "grad_norm": 0.27959635853767395, + "learning_rate": 1.1887833954157924e-05, + "loss": 1.2228, + "step": 15316 + }, + { + "epoch": 4.562110240324652, + "grad_norm": 0.263959139585495, + "learning_rate": 1.1886886723118317e-05, + "loss": 1.2208, + "step": 15317 + }, + { + "epoch": 4.562408086524321, + "grad_norm": 0.3678414523601532, + "learning_rate": 1.1885939474523337e-05, + "loss": 1.2181, + "step": 15318 + }, + { + "epoch": 4.56270593272399, + "grad_norm": 0.33446651697158813, + "learning_rate": 1.1884992208381798e-05, + "loss": 1.2123, + "step": 15319 + }, + { + "epoch": 4.563003778923658, + "grad_norm": 0.41127288341522217, + "learning_rate": 1.1884044924702511e-05, + "loss": 1.2209, + "step": 15320 + }, + { + "epoch": 4.5633016251233265, + "grad_norm": 0.5332489609718323, + "learning_rate": 1.1883097623494293e-05, + "loss": 1.2251, + "step": 15321 + }, + { + "epoch": 4.563599471322996, + "grad_norm": 0.2750576138496399, + "learning_rate": 1.1882150304765954e-05, + "loss": 1.2172, + "step": 15322 + }, + { + "epoch": 4.563897317522664, + "grad_norm": 0.3738904297351837, + "learning_rate": 1.1881202968526311e-05, + "loss": 1.2118, + "step": 15323 + }, + { + "epoch": 4.564195163722333, + "grad_norm": 0.2758297324180603, + "learning_rate": 1.1880255614784173e-05, + "loss": 1.2184, + "step": 15324 + }, + { + "epoch": 4.564493009922002, + "grad_norm": 0.4255402386188507, + "learning_rate": 1.1879308243548363e-05, + "loss": 1.2258, + "step": 15325 + }, + { + "epoch": 4.56479085612167, + "grad_norm": 0.28178808093070984, + "learning_rate": 1.1878360854827683e-05, + "loss": 1.2376, + "step": 15326 + }, + { + "epoch": 4.565088702321339, + "grad_norm": 0.29741108417510986, + "learning_rate": 1.1877413448630954e-05, + "loss": 1.2073, + "step": 15327 + }, + { + "epoch": 4.5653865485210074, + "grad_norm": 0.3053363561630249, + "learning_rate": 1.1876466024966993e-05, + "loss": 1.204, + "step": 15328 + }, + { + "epoch": 4.565684394720676, + "grad_norm": 0.2684979736804962, + "learning_rate": 1.1875518583844614e-05, + "loss": 1.2124, + "step": 15329 + }, + { + "epoch": 4.565982240920345, + "grad_norm": 0.25791966915130615, + "learning_rate": 1.187457112527263e-05, + "loss": 1.2185, + "step": 15330 + }, + { + "epoch": 4.566280087120013, + "grad_norm": 0.298246830701828, + "learning_rate": 1.1873623649259853e-05, + "loss": 1.2241, + "step": 15331 + }, + { + "epoch": 4.566577933319682, + "grad_norm": 0.26753464341163635, + "learning_rate": 1.1872676155815102e-05, + "loss": 1.2097, + "step": 15332 + }, + { + "epoch": 4.566875779519351, + "grad_norm": 0.32827845215797424, + "learning_rate": 1.1871728644947192e-05, + "loss": 1.2149, + "step": 15333 + }, + { + "epoch": 4.567173625719019, + "grad_norm": 0.5097824931144714, + "learning_rate": 1.1870781116664937e-05, + "loss": 1.196, + "step": 15334 + }, + { + "epoch": 4.567471471918688, + "grad_norm": 0.2686941623687744, + "learning_rate": 1.1869833570977156e-05, + "loss": 1.1954, + "step": 15335 + }, + { + "epoch": 4.567769318118357, + "grad_norm": 0.4071314334869385, + "learning_rate": 1.1868886007892664e-05, + "loss": 1.2166, + "step": 15336 + }, + { + "epoch": 4.568067164318025, + "grad_norm": 0.26355648040771484, + "learning_rate": 1.1867938427420273e-05, + "loss": 1.2242, + "step": 15337 + }, + { + "epoch": 4.568365010517694, + "grad_norm": 0.355571448802948, + "learning_rate": 1.1866990829568803e-05, + "loss": 1.2085, + "step": 15338 + }, + { + "epoch": 4.568662856717363, + "grad_norm": 0.2709072530269623, + "learning_rate": 1.1866043214347066e-05, + "loss": 1.2248, + "step": 15339 + }, + { + "epoch": 4.568960702917031, + "grad_norm": 0.44759824872016907, + "learning_rate": 1.1865095581763886e-05, + "loss": 1.2345, + "step": 15340 + }, + { + "epoch": 4.5692585491167, + "grad_norm": 0.29392242431640625, + "learning_rate": 1.1864147931828075e-05, + "loss": 1.2249, + "step": 15341 + }, + { + "epoch": 4.5695563953163685, + "grad_norm": 0.3723401725292206, + "learning_rate": 1.1863200264548449e-05, + "loss": 1.2275, + "step": 15342 + }, + { + "epoch": 4.569854241516037, + "grad_norm": 0.2689083218574524, + "learning_rate": 1.1862252579933827e-05, + "loss": 1.2123, + "step": 15343 + }, + { + "epoch": 4.570152087715706, + "grad_norm": 0.3089952766895294, + "learning_rate": 1.1861304877993025e-05, + "loss": 1.2261, + "step": 15344 + }, + { + "epoch": 4.570449933915374, + "grad_norm": 0.27428343892097473, + "learning_rate": 1.1860357158734858e-05, + "loss": 1.2239, + "step": 15345 + }, + { + "epoch": 4.570747780115044, + "grad_norm": 0.3443639576435089, + "learning_rate": 1.1859409422168151e-05, + "loss": 1.2121, + "step": 15346 + }, + { + "epoch": 4.571045626314712, + "grad_norm": 0.2563531696796417, + "learning_rate": 1.1858461668301713e-05, + "loss": 1.2194, + "step": 15347 + }, + { + "epoch": 4.57134347251438, + "grad_norm": 0.48409304022789, + "learning_rate": 1.1857513897144369e-05, + "loss": 1.229, + "step": 15348 + }, + { + "epoch": 4.5716413187140486, + "grad_norm": 0.31136196851730347, + "learning_rate": 1.1856566108704932e-05, + "loss": 1.2239, + "step": 15349 + }, + { + "epoch": 4.571939164913718, + "grad_norm": 0.35811367630958557, + "learning_rate": 1.1855618302992218e-05, + "loss": 1.2027, + "step": 15350 + }, + { + "epoch": 4.572237011113386, + "grad_norm": 0.2767625153064728, + "learning_rate": 1.1854670480015053e-05, + "loss": 1.2293, + "step": 15351 + }, + { + "epoch": 4.572534857313055, + "grad_norm": 0.3490542471408844, + "learning_rate": 1.185372263978225e-05, + "loss": 1.2288, + "step": 15352 + }, + { + "epoch": 4.572832703512724, + "grad_norm": 0.28721633553504944, + "learning_rate": 1.1852774782302632e-05, + "loss": 1.2203, + "step": 15353 + }, + { + "epoch": 4.573130549712392, + "grad_norm": 0.308515727519989, + "learning_rate": 1.1851826907585012e-05, + "loss": 1.2232, + "step": 15354 + }, + { + "epoch": 4.573428395912061, + "grad_norm": 0.304751992225647, + "learning_rate": 1.185087901563821e-05, + "loss": 1.2392, + "step": 15355 + }, + { + "epoch": 4.5737262421117295, + "grad_norm": 0.2567261755466461, + "learning_rate": 1.1849931106471048e-05, + "loss": 1.2009, + "step": 15356 + }, + { + "epoch": 4.574024088311398, + "grad_norm": 0.2771204113960266, + "learning_rate": 1.1848983180092343e-05, + "loss": 1.2224, + "step": 15357 + }, + { + "epoch": 4.574321934511067, + "grad_norm": 0.2839631736278534, + "learning_rate": 1.1848035236510918e-05, + "loss": 1.2296, + "step": 15358 + }, + { + "epoch": 4.574619780710735, + "grad_norm": 0.264902800321579, + "learning_rate": 1.1847087275735589e-05, + "loss": 1.2084, + "step": 15359 + }, + { + "epoch": 4.574917626910404, + "grad_norm": 0.28315335512161255, + "learning_rate": 1.1846139297775176e-05, + "loss": 1.2227, + "step": 15360 + }, + { + "epoch": 4.575215473110073, + "grad_norm": 0.3367712199687958, + "learning_rate": 1.18451913026385e-05, + "loss": 1.2345, + "step": 15361 + }, + { + "epoch": 4.575513319309741, + "grad_norm": 0.24982547760009766, + "learning_rate": 1.184424329033438e-05, + "loss": 1.2131, + "step": 15362 + }, + { + "epoch": 4.5758111655094105, + "grad_norm": 0.42404788732528687, + "learning_rate": 1.1843295260871637e-05, + "loss": 1.2382, + "step": 15363 + }, + { + "epoch": 4.576109011709079, + "grad_norm": 0.3827584385871887, + "learning_rate": 1.1842347214259092e-05, + "loss": 1.2157, + "step": 15364 + }, + { + "epoch": 4.576406857908747, + "grad_norm": 0.388635516166687, + "learning_rate": 1.1841399150505561e-05, + "loss": 1.2306, + "step": 15365 + }, + { + "epoch": 4.576704704108416, + "grad_norm": 0.7678462266921997, + "learning_rate": 1.1840451069619871e-05, + "loss": 1.2196, + "step": 15366 + }, + { + "epoch": 4.577002550308085, + "grad_norm": 0.3733859062194824, + "learning_rate": 1.1839502971610841e-05, + "loss": 1.2138, + "step": 15367 + }, + { + "epoch": 4.577300396507753, + "grad_norm": 0.5718566179275513, + "learning_rate": 1.1838554856487288e-05, + "loss": 1.2281, + "step": 15368 + }, + { + "epoch": 4.577598242707422, + "grad_norm": 0.4892123341560364, + "learning_rate": 1.1837606724258037e-05, + "loss": 1.2249, + "step": 15369 + }, + { + "epoch": 4.5778960889070905, + "grad_norm": 0.4518187940120697, + "learning_rate": 1.1836658574931912e-05, + "loss": 1.2234, + "step": 15370 + }, + { + "epoch": 4.578193935106759, + "grad_norm": 0.4065011143684387, + "learning_rate": 1.1835710408517729e-05, + "loss": 1.2182, + "step": 15371 + }, + { + "epoch": 4.578491781306428, + "grad_norm": 0.4031160771846771, + "learning_rate": 1.183476222502431e-05, + "loss": 1.2178, + "step": 15372 + }, + { + "epoch": 4.578789627506096, + "grad_norm": 0.3682171106338501, + "learning_rate": 1.1833814024460479e-05, + "loss": 1.2186, + "step": 15373 + }, + { + "epoch": 4.579087473705766, + "grad_norm": 0.38047561049461365, + "learning_rate": 1.1832865806835056e-05, + "loss": 1.2237, + "step": 15374 + }, + { + "epoch": 4.579385319905434, + "grad_norm": 0.36890560388565063, + "learning_rate": 1.1831917572156869e-05, + "loss": 1.226, + "step": 15375 + }, + { + "epoch": 4.579683166105102, + "grad_norm": 0.29262301325798035, + "learning_rate": 1.1830969320434732e-05, + "loss": 1.225, + "step": 15376 + }, + { + "epoch": 4.5799810123047715, + "grad_norm": 0.3057573437690735, + "learning_rate": 1.1830021051677474e-05, + "loss": 1.2198, + "step": 15377 + }, + { + "epoch": 4.58027885850444, + "grad_norm": 0.31514081358909607, + "learning_rate": 1.1829072765893911e-05, + "loss": 1.2226, + "step": 15378 + }, + { + "epoch": 4.580576704704108, + "grad_norm": 0.3018852770328522, + "learning_rate": 1.1828124463092872e-05, + "loss": 1.2041, + "step": 15379 + }, + { + "epoch": 4.580874550903777, + "grad_norm": 0.365410715341568, + "learning_rate": 1.1827176143283177e-05, + "loss": 1.2233, + "step": 15380 + }, + { + "epoch": 4.581172397103446, + "grad_norm": 0.28774645924568176, + "learning_rate": 1.182622780647365e-05, + "loss": 1.2243, + "step": 15381 + }, + { + "epoch": 4.581470243303114, + "grad_norm": 0.36222878098487854, + "learning_rate": 1.182527945267311e-05, + "loss": 1.214, + "step": 15382 + }, + { + "epoch": 4.581768089502783, + "grad_norm": 0.268823504447937, + "learning_rate": 1.182433108189039e-05, + "loss": 1.2044, + "step": 15383 + }, + { + "epoch": 4.582065935702452, + "grad_norm": 0.3325064182281494, + "learning_rate": 1.1823382694134305e-05, + "loss": 1.2306, + "step": 15384 + }, + { + "epoch": 4.582363781902121, + "grad_norm": 0.3218379616737366, + "learning_rate": 1.182243428941368e-05, + "loss": 1.2235, + "step": 15385 + }, + { + "epoch": 4.582661628101789, + "grad_norm": 0.25131484866142273, + "learning_rate": 1.1821485867737337e-05, + "loss": 1.2322, + "step": 15386 + }, + { + "epoch": 4.5829594743014574, + "grad_norm": 0.3831733763217926, + "learning_rate": 1.1820537429114109e-05, + "loss": 1.2131, + "step": 15387 + }, + { + "epoch": 4.583257320501126, + "grad_norm": 0.318324476480484, + "learning_rate": 1.1819588973552812e-05, + "loss": 1.2138, + "step": 15388 + }, + { + "epoch": 4.583555166700795, + "grad_norm": 0.3682376444339752, + "learning_rate": 1.1818640501062272e-05, + "loss": 1.2151, + "step": 15389 + }, + { + "epoch": 4.583853012900463, + "grad_norm": 0.4829396605491638, + "learning_rate": 1.1817692011651318e-05, + "loss": 1.219, + "step": 15390 + }, + { + "epoch": 4.5841508591001325, + "grad_norm": 0.6712361574172974, + "learning_rate": 1.1816743505328764e-05, + "loss": 1.239, + "step": 15391 + }, + { + "epoch": 4.584448705299801, + "grad_norm": 0.388353168964386, + "learning_rate": 1.1815794982103443e-05, + "loss": 1.215, + "step": 15392 + }, + { + "epoch": 4.584746551499469, + "grad_norm": 0.354200541973114, + "learning_rate": 1.1814846441984182e-05, + "loss": 1.2278, + "step": 15393 + }, + { + "epoch": 4.585044397699138, + "grad_norm": 0.2890750467777252, + "learning_rate": 1.1813897884979802e-05, + "loss": 1.2172, + "step": 15394 + }, + { + "epoch": 4.585342243898807, + "grad_norm": 0.3579188883304596, + "learning_rate": 1.1812949311099127e-05, + "loss": 1.206, + "step": 15395 + }, + { + "epoch": 4.585640090098475, + "grad_norm": 0.41335612535476685, + "learning_rate": 1.1812000720350984e-05, + "loss": 1.2205, + "step": 15396 + }, + { + "epoch": 4.585937936298144, + "grad_norm": 0.3208654820919037, + "learning_rate": 1.1811052112744198e-05, + "loss": 1.2112, + "step": 15397 + }, + { + "epoch": 4.586235782497813, + "grad_norm": 0.5678019523620605, + "learning_rate": 1.1810103488287598e-05, + "loss": 1.209, + "step": 15398 + }, + { + "epoch": 4.586533628697481, + "grad_norm": 0.26002275943756104, + "learning_rate": 1.1809154846990007e-05, + "loss": 1.2132, + "step": 15399 + }, + { + "epoch": 4.58683147489715, + "grad_norm": 0.48814716935157776, + "learning_rate": 1.1808206188860254e-05, + "loss": 1.2104, + "step": 15400 + }, + { + "epoch": 4.5871293210968185, + "grad_norm": 0.2996443212032318, + "learning_rate": 1.1807257513907158e-05, + "loss": 1.2302, + "step": 15401 + }, + { + "epoch": 4.587427167296488, + "grad_norm": 0.508868396282196, + "learning_rate": 1.1806308822139551e-05, + "loss": 1.219, + "step": 15402 + }, + { + "epoch": 4.587725013496156, + "grad_norm": 0.383939266204834, + "learning_rate": 1.180536011356626e-05, + "loss": 1.2166, + "step": 15403 + }, + { + "epoch": 4.588022859695824, + "grad_norm": 0.41564667224884033, + "learning_rate": 1.1804411388196108e-05, + "loss": 1.227, + "step": 15404 + }, + { + "epoch": 4.588320705895494, + "grad_norm": 0.2985652685165405, + "learning_rate": 1.1803462646037927e-05, + "loss": 1.2273, + "step": 15405 + }, + { + "epoch": 4.588618552095162, + "grad_norm": 0.5704119205474854, + "learning_rate": 1.1802513887100538e-05, + "loss": 1.2222, + "step": 15406 + }, + { + "epoch": 4.58891639829483, + "grad_norm": 0.4303019642829895, + "learning_rate": 1.1801565111392774e-05, + "loss": 1.239, + "step": 15407 + }, + { + "epoch": 4.589214244494499, + "grad_norm": 0.38110148906707764, + "learning_rate": 1.1800616318923458e-05, + "loss": 1.2265, + "step": 15408 + }, + { + "epoch": 4.589512090694168, + "grad_norm": 0.3087179362773895, + "learning_rate": 1.1799667509701415e-05, + "loss": 1.2204, + "step": 15409 + }, + { + "epoch": 4.589809936893836, + "grad_norm": 0.45208391547203064, + "learning_rate": 1.1798718683735482e-05, + "loss": 1.2287, + "step": 15410 + }, + { + "epoch": 4.590107783093505, + "grad_norm": 0.26270031929016113, + "learning_rate": 1.1797769841034481e-05, + "loss": 1.2302, + "step": 15411 + }, + { + "epoch": 4.590405629293174, + "grad_norm": 0.5112342834472656, + "learning_rate": 1.179682098160724e-05, + "loss": 1.2397, + "step": 15412 + }, + { + "epoch": 4.590703475492843, + "grad_norm": 0.2571510076522827, + "learning_rate": 1.1795872105462585e-05, + "loss": 1.2139, + "step": 15413 + }, + { + "epoch": 4.591001321692511, + "grad_norm": 0.4395637810230255, + "learning_rate": 1.179492321260935e-05, + "loss": 1.2152, + "step": 15414 + }, + { + "epoch": 4.5912991678921795, + "grad_norm": 0.2952497601509094, + "learning_rate": 1.1793974303056355e-05, + "loss": 1.209, + "step": 15415 + }, + { + "epoch": 4.591597014091848, + "grad_norm": 0.28428253531455994, + "learning_rate": 1.1793025376812434e-05, + "loss": 1.2315, + "step": 15416 + }, + { + "epoch": 4.591894860291517, + "grad_norm": 0.282982736825943, + "learning_rate": 1.1792076433886421e-05, + "loss": 1.2078, + "step": 15417 + }, + { + "epoch": 4.592192706491185, + "grad_norm": 0.2425025850534439, + "learning_rate": 1.1791127474287136e-05, + "loss": 1.2169, + "step": 15418 + }, + { + "epoch": 4.592490552690855, + "grad_norm": 0.27073776721954346, + "learning_rate": 1.179017849802341e-05, + "loss": 1.2076, + "step": 15419 + }, + { + "epoch": 4.592788398890523, + "grad_norm": 0.28246089816093445, + "learning_rate": 1.1789229505104074e-05, + "loss": 1.2157, + "step": 15420 + }, + { + "epoch": 4.593086245090191, + "grad_norm": 0.25326624512672424, + "learning_rate": 1.1788280495537954e-05, + "loss": 1.2222, + "step": 15421 + }, + { + "epoch": 4.5933840912898605, + "grad_norm": 0.2939671576023102, + "learning_rate": 1.1787331469333885e-05, + "loss": 1.2278, + "step": 15422 + }, + { + "epoch": 4.593681937489529, + "grad_norm": 0.4320828914642334, + "learning_rate": 1.1786382426500691e-05, + "loss": 1.223, + "step": 15423 + }, + { + "epoch": 4.593979783689197, + "grad_norm": 0.4275114834308624, + "learning_rate": 1.1785433367047207e-05, + "loss": 1.2146, + "step": 15424 + }, + { + "epoch": 4.594277629888866, + "grad_norm": 0.32131412625312805, + "learning_rate": 1.1784484290982262e-05, + "loss": 1.2204, + "step": 15425 + }, + { + "epoch": 4.594575476088535, + "grad_norm": 0.5534677505493164, + "learning_rate": 1.178353519831468e-05, + "loss": 1.2217, + "step": 15426 + }, + { + "epoch": 4.594873322288203, + "grad_norm": 0.27633920311927795, + "learning_rate": 1.1782586089053293e-05, + "loss": 1.2145, + "step": 15427 + }, + { + "epoch": 4.595171168487872, + "grad_norm": 0.5427194833755493, + "learning_rate": 1.1781636963206942e-05, + "loss": 1.2186, + "step": 15428 + }, + { + "epoch": 4.5954690146875405, + "grad_norm": 0.2924329340457916, + "learning_rate": 1.1780687820784447e-05, + "loss": 1.2183, + "step": 15429 + }, + { + "epoch": 4.59576686088721, + "grad_norm": 0.7174497842788696, + "learning_rate": 1.177973866179464e-05, + "loss": 1.2177, + "step": 15430 + }, + { + "epoch": 4.596064707086878, + "grad_norm": 0.25184351205825806, + "learning_rate": 1.1778789486246356e-05, + "loss": 1.2162, + "step": 15431 + }, + { + "epoch": 4.596362553286546, + "grad_norm": 0.5384388566017151, + "learning_rate": 1.1777840294148419e-05, + "loss": 1.2356, + "step": 15432 + }, + { + "epoch": 4.596660399486216, + "grad_norm": 0.26802632212638855, + "learning_rate": 1.1776891085509664e-05, + "loss": 1.2403, + "step": 15433 + }, + { + "epoch": 4.596958245685884, + "grad_norm": 0.527765154838562, + "learning_rate": 1.1775941860338924e-05, + "loss": 1.1969, + "step": 15434 + }, + { + "epoch": 4.597256091885552, + "grad_norm": 0.2646426558494568, + "learning_rate": 1.1774992618645034e-05, + "loss": 1.2053, + "step": 15435 + }, + { + "epoch": 4.5975539380852215, + "grad_norm": 0.4056178033351898, + "learning_rate": 1.1774043360436816e-05, + "loss": 1.2151, + "step": 15436 + }, + { + "epoch": 4.59785178428489, + "grad_norm": 0.3224065899848938, + "learning_rate": 1.1773094085723107e-05, + "loss": 1.2314, + "step": 15437 + }, + { + "epoch": 4.598149630484558, + "grad_norm": 0.26547330617904663, + "learning_rate": 1.177214479451274e-05, + "loss": 1.2156, + "step": 15438 + }, + { + "epoch": 4.598447476684227, + "grad_norm": 0.31536105275154114, + "learning_rate": 1.1771195486814544e-05, + "loss": 1.2187, + "step": 15439 + }, + { + "epoch": 4.598745322883896, + "grad_norm": 0.2992311716079712, + "learning_rate": 1.1770246162637353e-05, + "loss": 1.2318, + "step": 15440 + }, + { + "epoch": 4.599043169083565, + "grad_norm": 0.3289506435394287, + "learning_rate": 1.1769296821990001e-05, + "loss": 1.2394, + "step": 15441 + }, + { + "epoch": 4.599341015283233, + "grad_norm": 0.31637370586395264, + "learning_rate": 1.1768347464881318e-05, + "loss": 1.2197, + "step": 15442 + }, + { + "epoch": 4.599638861482902, + "grad_norm": 0.3021043539047241, + "learning_rate": 1.1767398091320136e-05, + "loss": 1.2324, + "step": 15443 + }, + { + "epoch": 4.599936707682571, + "grad_norm": 0.31690776348114014, + "learning_rate": 1.1766448701315292e-05, + "loss": 1.2231, + "step": 15444 + }, + { + "epoch": 4.600234553882239, + "grad_norm": 0.3074594736099243, + "learning_rate": 1.1765499294875613e-05, + "loss": 1.2424, + "step": 15445 + }, + { + "epoch": 4.6005324000819074, + "grad_norm": 0.28958991169929504, + "learning_rate": 1.1764549872009938e-05, + "loss": 1.2234, + "step": 15446 + }, + { + "epoch": 4.600830246281577, + "grad_norm": 0.27834829688072205, + "learning_rate": 1.1763600432727098e-05, + "loss": 1.225, + "step": 15447 + }, + { + "epoch": 4.601128092481245, + "grad_norm": 0.3386121392250061, + "learning_rate": 1.1762650977035928e-05, + "loss": 1.2376, + "step": 15448 + }, + { + "epoch": 4.601425938680913, + "grad_norm": 0.3685445785522461, + "learning_rate": 1.1761701504945258e-05, + "loss": 1.2089, + "step": 15449 + }, + { + "epoch": 4.6017237848805825, + "grad_norm": 0.29060253500938416, + "learning_rate": 1.1760752016463923e-05, + "loss": 1.2152, + "step": 15450 + }, + { + "epoch": 4.602021631080251, + "grad_norm": 0.4046565592288971, + "learning_rate": 1.1759802511600756e-05, + "loss": 1.2211, + "step": 15451 + }, + { + "epoch": 4.60231947727992, + "grad_norm": 0.25819021463394165, + "learning_rate": 1.1758852990364597e-05, + "loss": 1.2136, + "step": 15452 + }, + { + "epoch": 4.602617323479588, + "grad_norm": 0.3166137635707855, + "learning_rate": 1.1757903452764274e-05, + "loss": 1.2142, + "step": 15453 + }, + { + "epoch": 4.602915169679257, + "grad_norm": 0.2821340560913086, + "learning_rate": 1.1756953898808625e-05, + "loss": 1.2203, + "step": 15454 + }, + { + "epoch": 4.603213015878925, + "grad_norm": 0.32758796215057373, + "learning_rate": 1.1756004328506484e-05, + "loss": 1.2209, + "step": 15455 + }, + { + "epoch": 4.603510862078594, + "grad_norm": 0.41818171739578247, + "learning_rate": 1.1755054741866678e-05, + "loss": 1.2165, + "step": 15456 + }, + { + "epoch": 4.603808708278263, + "grad_norm": 0.5267404317855835, + "learning_rate": 1.1754105138898054e-05, + "loss": 1.2176, + "step": 15457 + }, + { + "epoch": 4.604106554477932, + "grad_norm": 0.2890204191207886, + "learning_rate": 1.1753155519609442e-05, + "loss": 1.2276, + "step": 15458 + }, + { + "epoch": 4.6044044006776, + "grad_norm": 0.4943948984146118, + "learning_rate": 1.1752205884009675e-05, + "loss": 1.2368, + "step": 15459 + }, + { + "epoch": 4.6047022468772685, + "grad_norm": 0.29402250051498413, + "learning_rate": 1.1751256232107589e-05, + "loss": 1.215, + "step": 15460 + }, + { + "epoch": 4.605000093076938, + "grad_norm": 0.5472093820571899, + "learning_rate": 1.1750306563912022e-05, + "loss": 1.2082, + "step": 15461 + }, + { + "epoch": 4.605297939276606, + "grad_norm": 0.3013838529586792, + "learning_rate": 1.1749356879431806e-05, + "loss": 1.2057, + "step": 15462 + }, + { + "epoch": 4.605595785476274, + "grad_norm": 0.45778924226760864, + "learning_rate": 1.1748407178675781e-05, + "loss": 1.2119, + "step": 15463 + }, + { + "epoch": 4.605893631675944, + "grad_norm": 0.4487936496734619, + "learning_rate": 1.1747457461652778e-05, + "loss": 1.2164, + "step": 15464 + }, + { + "epoch": 4.606191477875612, + "grad_norm": 0.4020189642906189, + "learning_rate": 1.1746507728371638e-05, + "loss": 1.2293, + "step": 15465 + }, + { + "epoch": 4.60648932407528, + "grad_norm": 0.683911919593811, + "learning_rate": 1.1745557978841195e-05, + "loss": 1.225, + "step": 15466 + }, + { + "epoch": 4.606787170274949, + "grad_norm": 0.25149503350257874, + "learning_rate": 1.1744608213070285e-05, + "loss": 1.2149, + "step": 15467 + }, + { + "epoch": 4.607085016474618, + "grad_norm": 0.35296499729156494, + "learning_rate": 1.1743658431067741e-05, + "loss": 1.227, + "step": 15468 + }, + { + "epoch": 4.607382862674287, + "grad_norm": 0.3388794958591461, + "learning_rate": 1.174270863284241e-05, + "loss": 1.2063, + "step": 15469 + }, + { + "epoch": 4.607680708873955, + "grad_norm": 0.46058595180511475, + "learning_rate": 1.1741758818403117e-05, + "loss": 1.2285, + "step": 15470 + }, + { + "epoch": 4.607978555073624, + "grad_norm": 0.2816953957080841, + "learning_rate": 1.1740808987758708e-05, + "loss": 1.2243, + "step": 15471 + }, + { + "epoch": 4.608276401273293, + "grad_norm": 0.349196195602417, + "learning_rate": 1.1739859140918016e-05, + "loss": 1.2253, + "step": 15472 + }, + { + "epoch": 4.608574247472961, + "grad_norm": 0.27341097593307495, + "learning_rate": 1.1738909277889877e-05, + "loss": 1.2106, + "step": 15473 + }, + { + "epoch": 4.6088720936726295, + "grad_norm": 0.3639007806777954, + "learning_rate": 1.1737959398683127e-05, + "loss": 1.2246, + "step": 15474 + }, + { + "epoch": 4.609169939872299, + "grad_norm": 0.26420873403549194, + "learning_rate": 1.173700950330661e-05, + "loss": 1.2064, + "step": 15475 + }, + { + "epoch": 4.609467786071967, + "grad_norm": 0.3559967875480652, + "learning_rate": 1.173605959176916e-05, + "loss": 1.2236, + "step": 15476 + }, + { + "epoch": 4.609765632271635, + "grad_norm": 0.39635011553764343, + "learning_rate": 1.1735109664079616e-05, + "loss": 1.2199, + "step": 15477 + }, + { + "epoch": 4.610063478471305, + "grad_norm": 0.3912164270877838, + "learning_rate": 1.1734159720246814e-05, + "loss": 1.2112, + "step": 15478 + }, + { + "epoch": 4.610361324670973, + "grad_norm": 0.28523966670036316, + "learning_rate": 1.1733209760279593e-05, + "loss": 1.2078, + "step": 15479 + }, + { + "epoch": 4.610659170870642, + "grad_norm": 0.2978664040565491, + "learning_rate": 1.1732259784186793e-05, + "loss": 1.2102, + "step": 15480 + }, + { + "epoch": 4.6109570170703105, + "grad_norm": 0.2843220829963684, + "learning_rate": 1.1731309791977248e-05, + "loss": 1.215, + "step": 15481 + }, + { + "epoch": 4.611254863269979, + "grad_norm": 0.32204434275627136, + "learning_rate": 1.1730359783659804e-05, + "loss": 1.2173, + "step": 15482 + }, + { + "epoch": 4.611552709469647, + "grad_norm": 0.3042689561843872, + "learning_rate": 1.1729409759243293e-05, + "loss": 1.2232, + "step": 15483 + }, + { + "epoch": 4.611850555669316, + "grad_norm": 0.3689742088317871, + "learning_rate": 1.1728459718736557e-05, + "loss": 1.2217, + "step": 15484 + }, + { + "epoch": 4.612148401868985, + "grad_norm": 0.545839250087738, + "learning_rate": 1.1727509662148435e-05, + "loss": 1.234, + "step": 15485 + }, + { + "epoch": 4.612446248068654, + "grad_norm": 0.2659914791584015, + "learning_rate": 1.1726559589487763e-05, + "loss": 1.2318, + "step": 15486 + }, + { + "epoch": 4.612744094268322, + "grad_norm": 0.40329739451408386, + "learning_rate": 1.1725609500763386e-05, + "loss": 1.2322, + "step": 15487 + }, + { + "epoch": 4.6130419404679905, + "grad_norm": 0.27586081624031067, + "learning_rate": 1.172465939598414e-05, + "loss": 1.2176, + "step": 15488 + }, + { + "epoch": 4.61333978666766, + "grad_norm": 0.5300125479698181, + "learning_rate": 1.1723709275158865e-05, + "loss": 1.2125, + "step": 15489 + }, + { + "epoch": 4.613637632867328, + "grad_norm": 0.4316484034061432, + "learning_rate": 1.17227591382964e-05, + "loss": 1.2147, + "step": 15490 + }, + { + "epoch": 4.613935479066996, + "grad_norm": 0.4001738131046295, + "learning_rate": 1.1721808985405587e-05, + "loss": 1.21, + "step": 15491 + }, + { + "epoch": 4.614233325266666, + "grad_norm": 0.6518998146057129, + "learning_rate": 1.1720858816495265e-05, + "loss": 1.1933, + "step": 15492 + }, + { + "epoch": 4.614531171466334, + "grad_norm": 0.3006100356578827, + "learning_rate": 1.1719908631574276e-05, + "loss": 1.2403, + "step": 15493 + }, + { + "epoch": 4.614829017666002, + "grad_norm": 0.374083936214447, + "learning_rate": 1.1718958430651455e-05, + "loss": 1.2228, + "step": 15494 + }, + { + "epoch": 4.6151268638656715, + "grad_norm": 0.2627086639404297, + "learning_rate": 1.1718008213735648e-05, + "loss": 1.2205, + "step": 15495 + }, + { + "epoch": 4.61542471006534, + "grad_norm": 0.2966965436935425, + "learning_rate": 1.1717057980835698e-05, + "loss": 1.2198, + "step": 15496 + }, + { + "epoch": 4.615722556265009, + "grad_norm": 0.30310168862342834, + "learning_rate": 1.1716107731960435e-05, + "loss": 1.2076, + "step": 15497 + }, + { + "epoch": 4.616020402464677, + "grad_norm": 0.6953670382499695, + "learning_rate": 1.171515746711871e-05, + "loss": 1.2296, + "step": 15498 + }, + { + "epoch": 4.616318248664346, + "grad_norm": 0.5464868545532227, + "learning_rate": 1.1714207186319364e-05, + "loss": 1.1972, + "step": 15499 + }, + { + "epoch": 4.616616094864015, + "grad_norm": 0.6416321992874146, + "learning_rate": 1.1713256889571231e-05, + "loss": 1.2038, + "step": 15500 + }, + { + "epoch": 4.616616094864015, + "eval_loss": 1.3223536014556885, + "eval_runtime": 107.9226, + "eval_samples_per_second": 16.067, + "eval_steps_per_second": 1.01, + "step": 15500 + }, + { + "epoch": 4.616913941063683, + "grad_norm": 0.5187292098999023, + "learning_rate": 1.1712306576883159e-05, + "loss": 1.237, + "step": 15501 + }, + { + "epoch": 4.617211787263352, + "grad_norm": 0.5971845388412476, + "learning_rate": 1.1711356248263987e-05, + "loss": 1.2101, + "step": 15502 + }, + { + "epoch": 4.617509633463021, + "grad_norm": 0.3387489914894104, + "learning_rate": 1.1710405903722558e-05, + "loss": 1.2173, + "step": 15503 + }, + { + "epoch": 4.617807479662689, + "grad_norm": 0.3731728792190552, + "learning_rate": 1.1709455543267709e-05, + "loss": 1.231, + "step": 15504 + }, + { + "epoch": 4.6181053258623574, + "grad_norm": 0.28528162837028503, + "learning_rate": 1.1708505166908291e-05, + "loss": 1.2146, + "step": 15505 + }, + { + "epoch": 4.618403172062027, + "grad_norm": 0.31289324164390564, + "learning_rate": 1.170755477465314e-05, + "loss": 1.2259, + "step": 15506 + }, + { + "epoch": 4.618701018261695, + "grad_norm": 0.25294843316078186, + "learning_rate": 1.1706604366511099e-05, + "loss": 1.2331, + "step": 15507 + }, + { + "epoch": 4.618998864461364, + "grad_norm": 0.29154565930366516, + "learning_rate": 1.1705653942491011e-05, + "loss": 1.2306, + "step": 15508 + }, + { + "epoch": 4.6192967106610325, + "grad_norm": 0.2750505208969116, + "learning_rate": 1.170470350260172e-05, + "loss": 1.2082, + "step": 15509 + }, + { + "epoch": 4.619594556860701, + "grad_norm": 0.2772367596626282, + "learning_rate": 1.1703753046852065e-05, + "loss": 1.2395, + "step": 15510 + }, + { + "epoch": 4.61989240306037, + "grad_norm": 0.2575388550758362, + "learning_rate": 1.1702802575250893e-05, + "loss": 1.1939, + "step": 15511 + }, + { + "epoch": 4.620190249260038, + "grad_norm": 0.2751673460006714, + "learning_rate": 1.1701852087807045e-05, + "loss": 1.2306, + "step": 15512 + }, + { + "epoch": 4.620488095459707, + "grad_norm": 0.3043292462825775, + "learning_rate": 1.1700901584529368e-05, + "loss": 1.2152, + "step": 15513 + }, + { + "epoch": 4.620785941659376, + "grad_norm": 0.24301272630691528, + "learning_rate": 1.1699951065426698e-05, + "loss": 1.2026, + "step": 15514 + }, + { + "epoch": 4.621083787859044, + "grad_norm": 0.3393288254737854, + "learning_rate": 1.169900053050788e-05, + "loss": 1.2104, + "step": 15515 + }, + { + "epoch": 4.621381634058713, + "grad_norm": 0.2605547308921814, + "learning_rate": 1.1698049979781766e-05, + "loss": 1.2306, + "step": 15516 + }, + { + "epoch": 4.621679480258382, + "grad_norm": 0.2591628432273865, + "learning_rate": 1.1697099413257193e-05, + "loss": 1.2205, + "step": 15517 + }, + { + "epoch": 4.62197732645805, + "grad_norm": 0.3370744287967682, + "learning_rate": 1.1696148830943005e-05, + "loss": 1.2036, + "step": 15518 + }, + { + "epoch": 4.622275172657719, + "grad_norm": 0.26255717873573303, + "learning_rate": 1.1695198232848047e-05, + "loss": 1.2229, + "step": 15519 + }, + { + "epoch": 4.622573018857388, + "grad_norm": 0.2808741331100464, + "learning_rate": 1.1694247618981165e-05, + "loss": 1.2335, + "step": 15520 + }, + { + "epoch": 4.622870865057056, + "grad_norm": 0.24790742993354797, + "learning_rate": 1.1693296989351199e-05, + "loss": 1.2138, + "step": 15521 + }, + { + "epoch": 4.623168711256724, + "grad_norm": 0.24825873970985413, + "learning_rate": 1.1692346343966999e-05, + "loss": 1.2269, + "step": 15522 + }, + { + "epoch": 4.623466557456394, + "grad_norm": 0.24359354376792908, + "learning_rate": 1.1691395682837407e-05, + "loss": 1.2123, + "step": 15523 + }, + { + "epoch": 4.623764403656062, + "grad_norm": 0.3074105679988861, + "learning_rate": 1.1690445005971268e-05, + "loss": 1.2262, + "step": 15524 + }, + { + "epoch": 4.624062249855731, + "grad_norm": 0.3166869878768921, + "learning_rate": 1.1689494313377426e-05, + "loss": 1.2116, + "step": 15525 + }, + { + "epoch": 4.624360096055399, + "grad_norm": 0.3546818792819977, + "learning_rate": 1.1688543605064726e-05, + "loss": 1.2141, + "step": 15526 + }, + { + "epoch": 4.624657942255068, + "grad_norm": 0.3457008898258209, + "learning_rate": 1.1687592881042016e-05, + "loss": 1.2204, + "step": 15527 + }, + { + "epoch": 4.624955788454737, + "grad_norm": 0.3059236407279968, + "learning_rate": 1.1686642141318139e-05, + "loss": 1.2192, + "step": 15528 + }, + { + "epoch": 4.625253634654405, + "grad_norm": 0.3162040710449219, + "learning_rate": 1.168569138590194e-05, + "loss": 1.2169, + "step": 15529 + }, + { + "epoch": 4.625551480854074, + "grad_norm": 0.40984293818473816, + "learning_rate": 1.1684740614802271e-05, + "loss": 1.2261, + "step": 15530 + }, + { + "epoch": 4.625849327053743, + "grad_norm": 0.3095340430736542, + "learning_rate": 1.1683789828027967e-05, + "loss": 1.2285, + "step": 15531 + }, + { + "epoch": 4.626147173253411, + "grad_norm": 0.48104336857795715, + "learning_rate": 1.1682839025587882e-05, + "loss": 1.2322, + "step": 15532 + }, + { + "epoch": 4.6264450194530795, + "grad_norm": 0.4585086405277252, + "learning_rate": 1.168188820749086e-05, + "loss": 1.2205, + "step": 15533 + }, + { + "epoch": 4.626742865652749, + "grad_norm": 0.2756181061267853, + "learning_rate": 1.168093737374575e-05, + "loss": 1.2185, + "step": 15534 + }, + { + "epoch": 4.627040711852417, + "grad_norm": 0.32118865847587585, + "learning_rate": 1.167998652436139e-05, + "loss": 1.2071, + "step": 15535 + }, + { + "epoch": 4.627338558052086, + "grad_norm": 0.37187910079956055, + "learning_rate": 1.1679035659346637e-05, + "loss": 1.2074, + "step": 15536 + }, + { + "epoch": 4.627636404251755, + "grad_norm": 0.4208984971046448, + "learning_rate": 1.1678084778710335e-05, + "loss": 1.2189, + "step": 15537 + }, + { + "epoch": 4.627934250451423, + "grad_norm": 0.2950340509414673, + "learning_rate": 1.1677133882461322e-05, + "loss": 1.2394, + "step": 15538 + }, + { + "epoch": 4.628232096651092, + "grad_norm": 0.3526626229286194, + "learning_rate": 1.1676182970608457e-05, + "loss": 1.2013, + "step": 15539 + }, + { + "epoch": 4.6285299428507605, + "grad_norm": 0.31066569685935974, + "learning_rate": 1.1675232043160583e-05, + "loss": 1.2214, + "step": 15540 + }, + { + "epoch": 4.628827789050429, + "grad_norm": 0.31988176703453064, + "learning_rate": 1.1674281100126544e-05, + "loss": 1.2112, + "step": 15541 + }, + { + "epoch": 4.629125635250098, + "grad_norm": 0.3039456903934479, + "learning_rate": 1.167333014151519e-05, + "loss": 1.2352, + "step": 15542 + }, + { + "epoch": 4.629423481449766, + "grad_norm": 0.3332209885120392, + "learning_rate": 1.1672379167335368e-05, + "loss": 1.2292, + "step": 15543 + }, + { + "epoch": 4.629721327649435, + "grad_norm": 0.3084508180618286, + "learning_rate": 1.1671428177595927e-05, + "loss": 1.2331, + "step": 15544 + }, + { + "epoch": 4.630019173849104, + "grad_norm": 0.4446774423122406, + "learning_rate": 1.1670477172305714e-05, + "loss": 1.2072, + "step": 15545 + }, + { + "epoch": 4.630317020048772, + "grad_norm": 0.2686777412891388, + "learning_rate": 1.1669526151473577e-05, + "loss": 1.2094, + "step": 15546 + }, + { + "epoch": 4.630614866248441, + "grad_norm": 0.3131929039955139, + "learning_rate": 1.166857511510837e-05, + "loss": 1.2167, + "step": 15547 + }, + { + "epoch": 4.63091271244811, + "grad_norm": 0.3872765302658081, + "learning_rate": 1.166762406321893e-05, + "loss": 1.2354, + "step": 15548 + }, + { + "epoch": 4.631210558647778, + "grad_norm": 0.3278713524341583, + "learning_rate": 1.1666672995814113e-05, + "loss": 1.2301, + "step": 15549 + }, + { + "epoch": 4.631508404847446, + "grad_norm": 0.36869022250175476, + "learning_rate": 1.1665721912902766e-05, + "loss": 1.2016, + "step": 15550 + }, + { + "epoch": 4.631806251047116, + "grad_norm": 0.4042108356952667, + "learning_rate": 1.1664770814493736e-05, + "loss": 1.2217, + "step": 15551 + }, + { + "epoch": 4.632104097246784, + "grad_norm": 0.3759974241256714, + "learning_rate": 1.1663819700595872e-05, + "loss": 1.2249, + "step": 15552 + }, + { + "epoch": 4.632401943446453, + "grad_norm": 0.2642388343811035, + "learning_rate": 1.1662868571218029e-05, + "loss": 1.2101, + "step": 15553 + }, + { + "epoch": 4.6326997896461215, + "grad_norm": 0.30808690190315247, + "learning_rate": 1.1661917426369051e-05, + "loss": 1.2191, + "step": 15554 + }, + { + "epoch": 4.63299763584579, + "grad_norm": 0.28365305066108704, + "learning_rate": 1.166096626605779e-05, + "loss": 1.2182, + "step": 15555 + }, + { + "epoch": 4.633295482045459, + "grad_norm": 0.26377856731414795, + "learning_rate": 1.1660015090293087e-05, + "loss": 1.2223, + "step": 15556 + }, + { + "epoch": 4.633593328245127, + "grad_norm": 0.29196006059646606, + "learning_rate": 1.1659063899083805e-05, + "loss": 1.2169, + "step": 15557 + }, + { + "epoch": 4.633891174444796, + "grad_norm": 0.2571171224117279, + "learning_rate": 1.1658112692438783e-05, + "loss": 1.2134, + "step": 15558 + }, + { + "epoch": 4.634189020644465, + "grad_norm": 0.2785705029964447, + "learning_rate": 1.1657161470366876e-05, + "loss": 1.2092, + "step": 15559 + }, + { + "epoch": 4.634486866844133, + "grad_norm": 0.4112585186958313, + "learning_rate": 1.1656210232876933e-05, + "loss": 1.2163, + "step": 15560 + }, + { + "epoch": 4.634784713043802, + "grad_norm": 0.31251394748687744, + "learning_rate": 1.1655258979977804e-05, + "loss": 1.2175, + "step": 15561 + }, + { + "epoch": 4.635082559243471, + "grad_norm": 0.3197955787181854, + "learning_rate": 1.1654307711678338e-05, + "loss": 1.2283, + "step": 15562 + }, + { + "epoch": 4.635380405443139, + "grad_norm": 0.38307496905326843, + "learning_rate": 1.165335642798739e-05, + "loss": 1.2165, + "step": 15563 + }, + { + "epoch": 4.635678251642808, + "grad_norm": 0.36435335874557495, + "learning_rate": 1.165240512891381e-05, + "loss": 1.2254, + "step": 15564 + }, + { + "epoch": 4.635976097842477, + "grad_norm": 0.3051356077194214, + "learning_rate": 1.1651453814466443e-05, + "loss": 1.2249, + "step": 15565 + }, + { + "epoch": 4.636273944042145, + "grad_norm": 0.36144301295280457, + "learning_rate": 1.1650502484654143e-05, + "loss": 1.2159, + "step": 15566 + }, + { + "epoch": 4.636571790241814, + "grad_norm": 0.3717021048069, + "learning_rate": 1.1649551139485759e-05, + "loss": 1.2144, + "step": 15567 + }, + { + "epoch": 4.6368696364414825, + "grad_norm": 0.3736661672592163, + "learning_rate": 1.164859977897015e-05, + "loss": 1.2143, + "step": 15568 + }, + { + "epoch": 4.637167482641151, + "grad_norm": 0.35798725485801697, + "learning_rate": 1.1647648403116159e-05, + "loss": 1.2218, + "step": 15569 + }, + { + "epoch": 4.63746532884082, + "grad_norm": 0.28397032618522644, + "learning_rate": 1.164669701193264e-05, + "loss": 1.2176, + "step": 15570 + }, + { + "epoch": 4.637763175040488, + "grad_norm": 0.33508992195129395, + "learning_rate": 1.1645745605428447e-05, + "loss": 1.2243, + "step": 15571 + }, + { + "epoch": 4.638061021240157, + "grad_norm": 0.3435896039009094, + "learning_rate": 1.1644794183612429e-05, + "loss": 1.2253, + "step": 15572 + }, + { + "epoch": 4.638358867439826, + "grad_norm": 0.24336761236190796, + "learning_rate": 1.1643842746493438e-05, + "loss": 1.2001, + "step": 15573 + }, + { + "epoch": 4.638656713639494, + "grad_norm": 0.30765971541404724, + "learning_rate": 1.1642891294080329e-05, + "loss": 1.2055, + "step": 15574 + }, + { + "epoch": 4.6389545598391635, + "grad_norm": 0.31240686774253845, + "learning_rate": 1.164193982638195e-05, + "loss": 1.2058, + "step": 15575 + }, + { + "epoch": 4.639252406038832, + "grad_norm": 0.2881528437137604, + "learning_rate": 1.1640988343407155e-05, + "loss": 1.2254, + "step": 15576 + }, + { + "epoch": 4.6395502522385, + "grad_norm": 0.5720462203025818, + "learning_rate": 1.1640036845164798e-05, + "loss": 1.2156, + "step": 15577 + }, + { + "epoch": 4.639848098438169, + "grad_norm": 0.50917649269104, + "learning_rate": 1.1639085331663734e-05, + "loss": 1.2173, + "step": 15578 + }, + { + "epoch": 4.640145944637838, + "grad_norm": 0.2722722589969635, + "learning_rate": 1.1638133802912805e-05, + "loss": 1.2153, + "step": 15579 + }, + { + "epoch": 4.640443790837506, + "grad_norm": 0.3898542523384094, + "learning_rate": 1.1637182258920874e-05, + "loss": 1.2224, + "step": 15580 + }, + { + "epoch": 4.640741637037175, + "grad_norm": 0.4063347280025482, + "learning_rate": 1.1636230699696795e-05, + "loss": 1.2026, + "step": 15581 + }, + { + "epoch": 4.641039483236844, + "grad_norm": 0.2960827350616455, + "learning_rate": 1.1635279125249415e-05, + "loss": 1.2266, + "step": 15582 + }, + { + "epoch": 4.641337329436512, + "grad_norm": 0.43661361932754517, + "learning_rate": 1.1634327535587588e-05, + "loss": 1.2301, + "step": 15583 + }, + { + "epoch": 4.641635175636181, + "grad_norm": 0.2536190152168274, + "learning_rate": 1.163337593072017e-05, + "loss": 1.2171, + "step": 15584 + }, + { + "epoch": 4.641933021835849, + "grad_norm": 0.31220099329948425, + "learning_rate": 1.1632424310656014e-05, + "loss": 1.2195, + "step": 15585 + }, + { + "epoch": 4.642230868035519, + "grad_norm": 0.2687770128250122, + "learning_rate": 1.1631472675403974e-05, + "loss": 1.2258, + "step": 15586 + }, + { + "epoch": 4.642528714235187, + "grad_norm": 0.3222101330757141, + "learning_rate": 1.1630521024972905e-05, + "loss": 1.2145, + "step": 15587 + }, + { + "epoch": 4.642826560434855, + "grad_norm": 0.2599417567253113, + "learning_rate": 1.162956935937166e-05, + "loss": 1.224, + "step": 15588 + }, + { + "epoch": 4.643124406634524, + "grad_norm": 0.2568964958190918, + "learning_rate": 1.1628617678609091e-05, + "loss": 1.2298, + "step": 15589 + }, + { + "epoch": 4.643422252834193, + "grad_norm": 0.3904465138912201, + "learning_rate": 1.1627665982694054e-05, + "loss": 1.2271, + "step": 15590 + }, + { + "epoch": 4.643720099033861, + "grad_norm": 0.6142756938934326, + "learning_rate": 1.1626714271635402e-05, + "loss": 1.2276, + "step": 15591 + }, + { + "epoch": 4.64401794523353, + "grad_norm": 0.2873811721801758, + "learning_rate": 1.1625762545441995e-05, + "loss": 1.2198, + "step": 15592 + }, + { + "epoch": 4.644315791433199, + "grad_norm": 0.6036465167999268, + "learning_rate": 1.1624810804122682e-05, + "loss": 1.207, + "step": 15593 + }, + { + "epoch": 4.644613637632867, + "grad_norm": 0.4465596079826355, + "learning_rate": 1.1623859047686317e-05, + "loss": 1.2128, + "step": 15594 + }, + { + "epoch": 4.644911483832536, + "grad_norm": 0.42341628670692444, + "learning_rate": 1.1622907276141764e-05, + "loss": 1.211, + "step": 15595 + }, + { + "epoch": 4.645209330032205, + "grad_norm": 0.581132709980011, + "learning_rate": 1.1621955489497867e-05, + "loss": 1.2178, + "step": 15596 + }, + { + "epoch": 4.645507176231873, + "grad_norm": 0.4056399166584015, + "learning_rate": 1.1621003687763486e-05, + "loss": 1.224, + "step": 15597 + }, + { + "epoch": 4.645805022431542, + "grad_norm": 0.4697325527667999, + "learning_rate": 1.162005187094748e-05, + "loss": 1.2134, + "step": 15598 + }, + { + "epoch": 4.6461028686312105, + "grad_norm": 0.28784817457199097, + "learning_rate": 1.1619100039058699e-05, + "loss": 1.2096, + "step": 15599 + }, + { + "epoch": 4.646400714830879, + "grad_norm": 0.45402616262435913, + "learning_rate": 1.1618148192106003e-05, + "loss": 1.2219, + "step": 15600 + }, + { + "epoch": 4.646698561030548, + "grad_norm": 0.3455040752887726, + "learning_rate": 1.1617196330098243e-05, + "loss": 1.2126, + "step": 15601 + }, + { + "epoch": 4.646996407230216, + "grad_norm": 0.4865645170211792, + "learning_rate": 1.1616244453044284e-05, + "loss": 1.2138, + "step": 15602 + }, + { + "epoch": 4.647294253429886, + "grad_norm": 0.3852697014808655, + "learning_rate": 1.1615292560952968e-05, + "loss": 1.1991, + "step": 15603 + }, + { + "epoch": 4.647592099629554, + "grad_norm": 0.2834094166755676, + "learning_rate": 1.1614340653833164e-05, + "loss": 1.2119, + "step": 15604 + }, + { + "epoch": 4.647889945829222, + "grad_norm": 0.4348202645778656, + "learning_rate": 1.1613388731693726e-05, + "loss": 1.1972, + "step": 15605 + }, + { + "epoch": 4.648187792028891, + "grad_norm": 0.26572486758232117, + "learning_rate": 1.1612436794543507e-05, + "loss": 1.2151, + "step": 15606 + }, + { + "epoch": 4.64848563822856, + "grad_norm": 0.42063283920288086, + "learning_rate": 1.1611484842391363e-05, + "loss": 1.2419, + "step": 15607 + }, + { + "epoch": 4.648783484428228, + "grad_norm": 0.2530670464038849, + "learning_rate": 1.1610532875246154e-05, + "loss": 1.2285, + "step": 15608 + }, + { + "epoch": 4.649081330627897, + "grad_norm": 0.3255590498447418, + "learning_rate": 1.1609580893116737e-05, + "loss": 1.2145, + "step": 15609 + }, + { + "epoch": 4.649379176827566, + "grad_norm": 0.3504859209060669, + "learning_rate": 1.1608628896011966e-05, + "loss": 1.2429, + "step": 15610 + }, + { + "epoch": 4.649677023027234, + "grad_norm": 0.3600333333015442, + "learning_rate": 1.1607676883940702e-05, + "loss": 1.2089, + "step": 15611 + }, + { + "epoch": 4.649974869226903, + "grad_norm": 0.5154030919075012, + "learning_rate": 1.1606724856911802e-05, + "loss": 1.2253, + "step": 15612 + }, + { + "epoch": 4.6502727154265715, + "grad_norm": 0.4052983224391937, + "learning_rate": 1.160577281493412e-05, + "loss": 1.214, + "step": 15613 + }, + { + "epoch": 4.650570561626241, + "grad_norm": 0.46548226475715637, + "learning_rate": 1.1604820758016518e-05, + "loss": 1.2133, + "step": 15614 + }, + { + "epoch": 4.650868407825909, + "grad_norm": 0.327823281288147, + "learning_rate": 1.160386868616785e-05, + "loss": 1.2108, + "step": 15615 + }, + { + "epoch": 4.651166254025577, + "grad_norm": 0.6578736305236816, + "learning_rate": 1.1602916599396974e-05, + "loss": 1.2076, + "step": 15616 + }, + { + "epoch": 4.651464100225246, + "grad_norm": 0.49334853887557983, + "learning_rate": 1.1601964497712751e-05, + "loss": 1.2234, + "step": 15617 + }, + { + "epoch": 4.651761946424915, + "grad_norm": 0.48380619287490845, + "learning_rate": 1.1601012381124042e-05, + "loss": 1.2258, + "step": 15618 + }, + { + "epoch": 4.652059792624583, + "grad_norm": 0.326084166765213, + "learning_rate": 1.16000602496397e-05, + "loss": 1.2231, + "step": 15619 + }, + { + "epoch": 4.6523576388242525, + "grad_norm": 0.5676078796386719, + "learning_rate": 1.1599108103268584e-05, + "loss": 1.2054, + "step": 15620 + }, + { + "epoch": 4.652655485023921, + "grad_norm": 0.4441809058189392, + "learning_rate": 1.1598155942019551e-05, + "loss": 1.2146, + "step": 15621 + }, + { + "epoch": 4.652953331223589, + "grad_norm": 0.33972957730293274, + "learning_rate": 1.1597203765901468e-05, + "loss": 1.2131, + "step": 15622 + }, + { + "epoch": 4.653251177423258, + "grad_norm": 0.3543725907802582, + "learning_rate": 1.1596251574923185e-05, + "loss": 1.2248, + "step": 15623 + }, + { + "epoch": 4.653549023622927, + "grad_norm": 0.25015169382095337, + "learning_rate": 1.1595299369093564e-05, + "loss": 1.2306, + "step": 15624 + }, + { + "epoch": 4.653846869822595, + "grad_norm": 0.3255843222141266, + "learning_rate": 1.1594347148421467e-05, + "loss": 1.2265, + "step": 15625 + }, + { + "epoch": 4.654144716022264, + "grad_norm": 0.32699039578437805, + "learning_rate": 1.159339491291575e-05, + "loss": 1.2045, + "step": 15626 + }, + { + "epoch": 4.6544425622219325, + "grad_norm": 0.24702998995780945, + "learning_rate": 1.1592442662585274e-05, + "loss": 1.2147, + "step": 15627 + }, + { + "epoch": 4.654740408421601, + "grad_norm": 0.28171196579933167, + "learning_rate": 1.1591490397438897e-05, + "loss": 1.2268, + "step": 15628 + }, + { + "epoch": 4.65503825462127, + "grad_norm": 0.3784802258014679, + "learning_rate": 1.1590538117485483e-05, + "loss": 1.2253, + "step": 15629 + }, + { + "epoch": 4.655336100820938, + "grad_norm": 0.26567766070365906, + "learning_rate": 1.1589585822733887e-05, + "loss": 1.2374, + "step": 15630 + }, + { + "epoch": 4.655633947020608, + "grad_norm": 0.4190118610858917, + "learning_rate": 1.1588633513192971e-05, + "loss": 1.2281, + "step": 15631 + }, + { + "epoch": 4.655931793220276, + "grad_norm": 0.4046405553817749, + "learning_rate": 1.1587681188871595e-05, + "loss": 1.2219, + "step": 15632 + }, + { + "epoch": 4.656229639419944, + "grad_norm": 0.26123669743537903, + "learning_rate": 1.1586728849778618e-05, + "loss": 1.2152, + "step": 15633 + }, + { + "epoch": 4.6565274856196135, + "grad_norm": 0.3636496663093567, + "learning_rate": 1.1585776495922903e-05, + "loss": 1.2154, + "step": 15634 + }, + { + "epoch": 4.656825331819282, + "grad_norm": 0.31194934248924255, + "learning_rate": 1.158482412731331e-05, + "loss": 1.2098, + "step": 15635 + }, + { + "epoch": 4.65712317801895, + "grad_norm": 0.5256511569023132, + "learning_rate": 1.15838717439587e-05, + "loss": 1.219, + "step": 15636 + }, + { + "epoch": 4.657421024218619, + "grad_norm": 0.4009208381175995, + "learning_rate": 1.158291934586793e-05, + "loss": 1.2093, + "step": 15637 + }, + { + "epoch": 4.657718870418288, + "grad_norm": 0.3432953953742981, + "learning_rate": 1.1581966933049867e-05, + "loss": 1.2026, + "step": 15638 + }, + { + "epoch": 4.658016716617956, + "grad_norm": 0.4191701114177704, + "learning_rate": 1.1581014505513369e-05, + "loss": 1.2281, + "step": 15639 + }, + { + "epoch": 4.658314562817625, + "grad_norm": 0.32237496972084045, + "learning_rate": 1.1580062063267295e-05, + "loss": 1.2165, + "step": 15640 + }, + { + "epoch": 4.658612409017294, + "grad_norm": 0.30885910987854004, + "learning_rate": 1.157910960632051e-05, + "loss": 1.2253, + "step": 15641 + }, + { + "epoch": 4.658910255216963, + "grad_norm": 0.41791924834251404, + "learning_rate": 1.1578157134681875e-05, + "loss": 1.2386, + "step": 15642 + }, + { + "epoch": 4.659208101416631, + "grad_norm": 0.41621527075767517, + "learning_rate": 1.1577204648360252e-05, + "loss": 1.232, + "step": 15643 + }, + { + "epoch": 4.659505947616299, + "grad_norm": 0.25124916434288025, + "learning_rate": 1.1576252147364497e-05, + "loss": 1.2167, + "step": 15644 + }, + { + "epoch": 4.659803793815969, + "grad_norm": 0.5308575630187988, + "learning_rate": 1.1575299631703482e-05, + "loss": 1.2142, + "step": 15645 + }, + { + "epoch": 4.660101640015637, + "grad_norm": 0.3115309476852417, + "learning_rate": 1.1574347101386063e-05, + "loss": 1.2208, + "step": 15646 + }, + { + "epoch": 4.660399486215305, + "grad_norm": 0.4527014195919037, + "learning_rate": 1.1573394556421102e-05, + "loss": 1.2275, + "step": 15647 + }, + { + "epoch": 4.6606973324149745, + "grad_norm": 0.5402430891990662, + "learning_rate": 1.1572441996817465e-05, + "loss": 1.2105, + "step": 15648 + }, + { + "epoch": 4.660995178614643, + "grad_norm": 0.26349201798439026, + "learning_rate": 1.157148942258401e-05, + "loss": 1.2164, + "step": 15649 + }, + { + "epoch": 4.661293024814311, + "grad_norm": 0.39200934767723083, + "learning_rate": 1.1570536833729599e-05, + "loss": 1.2355, + "step": 15650 + }, + { + "epoch": 4.66159087101398, + "grad_norm": 0.39736855030059814, + "learning_rate": 1.1569584230263103e-05, + "loss": 1.2239, + "step": 15651 + }, + { + "epoch": 4.661888717213649, + "grad_norm": 0.24578113853931427, + "learning_rate": 1.1568631612193375e-05, + "loss": 1.2292, + "step": 15652 + }, + { + "epoch": 4.662186563413318, + "grad_norm": 0.35816240310668945, + "learning_rate": 1.1567678979529286e-05, + "loss": 1.2441, + "step": 15653 + }, + { + "epoch": 4.662484409612986, + "grad_norm": 0.24438230693340302, + "learning_rate": 1.1566726332279694e-05, + "loss": 1.2246, + "step": 15654 + }, + { + "epoch": 4.662782255812655, + "grad_norm": 0.31994426250457764, + "learning_rate": 1.1565773670453464e-05, + "loss": 1.2219, + "step": 15655 + }, + { + "epoch": 4.663080102012323, + "grad_norm": 0.285806268453598, + "learning_rate": 1.1564820994059459e-05, + "loss": 1.2228, + "step": 15656 + }, + { + "epoch": 4.663377948211992, + "grad_norm": 0.3526848554611206, + "learning_rate": 1.1563868303106543e-05, + "loss": 1.2205, + "step": 15657 + }, + { + "epoch": 4.6636757944116605, + "grad_norm": 0.2998829483985901, + "learning_rate": 1.156291559760358e-05, + "loss": 1.226, + "step": 15658 + }, + { + "epoch": 4.66397364061133, + "grad_norm": 0.629315972328186, + "learning_rate": 1.1561962877559432e-05, + "loss": 1.2078, + "step": 15659 + }, + { + "epoch": 4.664271486810998, + "grad_norm": 0.3076554536819458, + "learning_rate": 1.1561010142982967e-05, + "loss": 1.2076, + "step": 15660 + }, + { + "epoch": 4.664569333010666, + "grad_norm": 0.4892565608024597, + "learning_rate": 1.1560057393883047e-05, + "loss": 1.2164, + "step": 15661 + }, + { + "epoch": 4.664867179210336, + "grad_norm": 0.3082030117511749, + "learning_rate": 1.1559104630268532e-05, + "loss": 1.222, + "step": 15662 + }, + { + "epoch": 4.665165025410004, + "grad_norm": 0.41707074642181396, + "learning_rate": 1.1558151852148294e-05, + "loss": 1.2243, + "step": 15663 + }, + { + "epoch": 4.665462871609672, + "grad_norm": 0.2680579125881195, + "learning_rate": 1.1557199059531192e-05, + "loss": 1.2329, + "step": 15664 + }, + { + "epoch": 4.665760717809341, + "grad_norm": 0.4275538921356201, + "learning_rate": 1.1556246252426093e-05, + "loss": 1.2209, + "step": 15665 + }, + { + "epoch": 4.66605856400901, + "grad_norm": 0.254695326089859, + "learning_rate": 1.155529343084186e-05, + "loss": 1.2214, + "step": 15666 + }, + { + "epoch": 4.666356410208678, + "grad_norm": 0.6112091541290283, + "learning_rate": 1.1554340594787364e-05, + "loss": 1.2276, + "step": 15667 + }, + { + "epoch": 4.666654256408347, + "grad_norm": 0.26346415281295776, + "learning_rate": 1.155338774427146e-05, + "loss": 1.2141, + "step": 15668 + }, + { + "epoch": 4.666952102608016, + "grad_norm": 1.103255033493042, + "learning_rate": 1.155243487930302e-05, + "loss": 1.2162, + "step": 15669 + }, + { + "epoch": 4.667249948807685, + "grad_norm": 0.44788405299186707, + "learning_rate": 1.1551481999890913e-05, + "loss": 1.2262, + "step": 15670 + }, + { + "epoch": 4.667547795007353, + "grad_norm": 0.7941567897796631, + "learning_rate": 1.1550529106043995e-05, + "loss": 1.2119, + "step": 15671 + }, + { + "epoch": 4.6678456412070215, + "grad_norm": 0.3029055595397949, + "learning_rate": 1.1549576197771136e-05, + "loss": 1.2307, + "step": 15672 + }, + { + "epoch": 4.668143487406691, + "grad_norm": 1.1536701917648315, + "learning_rate": 1.1548623275081201e-05, + "loss": 1.2099, + "step": 15673 + }, + { + "epoch": 4.668441333606359, + "grad_norm": 0.34991881251335144, + "learning_rate": 1.1547670337983057e-05, + "loss": 1.2208, + "step": 15674 + }, + { + "epoch": 4.668739179806027, + "grad_norm": 0.7014442086219788, + "learning_rate": 1.1546717386485571e-05, + "loss": 1.2341, + "step": 15675 + }, + { + "epoch": 4.669037026005697, + "grad_norm": 0.25845497846603394, + "learning_rate": 1.1545764420597607e-05, + "loss": 1.217, + "step": 15676 + }, + { + "epoch": 4.669334872205365, + "grad_norm": 0.35486188530921936, + "learning_rate": 1.1544811440328035e-05, + "loss": 1.2291, + "step": 15677 + }, + { + "epoch": 4.669632718405033, + "grad_norm": 0.543127179145813, + "learning_rate": 1.1543858445685714e-05, + "loss": 1.21, + "step": 15678 + }, + { + "epoch": 4.6699305646047025, + "grad_norm": 0.25546911358833313, + "learning_rate": 1.1542905436679516e-05, + "loss": 1.2173, + "step": 15679 + }, + { + "epoch": 4.670228410804371, + "grad_norm": 0.47984954714775085, + "learning_rate": 1.154195241331831e-05, + "loss": 1.2244, + "step": 15680 + }, + { + "epoch": 4.67052625700404, + "grad_norm": 0.3667922616004944, + "learning_rate": 1.1540999375610956e-05, + "loss": 1.222, + "step": 15681 + }, + { + "epoch": 4.670824103203708, + "grad_norm": 0.2718745172023773, + "learning_rate": 1.1540046323566325e-05, + "loss": 1.2127, + "step": 15682 + }, + { + "epoch": 4.671121949403377, + "grad_norm": 0.38106366991996765, + "learning_rate": 1.1539093257193284e-05, + "loss": 1.2356, + "step": 15683 + }, + { + "epoch": 4.671419795603045, + "grad_norm": 0.3056412935256958, + "learning_rate": 1.15381401765007e-05, + "loss": 1.2291, + "step": 15684 + }, + { + "epoch": 4.671717641802714, + "grad_norm": 0.276956707239151, + "learning_rate": 1.1537187081497439e-05, + "loss": 1.2094, + "step": 15685 + }, + { + "epoch": 4.6720154880023825, + "grad_norm": 0.28864794969558716, + "learning_rate": 1.1536233972192372e-05, + "loss": 1.219, + "step": 15686 + }, + { + "epoch": 4.672313334202052, + "grad_norm": 0.34191474318504333, + "learning_rate": 1.1535280848594364e-05, + "loss": 1.2166, + "step": 15687 + }, + { + "epoch": 4.67261118040172, + "grad_norm": 0.2732122242450714, + "learning_rate": 1.1534327710712282e-05, + "loss": 1.218, + "step": 15688 + }, + { + "epoch": 4.672909026601388, + "grad_norm": 0.2825454771518707, + "learning_rate": 1.1533374558554995e-05, + "loss": 1.2258, + "step": 15689 + }, + { + "epoch": 4.673206872801058, + "grad_norm": 0.32758834958076477, + "learning_rate": 1.1532421392131373e-05, + "loss": 1.208, + "step": 15690 + }, + { + "epoch": 4.673504719000726, + "grad_norm": 0.2849428057670593, + "learning_rate": 1.1531468211450278e-05, + "loss": 1.23, + "step": 15691 + }, + { + "epoch": 4.673802565200394, + "grad_norm": 0.30653116106987, + "learning_rate": 1.1530515016520585e-05, + "loss": 1.2249, + "step": 15692 + }, + { + "epoch": 4.6741004114000635, + "grad_norm": 0.24566933512687683, + "learning_rate": 1.1529561807351157e-05, + "loss": 1.2158, + "step": 15693 + }, + { + "epoch": 4.674398257599732, + "grad_norm": 0.28762638568878174, + "learning_rate": 1.152860858395087e-05, + "loss": 1.2131, + "step": 15694 + }, + { + "epoch": 4.6746961037994, + "grad_norm": 0.25978901982307434, + "learning_rate": 1.1527655346328585e-05, + "loss": 1.2144, + "step": 15695 + }, + { + "epoch": 4.674993949999069, + "grad_norm": 0.30036869645118713, + "learning_rate": 1.1526702094493174e-05, + "loss": 1.2294, + "step": 15696 + }, + { + "epoch": 4.675291796198738, + "grad_norm": 0.3539499044418335, + "learning_rate": 1.1525748828453507e-05, + "loss": 1.2286, + "step": 15697 + }, + { + "epoch": 4.675589642398407, + "grad_norm": 0.26067811250686646, + "learning_rate": 1.1524795548218451e-05, + "loss": 1.2278, + "step": 15698 + }, + { + "epoch": 4.675887488598075, + "grad_norm": 0.3095061182975769, + "learning_rate": 1.1523842253796875e-05, + "loss": 1.2093, + "step": 15699 + }, + { + "epoch": 4.676185334797744, + "grad_norm": 0.3263936936855316, + "learning_rate": 1.152288894519765e-05, + "loss": 1.2134, + "step": 15700 + }, + { + "epoch": 4.676483180997413, + "grad_norm": 0.2679992914199829, + "learning_rate": 1.1521935622429647e-05, + "loss": 1.2094, + "step": 15701 + }, + { + "epoch": 4.676781027197081, + "grad_norm": 0.39775457978248596, + "learning_rate": 1.1520982285501731e-05, + "loss": 1.195, + "step": 15702 + }, + { + "epoch": 4.677078873396749, + "grad_norm": 0.283600389957428, + "learning_rate": 1.1520028934422773e-05, + "loss": 1.2343, + "step": 15703 + }, + { + "epoch": 4.677376719596419, + "grad_norm": 0.4227513074874878, + "learning_rate": 1.1519075569201647e-05, + "loss": 1.2288, + "step": 15704 + }, + { + "epoch": 4.677674565796087, + "grad_norm": 0.3514833450317383, + "learning_rate": 1.1518122189847218e-05, + "loss": 1.2192, + "step": 15705 + }, + { + "epoch": 4.677972411995755, + "grad_norm": 0.40376874804496765, + "learning_rate": 1.1517168796368359e-05, + "loss": 1.2108, + "step": 15706 + }, + { + "epoch": 4.6782702581954245, + "grad_norm": 0.5061954259872437, + "learning_rate": 1.1516215388773941e-05, + "loss": 1.2175, + "step": 15707 + }, + { + "epoch": 4.678568104395093, + "grad_norm": 0.24786929786205292, + "learning_rate": 1.151526196707283e-05, + "loss": 1.2167, + "step": 15708 + }, + { + "epoch": 4.678865950594762, + "grad_norm": 0.4331750273704529, + "learning_rate": 1.15143085312739e-05, + "loss": 1.2259, + "step": 15709 + }, + { + "epoch": 4.67916379679443, + "grad_norm": 0.2860633432865143, + "learning_rate": 1.1513355081386021e-05, + "loss": 1.2337, + "step": 15710 + }, + { + "epoch": 4.679461642994099, + "grad_norm": 0.44512543082237244, + "learning_rate": 1.1512401617418068e-05, + "loss": 1.1948, + "step": 15711 + }, + { + "epoch": 4.679759489193768, + "grad_norm": 0.2672065794467926, + "learning_rate": 1.1511448139378903e-05, + "loss": 1.2183, + "step": 15712 + }, + { + "epoch": 4.680057335393436, + "grad_norm": 0.4276815950870514, + "learning_rate": 1.1510494647277402e-05, + "loss": 1.2133, + "step": 15713 + }, + { + "epoch": 4.680355181593105, + "grad_norm": 0.29092276096343994, + "learning_rate": 1.1509541141122437e-05, + "loss": 1.2037, + "step": 15714 + }, + { + "epoch": 4.680653027792774, + "grad_norm": 0.4415130615234375, + "learning_rate": 1.1508587620922877e-05, + "loss": 1.2332, + "step": 15715 + }, + { + "epoch": 4.680950873992442, + "grad_norm": 0.2688218355178833, + "learning_rate": 1.1507634086687596e-05, + "loss": 1.2288, + "step": 15716 + }, + { + "epoch": 4.6812487201921105, + "grad_norm": 0.41821667551994324, + "learning_rate": 1.1506680538425463e-05, + "loss": 1.2013, + "step": 15717 + }, + { + "epoch": 4.68154656639178, + "grad_norm": 0.2578604519367218, + "learning_rate": 1.1505726976145354e-05, + "loss": 1.2101, + "step": 15718 + }, + { + "epoch": 4.681844412591448, + "grad_norm": 0.4427260756492615, + "learning_rate": 1.1504773399856134e-05, + "loss": 1.2122, + "step": 15719 + }, + { + "epoch": 4.682142258791117, + "grad_norm": 0.2817981243133545, + "learning_rate": 1.1503819809566683e-05, + "loss": 1.2086, + "step": 15720 + }, + { + "epoch": 4.682440104990786, + "grad_norm": 0.43863993883132935, + "learning_rate": 1.1502866205285865e-05, + "loss": 1.2096, + "step": 15721 + }, + { + "epoch": 4.682737951190454, + "grad_norm": 0.42356306314468384, + "learning_rate": 1.1501912587022557e-05, + "loss": 1.2054, + "step": 15722 + }, + { + "epoch": 4.683035797390122, + "grad_norm": 0.2597607374191284, + "learning_rate": 1.150095895478563e-05, + "loss": 1.2026, + "step": 15723 + }, + { + "epoch": 4.683333643589791, + "grad_norm": 0.34694647789001465, + "learning_rate": 1.1500005308583957e-05, + "loss": 1.2197, + "step": 15724 + }, + { + "epoch": 4.68363148978946, + "grad_norm": 0.2810973525047302, + "learning_rate": 1.1499051648426414e-05, + "loss": 1.2243, + "step": 15725 + }, + { + "epoch": 4.683929335989129, + "grad_norm": 0.2500973045825958, + "learning_rate": 1.1498097974321865e-05, + "loss": 1.2234, + "step": 15726 + }, + { + "epoch": 4.684227182188797, + "grad_norm": 0.27204346656799316, + "learning_rate": 1.149714428627919e-05, + "loss": 1.2381, + "step": 15727 + }, + { + "epoch": 4.684525028388466, + "grad_norm": 0.2890789210796356, + "learning_rate": 1.1496190584307264e-05, + "loss": 1.2443, + "step": 15728 + }, + { + "epoch": 4.684822874588135, + "grad_norm": 0.24557743966579437, + "learning_rate": 1.1495236868414952e-05, + "loss": 1.2253, + "step": 15729 + }, + { + "epoch": 4.685120720787803, + "grad_norm": 0.32808101177215576, + "learning_rate": 1.149428313861113e-05, + "loss": 1.2181, + "step": 15730 + }, + { + "epoch": 4.6854185669874715, + "grad_norm": 0.24894008040428162, + "learning_rate": 1.149332939490468e-05, + "loss": 1.2236, + "step": 15731 + }, + { + "epoch": 4.685716413187141, + "grad_norm": 0.32874298095703125, + "learning_rate": 1.149237563730446e-05, + "loss": 1.2165, + "step": 15732 + }, + { + "epoch": 4.686014259386809, + "grad_norm": 0.25478601455688477, + "learning_rate": 1.1491421865819356e-05, + "loss": 1.2106, + "step": 15733 + }, + { + "epoch": 4.686312105586477, + "grad_norm": 0.3377542793750763, + "learning_rate": 1.1490468080458237e-05, + "loss": 1.2164, + "step": 15734 + }, + { + "epoch": 4.686609951786147, + "grad_norm": 0.32631832361221313, + "learning_rate": 1.148951428122998e-05, + "loss": 1.2128, + "step": 15735 + }, + { + "epoch": 4.686907797985815, + "grad_norm": 0.27623340487480164, + "learning_rate": 1.1488560468143454e-05, + "loss": 1.2091, + "step": 15736 + }, + { + "epoch": 4.687205644185484, + "grad_norm": 0.2862810790538788, + "learning_rate": 1.1487606641207539e-05, + "loss": 1.2217, + "step": 15737 + }, + { + "epoch": 4.6875034903851525, + "grad_norm": 0.2557696998119354, + "learning_rate": 1.1486652800431104e-05, + "loss": 1.2383, + "step": 15738 + }, + { + "epoch": 4.687801336584821, + "grad_norm": 0.28746888041496277, + "learning_rate": 1.1485698945823025e-05, + "loss": 1.2188, + "step": 15739 + }, + { + "epoch": 4.68809918278449, + "grad_norm": 0.2647130489349365, + "learning_rate": 1.1484745077392179e-05, + "loss": 1.2295, + "step": 15740 + }, + { + "epoch": 4.688397028984158, + "grad_norm": 0.33756572008132935, + "learning_rate": 1.1483791195147438e-05, + "loss": 1.219, + "step": 15741 + }, + { + "epoch": 4.688694875183827, + "grad_norm": 0.404854953289032, + "learning_rate": 1.148283729909768e-05, + "loss": 1.2314, + "step": 15742 + }, + { + "epoch": 4.688992721383496, + "grad_norm": 0.277214378118515, + "learning_rate": 1.1481883389251776e-05, + "loss": 1.2478, + "step": 15743 + }, + { + "epoch": 4.689290567583164, + "grad_norm": 0.37850046157836914, + "learning_rate": 1.1480929465618598e-05, + "loss": 1.2108, + "step": 15744 + }, + { + "epoch": 4.6895884137828325, + "grad_norm": 0.2664380669593811, + "learning_rate": 1.1479975528207032e-05, + "loss": 1.2316, + "step": 15745 + }, + { + "epoch": 4.689886259982502, + "grad_norm": 0.4216008186340332, + "learning_rate": 1.1479021577025946e-05, + "loss": 1.2283, + "step": 15746 + }, + { + "epoch": 4.69018410618217, + "grad_norm": 0.3628690838813782, + "learning_rate": 1.1478067612084216e-05, + "loss": 1.2043, + "step": 15747 + }, + { + "epoch": 4.690481952381839, + "grad_norm": 0.27909860014915466, + "learning_rate": 1.1477113633390719e-05, + "loss": 1.2121, + "step": 15748 + }, + { + "epoch": 4.690779798581508, + "grad_norm": 0.39102903008461, + "learning_rate": 1.147615964095433e-05, + "loss": 1.2097, + "step": 15749 + }, + { + "epoch": 4.691077644781176, + "grad_norm": 0.25147396326065063, + "learning_rate": 1.1475205634783921e-05, + "loss": 1.2292, + "step": 15750 + }, + { + "epoch": 4.691375490980844, + "grad_norm": 0.4025093913078308, + "learning_rate": 1.1474251614888376e-05, + "loss": 1.2214, + "step": 15751 + }, + { + "epoch": 4.6916733371805135, + "grad_norm": 0.276917964220047, + "learning_rate": 1.1473297581276568e-05, + "loss": 1.2299, + "step": 15752 + }, + { + "epoch": 4.691971183380182, + "grad_norm": 0.2868637442588806, + "learning_rate": 1.1472343533957369e-05, + "loss": 1.2316, + "step": 15753 + }, + { + "epoch": 4.692269029579851, + "grad_norm": 0.3295955955982208, + "learning_rate": 1.147138947293966e-05, + "loss": 1.2281, + "step": 15754 + }, + { + "epoch": 4.692566875779519, + "grad_norm": 0.2683313488960266, + "learning_rate": 1.1470435398232313e-05, + "loss": 1.2393, + "step": 15755 + }, + { + "epoch": 4.692864721979188, + "grad_norm": 0.33063018321990967, + "learning_rate": 1.146948130984421e-05, + "loss": 1.2136, + "step": 15756 + }, + { + "epoch": 4.693162568178857, + "grad_norm": 0.36789897084236145, + "learning_rate": 1.1468527207784225e-05, + "loss": 1.2171, + "step": 15757 + }, + { + "epoch": 4.693460414378525, + "grad_norm": 0.26819494366645813, + "learning_rate": 1.1467573092061236e-05, + "loss": 1.2147, + "step": 15758 + }, + { + "epoch": 4.693758260578194, + "grad_norm": 0.28111323714256287, + "learning_rate": 1.146661896268412e-05, + "loss": 1.2262, + "step": 15759 + }, + { + "epoch": 4.694056106777863, + "grad_norm": 0.2832450568675995, + "learning_rate": 1.1465664819661751e-05, + "loss": 1.2254, + "step": 15760 + }, + { + "epoch": 4.694353952977531, + "grad_norm": 0.2826347351074219, + "learning_rate": 1.1464710663003008e-05, + "loss": 1.2127, + "step": 15761 + }, + { + "epoch": 4.694651799177199, + "grad_norm": 0.2850990891456604, + "learning_rate": 1.1463756492716768e-05, + "loss": 1.2125, + "step": 15762 + }, + { + "epoch": 4.694949645376869, + "grad_norm": 0.2550585865974426, + "learning_rate": 1.146280230881191e-05, + "loss": 1.2314, + "step": 15763 + }, + { + "epoch": 4.695247491576537, + "grad_norm": 0.267622172832489, + "learning_rate": 1.1461848111297311e-05, + "loss": 1.2232, + "step": 15764 + }, + { + "epoch": 4.695545337776206, + "grad_norm": 0.27849382162094116, + "learning_rate": 1.1460893900181854e-05, + "loss": 1.2188, + "step": 15765 + }, + { + "epoch": 4.6958431839758745, + "grad_norm": 0.31942984461784363, + "learning_rate": 1.1459939675474405e-05, + "loss": 1.2188, + "step": 15766 + }, + { + "epoch": 4.696141030175543, + "grad_norm": 0.3307408094406128, + "learning_rate": 1.1458985437183849e-05, + "loss": 1.2147, + "step": 15767 + }, + { + "epoch": 4.696438876375212, + "grad_norm": 0.3231342136859894, + "learning_rate": 1.1458031185319063e-05, + "loss": 1.2172, + "step": 15768 + }, + { + "epoch": 4.69673672257488, + "grad_norm": 0.3401965796947479, + "learning_rate": 1.1457076919888928e-05, + "loss": 1.2267, + "step": 15769 + }, + { + "epoch": 4.697034568774549, + "grad_norm": 0.27146321535110474, + "learning_rate": 1.1456122640902318e-05, + "loss": 1.2127, + "step": 15770 + }, + { + "epoch": 4.697332414974218, + "grad_norm": 0.3082990050315857, + "learning_rate": 1.1455168348368118e-05, + "loss": 1.2389, + "step": 15771 + }, + { + "epoch": 4.697630261173886, + "grad_norm": 0.3296731412410736, + "learning_rate": 1.1454214042295199e-05, + "loss": 1.2269, + "step": 15772 + }, + { + "epoch": 4.697928107373555, + "grad_norm": 0.278409481048584, + "learning_rate": 1.1453259722692443e-05, + "loss": 1.2224, + "step": 15773 + }, + { + "epoch": 4.698225953573224, + "grad_norm": 0.24612689018249512, + "learning_rate": 1.145230538956873e-05, + "loss": 1.225, + "step": 15774 + }, + { + "epoch": 4.698523799772892, + "grad_norm": 0.4055539667606354, + "learning_rate": 1.1451351042932937e-05, + "loss": 1.2179, + "step": 15775 + }, + { + "epoch": 4.698821645972561, + "grad_norm": 0.6009336113929749, + "learning_rate": 1.1450396682793945e-05, + "loss": 1.2277, + "step": 15776 + }, + { + "epoch": 4.69911949217223, + "grad_norm": 0.3844189941883087, + "learning_rate": 1.1449442309160631e-05, + "loss": 1.2087, + "step": 15777 + }, + { + "epoch": 4.699417338371898, + "grad_norm": 0.5920946002006531, + "learning_rate": 1.1448487922041877e-05, + "loss": 1.2111, + "step": 15778 + }, + { + "epoch": 4.699715184571567, + "grad_norm": 0.47467926144599915, + "learning_rate": 1.1447533521446561e-05, + "loss": 1.2314, + "step": 15779 + }, + { + "epoch": 4.700013030771236, + "grad_norm": 0.36605679988861084, + "learning_rate": 1.1446579107383565e-05, + "loss": 1.2204, + "step": 15780 + }, + { + "epoch": 4.700310876970904, + "grad_norm": 0.42435649037361145, + "learning_rate": 1.1445624679861762e-05, + "loss": 1.2109, + "step": 15781 + }, + { + "epoch": 4.700608723170573, + "grad_norm": 0.30126717686653137, + "learning_rate": 1.1444670238890039e-05, + "loss": 1.227, + "step": 15782 + }, + { + "epoch": 4.700906569370241, + "grad_norm": 0.4542222023010254, + "learning_rate": 1.1443715784477277e-05, + "loss": 1.2303, + "step": 15783 + }, + { + "epoch": 4.70120441556991, + "grad_norm": 0.2698976397514343, + "learning_rate": 1.1442761316632347e-05, + "loss": 1.2262, + "step": 15784 + }, + { + "epoch": 4.701502261769579, + "grad_norm": 0.388721227645874, + "learning_rate": 1.1441806835364136e-05, + "loss": 1.2394, + "step": 15785 + }, + { + "epoch": 4.701800107969247, + "grad_norm": 0.28787553310394287, + "learning_rate": 1.1440852340681525e-05, + "loss": 1.2178, + "step": 15786 + }, + { + "epoch": 4.7020979541689165, + "grad_norm": 0.35869595408439636, + "learning_rate": 1.1439897832593392e-05, + "loss": 1.2229, + "step": 15787 + }, + { + "epoch": 4.702395800368585, + "grad_norm": 0.35007795691490173, + "learning_rate": 1.143894331110862e-05, + "loss": 1.2134, + "step": 15788 + }, + { + "epoch": 4.702693646568253, + "grad_norm": 0.3028625547885895, + "learning_rate": 1.1437988776236087e-05, + "loss": 1.2358, + "step": 15789 + }, + { + "epoch": 4.7029914927679215, + "grad_norm": 0.3037794530391693, + "learning_rate": 1.1437034227984673e-05, + "loss": 1.2311, + "step": 15790 + }, + { + "epoch": 4.703289338967591, + "grad_norm": 0.32174941897392273, + "learning_rate": 1.1436079666363262e-05, + "loss": 1.2003, + "step": 15791 + }, + { + "epoch": 4.703587185167259, + "grad_norm": 0.3050640821456909, + "learning_rate": 1.1435125091380735e-05, + "loss": 1.2256, + "step": 15792 + }, + { + "epoch": 4.703885031366928, + "grad_norm": 0.25783175230026245, + "learning_rate": 1.1434170503045974e-05, + "loss": 1.2155, + "step": 15793 + }, + { + "epoch": 4.704182877566597, + "grad_norm": 0.35210901498794556, + "learning_rate": 1.1433215901367856e-05, + "loss": 1.2261, + "step": 15794 + }, + { + "epoch": 4.704480723766265, + "grad_norm": 0.2773214876651764, + "learning_rate": 1.1432261286355267e-05, + "loss": 1.2206, + "step": 15795 + }, + { + "epoch": 4.704778569965934, + "grad_norm": 0.2532184422016144, + "learning_rate": 1.1431306658017088e-05, + "loss": 1.2132, + "step": 15796 + }, + { + "epoch": 4.7050764161656025, + "grad_norm": 0.2917252779006958, + "learning_rate": 1.1430352016362197e-05, + "loss": 1.2128, + "step": 15797 + }, + { + "epoch": 4.705374262365271, + "grad_norm": 0.38615626096725464, + "learning_rate": 1.142939736139948e-05, + "loss": 1.2231, + "step": 15798 + }, + { + "epoch": 4.70567210856494, + "grad_norm": 0.27403759956359863, + "learning_rate": 1.1428442693137815e-05, + "loss": 1.2149, + "step": 15799 + }, + { + "epoch": 4.705969954764608, + "grad_norm": 0.4835969805717468, + "learning_rate": 1.1427488011586092e-05, + "loss": 1.2234, + "step": 15800 + }, + { + "epoch": 4.706267800964277, + "grad_norm": 0.3025042712688446, + "learning_rate": 1.1426533316753185e-05, + "loss": 1.2154, + "step": 15801 + }, + { + "epoch": 4.706565647163946, + "grad_norm": 0.4113025367259979, + "learning_rate": 1.1425578608647978e-05, + "loss": 1.218, + "step": 15802 + }, + { + "epoch": 4.706863493363614, + "grad_norm": 0.32159093022346497, + "learning_rate": 1.1424623887279355e-05, + "loss": 1.2219, + "step": 15803 + }, + { + "epoch": 4.707161339563283, + "grad_norm": 0.5616856217384338, + "learning_rate": 1.14236691526562e-05, + "loss": 1.2046, + "step": 15804 + }, + { + "epoch": 4.707459185762952, + "grad_norm": 0.5068147778511047, + "learning_rate": 1.1422714404787394e-05, + "loss": 1.2154, + "step": 15805 + }, + { + "epoch": 4.70775703196262, + "grad_norm": 0.32729411125183105, + "learning_rate": 1.1421759643681822e-05, + "loss": 1.2385, + "step": 15806 + }, + { + "epoch": 4.708054878162289, + "grad_norm": 0.30482223629951477, + "learning_rate": 1.142080486934836e-05, + "loss": 1.2254, + "step": 15807 + }, + { + "epoch": 4.708352724361958, + "grad_norm": 0.36513110995292664, + "learning_rate": 1.14198500817959e-05, + "loss": 1.2025, + "step": 15808 + }, + { + "epoch": 4.708650570561626, + "grad_norm": 0.27403631806373596, + "learning_rate": 1.1418895281033317e-05, + "loss": 1.2352, + "step": 15809 + }, + { + "epoch": 4.708948416761295, + "grad_norm": 0.4531899690628052, + "learning_rate": 1.1417940467069504e-05, + "loss": 1.2156, + "step": 15810 + }, + { + "epoch": 4.7092462629609635, + "grad_norm": 0.3138730823993683, + "learning_rate": 1.1416985639913339e-05, + "loss": 1.209, + "step": 15811 + }, + { + "epoch": 4.709544109160632, + "grad_norm": 0.5142284631729126, + "learning_rate": 1.14160307995737e-05, + "loss": 1.2099, + "step": 15812 + }, + { + "epoch": 4.709841955360301, + "grad_norm": 0.283234566450119, + "learning_rate": 1.1415075946059485e-05, + "loss": 1.2294, + "step": 15813 + }, + { + "epoch": 4.710139801559969, + "grad_norm": 0.584631621837616, + "learning_rate": 1.1414121079379561e-05, + "loss": 1.2097, + "step": 15814 + }, + { + "epoch": 4.710437647759639, + "grad_norm": 0.3865150213241577, + "learning_rate": 1.1413166199542824e-05, + "loss": 1.2085, + "step": 15815 + }, + { + "epoch": 4.710735493959307, + "grad_norm": 0.42615455389022827, + "learning_rate": 1.1412211306558155e-05, + "loss": 1.2273, + "step": 15816 + }, + { + "epoch": 4.711033340158975, + "grad_norm": 0.2718888223171234, + "learning_rate": 1.1411256400434438e-05, + "loss": 1.2106, + "step": 15817 + }, + { + "epoch": 4.711331186358644, + "grad_norm": 0.3436574637889862, + "learning_rate": 1.1410301481180555e-05, + "loss": 1.2204, + "step": 15818 + }, + { + "epoch": 4.711629032558313, + "grad_norm": 0.3228500187397003, + "learning_rate": 1.1409346548805393e-05, + "loss": 1.2071, + "step": 15819 + }, + { + "epoch": 4.711926878757981, + "grad_norm": 0.25016388297080994, + "learning_rate": 1.1408391603317837e-05, + "loss": 1.222, + "step": 15820 + }, + { + "epoch": 4.71222472495765, + "grad_norm": 0.3156696856021881, + "learning_rate": 1.1407436644726768e-05, + "loss": 1.2224, + "step": 15821 + }, + { + "epoch": 4.712522571157319, + "grad_norm": 0.3078279197216034, + "learning_rate": 1.1406481673041077e-05, + "loss": 1.2098, + "step": 15822 + }, + { + "epoch": 4.712820417356987, + "grad_norm": 0.3350658416748047, + "learning_rate": 1.140552668826964e-05, + "loss": 1.2199, + "step": 15823 + }, + { + "epoch": 4.713118263556656, + "grad_norm": 0.4933941662311554, + "learning_rate": 1.1404571690421356e-05, + "loss": 1.2483, + "step": 15824 + }, + { + "epoch": 4.7134161097563245, + "grad_norm": 0.2801974415779114, + "learning_rate": 1.1403616679505096e-05, + "loss": 1.2146, + "step": 15825 + }, + { + "epoch": 4.713713955955993, + "grad_norm": 0.42451563477516174, + "learning_rate": 1.1402661655529752e-05, + "loss": 1.2001, + "step": 15826 + }, + { + "epoch": 4.714011802155662, + "grad_norm": 0.2859726548194885, + "learning_rate": 1.1401706618504206e-05, + "loss": 1.2368, + "step": 15827 + }, + { + "epoch": 4.71430964835533, + "grad_norm": 0.26921480894088745, + "learning_rate": 1.140075156843735e-05, + "loss": 1.2174, + "step": 15828 + }, + { + "epoch": 4.714607494554999, + "grad_norm": 0.2621622383594513, + "learning_rate": 1.1399796505338064e-05, + "loss": 1.239, + "step": 15829 + }, + { + "epoch": 4.714905340754668, + "grad_norm": 0.2795064449310303, + "learning_rate": 1.1398841429215237e-05, + "loss": 1.2322, + "step": 15830 + }, + { + "epoch": 4.715203186954336, + "grad_norm": 0.27517372369766235, + "learning_rate": 1.1397886340077753e-05, + "loss": 1.2293, + "step": 15831 + }, + { + "epoch": 4.7155010331540055, + "grad_norm": 0.29767560958862305, + "learning_rate": 1.1396931237934495e-05, + "loss": 1.2222, + "step": 15832 + }, + { + "epoch": 4.715798879353674, + "grad_norm": 0.42175325751304626, + "learning_rate": 1.1395976122794355e-05, + "loss": 1.2152, + "step": 15833 + }, + { + "epoch": 4.716096725553342, + "grad_norm": 0.3241307735443115, + "learning_rate": 1.1395020994666218e-05, + "loss": 1.2149, + "step": 15834 + }, + { + "epoch": 4.716394571753011, + "grad_norm": 0.3327314853668213, + "learning_rate": 1.139406585355897e-05, + "loss": 1.2229, + "step": 15835 + }, + { + "epoch": 4.71669241795268, + "grad_norm": 0.5165314674377441, + "learning_rate": 1.1393110699481494e-05, + "loss": 1.2099, + "step": 15836 + }, + { + "epoch": 4.716990264152348, + "grad_norm": 0.33652955293655396, + "learning_rate": 1.1392155532442679e-05, + "loss": 1.225, + "step": 15837 + }, + { + "epoch": 4.717288110352017, + "grad_norm": 0.4319644570350647, + "learning_rate": 1.1391200352451413e-05, + "loss": 1.2029, + "step": 15838 + }, + { + "epoch": 4.7175859565516856, + "grad_norm": 0.8324863314628601, + "learning_rate": 1.1390245159516582e-05, + "loss": 1.2222, + "step": 15839 + }, + { + "epoch": 4.717883802751354, + "grad_norm": 0.5616923570632935, + "learning_rate": 1.1389289953647073e-05, + "loss": 1.216, + "step": 15840 + }, + { + "epoch": 4.718181648951023, + "grad_norm": 0.3713075816631317, + "learning_rate": 1.1388334734851775e-05, + "loss": 1.2309, + "step": 15841 + }, + { + "epoch": 4.718479495150691, + "grad_norm": 0.34850919246673584, + "learning_rate": 1.1387379503139573e-05, + "loss": 1.2133, + "step": 15842 + }, + { + "epoch": 4.718777341350361, + "grad_norm": 0.5037635564804077, + "learning_rate": 1.1386424258519354e-05, + "loss": 1.2172, + "step": 15843 + }, + { + "epoch": 4.719075187550029, + "grad_norm": 0.31124255061149597, + "learning_rate": 1.1385469001000005e-05, + "loss": 1.2141, + "step": 15844 + }, + { + "epoch": 4.719373033749697, + "grad_norm": 0.6399654150009155, + "learning_rate": 1.1384513730590416e-05, + "loss": 1.2255, + "step": 15845 + }, + { + "epoch": 4.7196708799493665, + "grad_norm": 0.34755223989486694, + "learning_rate": 1.1383558447299474e-05, + "loss": 1.2336, + "step": 15846 + }, + { + "epoch": 4.719968726149035, + "grad_norm": 0.4550135135650635, + "learning_rate": 1.1382603151136067e-05, + "loss": 1.2276, + "step": 15847 + }, + { + "epoch": 4.720266572348703, + "grad_norm": 0.3026994466781616, + "learning_rate": 1.1381647842109082e-05, + "loss": 1.2257, + "step": 15848 + }, + { + "epoch": 4.720564418548372, + "grad_norm": 0.3064768314361572, + "learning_rate": 1.1380692520227408e-05, + "loss": 1.2092, + "step": 15849 + }, + { + "epoch": 4.720862264748041, + "grad_norm": 0.6107318997383118, + "learning_rate": 1.1379737185499927e-05, + "loss": 1.2017, + "step": 15850 + }, + { + "epoch": 4.721160110947709, + "grad_norm": 0.31734713912010193, + "learning_rate": 1.1378781837935542e-05, + "loss": 1.2048, + "step": 15851 + }, + { + "epoch": 4.721457957147378, + "grad_norm": 0.6801714897155762, + "learning_rate": 1.1377826477543128e-05, + "loss": 1.2282, + "step": 15852 + }, + { + "epoch": 4.721755803347047, + "grad_norm": 0.3529517948627472, + "learning_rate": 1.1376871104331577e-05, + "loss": 1.2178, + "step": 15853 + }, + { + "epoch": 4.722053649546716, + "grad_norm": 0.425773561000824, + "learning_rate": 1.1375915718309782e-05, + "loss": 1.2188, + "step": 15854 + }, + { + "epoch": 4.722351495746384, + "grad_norm": 0.26280325651168823, + "learning_rate": 1.1374960319486626e-05, + "loss": 1.2048, + "step": 15855 + }, + { + "epoch": 4.7226493419460525, + "grad_norm": 0.39089539647102356, + "learning_rate": 1.1374004907871e-05, + "loss": 1.2157, + "step": 15856 + }, + { + "epoch": 4.722947188145721, + "grad_norm": 0.28381043672561646, + "learning_rate": 1.1373049483471793e-05, + "loss": 1.2278, + "step": 15857 + }, + { + "epoch": 4.72324503434539, + "grad_norm": 0.48841750621795654, + "learning_rate": 1.1372094046297897e-05, + "loss": 1.2168, + "step": 15858 + }, + { + "epoch": 4.723542880545058, + "grad_norm": 0.29121631383895874, + "learning_rate": 1.1371138596358198e-05, + "loss": 1.2291, + "step": 15859 + }, + { + "epoch": 4.7238407267447275, + "grad_norm": 0.348071426153183, + "learning_rate": 1.1370183133661587e-05, + "loss": 1.222, + "step": 15860 + }, + { + "epoch": 4.724138572944396, + "grad_norm": 0.29713866114616394, + "learning_rate": 1.1369227658216952e-05, + "loss": 1.2278, + "step": 15861 + }, + { + "epoch": 4.724436419144064, + "grad_norm": 0.3051190674304962, + "learning_rate": 1.1368272170033183e-05, + "loss": 1.2236, + "step": 15862 + }, + { + "epoch": 4.724734265343733, + "grad_norm": 0.33109524846076965, + "learning_rate": 1.136731666911917e-05, + "loss": 1.2142, + "step": 15863 + }, + { + "epoch": 4.725032111543402, + "grad_norm": 0.27014556527137756, + "learning_rate": 1.1366361155483806e-05, + "loss": 1.2037, + "step": 15864 + }, + { + "epoch": 4.72532995774307, + "grad_norm": 0.41980963945388794, + "learning_rate": 1.1365405629135975e-05, + "loss": 1.2186, + "step": 15865 + }, + { + "epoch": 4.725627803942739, + "grad_norm": 0.25167006254196167, + "learning_rate": 1.136445009008457e-05, + "loss": 1.2284, + "step": 15866 + }, + { + "epoch": 4.725925650142408, + "grad_norm": 0.37986552715301514, + "learning_rate": 1.1363494538338482e-05, + "loss": 1.2269, + "step": 15867 + }, + { + "epoch": 4.726223496342076, + "grad_norm": 0.3030323386192322, + "learning_rate": 1.1362538973906601e-05, + "loss": 1.2292, + "step": 15868 + }, + { + "epoch": 4.726521342541745, + "grad_norm": 0.33262065052986145, + "learning_rate": 1.1361583396797817e-05, + "loss": 1.2191, + "step": 15869 + }, + { + "epoch": 4.7268191887414135, + "grad_norm": 0.4100240170955658, + "learning_rate": 1.1360627807021022e-05, + "loss": 1.2095, + "step": 15870 + }, + { + "epoch": 4.727117034941083, + "grad_norm": 0.2690527141094208, + "learning_rate": 1.1359672204585105e-05, + "loss": 1.2212, + "step": 15871 + }, + { + "epoch": 4.727414881140751, + "grad_norm": 0.3116254210472107, + "learning_rate": 1.1358716589498955e-05, + "loss": 1.2231, + "step": 15872 + }, + { + "epoch": 4.727712727340419, + "grad_norm": 0.28574854135513306, + "learning_rate": 1.1357760961771465e-05, + "loss": 1.2164, + "step": 15873 + }, + { + "epoch": 4.728010573540089, + "grad_norm": 0.262829065322876, + "learning_rate": 1.1356805321411529e-05, + "loss": 1.2111, + "step": 15874 + }, + { + "epoch": 4.728308419739757, + "grad_norm": 0.2841876447200775, + "learning_rate": 1.1355849668428035e-05, + "loss": 1.2276, + "step": 15875 + }, + { + "epoch": 4.728606265939425, + "grad_norm": 0.31921133399009705, + "learning_rate": 1.1354894002829875e-05, + "loss": 1.2222, + "step": 15876 + }, + { + "epoch": 4.7289041121390945, + "grad_norm": 0.260716050863266, + "learning_rate": 1.1353938324625937e-05, + "loss": 1.2122, + "step": 15877 + }, + { + "epoch": 4.729201958338763, + "grad_norm": 0.30550864338874817, + "learning_rate": 1.135298263382512e-05, + "loss": 1.2096, + "step": 15878 + }, + { + "epoch": 4.729499804538431, + "grad_norm": 0.28598499298095703, + "learning_rate": 1.1352026930436306e-05, + "loss": 1.2129, + "step": 15879 + }, + { + "epoch": 4.7297976507381, + "grad_norm": 0.249919131398201, + "learning_rate": 1.1351071214468394e-05, + "loss": 1.2048, + "step": 15880 + }, + { + "epoch": 4.730095496937769, + "grad_norm": 0.25255289673805237, + "learning_rate": 1.1350115485930275e-05, + "loss": 1.2145, + "step": 15881 + }, + { + "epoch": 4.730393343137438, + "grad_norm": 0.266684889793396, + "learning_rate": 1.1349159744830842e-05, + "loss": 1.2358, + "step": 15882 + }, + { + "epoch": 4.730691189337106, + "grad_norm": 0.29002997279167175, + "learning_rate": 1.1348203991178984e-05, + "loss": 1.2279, + "step": 15883 + }, + { + "epoch": 4.7309890355367745, + "grad_norm": 0.2965460419654846, + "learning_rate": 1.134724822498359e-05, + "loss": 1.2204, + "step": 15884 + }, + { + "epoch": 4.731286881736443, + "grad_norm": 0.3081950545310974, + "learning_rate": 1.134629244625356e-05, + "loss": 1.2087, + "step": 15885 + }, + { + "epoch": 4.731584727936112, + "grad_norm": 0.5313135981559753, + "learning_rate": 1.1345336654997783e-05, + "loss": 1.2089, + "step": 15886 + }, + { + "epoch": 4.73188257413578, + "grad_norm": 0.4403473436832428, + "learning_rate": 1.134438085122515e-05, + "loss": 1.228, + "step": 15887 + }, + { + "epoch": 4.73218042033545, + "grad_norm": 0.3934048116207123, + "learning_rate": 1.1343425034944557e-05, + "loss": 1.2068, + "step": 15888 + }, + { + "epoch": 4.732478266535118, + "grad_norm": 0.7846232056617737, + "learning_rate": 1.1342469206164894e-05, + "loss": 1.2095, + "step": 15889 + }, + { + "epoch": 4.732776112734786, + "grad_norm": 0.3105809986591339, + "learning_rate": 1.1341513364895053e-05, + "loss": 1.2137, + "step": 15890 + }, + { + "epoch": 4.7330739589344555, + "grad_norm": 0.42994362115859985, + "learning_rate": 1.134055751114393e-05, + "loss": 1.2437, + "step": 15891 + }, + { + "epoch": 4.733371805134124, + "grad_norm": 0.2574695348739624, + "learning_rate": 1.133960164492042e-05, + "loss": 1.2182, + "step": 15892 + }, + { + "epoch": 4.733669651333792, + "grad_norm": 0.5125569701194763, + "learning_rate": 1.1338645766233412e-05, + "loss": 1.2131, + "step": 15893 + }, + { + "epoch": 4.733967497533461, + "grad_norm": 0.2991676926612854, + "learning_rate": 1.13376898750918e-05, + "loss": 1.2177, + "step": 15894 + }, + { + "epoch": 4.73426534373313, + "grad_norm": 0.39358091354370117, + "learning_rate": 1.133673397150448e-05, + "loss": 1.2391, + "step": 15895 + }, + { + "epoch": 4.734563189932798, + "grad_norm": 0.3041570484638214, + "learning_rate": 1.133577805548034e-05, + "loss": 1.2223, + "step": 15896 + }, + { + "epoch": 4.734861036132467, + "grad_norm": 0.31268954277038574, + "learning_rate": 1.1334822127028278e-05, + "loss": 1.2178, + "step": 15897 + }, + { + "epoch": 4.7351588823321356, + "grad_norm": 0.25059276819229126, + "learning_rate": 1.133386618615719e-05, + "loss": 1.2267, + "step": 15898 + }, + { + "epoch": 4.735456728531805, + "grad_norm": 0.34685003757476807, + "learning_rate": 1.133291023287597e-05, + "loss": 1.225, + "step": 15899 + }, + { + "epoch": 4.735754574731473, + "grad_norm": 0.3498765230178833, + "learning_rate": 1.1331954267193504e-05, + "loss": 1.2026, + "step": 15900 + }, + { + "epoch": 4.736052420931141, + "grad_norm": 0.25785017013549805, + "learning_rate": 1.1330998289118693e-05, + "loss": 1.2059, + "step": 15901 + }, + { + "epoch": 4.736350267130811, + "grad_norm": 0.29561716318130493, + "learning_rate": 1.133004229866043e-05, + "loss": 1.2168, + "step": 15902 + }, + { + "epoch": 4.736648113330479, + "grad_norm": 0.2916998565196991, + "learning_rate": 1.1329086295827612e-05, + "loss": 1.2183, + "step": 15903 + }, + { + "epoch": 4.736945959530147, + "grad_norm": 0.335693359375, + "learning_rate": 1.1328130280629128e-05, + "loss": 1.2368, + "step": 15904 + }, + { + "epoch": 4.7372438057298165, + "grad_norm": 0.3438325524330139, + "learning_rate": 1.1327174253073881e-05, + "loss": 1.2136, + "step": 15905 + }, + { + "epoch": 4.737541651929485, + "grad_norm": 0.43850234150886536, + "learning_rate": 1.1326218213170755e-05, + "loss": 1.2196, + "step": 15906 + }, + { + "epoch": 4.737839498129153, + "grad_norm": 0.2512917220592499, + "learning_rate": 1.1325262160928651e-05, + "loss": 1.2065, + "step": 15907 + }, + { + "epoch": 4.738137344328822, + "grad_norm": 0.36331161856651306, + "learning_rate": 1.1324306096356463e-05, + "loss": 1.2303, + "step": 15908 + }, + { + "epoch": 4.738435190528491, + "grad_norm": 0.29328441619873047, + "learning_rate": 1.1323350019463088e-05, + "loss": 1.2322, + "step": 15909 + }, + { + "epoch": 4.73873303672816, + "grad_norm": 0.30946558713912964, + "learning_rate": 1.1322393930257418e-05, + "loss": 1.2259, + "step": 15910 + }, + { + "epoch": 4.739030882927828, + "grad_norm": 0.4157577157020569, + "learning_rate": 1.132143782874835e-05, + "loss": 1.2237, + "step": 15911 + }, + { + "epoch": 4.739328729127497, + "grad_norm": 0.4143553376197815, + "learning_rate": 1.1320481714944782e-05, + "loss": 1.2318, + "step": 15912 + }, + { + "epoch": 4.739626575327166, + "grad_norm": 0.29865482449531555, + "learning_rate": 1.1319525588855605e-05, + "loss": 1.2082, + "step": 15913 + }, + { + "epoch": 4.739924421526834, + "grad_norm": 0.5328629612922668, + "learning_rate": 1.1318569450489713e-05, + "loss": 1.2158, + "step": 15914 + }, + { + "epoch": 4.7402222677265025, + "grad_norm": 0.39911073446273804, + "learning_rate": 1.1317613299856011e-05, + "loss": 1.2343, + "step": 15915 + }, + { + "epoch": 4.740520113926172, + "grad_norm": 0.41975584626197815, + "learning_rate": 1.131665713696339e-05, + "loss": 1.2061, + "step": 15916 + }, + { + "epoch": 4.74081796012584, + "grad_norm": 0.3446354269981384, + "learning_rate": 1.1315700961820742e-05, + "loss": 1.2084, + "step": 15917 + }, + { + "epoch": 4.741115806325508, + "grad_norm": 0.48269596695899963, + "learning_rate": 1.1314744774436968e-05, + "loss": 1.2237, + "step": 15918 + }, + { + "epoch": 4.7414136525251775, + "grad_norm": 0.31334182620048523, + "learning_rate": 1.1313788574820963e-05, + "loss": 1.2164, + "step": 15919 + }, + { + "epoch": 4.741711498724846, + "grad_norm": 0.4656231999397278, + "learning_rate": 1.1312832362981621e-05, + "loss": 1.2142, + "step": 15920 + }, + { + "epoch": 4.742009344924515, + "grad_norm": 0.2596440315246582, + "learning_rate": 1.1311876138927842e-05, + "loss": 1.2227, + "step": 15921 + }, + { + "epoch": 4.742307191124183, + "grad_norm": 0.4730317294597626, + "learning_rate": 1.1310919902668522e-05, + "loss": 1.2415, + "step": 15922 + }, + { + "epoch": 4.742605037323852, + "grad_norm": 0.28711432218551636, + "learning_rate": 1.1309963654212557e-05, + "loss": 1.2153, + "step": 15923 + }, + { + "epoch": 4.74290288352352, + "grad_norm": 0.5630531311035156, + "learning_rate": 1.1309007393568843e-05, + "loss": 1.2178, + "step": 15924 + }, + { + "epoch": 4.743200729723189, + "grad_norm": 0.3179137706756592, + "learning_rate": 1.1308051120746277e-05, + "loss": 1.2418, + "step": 15925 + }, + { + "epoch": 4.743498575922858, + "grad_norm": 0.6113380789756775, + "learning_rate": 1.1307094835753757e-05, + "loss": 1.2045, + "step": 15926 + }, + { + "epoch": 4.743796422122527, + "grad_norm": 0.46305787563323975, + "learning_rate": 1.130613853860018e-05, + "loss": 1.2153, + "step": 15927 + }, + { + "epoch": 4.744094268322195, + "grad_norm": 0.5362480282783508, + "learning_rate": 1.1305182229294445e-05, + "loss": 1.2194, + "step": 15928 + }, + { + "epoch": 4.7443921145218635, + "grad_norm": 0.26356229186058044, + "learning_rate": 1.1304225907845448e-05, + "loss": 1.2298, + "step": 15929 + }, + { + "epoch": 4.744689960721533, + "grad_norm": 0.5824549794197083, + "learning_rate": 1.1303269574262083e-05, + "loss": 1.2277, + "step": 15930 + }, + { + "epoch": 4.744987806921201, + "grad_norm": 0.32579800486564636, + "learning_rate": 1.1302313228553253e-05, + "loss": 1.222, + "step": 15931 + }, + { + "epoch": 4.745285653120869, + "grad_norm": 0.3042381703853607, + "learning_rate": 1.1301356870727848e-05, + "loss": 1.2179, + "step": 15932 + }, + { + "epoch": 4.745583499320539, + "grad_norm": 0.5567289590835571, + "learning_rate": 1.1300400500794779e-05, + "loss": 1.2273, + "step": 15933 + }, + { + "epoch": 4.745881345520207, + "grad_norm": 0.2972211241722107, + "learning_rate": 1.1299444118762933e-05, + "loss": 1.2292, + "step": 15934 + }, + { + "epoch": 4.746179191719875, + "grad_norm": 0.3997952342033386, + "learning_rate": 1.1298487724641211e-05, + "loss": 1.2067, + "step": 15935 + }, + { + "epoch": 4.7464770379195445, + "grad_norm": 0.3564496338367462, + "learning_rate": 1.1297531318438514e-05, + "loss": 1.2253, + "step": 15936 + }, + { + "epoch": 4.746774884119213, + "grad_norm": 0.29910174012184143, + "learning_rate": 1.1296574900163734e-05, + "loss": 1.226, + "step": 15937 + }, + { + "epoch": 4.747072730318882, + "grad_norm": 0.2811172902584076, + "learning_rate": 1.1295618469825774e-05, + "loss": 1.2055, + "step": 15938 + }, + { + "epoch": 4.74737057651855, + "grad_norm": 0.31470999121665955, + "learning_rate": 1.1294662027433532e-05, + "loss": 1.2299, + "step": 15939 + }, + { + "epoch": 4.747668422718219, + "grad_norm": 0.3347685635089874, + "learning_rate": 1.1293705572995909e-05, + "loss": 1.2147, + "step": 15940 + }, + { + "epoch": 4.747966268917888, + "grad_norm": 0.3421890139579773, + "learning_rate": 1.1292749106521798e-05, + "loss": 1.2212, + "step": 15941 + }, + { + "epoch": 4.748264115117556, + "grad_norm": 0.2701142430305481, + "learning_rate": 1.1291792628020103e-05, + "loss": 1.2244, + "step": 15942 + }, + { + "epoch": 4.7485619613172245, + "grad_norm": 0.2640717327594757, + "learning_rate": 1.129083613749972e-05, + "loss": 1.2176, + "step": 15943 + }, + { + "epoch": 4.748859807516894, + "grad_norm": 0.29700079560279846, + "learning_rate": 1.1289879634969548e-05, + "loss": 1.2121, + "step": 15944 + }, + { + "epoch": 4.749157653716562, + "grad_norm": 0.2528539001941681, + "learning_rate": 1.1288923120438486e-05, + "loss": 1.2327, + "step": 15945 + }, + { + "epoch": 4.74945549991623, + "grad_norm": 0.4150124490261078, + "learning_rate": 1.1287966593915438e-05, + "loss": 1.2154, + "step": 15946 + }, + { + "epoch": 4.7497533461159, + "grad_norm": 0.38475698232650757, + "learning_rate": 1.1287010055409298e-05, + "loss": 1.2144, + "step": 15947 + }, + { + "epoch": 4.750051192315568, + "grad_norm": 0.3754880130290985, + "learning_rate": 1.1286053504928966e-05, + "loss": 1.2459, + "step": 15948 + }, + { + "epoch": 4.750349038515237, + "grad_norm": 0.3564351201057434, + "learning_rate": 1.1285096942483344e-05, + "loss": 1.2186, + "step": 15949 + }, + { + "epoch": 4.7506468847149055, + "grad_norm": 0.38486358523368835, + "learning_rate": 1.128414036808133e-05, + "loss": 1.2052, + "step": 15950 + }, + { + "epoch": 4.750944730914574, + "grad_norm": 0.32017916440963745, + "learning_rate": 1.1283183781731827e-05, + "loss": 1.2082, + "step": 15951 + }, + { + "epoch": 4.751242577114242, + "grad_norm": 0.3087635636329651, + "learning_rate": 1.128222718344373e-05, + "loss": 1.2183, + "step": 15952 + }, + { + "epoch": 4.751540423313911, + "grad_norm": 0.3792348802089691, + "learning_rate": 1.1281270573225947e-05, + "loss": 1.2285, + "step": 15953 + }, + { + "epoch": 4.75183826951358, + "grad_norm": 0.392402321100235, + "learning_rate": 1.1280313951087367e-05, + "loss": 1.2139, + "step": 15954 + }, + { + "epoch": 4.752136115713249, + "grad_norm": 0.2829458713531494, + "learning_rate": 1.1279357317036897e-05, + "loss": 1.2198, + "step": 15955 + }, + { + "epoch": 4.752433961912917, + "grad_norm": 0.2721094489097595, + "learning_rate": 1.1278400671083435e-05, + "loss": 1.2092, + "step": 15956 + }, + { + "epoch": 4.7527318081125856, + "grad_norm": 0.2691056728363037, + "learning_rate": 1.1277444013235888e-05, + "loss": 1.2215, + "step": 15957 + }, + { + "epoch": 4.753029654312255, + "grad_norm": 0.25471600890159607, + "learning_rate": 1.1276487343503148e-05, + "loss": 1.2296, + "step": 15958 + }, + { + "epoch": 4.753327500511923, + "grad_norm": 0.2551353871822357, + "learning_rate": 1.1275530661894121e-05, + "loss": 1.231, + "step": 15959 + }, + { + "epoch": 4.753625346711591, + "grad_norm": 0.3175426721572876, + "learning_rate": 1.1274573968417708e-05, + "loss": 1.217, + "step": 15960 + }, + { + "epoch": 4.753923192911261, + "grad_norm": 0.4047291874885559, + "learning_rate": 1.1273617263082804e-05, + "loss": 1.1934, + "step": 15961 + }, + { + "epoch": 4.754221039110929, + "grad_norm": 0.2636595070362091, + "learning_rate": 1.1272660545898315e-05, + "loss": 1.2184, + "step": 15962 + }, + { + "epoch": 4.754518885310597, + "grad_norm": 0.49120742082595825, + "learning_rate": 1.1271703816873147e-05, + "loss": 1.2203, + "step": 15963 + }, + { + "epoch": 4.7548167315102665, + "grad_norm": 0.3593755066394806, + "learning_rate": 1.1270747076016191e-05, + "loss": 1.2417, + "step": 15964 + }, + { + "epoch": 4.755114577709935, + "grad_norm": 0.32109713554382324, + "learning_rate": 1.1269790323336353e-05, + "loss": 1.224, + "step": 15965 + }, + { + "epoch": 4.755412423909604, + "grad_norm": 0.3326317369937897, + "learning_rate": 1.1268833558842536e-05, + "loss": 1.2134, + "step": 15966 + }, + { + "epoch": 4.755710270109272, + "grad_norm": 0.24378105998039246, + "learning_rate": 1.126787678254364e-05, + "loss": 1.2122, + "step": 15967 + }, + { + "epoch": 4.756008116308941, + "grad_norm": 0.33965176343917847, + "learning_rate": 1.1266919994448567e-05, + "loss": 1.2063, + "step": 15968 + }, + { + "epoch": 4.75630596250861, + "grad_norm": 0.261719286441803, + "learning_rate": 1.1265963194566218e-05, + "loss": 1.2366, + "step": 15969 + }, + { + "epoch": 4.756603808708278, + "grad_norm": 0.31250137090682983, + "learning_rate": 1.1265006382905499e-05, + "loss": 1.2277, + "step": 15970 + }, + { + "epoch": 4.756901654907947, + "grad_norm": 0.2600827217102051, + "learning_rate": 1.1264049559475307e-05, + "loss": 1.2337, + "step": 15971 + }, + { + "epoch": 4.757199501107616, + "grad_norm": 0.2892378568649292, + "learning_rate": 1.1263092724284548e-05, + "loss": 1.208, + "step": 15972 + }, + { + "epoch": 4.757497347307284, + "grad_norm": 0.3316152095794678, + "learning_rate": 1.126213587734212e-05, + "loss": 1.219, + "step": 15973 + }, + { + "epoch": 4.7577951935069525, + "grad_norm": 0.2919826805591583, + "learning_rate": 1.1261179018656926e-05, + "loss": 1.2263, + "step": 15974 + }, + { + "epoch": 4.758093039706622, + "grad_norm": 0.39483243227005005, + "learning_rate": 1.1260222148237874e-05, + "loss": 1.2107, + "step": 15975 + }, + { + "epoch": 4.75839088590629, + "grad_norm": 0.27996349334716797, + "learning_rate": 1.1259265266093862e-05, + "loss": 1.2147, + "step": 15976 + }, + { + "epoch": 4.758688732105959, + "grad_norm": 0.5439602732658386, + "learning_rate": 1.1258308372233794e-05, + "loss": 1.2165, + "step": 15977 + }, + { + "epoch": 4.7589865783056275, + "grad_norm": 0.4502294957637787, + "learning_rate": 1.1257351466666572e-05, + "loss": 1.2155, + "step": 15978 + }, + { + "epoch": 4.759284424505296, + "grad_norm": 0.31706276535987854, + "learning_rate": 1.1256394549401097e-05, + "loss": 1.2151, + "step": 15979 + }, + { + "epoch": 4.759582270704965, + "grad_norm": 0.5445091724395752, + "learning_rate": 1.1255437620446279e-05, + "loss": 1.2202, + "step": 15980 + }, + { + "epoch": 4.759880116904633, + "grad_norm": 0.2917754054069519, + "learning_rate": 1.1254480679811012e-05, + "loss": 1.2174, + "step": 15981 + }, + { + "epoch": 4.760177963104302, + "grad_norm": 0.3354133367538452, + "learning_rate": 1.1253523727504207e-05, + "loss": 1.2177, + "step": 15982 + }, + { + "epoch": 4.760475809303971, + "grad_norm": 0.303833931684494, + "learning_rate": 1.1252566763534762e-05, + "loss": 1.2203, + "step": 15983 + }, + { + "epoch": 4.760773655503639, + "grad_norm": 0.35081231594085693, + "learning_rate": 1.1251609787911584e-05, + "loss": 1.2156, + "step": 15984 + }, + { + "epoch": 4.761071501703308, + "grad_norm": 0.29519397020339966, + "learning_rate": 1.1250652800643576e-05, + "loss": 1.2056, + "step": 15985 + }, + { + "epoch": 4.761369347902977, + "grad_norm": 0.27962592244148254, + "learning_rate": 1.1249695801739639e-05, + "loss": 1.2172, + "step": 15986 + }, + { + "epoch": 4.761667194102645, + "grad_norm": 0.2990504503250122, + "learning_rate": 1.1248738791208682e-05, + "loss": 1.2227, + "step": 15987 + }, + { + "epoch": 4.761965040302314, + "grad_norm": 0.3045368790626526, + "learning_rate": 1.1247781769059604e-05, + "loss": 1.2197, + "step": 15988 + }, + { + "epoch": 4.762262886501983, + "grad_norm": 0.2627740800380707, + "learning_rate": 1.124682473530131e-05, + "loss": 1.212, + "step": 15989 + }, + { + "epoch": 4.762560732701651, + "grad_norm": 0.44368502497673035, + "learning_rate": 1.1245867689942705e-05, + "loss": 1.2142, + "step": 15990 + }, + { + "epoch": 4.762858578901319, + "grad_norm": 0.2825852632522583, + "learning_rate": 1.1244910632992694e-05, + "loss": 1.2104, + "step": 15991 + }, + { + "epoch": 4.763156425100989, + "grad_norm": 0.4482436776161194, + "learning_rate": 1.1243953564460179e-05, + "loss": 1.2277, + "step": 15992 + }, + { + "epoch": 4.763454271300657, + "grad_norm": 0.28264114260673523, + "learning_rate": 1.1242996484354068e-05, + "loss": 1.2239, + "step": 15993 + }, + { + "epoch": 4.763752117500326, + "grad_norm": 0.3293389081954956, + "learning_rate": 1.1242039392683263e-05, + "loss": 1.2199, + "step": 15994 + }, + { + "epoch": 4.7640499636999944, + "grad_norm": 0.3082025945186615, + "learning_rate": 1.1241082289456668e-05, + "loss": 1.2353, + "step": 15995 + }, + { + "epoch": 4.764347809899663, + "grad_norm": 0.24573594331741333, + "learning_rate": 1.1240125174683189e-05, + "loss": 1.2156, + "step": 15996 + }, + { + "epoch": 4.764645656099332, + "grad_norm": 0.31183791160583496, + "learning_rate": 1.1239168048371729e-05, + "loss": 1.2313, + "step": 15997 + }, + { + "epoch": 4.764943502299, + "grad_norm": 0.3353278338909149, + "learning_rate": 1.12382109105312e-05, + "loss": 1.2171, + "step": 15998 + }, + { + "epoch": 4.765241348498669, + "grad_norm": 0.29151424765586853, + "learning_rate": 1.12372537611705e-05, + "loss": 1.2206, + "step": 15999 + }, + { + "epoch": 4.765539194698338, + "grad_norm": 0.37144967913627625, + "learning_rate": 1.1236296600298533e-05, + "loss": 1.2322, + "step": 16000 + }, + { + "epoch": 4.765539194698338, + "eval_loss": 1.321445107460022, + "eval_runtime": 22.1535, + "eval_samples_per_second": 78.272, + "eval_steps_per_second": 4.92, + "step": 16000 + }, + { + "epoch": 4.765837040898006, + "grad_norm": 0.24773110449314117, + "learning_rate": 1.1235339427924212e-05, + "loss": 1.2145, + "step": 16001 + }, + { + "epoch": 4.7661348870976745, + "grad_norm": 0.31285032629966736, + "learning_rate": 1.1234382244056433e-05, + "loss": 1.2346, + "step": 16002 + }, + { + "epoch": 4.766432733297344, + "grad_norm": 0.27079176902770996, + "learning_rate": 1.1233425048704108e-05, + "loss": 1.194, + "step": 16003 + }, + { + "epoch": 4.766730579497012, + "grad_norm": 0.27916616201400757, + "learning_rate": 1.1232467841876142e-05, + "loss": 1.2209, + "step": 16004 + }, + { + "epoch": 4.767028425696681, + "grad_norm": 0.2608424425125122, + "learning_rate": 1.123151062358144e-05, + "loss": 1.2204, + "step": 16005 + }, + { + "epoch": 4.76732627189635, + "grad_norm": 0.2661018371582031, + "learning_rate": 1.1230553393828908e-05, + "loss": 1.2101, + "step": 16006 + }, + { + "epoch": 4.767624118096018, + "grad_norm": 0.3621608316898346, + "learning_rate": 1.1229596152627449e-05, + "loss": 1.2297, + "step": 16007 + }, + { + "epoch": 4.767921964295687, + "grad_norm": 0.3018997609615326, + "learning_rate": 1.1228638899985973e-05, + "loss": 1.2241, + "step": 16008 + }, + { + "epoch": 4.7682198104953555, + "grad_norm": 0.4123643934726715, + "learning_rate": 1.1227681635913384e-05, + "loss": 1.2085, + "step": 16009 + }, + { + "epoch": 4.768517656695024, + "grad_norm": 0.5294069051742554, + "learning_rate": 1.1226724360418588e-05, + "loss": 1.2186, + "step": 16010 + }, + { + "epoch": 4.768815502894693, + "grad_norm": 0.3427787721157074, + "learning_rate": 1.1225767073510496e-05, + "loss": 1.2175, + "step": 16011 + }, + { + "epoch": 4.769113349094361, + "grad_norm": 0.6941683292388916, + "learning_rate": 1.1224809775198012e-05, + "loss": 1.216, + "step": 16012 + }, + { + "epoch": 4.76941119529403, + "grad_norm": 0.2954423427581787, + "learning_rate": 1.1223852465490037e-05, + "loss": 1.2246, + "step": 16013 + }, + { + "epoch": 4.769709041493699, + "grad_norm": 0.3511301279067993, + "learning_rate": 1.1222895144395482e-05, + "loss": 1.2324, + "step": 16014 + }, + { + "epoch": 4.770006887693367, + "grad_norm": 0.3313235342502594, + "learning_rate": 1.1221937811923255e-05, + "loss": 1.2298, + "step": 16015 + }, + { + "epoch": 4.770304733893036, + "grad_norm": 0.29304108023643494, + "learning_rate": 1.1220980468082264e-05, + "loss": 1.2182, + "step": 16016 + }, + { + "epoch": 4.770602580092705, + "grad_norm": 0.40669217705726624, + "learning_rate": 1.1220023112881412e-05, + "loss": 1.2177, + "step": 16017 + }, + { + "epoch": 4.770900426292373, + "grad_norm": 0.3019462525844574, + "learning_rate": 1.121906574632961e-05, + "loss": 1.2168, + "step": 16018 + }, + { + "epoch": 4.771198272492041, + "grad_norm": 0.3180030584335327, + "learning_rate": 1.1218108368435761e-05, + "loss": 1.208, + "step": 16019 + }, + { + "epoch": 4.771496118691711, + "grad_norm": 0.290020227432251, + "learning_rate": 1.1217150979208773e-05, + "loss": 1.2181, + "step": 16020 + }, + { + "epoch": 4.771793964891379, + "grad_norm": 0.28905245661735535, + "learning_rate": 1.121619357865756e-05, + "loss": 1.2102, + "step": 16021 + }, + { + "epoch": 4.772091811091048, + "grad_norm": 0.3011939823627472, + "learning_rate": 1.1215236166791022e-05, + "loss": 1.2265, + "step": 16022 + }, + { + "epoch": 4.7723896572907165, + "grad_norm": 0.29782941937446594, + "learning_rate": 1.1214278743618068e-05, + "loss": 1.2055, + "step": 16023 + }, + { + "epoch": 4.772687503490385, + "grad_norm": 0.32444489002227783, + "learning_rate": 1.1213321309147609e-05, + "loss": 1.2217, + "step": 16024 + }, + { + "epoch": 4.772985349690054, + "grad_norm": 0.41676148772239685, + "learning_rate": 1.1212363863388549e-05, + "loss": 1.215, + "step": 16025 + }, + { + "epoch": 4.773283195889722, + "grad_norm": 0.27466535568237305, + "learning_rate": 1.1211406406349798e-05, + "loss": 1.2332, + "step": 16026 + }, + { + "epoch": 4.773581042089391, + "grad_norm": 0.3036925494670868, + "learning_rate": 1.1210448938040264e-05, + "loss": 1.2242, + "step": 16027 + }, + { + "epoch": 4.77387888828906, + "grad_norm": 0.41110652685165405, + "learning_rate": 1.1209491458468858e-05, + "loss": 1.2247, + "step": 16028 + }, + { + "epoch": 4.774176734488728, + "grad_norm": 0.2643027901649475, + "learning_rate": 1.1208533967644482e-05, + "loss": 1.2155, + "step": 16029 + }, + { + "epoch": 4.774474580688397, + "grad_norm": 0.37844428420066833, + "learning_rate": 1.1207576465576049e-05, + "loss": 1.2205, + "step": 16030 + }, + { + "epoch": 4.774772426888066, + "grad_norm": 0.30798405408859253, + "learning_rate": 1.1206618952272466e-05, + "loss": 1.22, + "step": 16031 + }, + { + "epoch": 4.775070273087734, + "grad_norm": 0.5551376938819885, + "learning_rate": 1.120566142774264e-05, + "loss": 1.217, + "step": 16032 + }, + { + "epoch": 4.775368119287403, + "grad_norm": 0.4104938805103302, + "learning_rate": 1.1204703891995483e-05, + "loss": 1.2127, + "step": 16033 + }, + { + "epoch": 4.775665965487072, + "grad_norm": 0.36480623483657837, + "learning_rate": 1.1203746345039903e-05, + "loss": 1.2151, + "step": 16034 + }, + { + "epoch": 4.77596381168674, + "grad_norm": 0.39886218309402466, + "learning_rate": 1.1202788786884808e-05, + "loss": 1.2292, + "step": 16035 + }, + { + "epoch": 4.776261657886409, + "grad_norm": 0.3694417178630829, + "learning_rate": 1.1201831217539108e-05, + "loss": 1.2249, + "step": 16036 + }, + { + "epoch": 4.7765595040860775, + "grad_norm": 0.42429476976394653, + "learning_rate": 1.1200873637011708e-05, + "loss": 1.2301, + "step": 16037 + }, + { + "epoch": 4.776857350285746, + "grad_norm": 0.3157731294631958, + "learning_rate": 1.1199916045311522e-05, + "loss": 1.2182, + "step": 16038 + }, + { + "epoch": 4.777155196485415, + "grad_norm": 0.5065358877182007, + "learning_rate": 1.119895844244746e-05, + "loss": 1.2333, + "step": 16039 + }, + { + "epoch": 4.777453042685083, + "grad_norm": 0.2756097614765167, + "learning_rate": 1.1198000828428426e-05, + "loss": 1.2055, + "step": 16040 + }, + { + "epoch": 4.777750888884752, + "grad_norm": 0.8761605620384216, + "learning_rate": 1.1197043203263334e-05, + "loss": 1.2283, + "step": 16041 + }, + { + "epoch": 4.778048735084421, + "grad_norm": 0.4738672077655792, + "learning_rate": 1.1196085566961095e-05, + "loss": 1.2189, + "step": 16042 + }, + { + "epoch": 4.778346581284089, + "grad_norm": 0.5599620342254639, + "learning_rate": 1.119512791953061e-05, + "loss": 1.2101, + "step": 16043 + }, + { + "epoch": 4.7786444274837585, + "grad_norm": 0.3681389093399048, + "learning_rate": 1.1194170260980799e-05, + "loss": 1.211, + "step": 16044 + }, + { + "epoch": 4.778942273683427, + "grad_norm": 0.5590227246284485, + "learning_rate": 1.1193212591320571e-05, + "loss": 1.2064, + "step": 16045 + }, + { + "epoch": 4.779240119883095, + "grad_norm": 0.30715882778167725, + "learning_rate": 1.1192254910558828e-05, + "loss": 1.2135, + "step": 16046 + }, + { + "epoch": 4.779537966082764, + "grad_norm": 0.6409532427787781, + "learning_rate": 1.1191297218704487e-05, + "loss": 1.2243, + "step": 16047 + }, + { + "epoch": 4.779835812282433, + "grad_norm": 0.3171904981136322, + "learning_rate": 1.1190339515766454e-05, + "loss": 1.204, + "step": 16048 + }, + { + "epoch": 4.780133658482101, + "grad_norm": 0.5026242733001709, + "learning_rate": 1.1189381801753645e-05, + "loss": 1.2313, + "step": 16049 + }, + { + "epoch": 4.78043150468177, + "grad_norm": 0.2544401288032532, + "learning_rate": 1.1188424076674966e-05, + "loss": 1.2163, + "step": 16050 + }, + { + "epoch": 4.780729350881439, + "grad_norm": 0.3725774884223938, + "learning_rate": 1.1187466340539328e-05, + "loss": 1.2069, + "step": 16051 + }, + { + "epoch": 4.781027197081107, + "grad_norm": 0.2977886497974396, + "learning_rate": 1.1186508593355645e-05, + "loss": 1.2289, + "step": 16052 + }, + { + "epoch": 4.781325043280776, + "grad_norm": 0.3489167094230652, + "learning_rate": 1.1185550835132824e-05, + "loss": 1.2218, + "step": 16053 + }, + { + "epoch": 4.7816228894804444, + "grad_norm": 0.31889593601226807, + "learning_rate": 1.1184593065879773e-05, + "loss": 1.2256, + "step": 16054 + }, + { + "epoch": 4.781920735680114, + "grad_norm": 0.27601557970046997, + "learning_rate": 1.1183635285605412e-05, + "loss": 1.2222, + "step": 16055 + }, + { + "epoch": 4.782218581879782, + "grad_norm": 0.449756920337677, + "learning_rate": 1.1182677494318646e-05, + "loss": 1.2291, + "step": 16056 + }, + { + "epoch": 4.78251642807945, + "grad_norm": 0.2948785424232483, + "learning_rate": 1.1181719692028385e-05, + "loss": 1.2347, + "step": 16057 + }, + { + "epoch": 4.782814274279119, + "grad_norm": 0.3799836337566376, + "learning_rate": 1.1180761878743545e-05, + "loss": 1.2174, + "step": 16058 + }, + { + "epoch": 4.783112120478788, + "grad_norm": 0.254565954208374, + "learning_rate": 1.1179804054473036e-05, + "loss": 1.2159, + "step": 16059 + }, + { + "epoch": 4.783409966678456, + "grad_norm": 0.35955533385276794, + "learning_rate": 1.1178846219225767e-05, + "loss": 1.2216, + "step": 16060 + }, + { + "epoch": 4.783707812878125, + "grad_norm": 0.2900918126106262, + "learning_rate": 1.1177888373010647e-05, + "loss": 1.2218, + "step": 16061 + }, + { + "epoch": 4.784005659077794, + "grad_norm": 0.3033221960067749, + "learning_rate": 1.1176930515836599e-05, + "loss": 1.2252, + "step": 16062 + }, + { + "epoch": 4.784303505277462, + "grad_norm": 0.32291966676712036, + "learning_rate": 1.1175972647712523e-05, + "loss": 1.2185, + "step": 16063 + }, + { + "epoch": 4.784601351477131, + "grad_norm": 0.40774357318878174, + "learning_rate": 1.1175014768647336e-05, + "loss": 1.2159, + "step": 16064 + }, + { + "epoch": 4.7848991976768, + "grad_norm": 0.25964364409446716, + "learning_rate": 1.1174056878649951e-05, + "loss": 1.249, + "step": 16065 + }, + { + "epoch": 4.785197043876468, + "grad_norm": 0.44589051604270935, + "learning_rate": 1.117309897772928e-05, + "loss": 1.2218, + "step": 16066 + }, + { + "epoch": 4.785494890076137, + "grad_norm": 0.362463116645813, + "learning_rate": 1.1172141065894227e-05, + "loss": 1.2192, + "step": 16067 + }, + { + "epoch": 4.7857927362758055, + "grad_norm": 0.3140842914581299, + "learning_rate": 1.1171183143153714e-05, + "loss": 1.229, + "step": 16068 + }, + { + "epoch": 4.786090582475474, + "grad_norm": 0.5171372890472412, + "learning_rate": 1.1170225209516654e-05, + "loss": 1.2359, + "step": 16069 + }, + { + "epoch": 4.786388428675143, + "grad_norm": 0.35875049233436584, + "learning_rate": 1.1169267264991952e-05, + "loss": 1.2267, + "step": 16070 + }, + { + "epoch": 4.786686274874811, + "grad_norm": 0.29174157977104187, + "learning_rate": 1.1168309309588525e-05, + "loss": 1.2158, + "step": 16071 + }, + { + "epoch": 4.786984121074481, + "grad_norm": 0.2554702162742615, + "learning_rate": 1.1167351343315286e-05, + "loss": 1.2288, + "step": 16072 + }, + { + "epoch": 4.787281967274149, + "grad_norm": 0.35019010305404663, + "learning_rate": 1.1166393366181147e-05, + "loss": 1.2252, + "step": 16073 + }, + { + "epoch": 4.787579813473817, + "grad_norm": 0.2655666470527649, + "learning_rate": 1.116543537819502e-05, + "loss": 1.2226, + "step": 16074 + }, + { + "epoch": 4.787877659673486, + "grad_norm": 0.32722175121307373, + "learning_rate": 1.1164477379365821e-05, + "loss": 1.2015, + "step": 16075 + }, + { + "epoch": 4.788175505873155, + "grad_norm": 0.33848732709884644, + "learning_rate": 1.116351936970246e-05, + "loss": 1.222, + "step": 16076 + }, + { + "epoch": 4.788473352072823, + "grad_norm": 0.2932543158531189, + "learning_rate": 1.1162561349213851e-05, + "loss": 1.2187, + "step": 16077 + }, + { + "epoch": 4.788771198272492, + "grad_norm": 0.450366348028183, + "learning_rate": 1.1161603317908907e-05, + "loss": 1.2189, + "step": 16078 + }, + { + "epoch": 4.789069044472161, + "grad_norm": 0.3034180700778961, + "learning_rate": 1.1160645275796543e-05, + "loss": 1.2144, + "step": 16079 + }, + { + "epoch": 4.789366890671829, + "grad_norm": 0.3472573459148407, + "learning_rate": 1.1159687222885672e-05, + "loss": 1.2085, + "step": 16080 + }, + { + "epoch": 4.789664736871498, + "grad_norm": 0.3495369851589203, + "learning_rate": 1.1158729159185204e-05, + "loss": 1.2258, + "step": 16081 + }, + { + "epoch": 4.7899625830711665, + "grad_norm": 0.2551051676273346, + "learning_rate": 1.1157771084704056e-05, + "loss": 1.2269, + "step": 16082 + }, + { + "epoch": 4.790260429270836, + "grad_norm": 0.30192965269088745, + "learning_rate": 1.1156812999451145e-05, + "loss": 1.2, + "step": 16083 + }, + { + "epoch": 4.790558275470504, + "grad_norm": 0.2525578439235687, + "learning_rate": 1.115585490343538e-05, + "loss": 1.2162, + "step": 16084 + }, + { + "epoch": 4.790856121670172, + "grad_norm": 0.26708728075027466, + "learning_rate": 1.1154896796665676e-05, + "loss": 1.2413, + "step": 16085 + }, + { + "epoch": 4.791153967869841, + "grad_norm": 0.28091707825660706, + "learning_rate": 1.1153938679150948e-05, + "loss": 1.2199, + "step": 16086 + }, + { + "epoch": 4.79145181406951, + "grad_norm": 0.2770179212093353, + "learning_rate": 1.115298055090011e-05, + "loss": 1.2501, + "step": 16087 + }, + { + "epoch": 4.791749660269178, + "grad_norm": 0.2677095830440521, + "learning_rate": 1.1152022411922076e-05, + "loss": 1.2258, + "step": 16088 + }, + { + "epoch": 4.7920475064688475, + "grad_norm": 0.25663822889328003, + "learning_rate": 1.115106426222576e-05, + "loss": 1.2029, + "step": 16089 + }, + { + "epoch": 4.792345352668516, + "grad_norm": 0.2832064628601074, + "learning_rate": 1.1150106101820077e-05, + "loss": 1.2244, + "step": 16090 + }, + { + "epoch": 4.792643198868184, + "grad_norm": 0.2537904977798462, + "learning_rate": 1.1149147930713941e-05, + "loss": 1.2245, + "step": 16091 + }, + { + "epoch": 4.792941045067853, + "grad_norm": 0.28968295454978943, + "learning_rate": 1.114818974891627e-05, + "loss": 1.2259, + "step": 16092 + }, + { + "epoch": 4.793238891267522, + "grad_norm": 0.24708563089370728, + "learning_rate": 1.1147231556435976e-05, + "loss": 1.2031, + "step": 16093 + }, + { + "epoch": 4.79353673746719, + "grad_norm": 0.4056083559989929, + "learning_rate": 1.1146273353281974e-05, + "loss": 1.2181, + "step": 16094 + }, + { + "epoch": 4.793834583666859, + "grad_norm": 0.498331218957901, + "learning_rate": 1.1145315139463178e-05, + "loss": 1.2073, + "step": 16095 + }, + { + "epoch": 4.7941324298665275, + "grad_norm": 0.4491882920265198, + "learning_rate": 1.1144356914988504e-05, + "loss": 1.2151, + "step": 16096 + }, + { + "epoch": 4.794430276066196, + "grad_norm": 0.3053653836250305, + "learning_rate": 1.1143398679866869e-05, + "loss": 1.2184, + "step": 16097 + }, + { + "epoch": 4.794728122265865, + "grad_norm": 0.3851979076862335, + "learning_rate": 1.1142440434107182e-05, + "loss": 1.2135, + "step": 16098 + }, + { + "epoch": 4.795025968465533, + "grad_norm": 0.4008645713329315, + "learning_rate": 1.1141482177718369e-05, + "loss": 1.2086, + "step": 16099 + }, + { + "epoch": 4.795323814665203, + "grad_norm": 0.28745028376579285, + "learning_rate": 1.1140523910709338e-05, + "loss": 1.2355, + "step": 16100 + }, + { + "epoch": 4.795621660864871, + "grad_norm": 0.3781936466693878, + "learning_rate": 1.1139565633089006e-05, + "loss": 1.2294, + "step": 16101 + }, + { + "epoch": 4.795919507064539, + "grad_norm": 0.2987149655818939, + "learning_rate": 1.1138607344866286e-05, + "loss": 1.2111, + "step": 16102 + }, + { + "epoch": 4.7962173532642085, + "grad_norm": 0.27935880422592163, + "learning_rate": 1.11376490460501e-05, + "loss": 1.2187, + "step": 16103 + }, + { + "epoch": 4.796515199463877, + "grad_norm": 0.36832916736602783, + "learning_rate": 1.1136690736649362e-05, + "loss": 1.2422, + "step": 16104 + }, + { + "epoch": 4.796813045663545, + "grad_norm": 0.30461856722831726, + "learning_rate": 1.1135732416672984e-05, + "loss": 1.2087, + "step": 16105 + }, + { + "epoch": 4.797110891863214, + "grad_norm": 0.3361176550388336, + "learning_rate": 1.1134774086129884e-05, + "loss": 1.2143, + "step": 16106 + }, + { + "epoch": 4.797408738062883, + "grad_norm": 0.3742274343967438, + "learning_rate": 1.1133815745028984e-05, + "loss": 1.2165, + "step": 16107 + }, + { + "epoch": 4.797706584262551, + "grad_norm": 0.2652067542076111, + "learning_rate": 1.1132857393379191e-05, + "loss": 1.226, + "step": 16108 + }, + { + "epoch": 4.79800443046222, + "grad_norm": 0.3102246820926666, + "learning_rate": 1.1131899031189427e-05, + "loss": 1.2358, + "step": 16109 + }, + { + "epoch": 4.798302276661889, + "grad_norm": 0.6317980885505676, + "learning_rate": 1.1130940658468607e-05, + "loss": 1.2431, + "step": 16110 + }, + { + "epoch": 4.798600122861558, + "grad_norm": 0.6439951658248901, + "learning_rate": 1.1129982275225646e-05, + "loss": 1.2208, + "step": 16111 + }, + { + "epoch": 4.798897969061226, + "grad_norm": 0.3831596374511719, + "learning_rate": 1.1129023881469466e-05, + "loss": 1.2113, + "step": 16112 + }, + { + "epoch": 4.7991958152608944, + "grad_norm": 0.37428316473960876, + "learning_rate": 1.1128065477208976e-05, + "loss": 1.222, + "step": 16113 + }, + { + "epoch": 4.799493661460564, + "grad_norm": 0.31380048394203186, + "learning_rate": 1.11271070624531e-05, + "loss": 1.2214, + "step": 16114 + }, + { + "epoch": 4.799791507660232, + "grad_norm": 0.43900665640830994, + "learning_rate": 1.112614863721075e-05, + "loss": 1.2324, + "step": 16115 + }, + { + "epoch": 4.8000893538599, + "grad_norm": 0.4185546338558197, + "learning_rate": 1.1125190201490845e-05, + "loss": 1.2354, + "step": 16116 + }, + { + "epoch": 4.8003872000595695, + "grad_norm": 0.2961720824241638, + "learning_rate": 1.1124231755302305e-05, + "loss": 1.2194, + "step": 16117 + }, + { + "epoch": 4.800685046259238, + "grad_norm": 0.4965033531188965, + "learning_rate": 1.1123273298654044e-05, + "loss": 1.2112, + "step": 16118 + }, + { + "epoch": 4.800982892458906, + "grad_norm": 0.2967652678489685, + "learning_rate": 1.1122314831554979e-05, + "loss": 1.2163, + "step": 16119 + }, + { + "epoch": 4.801280738658575, + "grad_norm": 0.4445110559463501, + "learning_rate": 1.1121356354014028e-05, + "loss": 1.2164, + "step": 16120 + }, + { + "epoch": 4.801578584858244, + "grad_norm": 0.27836817502975464, + "learning_rate": 1.1120397866040112e-05, + "loss": 1.2389, + "step": 16121 + }, + { + "epoch": 4.801876431057913, + "grad_norm": 0.4156196117401123, + "learning_rate": 1.1119439367642142e-05, + "loss": 1.2135, + "step": 16122 + }, + { + "epoch": 4.802174277257581, + "grad_norm": 0.25994083285331726, + "learning_rate": 1.1118480858829039e-05, + "loss": 1.2088, + "step": 16123 + }, + { + "epoch": 4.80247212345725, + "grad_norm": 0.37323346734046936, + "learning_rate": 1.1117522339609725e-05, + "loss": 1.2081, + "step": 16124 + }, + { + "epoch": 4.802769969656918, + "grad_norm": 0.2517227828502655, + "learning_rate": 1.1116563809993113e-05, + "loss": 1.2275, + "step": 16125 + }, + { + "epoch": 4.803067815856587, + "grad_norm": 0.28889697790145874, + "learning_rate": 1.111560526998812e-05, + "loss": 1.2276, + "step": 16126 + }, + { + "epoch": 4.8033656620562555, + "grad_norm": 0.2838118374347687, + "learning_rate": 1.1114646719603669e-05, + "loss": 1.2244, + "step": 16127 + }, + { + "epoch": 4.803663508255925, + "grad_norm": 0.26490119099617004, + "learning_rate": 1.1113688158848672e-05, + "loss": 1.2281, + "step": 16128 + }, + { + "epoch": 4.803961354455593, + "grad_norm": 0.3336222469806671, + "learning_rate": 1.1112729587732054e-05, + "loss": 1.2238, + "step": 16129 + }, + { + "epoch": 4.804259200655261, + "grad_norm": 0.285384863615036, + "learning_rate": 1.1111771006262728e-05, + "loss": 1.2293, + "step": 16130 + }, + { + "epoch": 4.804557046854931, + "grad_norm": 0.32861578464508057, + "learning_rate": 1.111081241444962e-05, + "loss": 1.2262, + "step": 16131 + }, + { + "epoch": 4.804854893054599, + "grad_norm": 0.24425429105758667, + "learning_rate": 1.1109853812301638e-05, + "loss": 1.2266, + "step": 16132 + }, + { + "epoch": 4.805152739254267, + "grad_norm": 0.4054061770439148, + "learning_rate": 1.1108895199827708e-05, + "loss": 1.2304, + "step": 16133 + }, + { + "epoch": 4.805450585453936, + "grad_norm": 0.4688212275505066, + "learning_rate": 1.110793657703675e-05, + "loss": 1.223, + "step": 16134 + }, + { + "epoch": 4.805748431653605, + "grad_norm": 0.27641046047210693, + "learning_rate": 1.1106977943937676e-05, + "loss": 1.2241, + "step": 16135 + }, + { + "epoch": 4.806046277853273, + "grad_norm": 0.6944801211357117, + "learning_rate": 1.110601930053941e-05, + "loss": 1.2068, + "step": 16136 + }, + { + "epoch": 4.806344124052942, + "grad_norm": 0.402495801448822, + "learning_rate": 1.1105060646850871e-05, + "loss": 1.2159, + "step": 16137 + }, + { + "epoch": 4.806641970252611, + "grad_norm": 0.457445353269577, + "learning_rate": 1.1104101982880978e-05, + "loss": 1.2148, + "step": 16138 + }, + { + "epoch": 4.80693981645228, + "grad_norm": 0.3986983001232147, + "learning_rate": 1.1103143308638648e-05, + "loss": 1.2056, + "step": 16139 + }, + { + "epoch": 4.807237662651948, + "grad_norm": 0.3038369119167328, + "learning_rate": 1.1102184624132802e-05, + "loss": 1.2168, + "step": 16140 + }, + { + "epoch": 4.8075355088516165, + "grad_norm": 0.44019803404808044, + "learning_rate": 1.1101225929372364e-05, + "loss": 1.2097, + "step": 16141 + }, + { + "epoch": 4.807833355051286, + "grad_norm": 0.2620181739330292, + "learning_rate": 1.1100267224366243e-05, + "loss": 1.223, + "step": 16142 + }, + { + "epoch": 4.808131201250954, + "grad_norm": 0.3152141273021698, + "learning_rate": 1.1099308509123367e-05, + "loss": 1.2292, + "step": 16143 + }, + { + "epoch": 4.808429047450622, + "grad_norm": 0.27323970198631287, + "learning_rate": 1.1098349783652654e-05, + "loss": 1.2153, + "step": 16144 + }, + { + "epoch": 4.808726893650292, + "grad_norm": 0.2811146378517151, + "learning_rate": 1.1097391047963022e-05, + "loss": 1.2279, + "step": 16145 + }, + { + "epoch": 4.80902473984996, + "grad_norm": 0.31005099415779114, + "learning_rate": 1.1096432302063394e-05, + "loss": 1.217, + "step": 16146 + }, + { + "epoch": 4.809322586049628, + "grad_norm": 0.2507877051830292, + "learning_rate": 1.1095473545962688e-05, + "loss": 1.2057, + "step": 16147 + }, + { + "epoch": 4.8096204322492975, + "grad_norm": 0.37099072337150574, + "learning_rate": 1.1094514779669825e-05, + "loss": 1.2328, + "step": 16148 + }, + { + "epoch": 4.809918278448966, + "grad_norm": 0.27322760224342346, + "learning_rate": 1.1093556003193722e-05, + "loss": 1.2211, + "step": 16149 + }, + { + "epoch": 4.810216124648635, + "grad_norm": 0.3571750521659851, + "learning_rate": 1.1092597216543303e-05, + "loss": 1.2116, + "step": 16150 + }, + { + "epoch": 4.810513970848303, + "grad_norm": 0.3114352524280548, + "learning_rate": 1.1091638419727493e-05, + "loss": 1.2199, + "step": 16151 + }, + { + "epoch": 4.810811817047972, + "grad_norm": 0.32958173751831055, + "learning_rate": 1.1090679612755202e-05, + "loss": 1.2013, + "step": 16152 + }, + { + "epoch": 4.81110966324764, + "grad_norm": 0.3899846374988556, + "learning_rate": 1.1089720795635357e-05, + "loss": 1.2131, + "step": 16153 + }, + { + "epoch": 4.811407509447309, + "grad_norm": 0.3302745223045349, + "learning_rate": 1.1088761968376878e-05, + "loss": 1.2019, + "step": 16154 + }, + { + "epoch": 4.8117053556469775, + "grad_norm": 0.5378304719924927, + "learning_rate": 1.1087803130988684e-05, + "loss": 1.2104, + "step": 16155 + }, + { + "epoch": 4.812003201846647, + "grad_norm": 0.2669019401073456, + "learning_rate": 1.1086844283479699e-05, + "loss": 1.1935, + "step": 16156 + }, + { + "epoch": 4.812301048046315, + "grad_norm": 0.8667817711830139, + "learning_rate": 1.108588542585884e-05, + "loss": 1.219, + "step": 16157 + }, + { + "epoch": 4.812598894245983, + "grad_norm": 0.5090213418006897, + "learning_rate": 1.1084926558135034e-05, + "loss": 1.2206, + "step": 16158 + }, + { + "epoch": 4.812896740445653, + "grad_norm": 0.5283071994781494, + "learning_rate": 1.1083967680317196e-05, + "loss": 1.2225, + "step": 16159 + }, + { + "epoch": 4.813194586645321, + "grad_norm": 0.3246292471885681, + "learning_rate": 1.108300879241425e-05, + "loss": 1.2135, + "step": 16160 + }, + { + "epoch": 4.813492432844989, + "grad_norm": 0.4943784177303314, + "learning_rate": 1.1082049894435116e-05, + "loss": 1.2283, + "step": 16161 + }, + { + "epoch": 4.8137902790446585, + "grad_norm": 0.3359585702419281, + "learning_rate": 1.1081090986388718e-05, + "loss": 1.2248, + "step": 16162 + }, + { + "epoch": 4.814088125244327, + "grad_norm": 0.4059714078903198, + "learning_rate": 1.1080132068283974e-05, + "loss": 1.2143, + "step": 16163 + }, + { + "epoch": 4.814385971443995, + "grad_norm": 0.45377418398857117, + "learning_rate": 1.107917314012981e-05, + "loss": 1.2078, + "step": 16164 + }, + { + "epoch": 4.814683817643664, + "grad_norm": 0.34101006388664246, + "learning_rate": 1.1078214201935148e-05, + "loss": 1.2144, + "step": 16165 + }, + { + "epoch": 4.814981663843333, + "grad_norm": 0.4971599280834198, + "learning_rate": 1.1077255253708906e-05, + "loss": 1.2322, + "step": 16166 + }, + { + "epoch": 4.815279510043002, + "grad_norm": 0.26410800218582153, + "learning_rate": 1.1076296295460003e-05, + "loss": 1.2216, + "step": 16167 + }, + { + "epoch": 4.81557735624267, + "grad_norm": 0.44251903891563416, + "learning_rate": 1.107533732719737e-05, + "loss": 1.2223, + "step": 16168 + }, + { + "epoch": 4.815875202442339, + "grad_norm": 0.3852265179157257, + "learning_rate": 1.1074378348929924e-05, + "loss": 1.2287, + "step": 16169 + }, + { + "epoch": 4.816173048642008, + "grad_norm": 0.3347865641117096, + "learning_rate": 1.1073419360666588e-05, + "loss": 1.222, + "step": 16170 + }, + { + "epoch": 4.816470894841676, + "grad_norm": 0.3753680884838104, + "learning_rate": 1.1072460362416284e-05, + "loss": 1.2304, + "step": 16171 + }, + { + "epoch": 4.8167687410413444, + "grad_norm": 0.2690870761871338, + "learning_rate": 1.1071501354187933e-05, + "loss": 1.2207, + "step": 16172 + }, + { + "epoch": 4.817066587241014, + "grad_norm": 0.34019342064857483, + "learning_rate": 1.107054233599046e-05, + "loss": 1.2251, + "step": 16173 + }, + { + "epoch": 4.817364433440682, + "grad_norm": 0.24536536633968353, + "learning_rate": 1.1069583307832788e-05, + "loss": 1.2086, + "step": 16174 + }, + { + "epoch": 4.81766227964035, + "grad_norm": 0.5281050801277161, + "learning_rate": 1.1068624269723837e-05, + "loss": 1.2158, + "step": 16175 + }, + { + "epoch": 4.8179601258400195, + "grad_norm": 0.45852169394493103, + "learning_rate": 1.1067665221672532e-05, + "loss": 1.2217, + "step": 16176 + }, + { + "epoch": 4.818257972039688, + "grad_norm": 0.3672395944595337, + "learning_rate": 1.1066706163687794e-05, + "loss": 1.2332, + "step": 16177 + }, + { + "epoch": 4.818555818239357, + "grad_norm": 0.582095205783844, + "learning_rate": 1.1065747095778547e-05, + "loss": 1.2274, + "step": 16178 + }, + { + "epoch": 4.818853664439025, + "grad_norm": 0.2974826693534851, + "learning_rate": 1.1064788017953714e-05, + "loss": 1.2259, + "step": 16179 + }, + { + "epoch": 4.819151510638694, + "grad_norm": 0.5012524724006653, + "learning_rate": 1.1063828930222219e-05, + "loss": 1.2325, + "step": 16180 + }, + { + "epoch": 4.819449356838363, + "grad_norm": 0.3937076926231384, + "learning_rate": 1.1062869832592981e-05, + "loss": 1.2267, + "step": 16181 + }, + { + "epoch": 4.819747203038031, + "grad_norm": 0.520210862159729, + "learning_rate": 1.1061910725074933e-05, + "loss": 1.219, + "step": 16182 + }, + { + "epoch": 4.8200450492377, + "grad_norm": 0.39134225249290466, + "learning_rate": 1.1060951607676989e-05, + "loss": 1.2076, + "step": 16183 + }, + { + "epoch": 4.820342895437369, + "grad_norm": 0.35149261355400085, + "learning_rate": 1.1059992480408076e-05, + "loss": 1.2023, + "step": 16184 + }, + { + "epoch": 4.820640741637037, + "grad_norm": 0.33472779393196106, + "learning_rate": 1.1059033343277114e-05, + "loss": 1.2119, + "step": 16185 + }, + { + "epoch": 4.8209385878367055, + "grad_norm": 0.3002585172653198, + "learning_rate": 1.1058074196293034e-05, + "loss": 1.2218, + "step": 16186 + }, + { + "epoch": 4.821236434036375, + "grad_norm": 0.5443071126937866, + "learning_rate": 1.1057115039464751e-05, + "loss": 1.2247, + "step": 16187 + }, + { + "epoch": 4.821534280236043, + "grad_norm": 0.28928321599960327, + "learning_rate": 1.10561558728012e-05, + "loss": 1.2367, + "step": 16188 + }, + { + "epoch": 4.821832126435712, + "grad_norm": 0.41158461570739746, + "learning_rate": 1.1055196696311296e-05, + "loss": 1.2314, + "step": 16189 + }, + { + "epoch": 4.822129972635381, + "grad_norm": 0.34556692838668823, + "learning_rate": 1.1054237510003961e-05, + "loss": 1.225, + "step": 16190 + }, + { + "epoch": 4.822427818835049, + "grad_norm": 0.5000821948051453, + "learning_rate": 1.105327831388813e-05, + "loss": 1.2268, + "step": 16191 + }, + { + "epoch": 4.822725665034717, + "grad_norm": 0.323465496301651, + "learning_rate": 1.1052319107972719e-05, + "loss": 1.2025, + "step": 16192 + }, + { + "epoch": 4.823023511234386, + "grad_norm": 0.3453395962715149, + "learning_rate": 1.1051359892266655e-05, + "loss": 1.1967, + "step": 16193 + }, + { + "epoch": 4.823321357434055, + "grad_norm": 0.34367984533309937, + "learning_rate": 1.1050400666778863e-05, + "loss": 1.2116, + "step": 16194 + }, + { + "epoch": 4.823619203633724, + "grad_norm": 0.3218033015727997, + "learning_rate": 1.1049441431518267e-05, + "loss": 1.2118, + "step": 16195 + }, + { + "epoch": 4.823917049833392, + "grad_norm": 0.269066721200943, + "learning_rate": 1.1048482186493785e-05, + "loss": 1.2078, + "step": 16196 + }, + { + "epoch": 4.824214896033061, + "grad_norm": 0.28650161623954773, + "learning_rate": 1.1047522931714352e-05, + "loss": 1.2108, + "step": 16197 + }, + { + "epoch": 4.82451274223273, + "grad_norm": 0.3001042306423187, + "learning_rate": 1.1046563667188888e-05, + "loss": 1.2141, + "step": 16198 + }, + { + "epoch": 4.824810588432398, + "grad_norm": 0.4155952036380768, + "learning_rate": 1.1045604392926318e-05, + "loss": 1.2293, + "step": 16199 + }, + { + "epoch": 4.8251084346320665, + "grad_norm": 0.29334381222724915, + "learning_rate": 1.1044645108935568e-05, + "loss": 1.2134, + "step": 16200 + }, + { + "epoch": 4.825406280831736, + "grad_norm": 0.32471799850463867, + "learning_rate": 1.1043685815225562e-05, + "loss": 1.223, + "step": 16201 + }, + { + "epoch": 4.825704127031404, + "grad_norm": 0.2967764437198639, + "learning_rate": 1.1042726511805226e-05, + "loss": 1.2231, + "step": 16202 + }, + { + "epoch": 4.826001973231072, + "grad_norm": 0.3156512379646301, + "learning_rate": 1.1041767198683483e-05, + "loss": 1.2272, + "step": 16203 + }, + { + "epoch": 4.826299819430742, + "grad_norm": 0.27915698289871216, + "learning_rate": 1.1040807875869261e-05, + "loss": 1.2229, + "step": 16204 + }, + { + "epoch": 4.82659766563041, + "grad_norm": 0.3005918562412262, + "learning_rate": 1.1039848543371485e-05, + "loss": 1.2285, + "step": 16205 + }, + { + "epoch": 4.826895511830079, + "grad_norm": 0.2813635468482971, + "learning_rate": 1.1038889201199084e-05, + "loss": 1.212, + "step": 16206 + }, + { + "epoch": 4.8271933580297475, + "grad_norm": 0.45619508624076843, + "learning_rate": 1.1037929849360976e-05, + "loss": 1.2311, + "step": 16207 + }, + { + "epoch": 4.827491204229416, + "grad_norm": 0.3799247145652771, + "learning_rate": 1.1036970487866086e-05, + "loss": 1.2272, + "step": 16208 + }, + { + "epoch": 4.827789050429085, + "grad_norm": 0.30754658579826355, + "learning_rate": 1.103601111672335e-05, + "loss": 1.2273, + "step": 16209 + }, + { + "epoch": 4.828086896628753, + "grad_norm": 0.33981654047966003, + "learning_rate": 1.1035051735941686e-05, + "loss": 1.2129, + "step": 16210 + }, + { + "epoch": 4.828384742828422, + "grad_norm": 0.2944939136505127, + "learning_rate": 1.1034092345530023e-05, + "loss": 1.216, + "step": 16211 + }, + { + "epoch": 4.828682589028091, + "grad_norm": 0.5903461575508118, + "learning_rate": 1.1033132945497287e-05, + "loss": 1.1992, + "step": 16212 + }, + { + "epoch": 4.828980435227759, + "grad_norm": 0.46102413535118103, + "learning_rate": 1.10321735358524e-05, + "loss": 1.2253, + "step": 16213 + }, + { + "epoch": 4.8292782814274275, + "grad_norm": 0.31985360383987427, + "learning_rate": 1.1031214116604294e-05, + "loss": 1.2162, + "step": 16214 + }, + { + "epoch": 4.829576127627097, + "grad_norm": 0.31031545996665955, + "learning_rate": 1.1030254687761892e-05, + "loss": 1.2283, + "step": 16215 + }, + { + "epoch": 4.829873973826765, + "grad_norm": 0.3633049726486206, + "learning_rate": 1.1029295249334122e-05, + "loss": 1.231, + "step": 16216 + }, + { + "epoch": 4.830171820026434, + "grad_norm": 0.2707124352455139, + "learning_rate": 1.1028335801329907e-05, + "loss": 1.2105, + "step": 16217 + }, + { + "epoch": 4.830469666226103, + "grad_norm": 0.2790050804615021, + "learning_rate": 1.102737634375818e-05, + "loss": 1.199, + "step": 16218 + }, + { + "epoch": 4.830767512425771, + "grad_norm": 0.294877290725708, + "learning_rate": 1.102641687662786e-05, + "loss": 1.224, + "step": 16219 + }, + { + "epoch": 4.83106535862544, + "grad_norm": 0.4410344064235687, + "learning_rate": 1.102545739994788e-05, + "loss": 1.215, + "step": 16220 + }, + { + "epoch": 4.8313632048251085, + "grad_norm": 0.25842905044555664, + "learning_rate": 1.1024497913727166e-05, + "loss": 1.2171, + "step": 16221 + }, + { + "epoch": 4.831661051024777, + "grad_norm": 0.44133102893829346, + "learning_rate": 1.1023538417974641e-05, + "loss": 1.2074, + "step": 16222 + }, + { + "epoch": 4.831958897224446, + "grad_norm": 0.2675754427909851, + "learning_rate": 1.1022578912699237e-05, + "loss": 1.2259, + "step": 16223 + }, + { + "epoch": 4.832256743424114, + "grad_norm": 0.5445632934570312, + "learning_rate": 1.102161939790988e-05, + "loss": 1.2173, + "step": 16224 + }, + { + "epoch": 4.832554589623783, + "grad_norm": 0.3402521014213562, + "learning_rate": 1.1020659873615491e-05, + "loss": 1.2052, + "step": 16225 + }, + { + "epoch": 4.832852435823452, + "grad_norm": 0.3629879653453827, + "learning_rate": 1.1019700339825006e-05, + "loss": 1.2254, + "step": 16226 + }, + { + "epoch": 4.83315028202312, + "grad_norm": 0.27562153339385986, + "learning_rate": 1.1018740796547347e-05, + "loss": 1.2034, + "step": 16227 + }, + { + "epoch": 4.833448128222789, + "grad_norm": 0.46772968769073486, + "learning_rate": 1.1017781243791443e-05, + "loss": 1.223, + "step": 16228 + }, + { + "epoch": 4.833745974422458, + "grad_norm": 0.3419756591320038, + "learning_rate": 1.1016821681566222e-05, + "loss": 1.2179, + "step": 16229 + }, + { + "epoch": 4.834043820622126, + "grad_norm": 0.35203564167022705, + "learning_rate": 1.1015862109880612e-05, + "loss": 1.2265, + "step": 16230 + }, + { + "epoch": 4.8343416668217944, + "grad_norm": 0.2615616023540497, + "learning_rate": 1.1014902528743537e-05, + "loss": 1.2261, + "step": 16231 + }, + { + "epoch": 4.834639513021464, + "grad_norm": 0.32637733221054077, + "learning_rate": 1.101394293816393e-05, + "loss": 1.2284, + "step": 16232 + }, + { + "epoch": 4.834937359221132, + "grad_norm": 0.25350940227508545, + "learning_rate": 1.101298333815072e-05, + "loss": 1.219, + "step": 16233 + }, + { + "epoch": 4.835235205420801, + "grad_norm": 0.3241324722766876, + "learning_rate": 1.1012023728712828e-05, + "loss": 1.2224, + "step": 16234 + }, + { + "epoch": 4.8355330516204695, + "grad_norm": 0.2580425441265106, + "learning_rate": 1.1011064109859186e-05, + "loss": 1.212, + "step": 16235 + }, + { + "epoch": 4.835830897820138, + "grad_norm": 0.26354455947875977, + "learning_rate": 1.1010104481598725e-05, + "loss": 1.2234, + "step": 16236 + }, + { + "epoch": 4.836128744019807, + "grad_norm": 0.26966482400894165, + "learning_rate": 1.1009144843940365e-05, + "loss": 1.2114, + "step": 16237 + }, + { + "epoch": 4.836426590219475, + "grad_norm": 0.2623552680015564, + "learning_rate": 1.1008185196893044e-05, + "loss": 1.2185, + "step": 16238 + }, + { + "epoch": 4.836724436419144, + "grad_norm": 0.3894292712211609, + "learning_rate": 1.1007225540465684e-05, + "loss": 1.2181, + "step": 16239 + }, + { + "epoch": 4.837022282618813, + "grad_norm": 0.27377384901046753, + "learning_rate": 1.1006265874667219e-05, + "loss": 1.2047, + "step": 16240 + }, + { + "epoch": 4.837320128818481, + "grad_norm": 0.5420881509780884, + "learning_rate": 1.1005306199506572e-05, + "loss": 1.2085, + "step": 16241 + }, + { + "epoch": 4.83761797501815, + "grad_norm": 0.4181584119796753, + "learning_rate": 1.1004346514992674e-05, + "loss": 1.2085, + "step": 16242 + }, + { + "epoch": 4.837915821217819, + "grad_norm": 0.42847493290901184, + "learning_rate": 1.1003386821134451e-05, + "loss": 1.2315, + "step": 16243 + }, + { + "epoch": 4.838213667417487, + "grad_norm": 0.505787193775177, + "learning_rate": 1.100242711794084e-05, + "loss": 1.223, + "step": 16244 + }, + { + "epoch": 4.838511513617156, + "grad_norm": 0.2728482484817505, + "learning_rate": 1.1001467405420761e-05, + "loss": 1.1958, + "step": 16245 + }, + { + "epoch": 4.838809359816825, + "grad_norm": 0.36392879486083984, + "learning_rate": 1.1000507683583147e-05, + "loss": 1.2007, + "step": 16246 + }, + { + "epoch": 4.839107206016493, + "grad_norm": 0.32511842250823975, + "learning_rate": 1.0999547952436932e-05, + "loss": 1.2105, + "step": 16247 + }, + { + "epoch": 4.839405052216162, + "grad_norm": 0.4247780442237854, + "learning_rate": 1.0998588211991034e-05, + "loss": 1.2135, + "step": 16248 + }, + { + "epoch": 4.839702898415831, + "grad_norm": 0.29443636536598206, + "learning_rate": 1.0997628462254388e-05, + "loss": 1.2139, + "step": 16249 + }, + { + "epoch": 4.840000744615499, + "grad_norm": 0.32411524653434753, + "learning_rate": 1.0996668703235926e-05, + "loss": 1.2366, + "step": 16250 + }, + { + "epoch": 4.840298590815168, + "grad_norm": 0.3381761908531189, + "learning_rate": 1.0995708934944575e-05, + "loss": 1.2161, + "step": 16251 + }, + { + "epoch": 4.840596437014836, + "grad_norm": 0.24481159448623657, + "learning_rate": 1.0994749157389266e-05, + "loss": 1.2224, + "step": 16252 + }, + { + "epoch": 4.840894283214505, + "grad_norm": 0.253580778837204, + "learning_rate": 1.0993789370578928e-05, + "loss": 1.212, + "step": 16253 + }, + { + "epoch": 4.841192129414174, + "grad_norm": 0.27224865555763245, + "learning_rate": 1.099282957452249e-05, + "loss": 1.2058, + "step": 16254 + }, + { + "epoch": 4.841489975613842, + "grad_norm": 0.2752995193004608, + "learning_rate": 1.0991869769228877e-05, + "loss": 1.22, + "step": 16255 + }, + { + "epoch": 4.8417878218135115, + "grad_norm": 0.30049604177474976, + "learning_rate": 1.0990909954707028e-05, + "loss": 1.2195, + "step": 16256 + }, + { + "epoch": 4.84208566801318, + "grad_norm": 0.2794896364212036, + "learning_rate": 1.0989950130965869e-05, + "loss": 1.2089, + "step": 16257 + }, + { + "epoch": 4.842383514212848, + "grad_norm": 0.31548571586608887, + "learning_rate": 1.098899029801433e-05, + "loss": 1.227, + "step": 16258 + }, + { + "epoch": 4.8426813604125165, + "grad_norm": 0.2630378305912018, + "learning_rate": 1.0988030455861342e-05, + "loss": 1.217, + "step": 16259 + }, + { + "epoch": 4.842979206612186, + "grad_norm": 0.32502326369285583, + "learning_rate": 1.0987070604515832e-05, + "loss": 1.2218, + "step": 16260 + }, + { + "epoch": 4.843277052811854, + "grad_norm": 0.2506658732891083, + "learning_rate": 1.0986110743986736e-05, + "loss": 1.233, + "step": 16261 + }, + { + "epoch": 4.843574899011523, + "grad_norm": 0.38177919387817383, + "learning_rate": 1.0985150874282979e-05, + "loss": 1.2318, + "step": 16262 + }, + { + "epoch": 4.843872745211192, + "grad_norm": 0.3221963942050934, + "learning_rate": 1.0984190995413495e-05, + "loss": 1.2031, + "step": 16263 + }, + { + "epoch": 4.84417059141086, + "grad_norm": 0.30547401309013367, + "learning_rate": 1.0983231107387213e-05, + "loss": 1.218, + "step": 16264 + }, + { + "epoch": 4.844468437610529, + "grad_norm": 0.39576584100723267, + "learning_rate": 1.0982271210213065e-05, + "loss": 1.2191, + "step": 16265 + }, + { + "epoch": 4.8447662838101975, + "grad_norm": 0.2645990252494812, + "learning_rate": 1.098131130389998e-05, + "loss": 1.2242, + "step": 16266 + }, + { + "epoch": 4.845064130009866, + "grad_norm": 0.3115106225013733, + "learning_rate": 1.0980351388456889e-05, + "loss": 1.2205, + "step": 16267 + }, + { + "epoch": 4.845361976209535, + "grad_norm": 0.25013476610183716, + "learning_rate": 1.0979391463892724e-05, + "loss": 1.2219, + "step": 16268 + }, + { + "epoch": 4.845659822409203, + "grad_norm": 0.3354990482330322, + "learning_rate": 1.0978431530216416e-05, + "loss": 1.2341, + "step": 16269 + }, + { + "epoch": 4.845957668608872, + "grad_norm": 0.3058474361896515, + "learning_rate": 1.0977471587436897e-05, + "loss": 1.2157, + "step": 16270 + }, + { + "epoch": 4.846255514808541, + "grad_norm": 0.2506557106971741, + "learning_rate": 1.0976511635563094e-05, + "loss": 1.2137, + "step": 16271 + }, + { + "epoch": 4.846553361008209, + "grad_norm": 0.23520879447460175, + "learning_rate": 1.0975551674603944e-05, + "loss": 1.2154, + "step": 16272 + }, + { + "epoch": 4.846851207207878, + "grad_norm": 0.24506239593029022, + "learning_rate": 1.0974591704568372e-05, + "loss": 1.2026, + "step": 16273 + }, + { + "epoch": 4.847149053407547, + "grad_norm": 0.2806796133518219, + "learning_rate": 1.0973631725465318e-05, + "loss": 1.2286, + "step": 16274 + }, + { + "epoch": 4.847446899607215, + "grad_norm": 0.27367648482322693, + "learning_rate": 1.0972671737303705e-05, + "loss": 1.2304, + "step": 16275 + }, + { + "epoch": 4.847744745806884, + "grad_norm": 0.28956472873687744, + "learning_rate": 1.0971711740092468e-05, + "loss": 1.2177, + "step": 16276 + }, + { + "epoch": 4.848042592006553, + "grad_norm": 0.4239387512207031, + "learning_rate": 1.0970751733840544e-05, + "loss": 1.2135, + "step": 16277 + }, + { + "epoch": 4.848340438206221, + "grad_norm": 0.2668099105358124, + "learning_rate": 1.0969791718556853e-05, + "loss": 1.1985, + "step": 16278 + }, + { + "epoch": 4.84863828440589, + "grad_norm": 0.2952024042606354, + "learning_rate": 1.0968831694250335e-05, + "loss": 1.2088, + "step": 16279 + }, + { + "epoch": 4.8489361306055585, + "grad_norm": 0.2664113938808441, + "learning_rate": 1.0967871660929923e-05, + "loss": 1.2339, + "step": 16280 + }, + { + "epoch": 4.849233976805227, + "grad_norm": 0.3338851034641266, + "learning_rate": 1.0966911618604548e-05, + "loss": 1.2059, + "step": 16281 + }, + { + "epoch": 4.849531823004896, + "grad_norm": 0.3878779411315918, + "learning_rate": 1.0965951567283138e-05, + "loss": 1.2103, + "step": 16282 + }, + { + "epoch": 4.849829669204564, + "grad_norm": 0.3286162316799164, + "learning_rate": 1.0964991506974628e-05, + "loss": 1.2155, + "step": 16283 + }, + { + "epoch": 4.850127515404234, + "grad_norm": 0.2817932665348053, + "learning_rate": 1.096403143768795e-05, + "loss": 1.2107, + "step": 16284 + }, + { + "epoch": 4.850425361603902, + "grad_norm": 0.2845592200756073, + "learning_rate": 1.0963071359432035e-05, + "loss": 1.2231, + "step": 16285 + }, + { + "epoch": 4.85072320780357, + "grad_norm": 0.46321550011634827, + "learning_rate": 1.0962111272215818e-05, + "loss": 1.2061, + "step": 16286 + }, + { + "epoch": 4.8510210540032395, + "grad_norm": 0.5695003867149353, + "learning_rate": 1.0961151176048233e-05, + "loss": 1.2365, + "step": 16287 + }, + { + "epoch": 4.851318900202908, + "grad_norm": 0.24942146241664886, + "learning_rate": 1.096019107093821e-05, + "loss": 1.2143, + "step": 16288 + }, + { + "epoch": 4.851616746402576, + "grad_norm": 0.4089578092098236, + "learning_rate": 1.095923095689468e-05, + "loss": 1.2036, + "step": 16289 + }, + { + "epoch": 4.851914592602245, + "grad_norm": 0.2610035836696625, + "learning_rate": 1.0958270833926579e-05, + "loss": 1.2214, + "step": 16290 + }, + { + "epoch": 4.852212438801914, + "grad_norm": 0.3350369930267334, + "learning_rate": 1.0957310702042836e-05, + "loss": 1.2054, + "step": 16291 + }, + { + "epoch": 4.852510285001582, + "grad_norm": 0.3857544958591461, + "learning_rate": 1.0956350561252388e-05, + "loss": 1.2236, + "step": 16292 + }, + { + "epoch": 4.852808131201251, + "grad_norm": 0.2987971603870392, + "learning_rate": 1.0955390411564166e-05, + "loss": 1.2152, + "step": 16293 + }, + { + "epoch": 4.8531059774009195, + "grad_norm": 0.2751263678073883, + "learning_rate": 1.0954430252987107e-05, + "loss": 1.2262, + "step": 16294 + }, + { + "epoch": 4.853403823600588, + "grad_norm": 0.29420581459999084, + "learning_rate": 1.0953470085530138e-05, + "loss": 1.2135, + "step": 16295 + }, + { + "epoch": 4.853701669800257, + "grad_norm": 0.25773486495018005, + "learning_rate": 1.0952509909202194e-05, + "loss": 1.2047, + "step": 16296 + }, + { + "epoch": 4.853999515999925, + "grad_norm": 0.2723999321460724, + "learning_rate": 1.0951549724012209e-05, + "loss": 1.2172, + "step": 16297 + }, + { + "epoch": 4.854297362199594, + "grad_norm": 0.2643665075302124, + "learning_rate": 1.0950589529969123e-05, + "loss": 1.2179, + "step": 16298 + }, + { + "epoch": 4.854595208399263, + "grad_norm": 0.4759957790374756, + "learning_rate": 1.0949629327081856e-05, + "loss": 1.2171, + "step": 16299 + }, + { + "epoch": 4.854893054598931, + "grad_norm": 0.40444934368133545, + "learning_rate": 1.0948669115359354e-05, + "loss": 1.2137, + "step": 16300 + }, + { + "epoch": 4.8551909007986005, + "grad_norm": 0.3371957540512085, + "learning_rate": 1.0947708894810542e-05, + "loss": 1.2096, + "step": 16301 + }, + { + "epoch": 4.855488746998269, + "grad_norm": 0.4310877323150635, + "learning_rate": 1.0946748665444362e-05, + "loss": 1.2196, + "step": 16302 + }, + { + "epoch": 4.855786593197937, + "grad_norm": 0.28807327151298523, + "learning_rate": 1.094578842726974e-05, + "loss": 1.2155, + "step": 16303 + }, + { + "epoch": 4.856084439397606, + "grad_norm": 0.3602162301540375, + "learning_rate": 1.0944828180295615e-05, + "loss": 1.248, + "step": 16304 + }, + { + "epoch": 4.856382285597275, + "grad_norm": 0.4068543314933777, + "learning_rate": 1.0943867924530919e-05, + "loss": 1.2225, + "step": 16305 + }, + { + "epoch": 4.856680131796943, + "grad_norm": 0.6764812469482422, + "learning_rate": 1.0942907659984587e-05, + "loss": 1.2324, + "step": 16306 + }, + { + "epoch": 4.856977977996612, + "grad_norm": 0.26010850071907043, + "learning_rate": 1.0941947386665553e-05, + "loss": 1.2259, + "step": 16307 + }, + { + "epoch": 4.857275824196281, + "grad_norm": 0.6575981974601746, + "learning_rate": 1.094098710458275e-05, + "loss": 1.2418, + "step": 16308 + }, + { + "epoch": 4.857573670395949, + "grad_norm": 0.26086241006851196, + "learning_rate": 1.0940026813745113e-05, + "loss": 1.2003, + "step": 16309 + }, + { + "epoch": 4.857871516595618, + "grad_norm": 0.6123746633529663, + "learning_rate": 1.0939066514161576e-05, + "loss": 1.2099, + "step": 16310 + }, + { + "epoch": 4.858169362795286, + "grad_norm": 0.3198632597923279, + "learning_rate": 1.0938106205841077e-05, + "loss": 1.2182, + "step": 16311 + }, + { + "epoch": 4.858467208994956, + "grad_norm": 0.5020037889480591, + "learning_rate": 1.0937145888792546e-05, + "loss": 1.2073, + "step": 16312 + }, + { + "epoch": 4.858765055194624, + "grad_norm": 0.2920586168766022, + "learning_rate": 1.0936185563024922e-05, + "loss": 1.2143, + "step": 16313 + }, + { + "epoch": 4.859062901394292, + "grad_norm": 0.4286530613899231, + "learning_rate": 1.0935225228547133e-05, + "loss": 1.2121, + "step": 16314 + }, + { + "epoch": 4.8593607475939615, + "grad_norm": 0.35088905692100525, + "learning_rate": 1.093426488536812e-05, + "loss": 1.2192, + "step": 16315 + }, + { + "epoch": 4.85965859379363, + "grad_norm": 0.38091519474983215, + "learning_rate": 1.0933304533496817e-05, + "loss": 1.2157, + "step": 16316 + }, + { + "epoch": 4.859956439993298, + "grad_norm": 0.39246076345443726, + "learning_rate": 1.0932344172942158e-05, + "loss": 1.2127, + "step": 16317 + }, + { + "epoch": 4.860254286192967, + "grad_norm": 0.43078768253326416, + "learning_rate": 1.0931383803713079e-05, + "loss": 1.2069, + "step": 16318 + }, + { + "epoch": 4.860552132392636, + "grad_norm": 0.32488927245140076, + "learning_rate": 1.093042342581851e-05, + "loss": 1.2297, + "step": 16319 + }, + { + "epoch": 4.860849978592304, + "grad_norm": 0.35250064730644226, + "learning_rate": 1.0929463039267393e-05, + "loss": 1.2218, + "step": 16320 + }, + { + "epoch": 4.861147824791973, + "grad_norm": 0.27092909812927246, + "learning_rate": 1.0928502644068662e-05, + "loss": 1.226, + "step": 16321 + }, + { + "epoch": 4.861445670991642, + "grad_norm": 0.43906137347221375, + "learning_rate": 1.0927542240231253e-05, + "loss": 1.2118, + "step": 16322 + }, + { + "epoch": 4.861743517191311, + "grad_norm": 0.3613179922103882, + "learning_rate": 1.0926581827764097e-05, + "loss": 1.2143, + "step": 16323 + }, + { + "epoch": 4.862041363390979, + "grad_norm": 0.3018326163291931, + "learning_rate": 1.0925621406676132e-05, + "loss": 1.2093, + "step": 16324 + }, + { + "epoch": 4.8623392095906475, + "grad_norm": 0.30221664905548096, + "learning_rate": 1.0924660976976295e-05, + "loss": 1.2193, + "step": 16325 + }, + { + "epoch": 4.862637055790316, + "grad_norm": 0.40009406208992004, + "learning_rate": 1.092370053867352e-05, + "loss": 1.201, + "step": 16326 + }, + { + "epoch": 4.862934901989985, + "grad_norm": 0.4788067936897278, + "learning_rate": 1.0922740091776744e-05, + "loss": 1.2159, + "step": 16327 + }, + { + "epoch": 4.863232748189653, + "grad_norm": 0.3238835632801056, + "learning_rate": 1.0921779636294904e-05, + "loss": 1.2291, + "step": 16328 + }, + { + "epoch": 4.863530594389323, + "grad_norm": 0.3570665419101715, + "learning_rate": 1.0920819172236932e-05, + "loss": 1.2074, + "step": 16329 + }, + { + "epoch": 4.863828440588991, + "grad_norm": 0.3871474266052246, + "learning_rate": 1.0919858699611767e-05, + "loss": 1.2091, + "step": 16330 + }, + { + "epoch": 4.864126286788659, + "grad_norm": 0.4075256884098053, + "learning_rate": 1.0918898218428344e-05, + "loss": 1.233, + "step": 16331 + }, + { + "epoch": 4.864424132988328, + "grad_norm": 0.2669723927974701, + "learning_rate": 1.09179377286956e-05, + "loss": 1.2266, + "step": 16332 + }, + { + "epoch": 4.864721979187997, + "grad_norm": 0.2555624842643738, + "learning_rate": 1.0916977230422472e-05, + "loss": 1.2367, + "step": 16333 + }, + { + "epoch": 4.865019825387665, + "grad_norm": 0.3796085715293884, + "learning_rate": 1.0916016723617894e-05, + "loss": 1.22, + "step": 16334 + }, + { + "epoch": 4.865317671587334, + "grad_norm": 0.26811715960502625, + "learning_rate": 1.0915056208290807e-05, + "loss": 1.2075, + "step": 16335 + }, + { + "epoch": 4.865615517787003, + "grad_norm": 0.32306674122810364, + "learning_rate": 1.091409568445014e-05, + "loss": 1.2161, + "step": 16336 + }, + { + "epoch": 4.865913363986671, + "grad_norm": 0.30818474292755127, + "learning_rate": 1.0913135152104835e-05, + "loss": 1.2176, + "step": 16337 + }, + { + "epoch": 4.86621121018634, + "grad_norm": 0.6810904145240784, + "learning_rate": 1.0912174611263828e-05, + "loss": 1.2138, + "step": 16338 + }, + { + "epoch": 4.8665090563860085, + "grad_norm": 0.5148798227310181, + "learning_rate": 1.0911214061936057e-05, + "loss": 1.225, + "step": 16339 + }, + { + "epoch": 4.866806902585678, + "grad_norm": 0.41127029061317444, + "learning_rate": 1.0910253504130457e-05, + "loss": 1.2184, + "step": 16340 + }, + { + "epoch": 4.867104748785346, + "grad_norm": 0.2449532151222229, + "learning_rate": 1.0909292937855964e-05, + "loss": 1.2153, + "step": 16341 + }, + { + "epoch": 4.867402594985014, + "grad_norm": 0.5934796929359436, + "learning_rate": 1.0908332363121516e-05, + "loss": 1.2326, + "step": 16342 + }, + { + "epoch": 4.867700441184684, + "grad_norm": 0.2860567569732666, + "learning_rate": 1.0907371779936051e-05, + "loss": 1.2216, + "step": 16343 + }, + { + "epoch": 4.867998287384352, + "grad_norm": 0.558010995388031, + "learning_rate": 1.0906411188308504e-05, + "loss": 1.2263, + "step": 16344 + }, + { + "epoch": 4.86829613358402, + "grad_norm": 0.2574203312397003, + "learning_rate": 1.0905450588247815e-05, + "loss": 1.2233, + "step": 16345 + }, + { + "epoch": 4.8685939797836895, + "grad_norm": 0.424591600894928, + "learning_rate": 1.0904489979762922e-05, + "loss": 1.2141, + "step": 16346 + }, + { + "epoch": 4.868891825983358, + "grad_norm": 0.2902675271034241, + "learning_rate": 1.0903529362862758e-05, + "loss": 1.2281, + "step": 16347 + }, + { + "epoch": 4.869189672183026, + "grad_norm": 0.28373557329177856, + "learning_rate": 1.090256873755626e-05, + "loss": 1.206, + "step": 16348 + }, + { + "epoch": 4.869487518382695, + "grad_norm": 0.3348288834095001, + "learning_rate": 1.0901608103852373e-05, + "loss": 1.2062, + "step": 16349 + }, + { + "epoch": 4.869785364582364, + "grad_norm": 0.34329766035079956, + "learning_rate": 1.0900647461760025e-05, + "loss": 1.2268, + "step": 16350 + }, + { + "epoch": 4.870083210782033, + "grad_norm": 0.36737751960754395, + "learning_rate": 1.0899686811288162e-05, + "loss": 1.2409, + "step": 16351 + }, + { + "epoch": 4.870381056981701, + "grad_norm": 0.42342904210090637, + "learning_rate": 1.089872615244572e-05, + "loss": 1.2172, + "step": 16352 + }, + { + "epoch": 4.8706789031813695, + "grad_norm": 0.3289167284965515, + "learning_rate": 1.0897765485241632e-05, + "loss": 1.2026, + "step": 16353 + }, + { + "epoch": 4.870976749381039, + "grad_norm": 0.33885636925697327, + "learning_rate": 1.089680480968484e-05, + "loss": 1.2022, + "step": 16354 + }, + { + "epoch": 4.871274595580707, + "grad_norm": 0.2676118016242981, + "learning_rate": 1.0895844125784278e-05, + "loss": 1.2043, + "step": 16355 + }, + { + "epoch": 4.871572441780375, + "grad_norm": 0.3302132785320282, + "learning_rate": 1.0894883433548894e-05, + "loss": 1.2302, + "step": 16356 + }, + { + "epoch": 4.871870287980045, + "grad_norm": 0.26079061627388, + "learning_rate": 1.0893922732987616e-05, + "loss": 1.2163, + "step": 16357 + }, + { + "epoch": 4.872168134179713, + "grad_norm": 0.3585273325443268, + "learning_rate": 1.0892962024109385e-05, + "loss": 1.2191, + "step": 16358 + }, + { + "epoch": 4.872465980379381, + "grad_norm": 0.3633761405944824, + "learning_rate": 1.0892001306923143e-05, + "loss": 1.2327, + "step": 16359 + }, + { + "epoch": 4.8727638265790505, + "grad_norm": 0.2803542912006378, + "learning_rate": 1.0891040581437822e-05, + "loss": 1.2075, + "step": 16360 + }, + { + "epoch": 4.873061672778719, + "grad_norm": 0.3831990659236908, + "learning_rate": 1.0890079847662364e-05, + "loss": 1.2139, + "step": 16361 + }, + { + "epoch": 4.873359518978387, + "grad_norm": 0.2607085108757019, + "learning_rate": 1.088911910560571e-05, + "loss": 1.2367, + "step": 16362 + }, + { + "epoch": 4.873657365178056, + "grad_norm": 0.302704781293869, + "learning_rate": 1.0888158355276796e-05, + "loss": 1.2179, + "step": 16363 + }, + { + "epoch": 4.873955211377725, + "grad_norm": 0.2732965648174286, + "learning_rate": 1.088719759668456e-05, + "loss": 1.2133, + "step": 16364 + }, + { + "epoch": 4.874253057577393, + "grad_norm": 0.2925439774990082, + "learning_rate": 1.0886236829837942e-05, + "loss": 1.2175, + "step": 16365 + }, + { + "epoch": 4.874550903777062, + "grad_norm": 0.27101626992225647, + "learning_rate": 1.0885276054745879e-05, + "loss": 1.2269, + "step": 16366 + }, + { + "epoch": 4.874848749976731, + "grad_norm": 0.3258912265300751, + "learning_rate": 1.0884315271417311e-05, + "loss": 1.2213, + "step": 16367 + }, + { + "epoch": 4.8751465961764, + "grad_norm": 0.2819480895996094, + "learning_rate": 1.0883354479861179e-05, + "loss": 1.2209, + "step": 16368 + }, + { + "epoch": 4.875444442376068, + "grad_norm": 0.28845056891441345, + "learning_rate": 1.0882393680086423e-05, + "loss": 1.2409, + "step": 16369 + }, + { + "epoch": 4.875742288575736, + "grad_norm": 0.3165302574634552, + "learning_rate": 1.0881432872101976e-05, + "loss": 1.2406, + "step": 16370 + }, + { + "epoch": 4.876040134775406, + "grad_norm": 0.3963184952735901, + "learning_rate": 1.0880472055916782e-05, + "loss": 1.2119, + "step": 16371 + }, + { + "epoch": 4.876337980975074, + "grad_norm": 0.2877182364463806, + "learning_rate": 1.0879511231539778e-05, + "loss": 1.2265, + "step": 16372 + }, + { + "epoch": 4.876635827174742, + "grad_norm": 0.3519115447998047, + "learning_rate": 1.0878550398979905e-05, + "loss": 1.2153, + "step": 16373 + }, + { + "epoch": 4.8769336733744115, + "grad_norm": 0.25945916771888733, + "learning_rate": 1.0877589558246102e-05, + "loss": 1.2227, + "step": 16374 + }, + { + "epoch": 4.87723151957408, + "grad_norm": 0.6392548680305481, + "learning_rate": 1.087662870934731e-05, + "loss": 1.2284, + "step": 16375 + }, + { + "epoch": 4.877529365773748, + "grad_norm": 0.5318585634231567, + "learning_rate": 1.087566785229247e-05, + "loss": 1.2234, + "step": 16376 + }, + { + "epoch": 4.877827211973417, + "grad_norm": 0.3100576400756836, + "learning_rate": 1.0874706987090513e-05, + "loss": 1.2355, + "step": 16377 + }, + { + "epoch": 4.878125058173086, + "grad_norm": 0.30222323536872864, + "learning_rate": 1.0873746113750385e-05, + "loss": 1.2178, + "step": 16378 + }, + { + "epoch": 4.878422904372755, + "grad_norm": 0.260503351688385, + "learning_rate": 1.087278523228103e-05, + "loss": 1.2172, + "step": 16379 + }, + { + "epoch": 4.878720750572423, + "grad_norm": 0.29023849964141846, + "learning_rate": 1.0871824342691382e-05, + "loss": 1.2371, + "step": 16380 + }, + { + "epoch": 4.879018596772092, + "grad_norm": 0.28010281920433044, + "learning_rate": 1.0870863444990383e-05, + "loss": 1.217, + "step": 16381 + }, + { + "epoch": 4.879316442971761, + "grad_norm": 0.2529861330986023, + "learning_rate": 1.0869902539186972e-05, + "loss": 1.2127, + "step": 16382 + }, + { + "epoch": 4.879614289171429, + "grad_norm": 0.2620205879211426, + "learning_rate": 1.086894162529009e-05, + "loss": 1.2276, + "step": 16383 + }, + { + "epoch": 4.8799121353710975, + "grad_norm": 0.27573156356811523, + "learning_rate": 1.0867980703308673e-05, + "loss": 1.2225, + "step": 16384 + }, + { + "epoch": 4.880209981570767, + "grad_norm": 0.2804858684539795, + "learning_rate": 1.086701977325167e-05, + "loss": 1.2205, + "step": 16385 + }, + { + "epoch": 4.880507827770435, + "grad_norm": 0.2714120149612427, + "learning_rate": 1.0866058835128017e-05, + "loss": 1.2305, + "step": 16386 + }, + { + "epoch": 4.880805673970103, + "grad_norm": 0.34036484360694885, + "learning_rate": 1.0865097888946654e-05, + "loss": 1.2178, + "step": 16387 + }, + { + "epoch": 4.881103520169773, + "grad_norm": 0.2547363340854645, + "learning_rate": 1.086413693471652e-05, + "loss": 1.2093, + "step": 16388 + }, + { + "epoch": 4.881401366369441, + "grad_norm": 0.29458507895469666, + "learning_rate": 1.0863175972446556e-05, + "loss": 1.222, + "step": 16389 + }, + { + "epoch": 4.88169921256911, + "grad_norm": 0.41369563341140747, + "learning_rate": 1.0862215002145706e-05, + "loss": 1.2143, + "step": 16390 + }, + { + "epoch": 4.881997058768778, + "grad_norm": 0.3324941098690033, + "learning_rate": 1.086125402382291e-05, + "loss": 1.2142, + "step": 16391 + }, + { + "epoch": 4.882294904968447, + "grad_norm": 0.25271379947662354, + "learning_rate": 1.0860293037487104e-05, + "loss": 1.2111, + "step": 16392 + }, + { + "epoch": 4.882592751168115, + "grad_norm": 0.27145299315452576, + "learning_rate": 1.0859332043147237e-05, + "loss": 1.2071, + "step": 16393 + }, + { + "epoch": 4.882890597367784, + "grad_norm": 0.2988566756248474, + "learning_rate": 1.0858371040812243e-05, + "loss": 1.2026, + "step": 16394 + }, + { + "epoch": 4.883188443567453, + "grad_norm": 0.31120866537094116, + "learning_rate": 1.0857410030491065e-05, + "loss": 1.2241, + "step": 16395 + }, + { + "epoch": 4.883486289767122, + "grad_norm": 0.37695109844207764, + "learning_rate": 1.0856449012192642e-05, + "loss": 1.2189, + "step": 16396 + }, + { + "epoch": 4.88378413596679, + "grad_norm": 0.6909080147743225, + "learning_rate": 1.0855487985925923e-05, + "loss": 1.214, + "step": 16397 + }, + { + "epoch": 4.8840819821664585, + "grad_norm": 0.44229087233543396, + "learning_rate": 1.0854526951699842e-05, + "loss": 1.2097, + "step": 16398 + }, + { + "epoch": 4.884379828366128, + "grad_norm": 0.39378392696380615, + "learning_rate": 1.0853565909523343e-05, + "loss": 1.2238, + "step": 16399 + }, + { + "epoch": 4.884677674565796, + "grad_norm": 0.48348134756088257, + "learning_rate": 1.0852604859405367e-05, + "loss": 1.227, + "step": 16400 + }, + { + "epoch": 4.884975520765464, + "grad_norm": 0.27815037965774536, + "learning_rate": 1.0851643801354855e-05, + "loss": 1.2222, + "step": 16401 + }, + { + "epoch": 4.885273366965134, + "grad_norm": 0.40838098526000977, + "learning_rate": 1.0850682735380744e-05, + "loss": 1.2166, + "step": 16402 + }, + { + "epoch": 4.885571213164802, + "grad_norm": 0.3574656546115875, + "learning_rate": 1.0849721661491986e-05, + "loss": 1.2394, + "step": 16403 + }, + { + "epoch": 4.88586905936447, + "grad_norm": 0.653743326663971, + "learning_rate": 1.0848760579697519e-05, + "loss": 1.2254, + "step": 16404 + }, + { + "epoch": 4.8861669055641395, + "grad_norm": 0.26477012038230896, + "learning_rate": 1.0847799490006278e-05, + "loss": 1.2147, + "step": 16405 + }, + { + "epoch": 4.886464751763808, + "grad_norm": 0.3457360863685608, + "learning_rate": 1.0846838392427215e-05, + "loss": 1.2195, + "step": 16406 + }, + { + "epoch": 4.886762597963477, + "grad_norm": 0.2645303010940552, + "learning_rate": 1.0845877286969265e-05, + "loss": 1.236, + "step": 16407 + }, + { + "epoch": 4.887060444163145, + "grad_norm": 0.28887856006622314, + "learning_rate": 1.0844916173641369e-05, + "loss": 1.2138, + "step": 16408 + }, + { + "epoch": 4.887358290362814, + "grad_norm": 0.32342734932899475, + "learning_rate": 1.0843955052452475e-05, + "loss": 1.2403, + "step": 16409 + }, + { + "epoch": 4.887656136562483, + "grad_norm": 0.3168574273586273, + "learning_rate": 1.0842993923411523e-05, + "loss": 1.2267, + "step": 16410 + }, + { + "epoch": 4.887953982762151, + "grad_norm": 0.3962790369987488, + "learning_rate": 1.0842032786527452e-05, + "loss": 1.2319, + "step": 16411 + }, + { + "epoch": 4.8882518289618195, + "grad_norm": 0.42341771721839905, + "learning_rate": 1.0841071641809208e-05, + "loss": 1.2311, + "step": 16412 + }, + { + "epoch": 4.888549675161489, + "grad_norm": 0.26488226652145386, + "learning_rate": 1.0840110489265731e-05, + "loss": 1.2183, + "step": 16413 + }, + { + "epoch": 4.888847521361157, + "grad_norm": 0.3394005298614502, + "learning_rate": 1.0839149328905965e-05, + "loss": 1.2168, + "step": 16414 + }, + { + "epoch": 4.889145367560825, + "grad_norm": 0.29203420877456665, + "learning_rate": 1.0838188160738852e-05, + "loss": 1.2121, + "step": 16415 + }, + { + "epoch": 4.889443213760495, + "grad_norm": 0.5804629325866699, + "learning_rate": 1.0837226984773335e-05, + "loss": 1.2104, + "step": 16416 + }, + { + "epoch": 4.889741059960163, + "grad_norm": 0.5409330129623413, + "learning_rate": 1.0836265801018358e-05, + "loss": 1.219, + "step": 16417 + }, + { + "epoch": 4.890038906159832, + "grad_norm": 0.2707678973674774, + "learning_rate": 1.0835304609482859e-05, + "loss": 1.2225, + "step": 16418 + }, + { + "epoch": 4.8903367523595005, + "grad_norm": 0.46385636925697327, + "learning_rate": 1.0834343410175784e-05, + "loss": 1.2179, + "step": 16419 + }, + { + "epoch": 4.890634598559169, + "grad_norm": 0.3915536105632782, + "learning_rate": 1.0833382203106076e-05, + "loss": 1.2159, + "step": 16420 + }, + { + "epoch": 4.890932444758838, + "grad_norm": 0.2822267711162567, + "learning_rate": 1.0832420988282682e-05, + "loss": 1.2154, + "step": 16421 + }, + { + "epoch": 4.891230290958506, + "grad_norm": 0.441423237323761, + "learning_rate": 1.0831459765714536e-05, + "loss": 1.2016, + "step": 16422 + }, + { + "epoch": 4.891528137158175, + "grad_norm": 0.2737753987312317, + "learning_rate": 1.0830498535410587e-05, + "loss": 1.2177, + "step": 16423 + }, + { + "epoch": 4.891825983357844, + "grad_norm": 0.625218391418457, + "learning_rate": 1.082953729737978e-05, + "loss": 1.2245, + "step": 16424 + }, + { + "epoch": 4.892123829557512, + "grad_norm": 0.38089168071746826, + "learning_rate": 1.0828576051631048e-05, + "loss": 1.2164, + "step": 16425 + }, + { + "epoch": 4.892421675757181, + "grad_norm": 0.2916334569454193, + "learning_rate": 1.0827614798173347e-05, + "loss": 1.223, + "step": 16426 + }, + { + "epoch": 4.89271952195685, + "grad_norm": 0.32069745659828186, + "learning_rate": 1.0826653537015615e-05, + "loss": 1.2186, + "step": 16427 + }, + { + "epoch": 4.893017368156518, + "grad_norm": 0.28550082445144653, + "learning_rate": 1.0825692268166794e-05, + "loss": 1.2155, + "step": 16428 + }, + { + "epoch": 4.893315214356186, + "grad_norm": 0.3154747784137726, + "learning_rate": 1.082473099163583e-05, + "loss": 1.2046, + "step": 16429 + }, + { + "epoch": 4.893613060555856, + "grad_norm": 0.35802021622657776, + "learning_rate": 1.0823769707431663e-05, + "loss": 1.2329, + "step": 16430 + }, + { + "epoch": 4.893910906755524, + "grad_norm": 0.3719880282878876, + "learning_rate": 1.0822808415563242e-05, + "loss": 1.2279, + "step": 16431 + }, + { + "epoch": 4.894208752955192, + "grad_norm": 0.2961256206035614, + "learning_rate": 1.0821847116039508e-05, + "loss": 1.2018, + "step": 16432 + }, + { + "epoch": 4.8945065991548615, + "grad_norm": 0.352282851934433, + "learning_rate": 1.0820885808869404e-05, + "loss": 1.2299, + "step": 16433 + }, + { + "epoch": 4.89480444535453, + "grad_norm": 0.28779590129852295, + "learning_rate": 1.0819924494061875e-05, + "loss": 1.2112, + "step": 16434 + }, + { + "epoch": 4.895102291554199, + "grad_norm": 0.2735520899295807, + "learning_rate": 1.0818963171625865e-05, + "loss": 1.2135, + "step": 16435 + }, + { + "epoch": 4.895400137753867, + "grad_norm": 0.335479199886322, + "learning_rate": 1.0818001841570316e-05, + "loss": 1.2188, + "step": 16436 + }, + { + "epoch": 4.895697983953536, + "grad_norm": 0.2797480523586273, + "learning_rate": 1.0817040503904172e-05, + "loss": 1.2282, + "step": 16437 + }, + { + "epoch": 4.895995830153205, + "grad_norm": 0.2558453679084778, + "learning_rate": 1.0816079158636383e-05, + "loss": 1.2277, + "step": 16438 + }, + { + "epoch": 4.896293676352873, + "grad_norm": 0.32490307092666626, + "learning_rate": 1.0815117805775889e-05, + "loss": 1.2111, + "step": 16439 + }, + { + "epoch": 4.896591522552542, + "grad_norm": 0.25975948572158813, + "learning_rate": 1.0814156445331635e-05, + "loss": 1.2087, + "step": 16440 + }, + { + "epoch": 4.896889368752211, + "grad_norm": 0.3289744555950165, + "learning_rate": 1.0813195077312564e-05, + "loss": 1.206, + "step": 16441 + }, + { + "epoch": 4.897187214951879, + "grad_norm": 0.2599387466907501, + "learning_rate": 1.081223370172762e-05, + "loss": 1.2208, + "step": 16442 + }, + { + "epoch": 4.8974850611515475, + "grad_norm": 0.49145135283470154, + "learning_rate": 1.0811272318585745e-05, + "loss": 1.2194, + "step": 16443 + }, + { + "epoch": 4.897782907351217, + "grad_norm": 0.28819048404693604, + "learning_rate": 1.0810310927895897e-05, + "loss": 1.1981, + "step": 16444 + }, + { + "epoch": 4.898080753550885, + "grad_norm": 0.40600836277008057, + "learning_rate": 1.0809349529667006e-05, + "loss": 1.2087, + "step": 16445 + }, + { + "epoch": 4.898378599750554, + "grad_norm": 0.4209963083267212, + "learning_rate": 1.080838812390802e-05, + "loss": 1.2194, + "step": 16446 + }, + { + "epoch": 4.898676445950223, + "grad_norm": 0.2710931599140167, + "learning_rate": 1.0807426710627888e-05, + "loss": 1.2169, + "step": 16447 + }, + { + "epoch": 4.898974292149891, + "grad_norm": 0.3259083926677704, + "learning_rate": 1.0806465289835552e-05, + "loss": 1.2131, + "step": 16448 + }, + { + "epoch": 4.89927213834956, + "grad_norm": 0.3095962107181549, + "learning_rate": 1.0805503861539959e-05, + "loss": 1.2287, + "step": 16449 + }, + { + "epoch": 4.899569984549228, + "grad_norm": 0.32337522506713867, + "learning_rate": 1.0804542425750052e-05, + "loss": 1.2082, + "step": 16450 + }, + { + "epoch": 4.899867830748897, + "grad_norm": 0.4136713147163391, + "learning_rate": 1.0803580982474777e-05, + "loss": 1.2041, + "step": 16451 + }, + { + "epoch": 4.900165676948566, + "grad_norm": 0.28453126549720764, + "learning_rate": 1.0802619531723077e-05, + "loss": 1.2072, + "step": 16452 + }, + { + "epoch": 4.900463523148234, + "grad_norm": 0.46619874238967896, + "learning_rate": 1.08016580735039e-05, + "loss": 1.2068, + "step": 16453 + }, + { + "epoch": 4.900761369347903, + "grad_norm": 0.4028174877166748, + "learning_rate": 1.0800696607826188e-05, + "loss": 1.2168, + "step": 16454 + }, + { + "epoch": 4.901059215547572, + "grad_norm": 0.3680400252342224, + "learning_rate": 1.0799735134698893e-05, + "loss": 1.2184, + "step": 16455 + }, + { + "epoch": 4.90135706174724, + "grad_norm": 0.613753080368042, + "learning_rate": 1.0798773654130951e-05, + "loss": 1.227, + "step": 16456 + }, + { + "epoch": 4.901654907946909, + "grad_norm": 0.32443124055862427, + "learning_rate": 1.0797812166131316e-05, + "loss": 1.2075, + "step": 16457 + }, + { + "epoch": 4.901952754146578, + "grad_norm": 0.409507155418396, + "learning_rate": 1.0796850670708934e-05, + "loss": 1.2139, + "step": 16458 + }, + { + "epoch": 4.902250600346246, + "grad_norm": 0.2848486602306366, + "learning_rate": 1.0795889167872743e-05, + "loss": 1.22, + "step": 16459 + }, + { + "epoch": 4.902548446545914, + "grad_norm": 0.2892007529735565, + "learning_rate": 1.0794927657631689e-05, + "loss": 1.1935, + "step": 16460 + }, + { + "epoch": 4.902846292745584, + "grad_norm": 0.3312188684940338, + "learning_rate": 1.0793966139994723e-05, + "loss": 1.226, + "step": 16461 + }, + { + "epoch": 4.903144138945252, + "grad_norm": 0.4444887936115265, + "learning_rate": 1.0793004614970793e-05, + "loss": 1.2277, + "step": 16462 + }, + { + "epoch": 4.903441985144921, + "grad_norm": 0.2989336848258972, + "learning_rate": 1.0792043082568836e-05, + "loss": 1.2244, + "step": 16463 + }, + { + "epoch": 4.9037398313445895, + "grad_norm": 0.3596751093864441, + "learning_rate": 1.0791081542797805e-05, + "loss": 1.2229, + "step": 16464 + }, + { + "epoch": 4.904037677544258, + "grad_norm": 0.39726847410202026, + "learning_rate": 1.0790119995666649e-05, + "loss": 1.2193, + "step": 16465 + }, + { + "epoch": 4.904335523743927, + "grad_norm": 0.37603044509887695, + "learning_rate": 1.0789158441184302e-05, + "loss": 1.2189, + "step": 16466 + }, + { + "epoch": 4.904633369943595, + "grad_norm": 0.4337458312511444, + "learning_rate": 1.0788196879359718e-05, + "loss": 1.2165, + "step": 16467 + }, + { + "epoch": 4.904931216143264, + "grad_norm": 0.5279236435890198, + "learning_rate": 1.0787235310201847e-05, + "loss": 1.2166, + "step": 16468 + }, + { + "epoch": 4.905229062342933, + "grad_norm": 0.25897216796875, + "learning_rate": 1.078627373371963e-05, + "loss": 1.2118, + "step": 16469 + }, + { + "epoch": 4.905526908542601, + "grad_norm": 0.27286821603775024, + "learning_rate": 1.0785312149922012e-05, + "loss": 1.2259, + "step": 16470 + }, + { + "epoch": 4.9058247547422695, + "grad_norm": 0.28314799070358276, + "learning_rate": 1.0784350558817942e-05, + "loss": 1.2219, + "step": 16471 + }, + { + "epoch": 4.906122600941939, + "grad_norm": 0.30134689807891846, + "learning_rate": 1.0783388960416367e-05, + "loss": 1.2227, + "step": 16472 + }, + { + "epoch": 4.906420447141607, + "grad_norm": 0.3107009828090668, + "learning_rate": 1.0782427354726235e-05, + "loss": 1.2111, + "step": 16473 + }, + { + "epoch": 4.906718293341276, + "grad_norm": 0.45756450295448303, + "learning_rate": 1.0781465741756487e-05, + "loss": 1.2211, + "step": 16474 + }, + { + "epoch": 4.907016139540945, + "grad_norm": 0.2705390453338623, + "learning_rate": 1.0780504121516077e-05, + "loss": 1.2275, + "step": 16475 + }, + { + "epoch": 4.907313985740613, + "grad_norm": 0.6505392789840698, + "learning_rate": 1.0779542494013947e-05, + "loss": 1.2343, + "step": 16476 + }, + { + "epoch": 4.907611831940282, + "grad_norm": 0.32612910866737366, + "learning_rate": 1.0778580859259045e-05, + "loss": 1.2083, + "step": 16477 + }, + { + "epoch": 4.9079096781399505, + "grad_norm": 0.4365836977958679, + "learning_rate": 1.0777619217260317e-05, + "loss": 1.2202, + "step": 16478 + }, + { + "epoch": 4.908207524339619, + "grad_norm": 0.2678183615207672, + "learning_rate": 1.0776657568026712e-05, + "loss": 1.2287, + "step": 16479 + }, + { + "epoch": 4.908505370539288, + "grad_norm": 0.5058385133743286, + "learning_rate": 1.0775695911567175e-05, + "loss": 1.2089, + "step": 16480 + }, + { + "epoch": 4.908803216738956, + "grad_norm": 0.3062038719654083, + "learning_rate": 1.0774734247890655e-05, + "loss": 1.2147, + "step": 16481 + }, + { + "epoch": 4.909101062938625, + "grad_norm": 0.3651920258998871, + "learning_rate": 1.0773772577006101e-05, + "loss": 1.2263, + "step": 16482 + }, + { + "epoch": 4.909398909138294, + "grad_norm": 0.2616761028766632, + "learning_rate": 1.0772810898922455e-05, + "loss": 1.2088, + "step": 16483 + }, + { + "epoch": 4.909696755337962, + "grad_norm": 0.3940376043319702, + "learning_rate": 1.0771849213648663e-05, + "loss": 1.217, + "step": 16484 + }, + { + "epoch": 4.9099946015376315, + "grad_norm": 0.24980327486991882, + "learning_rate": 1.0770887521193682e-05, + "loss": 1.2212, + "step": 16485 + }, + { + "epoch": 4.9102924477373, + "grad_norm": 0.4014640748500824, + "learning_rate": 1.0769925821566455e-05, + "loss": 1.2278, + "step": 16486 + }, + { + "epoch": 4.910590293936968, + "grad_norm": 0.3518194556236267, + "learning_rate": 1.0768964114775923e-05, + "loss": 1.2264, + "step": 16487 + }, + { + "epoch": 4.910888140136637, + "grad_norm": 0.3278154730796814, + "learning_rate": 1.0768002400831042e-05, + "loss": 1.2092, + "step": 16488 + }, + { + "epoch": 4.911185986336306, + "grad_norm": 0.2689017653465271, + "learning_rate": 1.0767040679740757e-05, + "loss": 1.2237, + "step": 16489 + }, + { + "epoch": 4.911483832535974, + "grad_norm": 0.39677876234054565, + "learning_rate": 1.0766078951514014e-05, + "loss": 1.2147, + "step": 16490 + }, + { + "epoch": 4.911781678735643, + "grad_norm": 0.4278595745563507, + "learning_rate": 1.0765117216159763e-05, + "loss": 1.2331, + "step": 16491 + }, + { + "epoch": 4.9120795249353115, + "grad_norm": 0.30160701274871826, + "learning_rate": 1.0764155473686955e-05, + "loss": 1.2353, + "step": 16492 + }, + { + "epoch": 4.91237737113498, + "grad_norm": 0.33897310495376587, + "learning_rate": 1.0763193724104531e-05, + "loss": 1.2116, + "step": 16493 + }, + { + "epoch": 4.912675217334649, + "grad_norm": 0.2941698133945465, + "learning_rate": 1.076223196742144e-05, + "loss": 1.2156, + "step": 16494 + }, + { + "epoch": 4.912973063534317, + "grad_norm": 0.37687936425209045, + "learning_rate": 1.0761270203646632e-05, + "loss": 1.2231, + "step": 16495 + }, + { + "epoch": 4.913270909733986, + "grad_norm": 0.2899411916732788, + "learning_rate": 1.0760308432789058e-05, + "loss": 1.2426, + "step": 16496 + }, + { + "epoch": 4.913568755933655, + "grad_norm": 0.33309483528137207, + "learning_rate": 1.0759346654857663e-05, + "loss": 1.2249, + "step": 16497 + }, + { + "epoch": 4.913866602133323, + "grad_norm": 0.27459484338760376, + "learning_rate": 1.0758384869861393e-05, + "loss": 1.2103, + "step": 16498 + }, + { + "epoch": 4.914164448332992, + "grad_norm": 0.41233590245246887, + "learning_rate": 1.0757423077809203e-05, + "loss": 1.2136, + "step": 16499 + }, + { + "epoch": 4.914462294532661, + "grad_norm": 0.2641564905643463, + "learning_rate": 1.0756461278710038e-05, + "loss": 1.2015, + "step": 16500 + }, + { + "epoch": 4.914462294532661, + "eval_loss": 1.3246026039123535, + "eval_runtime": 21.3127, + "eval_samples_per_second": 81.36, + "eval_steps_per_second": 5.114, + "step": 16500 + }, + { + "epoch": 4.914760140732329, + "grad_norm": 0.4468182921409607, + "learning_rate": 1.0755499472572844e-05, + "loss": 1.2283, + "step": 16501 + }, + { + "epoch": 4.915057986931998, + "grad_norm": 0.3703464865684509, + "learning_rate": 1.075453765940657e-05, + "loss": 1.2123, + "step": 16502 + }, + { + "epoch": 4.915355833131667, + "grad_norm": 0.34836113452911377, + "learning_rate": 1.0753575839220168e-05, + "loss": 1.1944, + "step": 16503 + }, + { + "epoch": 4.915653679331335, + "grad_norm": 0.4428093731403351, + "learning_rate": 1.0752614012022583e-05, + "loss": 1.2353, + "step": 16504 + }, + { + "epoch": 4.915951525531004, + "grad_norm": 0.3108443319797516, + "learning_rate": 1.0751652177822768e-05, + "loss": 1.228, + "step": 16505 + }, + { + "epoch": 4.916249371730673, + "grad_norm": 0.5079678297042847, + "learning_rate": 1.075069033662967e-05, + "loss": 1.2157, + "step": 16506 + }, + { + "epoch": 4.916547217930341, + "grad_norm": 0.2944551706314087, + "learning_rate": 1.0749728488452231e-05, + "loss": 1.2403, + "step": 16507 + }, + { + "epoch": 4.91684506413001, + "grad_norm": 0.4437999725341797, + "learning_rate": 1.074876663329941e-05, + "loss": 1.2133, + "step": 16508 + }, + { + "epoch": 4.917142910329678, + "grad_norm": 0.3734782636165619, + "learning_rate": 1.0747804771180154e-05, + "loss": 1.2097, + "step": 16509 + }, + { + "epoch": 4.917440756529347, + "grad_norm": 0.5850622057914734, + "learning_rate": 1.074684290210341e-05, + "loss": 1.2154, + "step": 16510 + }, + { + "epoch": 4.917738602729016, + "grad_norm": 0.2878182530403137, + "learning_rate": 1.0745881026078125e-05, + "loss": 1.217, + "step": 16511 + }, + { + "epoch": 4.918036448928684, + "grad_norm": 0.5440872311592102, + "learning_rate": 1.074491914311325e-05, + "loss": 1.2169, + "step": 16512 + }, + { + "epoch": 4.9183342951283535, + "grad_norm": 0.2729608714580536, + "learning_rate": 1.0743957253217736e-05, + "loss": 1.2265, + "step": 16513 + }, + { + "epoch": 4.918632141328022, + "grad_norm": 0.45563188195228577, + "learning_rate": 1.0742995356400529e-05, + "loss": 1.2315, + "step": 16514 + }, + { + "epoch": 4.91892998752769, + "grad_norm": 0.2730143666267395, + "learning_rate": 1.0742033452670582e-05, + "loss": 1.2226, + "step": 16515 + }, + { + "epoch": 4.919227833727359, + "grad_norm": 0.3658234179019928, + "learning_rate": 1.0741071542036846e-05, + "loss": 1.2184, + "step": 16516 + }, + { + "epoch": 4.919525679927028, + "grad_norm": 0.3419719338417053, + "learning_rate": 1.0740109624508263e-05, + "loss": 1.2156, + "step": 16517 + }, + { + "epoch": 4.919823526126696, + "grad_norm": 0.4722875654697418, + "learning_rate": 1.0739147700093789e-05, + "loss": 1.2074, + "step": 16518 + }, + { + "epoch": 4.920121372326365, + "grad_norm": 0.27891281247138977, + "learning_rate": 1.0738185768802369e-05, + "loss": 1.2062, + "step": 16519 + }, + { + "epoch": 4.920419218526034, + "grad_norm": 0.34261152148246765, + "learning_rate": 1.0737223830642955e-05, + "loss": 1.2229, + "step": 16520 + }, + { + "epoch": 4.920717064725702, + "grad_norm": 0.2668758034706116, + "learning_rate": 1.07362618856245e-05, + "loss": 1.2226, + "step": 16521 + }, + { + "epoch": 4.921014910925371, + "grad_norm": 0.45938822627067566, + "learning_rate": 1.0735299933755947e-05, + "loss": 1.2157, + "step": 16522 + }, + { + "epoch": 4.9213127571250395, + "grad_norm": 0.2678782343864441, + "learning_rate": 1.0734337975046254e-05, + "loss": 1.2348, + "step": 16523 + }, + { + "epoch": 4.921610603324709, + "grad_norm": 0.3673868179321289, + "learning_rate": 1.0733376009504364e-05, + "loss": 1.217, + "step": 16524 + }, + { + "epoch": 4.921908449524377, + "grad_norm": 0.27473393082618713, + "learning_rate": 1.0732414037139228e-05, + "loss": 1.2059, + "step": 16525 + }, + { + "epoch": 4.922206295724045, + "grad_norm": 0.4587046504020691, + "learning_rate": 1.07314520579598e-05, + "loss": 1.2276, + "step": 16526 + }, + { + "epoch": 4.922504141923714, + "grad_norm": 0.37563133239746094, + "learning_rate": 1.0730490071975028e-05, + "loss": 1.2076, + "step": 16527 + }, + { + "epoch": 4.922801988123383, + "grad_norm": 0.4534524977207184, + "learning_rate": 1.0729528079193863e-05, + "loss": 1.2489, + "step": 16528 + }, + { + "epoch": 4.923099834323051, + "grad_norm": 0.5472278594970703, + "learning_rate": 1.072856607962525e-05, + "loss": 1.2077, + "step": 16529 + }, + { + "epoch": 4.92339768052272, + "grad_norm": 0.2616935074329376, + "learning_rate": 1.0727604073278148e-05, + "loss": 1.2255, + "step": 16530 + }, + { + "epoch": 4.923695526722389, + "grad_norm": 0.31283023953437805, + "learning_rate": 1.07266420601615e-05, + "loss": 1.2174, + "step": 16531 + }, + { + "epoch": 4.923993372922057, + "grad_norm": 0.2985352575778961, + "learning_rate": 1.0725680040284263e-05, + "loss": 1.2149, + "step": 16532 + }, + { + "epoch": 4.924291219121726, + "grad_norm": 0.3567928671836853, + "learning_rate": 1.0724718013655384e-05, + "loss": 1.2367, + "step": 16533 + }, + { + "epoch": 4.924589065321395, + "grad_norm": 0.2781184911727905, + "learning_rate": 1.0723755980283809e-05, + "loss": 1.2157, + "step": 16534 + }, + { + "epoch": 4.924886911521063, + "grad_norm": 0.3470194339752197, + "learning_rate": 1.0722793940178498e-05, + "loss": 1.2342, + "step": 16535 + }, + { + "epoch": 4.925184757720732, + "grad_norm": 0.2665918469429016, + "learning_rate": 1.0721831893348393e-05, + "loss": 1.2242, + "step": 16536 + }, + { + "epoch": 4.9254826039204005, + "grad_norm": 0.30841633677482605, + "learning_rate": 1.072086983980245e-05, + "loss": 1.2195, + "step": 16537 + }, + { + "epoch": 4.925780450120069, + "grad_norm": 0.3293060064315796, + "learning_rate": 1.0719907779549619e-05, + "loss": 1.2148, + "step": 16538 + }, + { + "epoch": 4.926078296319738, + "grad_norm": 0.30947989225387573, + "learning_rate": 1.0718945712598852e-05, + "loss": 1.2124, + "step": 16539 + }, + { + "epoch": 4.926376142519406, + "grad_norm": 0.3894166052341461, + "learning_rate": 1.0717983638959097e-05, + "loss": 1.2194, + "step": 16540 + }, + { + "epoch": 4.926673988719076, + "grad_norm": 0.32728278636932373, + "learning_rate": 1.0717021558639306e-05, + "loss": 1.2194, + "step": 16541 + }, + { + "epoch": 4.926971834918744, + "grad_norm": 0.3462470471858978, + "learning_rate": 1.071605947164843e-05, + "loss": 1.2083, + "step": 16542 + }, + { + "epoch": 4.927269681118412, + "grad_norm": 0.26606592535972595, + "learning_rate": 1.0715097377995422e-05, + "loss": 1.2075, + "step": 16543 + }, + { + "epoch": 4.9275675273180815, + "grad_norm": 0.34016546607017517, + "learning_rate": 1.071413527768923e-05, + "loss": 1.2253, + "step": 16544 + }, + { + "epoch": 4.92786537351775, + "grad_norm": 0.2924135625362396, + "learning_rate": 1.0713173170738808e-05, + "loss": 1.2282, + "step": 16545 + }, + { + "epoch": 4.928163219717418, + "grad_norm": 0.31891173124313354, + "learning_rate": 1.0712211057153108e-05, + "loss": 1.2275, + "step": 16546 + }, + { + "epoch": 4.928461065917087, + "grad_norm": 0.3394494354724884, + "learning_rate": 1.0711248936941081e-05, + "loss": 1.2095, + "step": 16547 + }, + { + "epoch": 4.928758912116756, + "grad_norm": 0.2725721299648285, + "learning_rate": 1.0710286810111672e-05, + "loss": 1.2046, + "step": 16548 + }, + { + "epoch": 4.929056758316424, + "grad_norm": 0.2920966148376465, + "learning_rate": 1.070932467667384e-05, + "loss": 1.2413, + "step": 16549 + }, + { + "epoch": 4.929354604516093, + "grad_norm": 0.4335156977176666, + "learning_rate": 1.0708362536636538e-05, + "loss": 1.2211, + "step": 16550 + }, + { + "epoch": 4.9296524507157615, + "grad_norm": 0.46687009930610657, + "learning_rate": 1.0707400390008711e-05, + "loss": 1.225, + "step": 16551 + }, + { + "epoch": 4.929950296915431, + "grad_norm": 0.3849765658378601, + "learning_rate": 1.0706438236799313e-05, + "loss": 1.2245, + "step": 16552 + }, + { + "epoch": 4.930248143115099, + "grad_norm": 0.3573441207408905, + "learning_rate": 1.0705476077017298e-05, + "loss": 1.2113, + "step": 16553 + }, + { + "epoch": 4.930545989314767, + "grad_norm": 0.4444035291671753, + "learning_rate": 1.0704513910671615e-05, + "loss": 1.2131, + "step": 16554 + }, + { + "epoch": 4.930843835514437, + "grad_norm": 0.33163416385650635, + "learning_rate": 1.0703551737771218e-05, + "loss": 1.2255, + "step": 16555 + }, + { + "epoch": 4.931141681714105, + "grad_norm": 0.4915383458137512, + "learning_rate": 1.0702589558325057e-05, + "loss": 1.2123, + "step": 16556 + }, + { + "epoch": 4.931439527913773, + "grad_norm": 0.29612472653388977, + "learning_rate": 1.0701627372342089e-05, + "loss": 1.2274, + "step": 16557 + }, + { + "epoch": 4.9317373741134425, + "grad_norm": 0.539953887462616, + "learning_rate": 1.0700665179831258e-05, + "loss": 1.2099, + "step": 16558 + }, + { + "epoch": 4.932035220313111, + "grad_norm": 0.35313206911087036, + "learning_rate": 1.0699702980801522e-05, + "loss": 1.2357, + "step": 16559 + }, + { + "epoch": 4.932333066512779, + "grad_norm": 0.43425267934799194, + "learning_rate": 1.069874077526183e-05, + "loss": 1.2249, + "step": 16560 + }, + { + "epoch": 4.932630912712448, + "grad_norm": 0.2519639730453491, + "learning_rate": 1.0697778563221137e-05, + "loss": 1.2166, + "step": 16561 + }, + { + "epoch": 4.932928758912117, + "grad_norm": 0.7543262839317322, + "learning_rate": 1.0696816344688394e-05, + "loss": 1.213, + "step": 16562 + }, + { + "epoch": 4.933226605111786, + "grad_norm": 0.37575799226760864, + "learning_rate": 1.0695854119672553e-05, + "loss": 1.2345, + "step": 16563 + }, + { + "epoch": 4.933524451311454, + "grad_norm": 0.36105549335479736, + "learning_rate": 1.069489188818257e-05, + "loss": 1.2265, + "step": 16564 + }, + { + "epoch": 4.933822297511123, + "grad_norm": 0.32097697257995605, + "learning_rate": 1.0693929650227392e-05, + "loss": 1.2115, + "step": 16565 + }, + { + "epoch": 4.934120143710791, + "grad_norm": 0.3280133008956909, + "learning_rate": 1.069296740581597e-05, + "loss": 1.2306, + "step": 16566 + }, + { + "epoch": 4.93441798991046, + "grad_norm": 0.38967204093933105, + "learning_rate": 1.0692005154957265e-05, + "loss": 1.2378, + "step": 16567 + }, + { + "epoch": 4.934715836110128, + "grad_norm": 0.26550593972206116, + "learning_rate": 1.0691042897660226e-05, + "loss": 1.2188, + "step": 16568 + }, + { + "epoch": 4.935013682309798, + "grad_norm": 0.34446680545806885, + "learning_rate": 1.0690080633933803e-05, + "loss": 1.2203, + "step": 16569 + }, + { + "epoch": 4.935311528509466, + "grad_norm": 0.3465847969055176, + "learning_rate": 1.068911836378695e-05, + "loss": 1.2089, + "step": 16570 + }, + { + "epoch": 4.935609374709134, + "grad_norm": 0.3076886236667633, + "learning_rate": 1.0688156087228625e-05, + "loss": 1.2128, + "step": 16571 + }, + { + "epoch": 4.9359072209088035, + "grad_norm": 0.43502846360206604, + "learning_rate": 1.068719380426777e-05, + "loss": 1.2222, + "step": 16572 + }, + { + "epoch": 4.936205067108472, + "grad_norm": 0.37854209542274475, + "learning_rate": 1.0686231514913347e-05, + "loss": 1.2309, + "step": 16573 + }, + { + "epoch": 4.93650291330814, + "grad_norm": 0.49233517050743103, + "learning_rate": 1.0685269219174308e-05, + "loss": 1.2329, + "step": 16574 + }, + { + "epoch": 4.936800759507809, + "grad_norm": 0.4589947462081909, + "learning_rate": 1.0684306917059604e-05, + "loss": 1.2298, + "step": 16575 + }, + { + "epoch": 4.937098605707478, + "grad_norm": 0.41953331232070923, + "learning_rate": 1.0683344608578187e-05, + "loss": 1.2264, + "step": 16576 + }, + { + "epoch": 4.937396451907146, + "grad_norm": 0.31615251302719116, + "learning_rate": 1.0682382293739014e-05, + "loss": 1.2115, + "step": 16577 + }, + { + "epoch": 4.937694298106815, + "grad_norm": 0.37679243087768555, + "learning_rate": 1.0681419972551036e-05, + "loss": 1.2176, + "step": 16578 + }, + { + "epoch": 4.937992144306484, + "grad_norm": 0.4259312152862549, + "learning_rate": 1.0680457645023204e-05, + "loss": 1.2218, + "step": 16579 + }, + { + "epoch": 4.938289990506153, + "grad_norm": 0.29420703649520874, + "learning_rate": 1.0679495311164473e-05, + "loss": 1.2267, + "step": 16580 + }, + { + "epoch": 4.938587836705821, + "grad_norm": 0.2868013083934784, + "learning_rate": 1.0678532970983804e-05, + "loss": 1.2311, + "step": 16581 + }, + { + "epoch": 4.9388856829054895, + "grad_norm": 0.3442378044128418, + "learning_rate": 1.0677570624490138e-05, + "loss": 1.2353, + "step": 16582 + }, + { + "epoch": 4.939183529105159, + "grad_norm": 0.2716892659664154, + "learning_rate": 1.0676608271692437e-05, + "loss": 1.2102, + "step": 16583 + }, + { + "epoch": 4.939481375304827, + "grad_norm": 0.2793993055820465, + "learning_rate": 1.067564591259965e-05, + "loss": 1.2019, + "step": 16584 + }, + { + "epoch": 4.939779221504495, + "grad_norm": 0.33249780535697937, + "learning_rate": 1.0674683547220734e-05, + "loss": 1.2233, + "step": 16585 + }, + { + "epoch": 4.9400770677041645, + "grad_norm": 0.28556743264198303, + "learning_rate": 1.067372117556464e-05, + "loss": 1.2233, + "step": 16586 + }, + { + "epoch": 4.940374913903833, + "grad_norm": 0.2517443299293518, + "learning_rate": 1.0672758797640324e-05, + "loss": 1.2298, + "step": 16587 + }, + { + "epoch": 4.940672760103501, + "grad_norm": 0.2680200934410095, + "learning_rate": 1.067179641345674e-05, + "loss": 1.2179, + "step": 16588 + }, + { + "epoch": 4.94097060630317, + "grad_norm": 0.26771560311317444, + "learning_rate": 1.0670834023022843e-05, + "loss": 1.2148, + "step": 16589 + }, + { + "epoch": 4.941268452502839, + "grad_norm": 0.3182801902294159, + "learning_rate": 1.066987162634758e-05, + "loss": 1.2379, + "step": 16590 + }, + { + "epoch": 4.941566298702508, + "grad_norm": 0.29841336607933044, + "learning_rate": 1.0668909223439912e-05, + "loss": 1.2207, + "step": 16591 + }, + { + "epoch": 4.941864144902176, + "grad_norm": 0.2742636203765869, + "learning_rate": 1.0667946814308792e-05, + "loss": 1.2226, + "step": 16592 + }, + { + "epoch": 4.942161991101845, + "grad_norm": 0.2781110107898712, + "learning_rate": 1.0666984398963171e-05, + "loss": 1.2145, + "step": 16593 + }, + { + "epoch": 4.942459837301513, + "grad_norm": 0.36052560806274414, + "learning_rate": 1.0666021977412007e-05, + "loss": 1.2371, + "step": 16594 + }, + { + "epoch": 4.942757683501182, + "grad_norm": 0.41813093423843384, + "learning_rate": 1.0665059549664252e-05, + "loss": 1.2129, + "step": 16595 + }, + { + "epoch": 4.9430555297008505, + "grad_norm": 0.2921593487262726, + "learning_rate": 1.066409711572886e-05, + "loss": 1.2238, + "step": 16596 + }, + { + "epoch": 4.94335337590052, + "grad_norm": 0.5107043981552124, + "learning_rate": 1.0663134675614788e-05, + "loss": 1.2128, + "step": 16597 + }, + { + "epoch": 4.943651222100188, + "grad_norm": 0.7092805504798889, + "learning_rate": 1.066217222933099e-05, + "loss": 1.2299, + "step": 16598 + }, + { + "epoch": 4.943949068299856, + "grad_norm": 0.3690491318702698, + "learning_rate": 1.0661209776886417e-05, + "loss": 1.2131, + "step": 16599 + }, + { + "epoch": 4.944246914499526, + "grad_norm": 0.40929409861564636, + "learning_rate": 1.0660247318290027e-05, + "loss": 1.2132, + "step": 16600 + }, + { + "epoch": 4.944544760699194, + "grad_norm": 0.25750359892845154, + "learning_rate": 1.065928485355077e-05, + "loss": 1.2123, + "step": 16601 + }, + { + "epoch": 4.944842606898862, + "grad_norm": 0.3441201150417328, + "learning_rate": 1.0658322382677608e-05, + "loss": 1.1992, + "step": 16602 + }, + { + "epoch": 4.9451404530985315, + "grad_norm": 0.28091517090797424, + "learning_rate": 1.065735990567949e-05, + "loss": 1.2086, + "step": 16603 + }, + { + "epoch": 4.9454382992982, + "grad_norm": 0.32239842414855957, + "learning_rate": 1.0656397422565373e-05, + "loss": 1.2272, + "step": 16604 + }, + { + "epoch": 4.945736145497868, + "grad_norm": 0.34925195574760437, + "learning_rate": 1.0655434933344213e-05, + "loss": 1.2248, + "step": 16605 + }, + { + "epoch": 4.946033991697537, + "grad_norm": 0.3011159896850586, + "learning_rate": 1.0654472438024962e-05, + "loss": 1.2232, + "step": 16606 + }, + { + "epoch": 4.946331837897206, + "grad_norm": 0.4502594470977783, + "learning_rate": 1.0653509936616575e-05, + "loss": 1.2162, + "step": 16607 + }, + { + "epoch": 4.946629684096875, + "grad_norm": 0.3334161639213562, + "learning_rate": 1.0652547429128008e-05, + "loss": 1.2277, + "step": 16608 + }, + { + "epoch": 4.946927530296543, + "grad_norm": 0.2528218924999237, + "learning_rate": 1.0651584915568215e-05, + "loss": 1.2299, + "step": 16609 + }, + { + "epoch": 4.9472253764962115, + "grad_norm": 0.31391286849975586, + "learning_rate": 1.0650622395946155e-05, + "loss": 1.2095, + "step": 16610 + }, + { + "epoch": 4.947523222695881, + "grad_norm": 0.3338140845298767, + "learning_rate": 1.0649659870270781e-05, + "loss": 1.2225, + "step": 16611 + }, + { + "epoch": 4.947821068895549, + "grad_norm": 0.390227735042572, + "learning_rate": 1.0648697338551048e-05, + "loss": 1.2077, + "step": 16612 + }, + { + "epoch": 4.948118915095217, + "grad_norm": 0.3159486949443817, + "learning_rate": 1.0647734800795908e-05, + "loss": 1.2334, + "step": 16613 + }, + { + "epoch": 4.948416761294887, + "grad_norm": 0.4039250910282135, + "learning_rate": 1.0646772257014319e-05, + "loss": 1.2294, + "step": 16614 + }, + { + "epoch": 4.948714607494555, + "grad_norm": 0.2981310784816742, + "learning_rate": 1.0645809707215242e-05, + "loss": 1.2164, + "step": 16615 + }, + { + "epoch": 4.949012453694223, + "grad_norm": 0.2539900541305542, + "learning_rate": 1.0644847151407624e-05, + "loss": 1.2048, + "step": 16616 + }, + { + "epoch": 4.9493102998938925, + "grad_norm": 0.257242888212204, + "learning_rate": 1.0643884589600423e-05, + "loss": 1.2153, + "step": 16617 + }, + { + "epoch": 4.949608146093561, + "grad_norm": 0.33412307500839233, + "learning_rate": 1.0642922021802597e-05, + "loss": 1.205, + "step": 16618 + }, + { + "epoch": 4.94990599229323, + "grad_norm": 0.3598838150501251, + "learning_rate": 1.0641959448023099e-05, + "loss": 1.2251, + "step": 16619 + }, + { + "epoch": 4.950203838492898, + "grad_norm": 0.23968324065208435, + "learning_rate": 1.0640996868270885e-05, + "loss": 1.2164, + "step": 16620 + }, + { + "epoch": 4.950501684692567, + "grad_norm": 0.3048483431339264, + "learning_rate": 1.0640034282554912e-05, + "loss": 1.2091, + "step": 16621 + }, + { + "epoch": 4.950799530892236, + "grad_norm": 0.2794269323348999, + "learning_rate": 1.0639071690884138e-05, + "loss": 1.2181, + "step": 16622 + }, + { + "epoch": 4.951097377091904, + "grad_norm": 0.26282474398612976, + "learning_rate": 1.063810909326751e-05, + "loss": 1.2193, + "step": 16623 + }, + { + "epoch": 4.951395223291573, + "grad_norm": 0.3358785808086395, + "learning_rate": 1.0637146489713993e-05, + "loss": 1.2326, + "step": 16624 + }, + { + "epoch": 4.951693069491242, + "grad_norm": 0.28418686985969543, + "learning_rate": 1.0636183880232541e-05, + "loss": 1.2107, + "step": 16625 + }, + { + "epoch": 4.95199091569091, + "grad_norm": 0.28371039032936096, + "learning_rate": 1.0635221264832105e-05, + "loss": 1.213, + "step": 16626 + }, + { + "epoch": 4.952288761890578, + "grad_norm": 0.26671627163887024, + "learning_rate": 1.063425864352165e-05, + "loss": 1.1987, + "step": 16627 + }, + { + "epoch": 4.952586608090248, + "grad_norm": 0.284598708152771, + "learning_rate": 1.0633296016310122e-05, + "loss": 1.2128, + "step": 16628 + }, + { + "epoch": 4.952884454289916, + "grad_norm": 0.47250860929489136, + "learning_rate": 1.0632333383206486e-05, + "loss": 1.2312, + "step": 16629 + }, + { + "epoch": 4.953182300489585, + "grad_norm": 0.4156145751476288, + "learning_rate": 1.0631370744219694e-05, + "loss": 1.2304, + "step": 16630 + }, + { + "epoch": 4.9534801466892535, + "grad_norm": 0.3234199285507202, + "learning_rate": 1.0630408099358697e-05, + "loss": 1.218, + "step": 16631 + }, + { + "epoch": 4.953777992888922, + "grad_norm": 0.4637068510055542, + "learning_rate": 1.0629445448632462e-05, + "loss": 1.2133, + "step": 16632 + }, + { + "epoch": 4.95407583908859, + "grad_norm": 0.2520059049129486, + "learning_rate": 1.062848279204994e-05, + "loss": 1.211, + "step": 16633 + }, + { + "epoch": 4.954373685288259, + "grad_norm": 0.3065948486328125, + "learning_rate": 1.0627520129620087e-05, + "loss": 1.2251, + "step": 16634 + }, + { + "epoch": 4.954671531487928, + "grad_norm": 0.2537427544593811, + "learning_rate": 1.0626557461351862e-05, + "loss": 1.204, + "step": 16635 + }, + { + "epoch": 4.954969377687597, + "grad_norm": 0.2744297385215759, + "learning_rate": 1.0625594787254216e-05, + "loss": 1.2239, + "step": 16636 + }, + { + "epoch": 4.955267223887265, + "grad_norm": 0.26712867617607117, + "learning_rate": 1.0624632107336112e-05, + "loss": 1.2185, + "step": 16637 + }, + { + "epoch": 4.955565070086934, + "grad_norm": 0.28278082609176636, + "learning_rate": 1.0623669421606504e-05, + "loss": 1.2089, + "step": 16638 + }, + { + "epoch": 4.955862916286603, + "grad_norm": 0.2572941780090332, + "learning_rate": 1.0622706730074352e-05, + "loss": 1.221, + "step": 16639 + }, + { + "epoch": 4.956160762486271, + "grad_norm": 0.2989979684352875, + "learning_rate": 1.0621744032748607e-05, + "loss": 1.2103, + "step": 16640 + }, + { + "epoch": 4.9564586086859395, + "grad_norm": 0.2684837281703949, + "learning_rate": 1.0620781329638228e-05, + "loss": 1.2171, + "step": 16641 + }, + { + "epoch": 4.956756454885609, + "grad_norm": 0.40119245648384094, + "learning_rate": 1.0619818620752172e-05, + "loss": 1.2245, + "step": 16642 + }, + { + "epoch": 4.957054301085277, + "grad_norm": 0.37163183093070984, + "learning_rate": 1.0618855906099395e-05, + "loss": 1.2182, + "step": 16643 + }, + { + "epoch": 4.957352147284945, + "grad_norm": 0.26145032048225403, + "learning_rate": 1.0617893185688856e-05, + "loss": 1.2386, + "step": 16644 + }, + { + "epoch": 4.9576499934846145, + "grad_norm": 0.45826685428619385, + "learning_rate": 1.0616930459529513e-05, + "loss": 1.2294, + "step": 16645 + }, + { + "epoch": 4.957947839684283, + "grad_norm": 0.5881915092468262, + "learning_rate": 1.061596772763032e-05, + "loss": 1.2201, + "step": 16646 + }, + { + "epoch": 4.958245685883952, + "grad_norm": 0.4020099639892578, + "learning_rate": 1.0615004990000237e-05, + "loss": 1.2128, + "step": 16647 + }, + { + "epoch": 4.95854353208362, + "grad_norm": 0.2901490330696106, + "learning_rate": 1.0614042246648217e-05, + "loss": 1.2284, + "step": 16648 + }, + { + "epoch": 4.958841378283289, + "grad_norm": 0.33594927191734314, + "learning_rate": 1.0613079497583223e-05, + "loss": 1.2197, + "step": 16649 + }, + { + "epoch": 4.959139224482958, + "grad_norm": 0.2905789613723755, + "learning_rate": 1.0612116742814207e-05, + "loss": 1.2082, + "step": 16650 + }, + { + "epoch": 4.959437070682626, + "grad_norm": 0.3242633044719696, + "learning_rate": 1.0611153982350129e-05, + "loss": 1.2226, + "step": 16651 + }, + { + "epoch": 4.959734916882295, + "grad_norm": 0.30701521039009094, + "learning_rate": 1.0610191216199945e-05, + "loss": 1.217, + "step": 16652 + }, + { + "epoch": 4.960032763081964, + "grad_norm": 0.5019434690475464, + "learning_rate": 1.0609228444372617e-05, + "loss": 1.2233, + "step": 16653 + }, + { + "epoch": 4.960330609281632, + "grad_norm": 0.688578188419342, + "learning_rate": 1.0608265666877095e-05, + "loss": 1.2137, + "step": 16654 + }, + { + "epoch": 4.9606284554813005, + "grad_norm": 0.32056665420532227, + "learning_rate": 1.0607302883722342e-05, + "loss": 1.2392, + "step": 16655 + }, + { + "epoch": 4.96092630168097, + "grad_norm": 0.7792702317237854, + "learning_rate": 1.0606340094917318e-05, + "loss": 1.2381, + "step": 16656 + }, + { + "epoch": 4.961224147880638, + "grad_norm": 0.3579455316066742, + "learning_rate": 1.0605377300470971e-05, + "loss": 1.219, + "step": 16657 + }, + { + "epoch": 4.961521994080307, + "grad_norm": 0.9020829200744629, + "learning_rate": 1.0604414500392269e-05, + "loss": 1.2164, + "step": 16658 + }, + { + "epoch": 4.961819840279976, + "grad_norm": 0.6582271456718445, + "learning_rate": 1.060345169469016e-05, + "loss": 1.2373, + "step": 16659 + }, + { + "epoch": 4.962117686479644, + "grad_norm": 0.5560967922210693, + "learning_rate": 1.0602488883373611e-05, + "loss": 1.2232, + "step": 16660 + }, + { + "epoch": 4.962415532679312, + "grad_norm": 0.37864935398101807, + "learning_rate": 1.0601526066451577e-05, + "loss": 1.2243, + "step": 16661 + }, + { + "epoch": 4.9627133788789815, + "grad_norm": 0.4254647195339203, + "learning_rate": 1.0600563243933012e-05, + "loss": 1.2283, + "step": 16662 + }, + { + "epoch": 4.96301122507865, + "grad_norm": 0.3214024603366852, + "learning_rate": 1.059960041582688e-05, + "loss": 1.2108, + "step": 16663 + }, + { + "epoch": 4.963309071278319, + "grad_norm": 0.37460124492645264, + "learning_rate": 1.0598637582142134e-05, + "loss": 1.2047, + "step": 16664 + }, + { + "epoch": 4.963606917477987, + "grad_norm": 0.5097197890281677, + "learning_rate": 1.0597674742887737e-05, + "loss": 1.234, + "step": 16665 + }, + { + "epoch": 4.963904763677656, + "grad_norm": 0.29781848192214966, + "learning_rate": 1.0596711898072642e-05, + "loss": 1.2104, + "step": 16666 + }, + { + "epoch": 4.964202609877325, + "grad_norm": 0.4486392140388489, + "learning_rate": 1.0595749047705809e-05, + "loss": 1.2232, + "step": 16667 + }, + { + "epoch": 4.964500456076993, + "grad_norm": 0.2860359251499176, + "learning_rate": 1.0594786191796195e-05, + "loss": 1.2169, + "step": 16668 + }, + { + "epoch": 4.9647983022766615, + "grad_norm": 0.36968132853507996, + "learning_rate": 1.0593823330352764e-05, + "loss": 1.2283, + "step": 16669 + }, + { + "epoch": 4.965096148476331, + "grad_norm": 0.3152136504650116, + "learning_rate": 1.0592860463384472e-05, + "loss": 1.2331, + "step": 16670 + }, + { + "epoch": 4.965393994675999, + "grad_norm": 0.24299803376197815, + "learning_rate": 1.0591897590900272e-05, + "loss": 1.2054, + "step": 16671 + }, + { + "epoch": 4.965691840875667, + "grad_norm": 0.3697117269039154, + "learning_rate": 1.0590934712909122e-05, + "loss": 1.2058, + "step": 16672 + }, + { + "epoch": 4.965989687075337, + "grad_norm": 0.3239505887031555, + "learning_rate": 1.0589971829419992e-05, + "loss": 1.2104, + "step": 16673 + }, + { + "epoch": 4.966287533275005, + "grad_norm": 0.3296203911304474, + "learning_rate": 1.0589008940441831e-05, + "loss": 1.2199, + "step": 16674 + }, + { + "epoch": 4.966585379474674, + "grad_norm": 0.3282039761543274, + "learning_rate": 1.05880460459836e-05, + "loss": 1.2145, + "step": 16675 + }, + { + "epoch": 4.9668832256743425, + "grad_norm": 0.2871517539024353, + "learning_rate": 1.0587083146054258e-05, + "loss": 1.2284, + "step": 16676 + }, + { + "epoch": 4.967181071874011, + "grad_norm": 0.2773374915122986, + "learning_rate": 1.058612024066276e-05, + "loss": 1.2132, + "step": 16677 + }, + { + "epoch": 4.96747891807368, + "grad_norm": 0.264411985874176, + "learning_rate": 1.0585157329818069e-05, + "loss": 1.2254, + "step": 16678 + }, + { + "epoch": 4.967776764273348, + "grad_norm": 0.2909695506095886, + "learning_rate": 1.0584194413529145e-05, + "loss": 1.2263, + "step": 16679 + }, + { + "epoch": 4.968074610473017, + "grad_norm": 0.2745228707790375, + "learning_rate": 1.0583231491804946e-05, + "loss": 1.2186, + "step": 16680 + }, + { + "epoch": 4.968372456672686, + "grad_norm": 0.27479180693626404, + "learning_rate": 1.0582268564654427e-05, + "loss": 1.2119, + "step": 16681 + }, + { + "epoch": 4.968670302872354, + "grad_norm": 0.2838924825191498, + "learning_rate": 1.058130563208655e-05, + "loss": 1.2305, + "step": 16682 + }, + { + "epoch": 4.9689681490720226, + "grad_norm": 0.2745339572429657, + "learning_rate": 1.0580342694110272e-05, + "loss": 1.2148, + "step": 16683 + }, + { + "epoch": 4.969265995271692, + "grad_norm": 0.3092835545539856, + "learning_rate": 1.0579379750734557e-05, + "loss": 1.2076, + "step": 16684 + }, + { + "epoch": 4.96956384147136, + "grad_norm": 0.29313743114471436, + "learning_rate": 1.0578416801968359e-05, + "loss": 1.2418, + "step": 16685 + }, + { + "epoch": 4.969861687671029, + "grad_norm": 0.260153204202652, + "learning_rate": 1.0577453847820638e-05, + "loss": 1.2173, + "step": 16686 + }, + { + "epoch": 4.970159533870698, + "grad_norm": 0.28208646178245544, + "learning_rate": 1.0576490888300357e-05, + "loss": 1.2278, + "step": 16687 + }, + { + "epoch": 4.970457380070366, + "grad_norm": 0.27743834257125854, + "learning_rate": 1.057552792341647e-05, + "loss": 1.2268, + "step": 16688 + }, + { + "epoch": 4.970755226270035, + "grad_norm": 0.278525710105896, + "learning_rate": 1.057456495317794e-05, + "loss": 1.2212, + "step": 16689 + }, + { + "epoch": 4.9710530724697035, + "grad_norm": 0.3726790249347687, + "learning_rate": 1.0573601977593724e-05, + "loss": 1.221, + "step": 16690 + }, + { + "epoch": 4.971350918669372, + "grad_norm": 0.41416075825691223, + "learning_rate": 1.0572638996672781e-05, + "loss": 1.2207, + "step": 16691 + }, + { + "epoch": 4.971648764869041, + "grad_norm": 0.2695554494857788, + "learning_rate": 1.0571676010424072e-05, + "loss": 1.221, + "step": 16692 + }, + { + "epoch": 4.971946611068709, + "grad_norm": 0.36792466044425964, + "learning_rate": 1.057071301885656e-05, + "loss": 1.2063, + "step": 16693 + }, + { + "epoch": 4.972244457268378, + "grad_norm": 0.28067547082901, + "learning_rate": 1.0569750021979199e-05, + "loss": 1.2353, + "step": 16694 + }, + { + "epoch": 4.972542303468047, + "grad_norm": 0.4062131643295288, + "learning_rate": 1.0568787019800951e-05, + "loss": 1.2273, + "step": 16695 + }, + { + "epoch": 4.972840149667715, + "grad_norm": 0.31503891944885254, + "learning_rate": 1.0567824012330772e-05, + "loss": 1.2109, + "step": 16696 + }, + { + "epoch": 4.9731379958673845, + "grad_norm": 0.2861665189266205, + "learning_rate": 1.056686099957763e-05, + "loss": 1.2136, + "step": 16697 + }, + { + "epoch": 4.973435842067053, + "grad_norm": 0.2808992266654968, + "learning_rate": 1.0565897981550477e-05, + "loss": 1.2074, + "step": 16698 + }, + { + "epoch": 4.973733688266721, + "grad_norm": 0.3202911615371704, + "learning_rate": 1.0564934958258278e-05, + "loss": 1.2055, + "step": 16699 + }, + { + "epoch": 4.9740315344663895, + "grad_norm": 0.5615058541297913, + "learning_rate": 1.0563971929709988e-05, + "loss": 1.2232, + "step": 16700 + }, + { + "epoch": 4.974329380666059, + "grad_norm": 0.5817986130714417, + "learning_rate": 1.0563008895914569e-05, + "loss": 1.2207, + "step": 16701 + }, + { + "epoch": 4.974627226865727, + "grad_norm": 0.30600810050964355, + "learning_rate": 1.056204585688098e-05, + "loss": 1.1981, + "step": 16702 + }, + { + "epoch": 4.974925073065396, + "grad_norm": 0.6330814361572266, + "learning_rate": 1.0561082812618184e-05, + "loss": 1.2219, + "step": 16703 + }, + { + "epoch": 4.9752229192650645, + "grad_norm": 0.3917084038257599, + "learning_rate": 1.0560119763135143e-05, + "loss": 1.2194, + "step": 16704 + }, + { + "epoch": 4.975520765464733, + "grad_norm": 0.5173817276954651, + "learning_rate": 1.055915670844081e-05, + "loss": 1.201, + "step": 16705 + }, + { + "epoch": 4.975818611664402, + "grad_norm": 0.4674241244792938, + "learning_rate": 1.0558193648544148e-05, + "loss": 1.2212, + "step": 16706 + }, + { + "epoch": 4.97611645786407, + "grad_norm": 0.48664671182632446, + "learning_rate": 1.0557230583454119e-05, + "loss": 1.2251, + "step": 16707 + }, + { + "epoch": 4.976414304063739, + "grad_norm": 0.4714708924293518, + "learning_rate": 1.055626751317968e-05, + "loss": 1.2109, + "step": 16708 + }, + { + "epoch": 4.976712150263408, + "grad_norm": 0.5601451396942139, + "learning_rate": 1.0555304437729795e-05, + "loss": 1.2094, + "step": 16709 + }, + { + "epoch": 4.977009996463076, + "grad_norm": 0.6209285259246826, + "learning_rate": 1.0554341357113423e-05, + "loss": 1.2182, + "step": 16710 + }, + { + "epoch": 4.977307842662745, + "grad_norm": 0.3429107367992401, + "learning_rate": 1.0553378271339523e-05, + "loss": 1.2225, + "step": 16711 + }, + { + "epoch": 4.977605688862414, + "grad_norm": 0.32052913308143616, + "learning_rate": 1.0552415180417057e-05, + "loss": 1.2083, + "step": 16712 + }, + { + "epoch": 4.977903535062082, + "grad_norm": 0.3625233471393585, + "learning_rate": 1.0551452084354982e-05, + "loss": 1.23, + "step": 16713 + }, + { + "epoch": 4.978201381261751, + "grad_norm": 0.2513951361179352, + "learning_rate": 1.0550488983162266e-05, + "loss": 1.2109, + "step": 16714 + }, + { + "epoch": 4.97849922746142, + "grad_norm": 0.3679635226726532, + "learning_rate": 1.0549525876847863e-05, + "loss": 1.222, + "step": 16715 + }, + { + "epoch": 4.978797073661088, + "grad_norm": 0.2886575758457184, + "learning_rate": 1.0548562765420735e-05, + "loss": 1.1996, + "step": 16716 + }, + { + "epoch": 4.979094919860757, + "grad_norm": 0.2606308162212372, + "learning_rate": 1.0547599648889846e-05, + "loss": 1.2123, + "step": 16717 + }, + { + "epoch": 4.979392766060426, + "grad_norm": 0.38203656673431396, + "learning_rate": 1.0546636527264154e-05, + "loss": 1.2061, + "step": 16718 + }, + { + "epoch": 4.979690612260094, + "grad_norm": 0.26509690284729004, + "learning_rate": 1.0545673400552613e-05, + "loss": 1.2082, + "step": 16719 + }, + { + "epoch": 4.979988458459763, + "grad_norm": 0.4086620807647705, + "learning_rate": 1.0544710268764198e-05, + "loss": 1.2126, + "step": 16720 + }, + { + "epoch": 4.9802863046594315, + "grad_norm": 0.3342956006526947, + "learning_rate": 1.0543747131907862e-05, + "loss": 1.2151, + "step": 16721 + }, + { + "epoch": 4.9805841508591, + "grad_norm": 0.303300678730011, + "learning_rate": 1.0542783989992565e-05, + "loss": 1.2292, + "step": 16722 + }, + { + "epoch": 4.980881997058769, + "grad_norm": 0.322784423828125, + "learning_rate": 1.0541820843027268e-05, + "loss": 1.2103, + "step": 16723 + }, + { + "epoch": 4.981179843258437, + "grad_norm": 0.30369865894317627, + "learning_rate": 1.0540857691020934e-05, + "loss": 1.2106, + "step": 16724 + }, + { + "epoch": 4.9814776894581065, + "grad_norm": 0.2760009765625, + "learning_rate": 1.0539894533982524e-05, + "loss": 1.213, + "step": 16725 + }, + { + "epoch": 4.981775535657775, + "grad_norm": 0.2748645842075348, + "learning_rate": 1.0538931371921e-05, + "loss": 1.1896, + "step": 16726 + }, + { + "epoch": 4.982073381857443, + "grad_norm": 0.29580578207969666, + "learning_rate": 1.0537968204845319e-05, + "loss": 1.2122, + "step": 16727 + }, + { + "epoch": 4.9823712280571115, + "grad_norm": 0.266201376914978, + "learning_rate": 1.0537005032764447e-05, + "loss": 1.2032, + "step": 16728 + }, + { + "epoch": 4.982669074256781, + "grad_norm": 0.3600304126739502, + "learning_rate": 1.0536041855687343e-05, + "loss": 1.2158, + "step": 16729 + }, + { + "epoch": 4.982966920456449, + "grad_norm": 0.41808775067329407, + "learning_rate": 1.0535078673622967e-05, + "loss": 1.2106, + "step": 16730 + }, + { + "epoch": 4.983264766656118, + "grad_norm": 0.3674350380897522, + "learning_rate": 1.0534115486580283e-05, + "loss": 1.22, + "step": 16731 + }, + { + "epoch": 4.983562612855787, + "grad_norm": 0.496549129486084, + "learning_rate": 1.053315229456825e-05, + "loss": 1.2233, + "step": 16732 + }, + { + "epoch": 4.983860459055455, + "grad_norm": 0.4117461144924164, + "learning_rate": 1.0532189097595831e-05, + "loss": 1.2182, + "step": 16733 + }, + { + "epoch": 4.984158305255124, + "grad_norm": 0.5589625239372253, + "learning_rate": 1.053122589567199e-05, + "loss": 1.2178, + "step": 16734 + }, + { + "epoch": 4.9844561514547925, + "grad_norm": 0.4036214351654053, + "learning_rate": 1.0530262688805684e-05, + "loss": 1.2201, + "step": 16735 + }, + { + "epoch": 4.984753997654461, + "grad_norm": 0.38442564010620117, + "learning_rate": 1.0529299477005874e-05, + "loss": 1.2195, + "step": 16736 + }, + { + "epoch": 4.98505184385413, + "grad_norm": 0.4841512143611908, + "learning_rate": 1.0528336260281523e-05, + "loss": 1.2209, + "step": 16737 + }, + { + "epoch": 4.985349690053798, + "grad_norm": 0.2693743109703064, + "learning_rate": 1.0527373038641595e-05, + "loss": 1.2218, + "step": 16738 + }, + { + "epoch": 4.985647536253467, + "grad_norm": 0.5425184965133667, + "learning_rate": 1.0526409812095052e-05, + "loss": 1.2191, + "step": 16739 + }, + { + "epoch": 4.985945382453136, + "grad_norm": 0.33673861622810364, + "learning_rate": 1.0525446580650852e-05, + "loss": 1.2281, + "step": 16740 + }, + { + "epoch": 4.986243228652804, + "grad_norm": 0.4996735453605652, + "learning_rate": 1.0524483344317959e-05, + "loss": 1.2295, + "step": 16741 + }, + { + "epoch": 4.986541074852473, + "grad_norm": 0.3482604920864105, + "learning_rate": 1.0523520103105331e-05, + "loss": 1.2104, + "step": 16742 + }, + { + "epoch": 4.986838921052142, + "grad_norm": 0.26863017678260803, + "learning_rate": 1.0522556857021937e-05, + "loss": 1.2201, + "step": 16743 + }, + { + "epoch": 4.98713676725181, + "grad_norm": 0.31027543544769287, + "learning_rate": 1.0521593606076734e-05, + "loss": 1.2266, + "step": 16744 + }, + { + "epoch": 4.987434613451479, + "grad_norm": 0.26575297117233276, + "learning_rate": 1.0520630350278689e-05, + "loss": 1.2221, + "step": 16745 + }, + { + "epoch": 4.987732459651148, + "grad_norm": 0.27818453311920166, + "learning_rate": 1.0519667089636758e-05, + "loss": 1.2135, + "step": 16746 + }, + { + "epoch": 4.988030305850816, + "grad_norm": 0.3446889817714691, + "learning_rate": 1.0518703824159903e-05, + "loss": 1.2133, + "step": 16747 + }, + { + "epoch": 4.988328152050485, + "grad_norm": 0.2743852734565735, + "learning_rate": 1.0517740553857089e-05, + "loss": 1.2231, + "step": 16748 + }, + { + "epoch": 4.9886259982501535, + "grad_norm": 0.3398355543613434, + "learning_rate": 1.051677727873728e-05, + "loss": 1.2129, + "step": 16749 + }, + { + "epoch": 4.988923844449822, + "grad_norm": 0.3796647787094116, + "learning_rate": 1.0515813998809434e-05, + "loss": 1.2016, + "step": 16750 + }, + { + "epoch": 4.989221690649491, + "grad_norm": 0.30225494503974915, + "learning_rate": 1.0514850714082517e-05, + "loss": 1.219, + "step": 16751 + }, + { + "epoch": 4.989519536849159, + "grad_norm": 0.8262394070625305, + "learning_rate": 1.051388742456549e-05, + "loss": 1.239, + "step": 16752 + }, + { + "epoch": 4.989817383048829, + "grad_norm": 0.515708327293396, + "learning_rate": 1.051292413026731e-05, + "loss": 1.2234, + "step": 16753 + }, + { + "epoch": 4.990115229248497, + "grad_norm": 0.45032769441604614, + "learning_rate": 1.0511960831196946e-05, + "loss": 1.2183, + "step": 16754 + }, + { + "epoch": 4.990413075448165, + "grad_norm": 0.4615631103515625, + "learning_rate": 1.051099752736336e-05, + "loss": 1.2119, + "step": 16755 + }, + { + "epoch": 4.9907109216478345, + "grad_norm": 0.4467100203037262, + "learning_rate": 1.0510034218775514e-05, + "loss": 1.2253, + "step": 16756 + }, + { + "epoch": 4.991008767847503, + "grad_norm": 0.3474428951740265, + "learning_rate": 1.0509070905442364e-05, + "loss": 1.2064, + "step": 16757 + }, + { + "epoch": 4.991306614047171, + "grad_norm": 0.40014272928237915, + "learning_rate": 1.0508107587372884e-05, + "loss": 1.2208, + "step": 16758 + }, + { + "epoch": 4.99160446024684, + "grad_norm": 0.321129709482193, + "learning_rate": 1.0507144264576028e-05, + "loss": 1.2146, + "step": 16759 + }, + { + "epoch": 4.991902306446509, + "grad_norm": 0.2836153209209442, + "learning_rate": 1.0506180937060759e-05, + "loss": 1.1964, + "step": 16760 + }, + { + "epoch": 4.992200152646177, + "grad_norm": 0.44101935625076294, + "learning_rate": 1.0505217604836045e-05, + "loss": 1.2227, + "step": 16761 + }, + { + "epoch": 4.992497998845846, + "grad_norm": 0.2814334034919739, + "learning_rate": 1.0504254267910847e-05, + "loss": 1.2179, + "step": 16762 + }, + { + "epoch": 4.9927958450455145, + "grad_norm": 0.3615776300430298, + "learning_rate": 1.0503290926294122e-05, + "loss": 1.2043, + "step": 16763 + }, + { + "epoch": 4.993093691245184, + "grad_norm": 0.30053389072418213, + "learning_rate": 1.050232757999484e-05, + "loss": 1.2132, + "step": 16764 + }, + { + "epoch": 4.993391537444852, + "grad_norm": 0.3011837303638458, + "learning_rate": 1.0501364229021962e-05, + "loss": 1.202, + "step": 16765 + }, + { + "epoch": 4.99368938364452, + "grad_norm": 0.26689302921295166, + "learning_rate": 1.0500400873384446e-05, + "loss": 1.2103, + "step": 16766 + }, + { + "epoch": 4.993987229844189, + "grad_norm": 0.3297526240348816, + "learning_rate": 1.0499437513091263e-05, + "loss": 1.2247, + "step": 16767 + }, + { + "epoch": 4.994285076043858, + "grad_norm": 0.30646270513534546, + "learning_rate": 1.049847414815137e-05, + "loss": 1.2096, + "step": 16768 + }, + { + "epoch": 4.994582922243526, + "grad_norm": 0.31208336353302, + "learning_rate": 1.0497510778573733e-05, + "loss": 1.2111, + "step": 16769 + }, + { + "epoch": 4.9948807684431955, + "grad_norm": 0.2566390335559845, + "learning_rate": 1.0496547404367314e-05, + "loss": 1.2269, + "step": 16770 + }, + { + "epoch": 4.995178614642864, + "grad_norm": 0.3259469270706177, + "learning_rate": 1.0495584025541077e-05, + "loss": 1.2038, + "step": 16771 + }, + { + "epoch": 4.995476460842532, + "grad_norm": 0.2688966393470764, + "learning_rate": 1.049462064210398e-05, + "loss": 1.2179, + "step": 16772 + }, + { + "epoch": 4.995774307042201, + "grad_norm": 0.24849647283554077, + "learning_rate": 1.0493657254064995e-05, + "loss": 1.215, + "step": 16773 + }, + { + "epoch": 4.99607215324187, + "grad_norm": 0.2939499616622925, + "learning_rate": 1.049269386143308e-05, + "loss": 1.1907, + "step": 16774 + }, + { + "epoch": 4.996369999441538, + "grad_norm": 0.2791326642036438, + "learning_rate": 1.04917304642172e-05, + "loss": 1.2196, + "step": 16775 + }, + { + "epoch": 4.996667845641207, + "grad_norm": 0.28479069471359253, + "learning_rate": 1.0490767062426314e-05, + "loss": 1.2281, + "step": 16776 + }, + { + "epoch": 4.996965691840876, + "grad_norm": 0.5685864686965942, + "learning_rate": 1.0489803656069392e-05, + "loss": 1.2241, + "step": 16777 + }, + { + "epoch": 4.997263538040544, + "grad_norm": 0.43502071499824524, + "learning_rate": 1.0488840245155392e-05, + "loss": 1.2231, + "step": 16778 + }, + { + "epoch": 4.997561384240213, + "grad_norm": 0.420467346906662, + "learning_rate": 1.0487876829693283e-05, + "loss": 1.2031, + "step": 16779 + }, + { + "epoch": 4.9978592304398815, + "grad_norm": 0.4691842198371887, + "learning_rate": 1.0486913409692022e-05, + "loss": 1.2187, + "step": 16780 + }, + { + "epoch": 4.998157076639551, + "grad_norm": 0.3853599727153778, + "learning_rate": 1.0485949985160575e-05, + "loss": 1.221, + "step": 16781 + }, + { + "epoch": 4.998454922839219, + "grad_norm": 0.5169789791107178, + "learning_rate": 1.048498655610791e-05, + "loss": 1.2123, + "step": 16782 + }, + { + "epoch": 4.998752769038887, + "grad_norm": 0.30979102849960327, + "learning_rate": 1.0484023122542983e-05, + "loss": 1.2158, + "step": 16783 + }, + { + "epoch": 4.9990506152385565, + "grad_norm": 0.35916584730148315, + "learning_rate": 1.0483059684474764e-05, + "loss": 1.1891, + "step": 16784 + }, + { + "epoch": 4.999348461438225, + "grad_norm": 0.3863771855831146, + "learning_rate": 1.0482096241912211e-05, + "loss": 1.2279, + "step": 16785 + }, + { + "epoch": 4.999646307637893, + "grad_norm": 0.32903555035591125, + "learning_rate": 1.0481132794864298e-05, + "loss": 1.2017, + "step": 16786 + }, + { + "epoch": 4.999944153837562, + "grad_norm": 0.3094794452190399, + "learning_rate": 1.0480169343339976e-05, + "loss": 1.2045, + "step": 16787 + }, + { + "epoch": 5.000242000037231, + "grad_norm": 0.2456616908311844, + "learning_rate": 1.0479205887348216e-05, + "loss": 1.2252, + "step": 16788 + }, + { + "epoch": 5.000539846236899, + "grad_norm": 0.3059477210044861, + "learning_rate": 1.0478242426897982e-05, + "loss": 1.2404, + "step": 16789 + }, + { + "epoch": 5.000837692436568, + "grad_norm": 0.2795867621898651, + "learning_rate": 1.0477278961998236e-05, + "loss": 1.197, + "step": 16790 + }, + { + "epoch": 5.001135538636237, + "grad_norm": 0.35036084055900574, + "learning_rate": 1.047631549265794e-05, + "loss": 1.2122, + "step": 16791 + }, + { + "epoch": 5.001433384835905, + "grad_norm": 0.24780143797397614, + "learning_rate": 1.0475352018886067e-05, + "loss": 1.2192, + "step": 16792 + }, + { + "epoch": 5.001731231035574, + "grad_norm": 0.2544531524181366, + "learning_rate": 1.0474388540691569e-05, + "loss": 1.2197, + "step": 16793 + }, + { + "epoch": 5.0020290772352425, + "grad_norm": 0.35282015800476074, + "learning_rate": 1.0473425058083418e-05, + "loss": 1.2164, + "step": 16794 + }, + { + "epoch": 5.002326923434912, + "grad_norm": 0.33501848578453064, + "learning_rate": 1.0472461571070574e-05, + "loss": 1.2349, + "step": 16795 + }, + { + "epoch": 5.00262476963458, + "grad_norm": 0.24779658019542694, + "learning_rate": 1.0471498079662001e-05, + "loss": 1.2159, + "step": 16796 + }, + { + "epoch": 5.002922615834248, + "grad_norm": 0.3270301818847656, + "learning_rate": 1.0470534583866669e-05, + "loss": 1.2143, + "step": 16797 + }, + { + "epoch": 5.003220462033918, + "grad_norm": 0.3353484570980072, + "learning_rate": 1.0469571083693538e-05, + "loss": 1.2297, + "step": 16798 + }, + { + "epoch": 5.003518308233586, + "grad_norm": 0.3318248689174652, + "learning_rate": 1.0468607579151574e-05, + "loss": 1.213, + "step": 16799 + }, + { + "epoch": 5.003816154433254, + "grad_norm": 0.5154165625572205, + "learning_rate": 1.0467644070249736e-05, + "loss": 1.2114, + "step": 16800 + }, + { + "epoch": 5.004114000632923, + "grad_norm": 0.2916905879974365, + "learning_rate": 1.0466680556996994e-05, + "loss": 1.2323, + "step": 16801 + }, + { + "epoch": 5.004411846832592, + "grad_norm": 0.7062249183654785, + "learning_rate": 1.046571703940231e-05, + "loss": 1.2052, + "step": 16802 + }, + { + "epoch": 5.00470969303226, + "grad_norm": 0.28323349356651306, + "learning_rate": 1.046475351747465e-05, + "loss": 1.2076, + "step": 16803 + }, + { + "epoch": 5.005007539231929, + "grad_norm": 0.6860963702201843, + "learning_rate": 1.046378999122298e-05, + "loss": 1.2258, + "step": 16804 + }, + { + "epoch": 5.005305385431598, + "grad_norm": 0.45981213450431824, + "learning_rate": 1.046282646065626e-05, + "loss": 1.2151, + "step": 16805 + }, + { + "epoch": 5.005603231631266, + "grad_norm": 0.4969659149646759, + "learning_rate": 1.0461862925783457e-05, + "loss": 1.2304, + "step": 16806 + }, + { + "epoch": 5.005901077830935, + "grad_norm": 0.5255463123321533, + "learning_rate": 1.0460899386613537e-05, + "loss": 1.2139, + "step": 16807 + }, + { + "epoch": 5.0061989240306035, + "grad_norm": 0.31206822395324707, + "learning_rate": 1.045993584315546e-05, + "loss": 1.2146, + "step": 16808 + }, + { + "epoch": 5.006496770230273, + "grad_norm": 0.36263447999954224, + "learning_rate": 1.0458972295418196e-05, + "loss": 1.2387, + "step": 16809 + }, + { + "epoch": 5.006794616429941, + "grad_norm": 0.31523358821868896, + "learning_rate": 1.045800874341071e-05, + "loss": 1.2408, + "step": 16810 + }, + { + "epoch": 5.007092462629609, + "grad_norm": 0.26450997591018677, + "learning_rate": 1.0457045187141962e-05, + "loss": 1.2104, + "step": 16811 + }, + { + "epoch": 5.007390308829279, + "grad_norm": 0.33075958490371704, + "learning_rate": 1.0456081626620918e-05, + "loss": 1.2293, + "step": 16812 + }, + { + "epoch": 5.007688155028947, + "grad_norm": 0.34704312682151794, + "learning_rate": 1.0455118061856546e-05, + "loss": 1.2166, + "step": 16813 + }, + { + "epoch": 5.007986001228615, + "grad_norm": 0.40252599120140076, + "learning_rate": 1.0454154492857806e-05, + "loss": 1.22, + "step": 16814 + }, + { + "epoch": 5.0082838474282845, + "grad_norm": 0.31370943784713745, + "learning_rate": 1.0453190919633669e-05, + "loss": 1.2253, + "step": 16815 + }, + { + "epoch": 5.008581693627953, + "grad_norm": 0.28903695940971375, + "learning_rate": 1.0452227342193098e-05, + "loss": 1.2191, + "step": 16816 + }, + { + "epoch": 5.008879539827621, + "grad_norm": 0.3101347088813782, + "learning_rate": 1.0451263760545054e-05, + "loss": 1.2165, + "step": 16817 + }, + { + "epoch": 5.00917738602729, + "grad_norm": 0.3071835935115814, + "learning_rate": 1.0450300174698505e-05, + "loss": 1.1955, + "step": 16818 + }, + { + "epoch": 5.009475232226959, + "grad_norm": 0.38228222727775574, + "learning_rate": 1.0449336584662413e-05, + "loss": 1.2072, + "step": 16819 + }, + { + "epoch": 5.009773078426628, + "grad_norm": 0.4122900664806366, + "learning_rate": 1.0448372990445752e-05, + "loss": 1.1978, + "step": 16820 + }, + { + "epoch": 5.010070924626296, + "grad_norm": 0.3079754412174225, + "learning_rate": 1.0447409392057479e-05, + "loss": 1.2248, + "step": 16821 + }, + { + "epoch": 5.0103687708259645, + "grad_norm": 0.3712056577205658, + "learning_rate": 1.0446445789506561e-05, + "loss": 1.213, + "step": 16822 + }, + { + "epoch": 5.010666617025634, + "grad_norm": 0.26759687066078186, + "learning_rate": 1.0445482182801964e-05, + "loss": 1.2028, + "step": 16823 + }, + { + "epoch": 5.010964463225302, + "grad_norm": 0.5693494081497192, + "learning_rate": 1.044451857195265e-05, + "loss": 1.2246, + "step": 16824 + }, + { + "epoch": 5.01126230942497, + "grad_norm": 0.2638775706291199, + "learning_rate": 1.0443554956967592e-05, + "loss": 1.2216, + "step": 16825 + }, + { + "epoch": 5.01156015562464, + "grad_norm": 0.4996988773345947, + "learning_rate": 1.044259133785575e-05, + "loss": 1.2302, + "step": 16826 + }, + { + "epoch": 5.011858001824308, + "grad_norm": 0.3001219928264618, + "learning_rate": 1.044162771462609e-05, + "loss": 1.2087, + "step": 16827 + }, + { + "epoch": 5.012155848023976, + "grad_norm": 0.3501885235309601, + "learning_rate": 1.0440664087287575e-05, + "loss": 1.2154, + "step": 16828 + }, + { + "epoch": 5.0124536942236455, + "grad_norm": 0.2479260414838791, + "learning_rate": 1.0439700455849176e-05, + "loss": 1.2149, + "step": 16829 + }, + { + "epoch": 5.012751540423314, + "grad_norm": 0.3943127691745758, + "learning_rate": 1.0438736820319855e-05, + "loss": 1.2112, + "step": 16830 + }, + { + "epoch": 5.013049386622982, + "grad_norm": 0.3294520676136017, + "learning_rate": 1.0437773180708575e-05, + "loss": 1.215, + "step": 16831 + }, + { + "epoch": 5.013347232822651, + "grad_norm": 0.25596147775650024, + "learning_rate": 1.0436809537024309e-05, + "loss": 1.2236, + "step": 16832 + }, + { + "epoch": 5.01364507902232, + "grad_norm": 0.2845492362976074, + "learning_rate": 1.0435845889276018e-05, + "loss": 1.2239, + "step": 16833 + }, + { + "epoch": 5.013942925221989, + "grad_norm": 0.2554742097854614, + "learning_rate": 1.0434882237472666e-05, + "loss": 1.219, + "step": 16834 + }, + { + "epoch": 5.014240771421657, + "grad_norm": 0.28055235743522644, + "learning_rate": 1.043391858162322e-05, + "loss": 1.2077, + "step": 16835 + }, + { + "epoch": 5.014538617621326, + "grad_norm": 0.290095716714859, + "learning_rate": 1.0432954921736646e-05, + "loss": 1.2185, + "step": 16836 + }, + { + "epoch": 5.014836463820995, + "grad_norm": 0.26479220390319824, + "learning_rate": 1.0431991257821911e-05, + "loss": 1.2041, + "step": 16837 + }, + { + "epoch": 5.015134310020663, + "grad_norm": 0.27861976623535156, + "learning_rate": 1.043102758988798e-05, + "loss": 1.2362, + "step": 16838 + }, + { + "epoch": 5.0154321562203314, + "grad_norm": 0.29868316650390625, + "learning_rate": 1.043006391794382e-05, + "loss": 1.2242, + "step": 16839 + }, + { + "epoch": 5.015730002420001, + "grad_norm": 0.28946223855018616, + "learning_rate": 1.0429100241998395e-05, + "loss": 1.2185, + "step": 16840 + }, + { + "epoch": 5.016027848619669, + "grad_norm": 0.3949662148952484, + "learning_rate": 1.0428136562060673e-05, + "loss": 1.2255, + "step": 16841 + }, + { + "epoch": 5.016325694819337, + "grad_norm": 0.32382822036743164, + "learning_rate": 1.0427172878139615e-05, + "loss": 1.2142, + "step": 16842 + }, + { + "epoch": 5.0166235410190065, + "grad_norm": 0.2649022936820984, + "learning_rate": 1.0426209190244193e-05, + "loss": 1.2183, + "step": 16843 + }, + { + "epoch": 5.016921387218675, + "grad_norm": 0.2537255585193634, + "learning_rate": 1.0425245498383372e-05, + "loss": 1.2207, + "step": 16844 + }, + { + "epoch": 5.017219233418343, + "grad_norm": 0.25246289372444153, + "learning_rate": 1.0424281802566114e-05, + "loss": 1.2226, + "step": 16845 + }, + { + "epoch": 5.017517079618012, + "grad_norm": 0.29220494627952576, + "learning_rate": 1.042331810280139e-05, + "loss": 1.2143, + "step": 16846 + }, + { + "epoch": 5.017814925817681, + "grad_norm": 0.25921738147735596, + "learning_rate": 1.0422354399098165e-05, + "loss": 1.2365, + "step": 16847 + }, + { + "epoch": 5.01811277201735, + "grad_norm": 0.3867584466934204, + "learning_rate": 1.04213906914654e-05, + "loss": 1.2081, + "step": 16848 + }, + { + "epoch": 5.018410618217018, + "grad_norm": 0.3751278817653656, + "learning_rate": 1.0420426979912068e-05, + "loss": 1.2161, + "step": 16849 + }, + { + "epoch": 5.018708464416687, + "grad_norm": 0.329056054353714, + "learning_rate": 1.0419463264447133e-05, + "loss": 1.2059, + "step": 16850 + }, + { + "epoch": 5.019006310616356, + "grad_norm": 0.3369464576244354, + "learning_rate": 1.0418499545079562e-05, + "loss": 1.2106, + "step": 16851 + }, + { + "epoch": 5.019304156816024, + "grad_norm": 0.27653270959854126, + "learning_rate": 1.0417535821818319e-05, + "loss": 1.2201, + "step": 16852 + }, + { + "epoch": 5.0196020030156925, + "grad_norm": 0.2683281898498535, + "learning_rate": 1.041657209467237e-05, + "loss": 1.2077, + "step": 16853 + }, + { + "epoch": 5.019899849215362, + "grad_norm": 0.4458148181438446, + "learning_rate": 1.0415608363650685e-05, + "loss": 1.2174, + "step": 16854 + }, + { + "epoch": 5.02019769541503, + "grad_norm": 0.30316099524497986, + "learning_rate": 1.0414644628762227e-05, + "loss": 1.2164, + "step": 16855 + }, + { + "epoch": 5.020495541614698, + "grad_norm": 0.40766316652297974, + "learning_rate": 1.0413680890015965e-05, + "loss": 1.2128, + "step": 16856 + }, + { + "epoch": 5.020793387814368, + "grad_norm": 0.500697672367096, + "learning_rate": 1.0412717147420866e-05, + "loss": 1.2112, + "step": 16857 + }, + { + "epoch": 5.021091234014036, + "grad_norm": 0.2633652985095978, + "learning_rate": 1.0411753400985894e-05, + "loss": 1.2026, + "step": 16858 + }, + { + "epoch": 5.021389080213704, + "grad_norm": 0.42190057039260864, + "learning_rate": 1.0410789650720016e-05, + "loss": 1.2165, + "step": 16859 + }, + { + "epoch": 5.021686926413373, + "grad_norm": 0.2626347541809082, + "learning_rate": 1.0409825896632198e-05, + "loss": 1.2153, + "step": 16860 + }, + { + "epoch": 5.021984772613042, + "grad_norm": 0.5549313426017761, + "learning_rate": 1.040886213873141e-05, + "loss": 1.2187, + "step": 16861 + }, + { + "epoch": 5.022282618812711, + "grad_norm": 0.36056095361709595, + "learning_rate": 1.0407898377026615e-05, + "loss": 1.2166, + "step": 16862 + }, + { + "epoch": 5.022580465012379, + "grad_norm": 0.4121907353401184, + "learning_rate": 1.0406934611526785e-05, + "loss": 1.2091, + "step": 16863 + }, + { + "epoch": 5.022878311212048, + "grad_norm": 0.29632169008255005, + "learning_rate": 1.040597084224088e-05, + "loss": 1.2185, + "step": 16864 + }, + { + "epoch": 5.023176157411717, + "grad_norm": 0.4148094654083252, + "learning_rate": 1.0405007069177869e-05, + "loss": 1.2176, + "step": 16865 + }, + { + "epoch": 5.023474003611385, + "grad_norm": 0.31199824810028076, + "learning_rate": 1.0404043292346722e-05, + "loss": 1.2217, + "step": 16866 + }, + { + "epoch": 5.0237718498110535, + "grad_norm": 0.39971500635147095, + "learning_rate": 1.04030795117564e-05, + "loss": 1.2261, + "step": 16867 + }, + { + "epoch": 5.024069696010723, + "grad_norm": 0.35686275362968445, + "learning_rate": 1.040211572741588e-05, + "loss": 1.2113, + "step": 16868 + }, + { + "epoch": 5.024367542210391, + "grad_norm": 0.36970195174217224, + "learning_rate": 1.0401151939334118e-05, + "loss": 1.1995, + "step": 16869 + }, + { + "epoch": 5.024665388410059, + "grad_norm": 0.34617671370506287, + "learning_rate": 1.0400188147520084e-05, + "loss": 1.1871, + "step": 16870 + }, + { + "epoch": 5.024963234609729, + "grad_norm": 0.3224303424358368, + "learning_rate": 1.0399224351982748e-05, + "loss": 1.2256, + "step": 16871 + }, + { + "epoch": 5.025261080809397, + "grad_norm": 0.3524259328842163, + "learning_rate": 1.0398260552731076e-05, + "loss": 1.2142, + "step": 16872 + }, + { + "epoch": 5.025558927009065, + "grad_norm": 0.3058825433254242, + "learning_rate": 1.0397296749774034e-05, + "loss": 1.2107, + "step": 16873 + }, + { + "epoch": 5.0258567732087345, + "grad_norm": 0.34864315390586853, + "learning_rate": 1.0396332943120593e-05, + "loss": 1.2073, + "step": 16874 + }, + { + "epoch": 5.026154619408403, + "grad_norm": 0.42834699153900146, + "learning_rate": 1.0395369132779713e-05, + "loss": 1.2065, + "step": 16875 + }, + { + "epoch": 5.026452465608072, + "grad_norm": 0.25437748432159424, + "learning_rate": 1.0394405318760365e-05, + "loss": 1.2193, + "step": 16876 + }, + { + "epoch": 5.02675031180774, + "grad_norm": 0.3262318968772888, + "learning_rate": 1.0393441501071517e-05, + "loss": 1.2304, + "step": 16877 + }, + { + "epoch": 5.027048158007409, + "grad_norm": 0.3492392897605896, + "learning_rate": 1.0392477679722135e-05, + "loss": 1.2346, + "step": 16878 + }, + { + "epoch": 5.027346004207078, + "grad_norm": 0.31459948420524597, + "learning_rate": 1.0391513854721187e-05, + "loss": 1.2128, + "step": 16879 + }, + { + "epoch": 5.027643850406746, + "grad_norm": 0.4275921881198883, + "learning_rate": 1.0390550026077642e-05, + "loss": 1.2092, + "step": 16880 + }, + { + "epoch": 5.0279416966064145, + "grad_norm": 0.27070149779319763, + "learning_rate": 1.0389586193800462e-05, + "loss": 1.2135, + "step": 16881 + }, + { + "epoch": 5.028239542806084, + "grad_norm": 0.7566307187080383, + "learning_rate": 1.0388622357898621e-05, + "loss": 1.2071, + "step": 16882 + }, + { + "epoch": 5.028537389005752, + "grad_norm": 0.4427952170372009, + "learning_rate": 1.0387658518381078e-05, + "loss": 1.2148, + "step": 16883 + }, + { + "epoch": 5.02883523520542, + "grad_norm": 0.6872209906578064, + "learning_rate": 1.038669467525681e-05, + "loss": 1.2323, + "step": 16884 + }, + { + "epoch": 5.02913308140509, + "grad_norm": 0.30873167514801025, + "learning_rate": 1.038573082853478e-05, + "loss": 1.1977, + "step": 16885 + }, + { + "epoch": 5.029430927604758, + "grad_norm": 0.7701168060302734, + "learning_rate": 1.0384766978223954e-05, + "loss": 1.206, + "step": 16886 + }, + { + "epoch": 5.029728773804427, + "grad_norm": 0.3259108066558838, + "learning_rate": 1.0383803124333302e-05, + "loss": 1.2131, + "step": 16887 + }, + { + "epoch": 5.0300266200040955, + "grad_norm": 0.4072898030281067, + "learning_rate": 1.038283926687179e-05, + "loss": 1.2214, + "step": 16888 + }, + { + "epoch": 5.030324466203764, + "grad_norm": 0.31779375672340393, + "learning_rate": 1.0381875405848387e-05, + "loss": 1.2164, + "step": 16889 + }, + { + "epoch": 5.030622312403433, + "grad_norm": 0.27759429812431335, + "learning_rate": 1.0380911541272059e-05, + "loss": 1.208, + "step": 16890 + }, + { + "epoch": 5.030920158603101, + "grad_norm": 0.31237512826919556, + "learning_rate": 1.0379947673151778e-05, + "loss": 1.2114, + "step": 16891 + }, + { + "epoch": 5.03121800480277, + "grad_norm": 0.3162155747413635, + "learning_rate": 1.0378983801496505e-05, + "loss": 1.2206, + "step": 16892 + }, + { + "epoch": 5.031515851002439, + "grad_norm": 0.2899010479450226, + "learning_rate": 1.0378019926315212e-05, + "loss": 1.2239, + "step": 16893 + }, + { + "epoch": 5.031813697202107, + "grad_norm": 0.39960283041000366, + "learning_rate": 1.0377056047616864e-05, + "loss": 1.2271, + "step": 16894 + }, + { + "epoch": 5.032111543401776, + "grad_norm": 0.26622918248176575, + "learning_rate": 1.0376092165410434e-05, + "loss": 1.2148, + "step": 16895 + }, + { + "epoch": 5.032409389601445, + "grad_norm": 0.43999436497688293, + "learning_rate": 1.0375128279704885e-05, + "loss": 1.2341, + "step": 16896 + }, + { + "epoch": 5.032707235801113, + "grad_norm": 0.2580682635307312, + "learning_rate": 1.0374164390509187e-05, + "loss": 1.2047, + "step": 16897 + }, + { + "epoch": 5.0330050820007814, + "grad_norm": 0.3405083417892456, + "learning_rate": 1.037320049783231e-05, + "loss": 1.21, + "step": 16898 + }, + { + "epoch": 5.033302928200451, + "grad_norm": 0.34114450216293335, + "learning_rate": 1.0372236601683214e-05, + "loss": 1.2241, + "step": 16899 + }, + { + "epoch": 5.033600774400119, + "grad_norm": 0.37767964601516724, + "learning_rate": 1.0371272702070875e-05, + "loss": 1.2111, + "step": 16900 + }, + { + "epoch": 5.033898620599788, + "grad_norm": 0.3744494915008545, + "learning_rate": 1.0370308799004256e-05, + "loss": 1.2207, + "step": 16901 + }, + { + "epoch": 5.0341964667994565, + "grad_norm": 0.27336758375167847, + "learning_rate": 1.0369344892492333e-05, + "loss": 1.2101, + "step": 16902 + }, + { + "epoch": 5.034494312999125, + "grad_norm": 0.472644567489624, + "learning_rate": 1.0368380982544064e-05, + "loss": 1.2285, + "step": 16903 + }, + { + "epoch": 5.034792159198794, + "grad_norm": 0.2490297257900238, + "learning_rate": 1.0367417069168422e-05, + "loss": 1.2266, + "step": 16904 + }, + { + "epoch": 5.035090005398462, + "grad_norm": 0.5181571841239929, + "learning_rate": 1.0366453152374376e-05, + "loss": 1.2284, + "step": 16905 + }, + { + "epoch": 5.035387851598131, + "grad_norm": 0.3122056722640991, + "learning_rate": 1.0365489232170893e-05, + "loss": 1.2183, + "step": 16906 + }, + { + "epoch": 5.0356856977978, + "grad_norm": 0.4880363941192627, + "learning_rate": 1.0364525308566937e-05, + "loss": 1.2227, + "step": 16907 + }, + { + "epoch": 5.035983543997468, + "grad_norm": 0.47182410955429077, + "learning_rate": 1.0363561381571485e-05, + "loss": 1.2265, + "step": 16908 + }, + { + "epoch": 5.036281390197137, + "grad_norm": 0.36670053005218506, + "learning_rate": 1.0362597451193499e-05, + "loss": 1.2301, + "step": 16909 + }, + { + "epoch": 5.036579236396806, + "grad_norm": 0.64680016040802, + "learning_rate": 1.0361633517441949e-05, + "loss": 1.2088, + "step": 16910 + }, + { + "epoch": 5.036877082596474, + "grad_norm": 0.3548673689365387, + "learning_rate": 1.0360669580325802e-05, + "loss": 1.2177, + "step": 16911 + }, + { + "epoch": 5.0371749287961425, + "grad_norm": 0.4463796317577362, + "learning_rate": 1.0359705639854027e-05, + "loss": 1.2258, + "step": 16912 + }, + { + "epoch": 5.037472774995812, + "grad_norm": 0.3287397623062134, + "learning_rate": 1.0358741696035594e-05, + "loss": 1.2183, + "step": 16913 + }, + { + "epoch": 5.03777062119548, + "grad_norm": 0.40949130058288574, + "learning_rate": 1.0357777748879472e-05, + "loss": 1.2312, + "step": 16914 + }, + { + "epoch": 5.038068467395149, + "grad_norm": 0.4259876012802124, + "learning_rate": 1.0356813798394628e-05, + "loss": 1.1945, + "step": 16915 + }, + { + "epoch": 5.038366313594818, + "grad_norm": 0.27881747484207153, + "learning_rate": 1.0355849844590029e-05, + "loss": 1.2139, + "step": 16916 + }, + { + "epoch": 5.038664159794486, + "grad_norm": 0.3707573711872101, + "learning_rate": 1.0354885887474644e-05, + "loss": 1.2116, + "step": 16917 + }, + { + "epoch": 5.038962005994155, + "grad_norm": 0.31895044445991516, + "learning_rate": 1.0353921927057442e-05, + "loss": 1.2255, + "step": 16918 + }, + { + "epoch": 5.039259852193823, + "grad_norm": 0.2716042995452881, + "learning_rate": 1.0352957963347393e-05, + "loss": 1.2233, + "step": 16919 + }, + { + "epoch": 5.039557698393492, + "grad_norm": 0.3826630115509033, + "learning_rate": 1.0351993996353463e-05, + "loss": 1.2243, + "step": 16920 + }, + { + "epoch": 5.039855544593161, + "grad_norm": 0.2522062063217163, + "learning_rate": 1.0351030026084624e-05, + "loss": 1.2282, + "step": 16921 + }, + { + "epoch": 5.040153390792829, + "grad_norm": 0.3618514835834503, + "learning_rate": 1.0350066052549842e-05, + "loss": 1.2363, + "step": 16922 + }, + { + "epoch": 5.040451236992498, + "grad_norm": 0.29095321893692017, + "learning_rate": 1.0349102075758089e-05, + "loss": 1.2134, + "step": 16923 + }, + { + "epoch": 5.040749083192167, + "grad_norm": 0.30500462651252747, + "learning_rate": 1.0348138095718327e-05, + "loss": 1.1962, + "step": 16924 + }, + { + "epoch": 5.041046929391835, + "grad_norm": 0.35836461186408997, + "learning_rate": 1.0347174112439528e-05, + "loss": 1.2178, + "step": 16925 + }, + { + "epoch": 5.0413447755915035, + "grad_norm": 0.3042157292366028, + "learning_rate": 1.0346210125930667e-05, + "loss": 1.2136, + "step": 16926 + }, + { + "epoch": 5.041642621791173, + "grad_norm": 0.35214394330978394, + "learning_rate": 1.0345246136200704e-05, + "loss": 1.2086, + "step": 16927 + }, + { + "epoch": 5.041940467990841, + "grad_norm": 0.2565341889858246, + "learning_rate": 1.0344282143258612e-05, + "loss": 1.2281, + "step": 16928 + }, + { + "epoch": 5.04223831419051, + "grad_norm": 0.36944258213043213, + "learning_rate": 1.0343318147113364e-05, + "loss": 1.2198, + "step": 16929 + }, + { + "epoch": 5.042536160390179, + "grad_norm": 0.27086594700813293, + "learning_rate": 1.0342354147773917e-05, + "loss": 1.1991, + "step": 16930 + }, + { + "epoch": 5.042834006589847, + "grad_norm": 0.3638143837451935, + "learning_rate": 1.034139014524925e-05, + "loss": 1.2186, + "step": 16931 + }, + { + "epoch": 5.043131852789516, + "grad_norm": 0.34115177392959595, + "learning_rate": 1.0340426139548332e-05, + "loss": 1.2135, + "step": 16932 + }, + { + "epoch": 5.0434296989891845, + "grad_norm": 0.4475928843021393, + "learning_rate": 1.0339462130680125e-05, + "loss": 1.2159, + "step": 16933 + }, + { + "epoch": 5.043727545188853, + "grad_norm": 0.33746838569641113, + "learning_rate": 1.0338498118653604e-05, + "loss": 1.2213, + "step": 16934 + }, + { + "epoch": 5.044025391388522, + "grad_norm": 0.26538002490997314, + "learning_rate": 1.0337534103477736e-05, + "loss": 1.2115, + "step": 16935 + }, + { + "epoch": 5.04432323758819, + "grad_norm": 0.30209416151046753, + "learning_rate": 1.0336570085161489e-05, + "loss": 1.2304, + "step": 16936 + }, + { + "epoch": 5.044621083787859, + "grad_norm": 0.2903737723827362, + "learning_rate": 1.0335606063713835e-05, + "loss": 1.2124, + "step": 16937 + }, + { + "epoch": 5.044918929987528, + "grad_norm": 0.3577030599117279, + "learning_rate": 1.0334642039143741e-05, + "loss": 1.2226, + "step": 16938 + }, + { + "epoch": 5.045216776187196, + "grad_norm": 0.29479989409446716, + "learning_rate": 1.0333678011460178e-05, + "loss": 1.221, + "step": 16939 + }, + { + "epoch": 5.0455146223868645, + "grad_norm": 0.32492542266845703, + "learning_rate": 1.0332713980672113e-05, + "loss": 1.2151, + "step": 16940 + }, + { + "epoch": 5.045812468586534, + "grad_norm": 0.2970866560935974, + "learning_rate": 1.0331749946788514e-05, + "loss": 1.2135, + "step": 16941 + }, + { + "epoch": 5.046110314786202, + "grad_norm": 0.29935309290885925, + "learning_rate": 1.0330785909818355e-05, + "loss": 1.199, + "step": 16942 + }, + { + "epoch": 5.046408160985871, + "grad_norm": 0.39471688866615295, + "learning_rate": 1.0329821869770603e-05, + "loss": 1.2133, + "step": 16943 + }, + { + "epoch": 5.04670600718554, + "grad_norm": 0.3253411650657654, + "learning_rate": 1.0328857826654223e-05, + "loss": 1.2164, + "step": 16944 + }, + { + "epoch": 5.047003853385208, + "grad_norm": 0.25424107909202576, + "learning_rate": 1.0327893780478191e-05, + "loss": 1.2234, + "step": 16945 + }, + { + "epoch": 5.047301699584877, + "grad_norm": 0.3143296539783478, + "learning_rate": 1.0326929731251475e-05, + "loss": 1.2152, + "step": 16946 + }, + { + "epoch": 5.0475995457845455, + "grad_norm": 0.25473228096961975, + "learning_rate": 1.0325965678983044e-05, + "loss": 1.201, + "step": 16947 + }, + { + "epoch": 5.047897391984214, + "grad_norm": 0.33592459559440613, + "learning_rate": 1.0325001623681863e-05, + "loss": 1.2213, + "step": 16948 + }, + { + "epoch": 5.048195238183883, + "grad_norm": 0.3559499979019165, + "learning_rate": 1.0324037565356909e-05, + "loss": 1.2004, + "step": 16949 + }, + { + "epoch": 5.048493084383551, + "grad_norm": 0.28112372756004333, + "learning_rate": 1.0323073504017142e-05, + "loss": 1.1925, + "step": 16950 + }, + { + "epoch": 5.04879093058322, + "grad_norm": 0.29977861046791077, + "learning_rate": 1.0322109439671542e-05, + "loss": 1.2019, + "step": 16951 + }, + { + "epoch": 5.049088776782889, + "grad_norm": 0.4374518394470215, + "learning_rate": 1.0321145372329071e-05, + "loss": 1.2181, + "step": 16952 + }, + { + "epoch": 5.049386622982557, + "grad_norm": 0.3195934295654297, + "learning_rate": 1.0320181301998702e-05, + "loss": 1.2192, + "step": 16953 + }, + { + "epoch": 5.0496844691822265, + "grad_norm": 0.37285518646240234, + "learning_rate": 1.0319217228689401e-05, + "loss": 1.2041, + "step": 16954 + }, + { + "epoch": 5.049982315381895, + "grad_norm": 0.341904878616333, + "learning_rate": 1.0318253152410145e-05, + "loss": 1.2203, + "step": 16955 + }, + { + "epoch": 5.050280161581563, + "grad_norm": 0.41418173909187317, + "learning_rate": 1.0317289073169898e-05, + "loss": 1.2049, + "step": 16956 + }, + { + "epoch": 5.050578007781232, + "grad_norm": 0.4288500249385834, + "learning_rate": 1.0316324990977629e-05, + "loss": 1.2099, + "step": 16957 + }, + { + "epoch": 5.050875853980901, + "grad_norm": 0.31045854091644287, + "learning_rate": 1.031536090584231e-05, + "loss": 1.2121, + "step": 16958 + }, + { + "epoch": 5.051173700180569, + "grad_norm": 0.3993091285228729, + "learning_rate": 1.0314396817772909e-05, + "loss": 1.2185, + "step": 16959 + }, + { + "epoch": 5.051471546380238, + "grad_norm": 0.26940515637397766, + "learning_rate": 1.0313432726778398e-05, + "loss": 1.2201, + "step": 16960 + }, + { + "epoch": 5.0517693925799065, + "grad_norm": 0.2706749439239502, + "learning_rate": 1.0312468632867745e-05, + "loss": 1.2141, + "step": 16961 + }, + { + "epoch": 5.052067238779575, + "grad_norm": 0.35923677682876587, + "learning_rate": 1.031150453604992e-05, + "loss": 1.2192, + "step": 16962 + }, + { + "epoch": 5.052365084979244, + "grad_norm": 0.31290218234062195, + "learning_rate": 1.0310540436333898e-05, + "loss": 1.2273, + "step": 16963 + }, + { + "epoch": 5.052662931178912, + "grad_norm": 0.3103160858154297, + "learning_rate": 1.030957633372864e-05, + "loss": 1.2115, + "step": 16964 + }, + { + "epoch": 5.052960777378581, + "grad_norm": 0.330236554145813, + "learning_rate": 1.030861222824312e-05, + "loss": 1.2031, + "step": 16965 + }, + { + "epoch": 5.05325862357825, + "grad_norm": 0.2710285782814026, + "learning_rate": 1.0307648119886308e-05, + "loss": 1.2227, + "step": 16966 + }, + { + "epoch": 5.053556469777918, + "grad_norm": 0.329816609621048, + "learning_rate": 1.0306684008667173e-05, + "loss": 1.2153, + "step": 16967 + }, + { + "epoch": 5.0538543159775875, + "grad_norm": 0.3519997000694275, + "learning_rate": 1.0305719894594688e-05, + "loss": 1.2145, + "step": 16968 + }, + { + "epoch": 5.054152162177256, + "grad_norm": 0.25790297985076904, + "learning_rate": 1.0304755777677822e-05, + "loss": 1.2149, + "step": 16969 + }, + { + "epoch": 5.054450008376924, + "grad_norm": 0.3011535704135895, + "learning_rate": 1.0303791657925544e-05, + "loss": 1.2109, + "step": 16970 + }, + { + "epoch": 5.054747854576593, + "grad_norm": 0.25497984886169434, + "learning_rate": 1.030282753534682e-05, + "loss": 1.199, + "step": 16971 + }, + { + "epoch": 5.055045700776262, + "grad_norm": 0.32787832617759705, + "learning_rate": 1.0301863409950625e-05, + "loss": 1.2055, + "step": 16972 + }, + { + "epoch": 5.05534354697593, + "grad_norm": 0.2479628622531891, + "learning_rate": 1.0300899281745934e-05, + "loss": 1.216, + "step": 16973 + }, + { + "epoch": 5.055641393175599, + "grad_norm": 0.3591461181640625, + "learning_rate": 1.0299935150741707e-05, + "loss": 1.2179, + "step": 16974 + }, + { + "epoch": 5.055939239375268, + "grad_norm": 0.26028865575790405, + "learning_rate": 1.0298971016946917e-05, + "loss": 1.2252, + "step": 16975 + }, + { + "epoch": 5.056237085574936, + "grad_norm": 0.4700208306312561, + "learning_rate": 1.0298006880370536e-05, + "loss": 1.2343, + "step": 16976 + }, + { + "epoch": 5.056534931774605, + "grad_norm": 0.3031938970088959, + "learning_rate": 1.0297042741021537e-05, + "loss": 1.1976, + "step": 16977 + }, + { + "epoch": 5.056832777974273, + "grad_norm": 0.29954105615615845, + "learning_rate": 1.0296078598908885e-05, + "loss": 1.2042, + "step": 16978 + }, + { + "epoch": 5.057130624173942, + "grad_norm": 0.2670590579509735, + "learning_rate": 1.0295114454041553e-05, + "loss": 1.2224, + "step": 16979 + }, + { + "epoch": 5.057428470373611, + "grad_norm": 0.44915083050727844, + "learning_rate": 1.0294150306428513e-05, + "loss": 1.2038, + "step": 16980 + }, + { + "epoch": 5.057726316573279, + "grad_norm": 0.40954411029815674, + "learning_rate": 1.029318615607873e-05, + "loss": 1.2036, + "step": 16981 + }, + { + "epoch": 5.0580241627729485, + "grad_norm": 0.2915779948234558, + "learning_rate": 1.0292222003001178e-05, + "loss": 1.2119, + "step": 16982 + }, + { + "epoch": 5.058322008972617, + "grad_norm": 0.4107596278190613, + "learning_rate": 1.0291257847204827e-05, + "loss": 1.2288, + "step": 16983 + }, + { + "epoch": 5.058619855172285, + "grad_norm": 0.27059832215309143, + "learning_rate": 1.0290293688698647e-05, + "loss": 1.2141, + "step": 16984 + }, + { + "epoch": 5.058917701371954, + "grad_norm": 0.3820253908634186, + "learning_rate": 1.0289329527491607e-05, + "loss": 1.2276, + "step": 16985 + }, + { + "epoch": 5.059215547571623, + "grad_norm": 0.2683317959308624, + "learning_rate": 1.0288365363592681e-05, + "loss": 1.2337, + "step": 16986 + }, + { + "epoch": 5.059513393771291, + "grad_norm": 0.35911110043525696, + "learning_rate": 1.028740119701084e-05, + "loss": 1.2133, + "step": 16987 + }, + { + "epoch": 5.05981123997096, + "grad_norm": 0.2821185290813446, + "learning_rate": 1.0286437027755047e-05, + "loss": 1.2327, + "step": 16988 + }, + { + "epoch": 5.060109086170629, + "grad_norm": 0.31514233350753784, + "learning_rate": 1.0285472855834277e-05, + "loss": 1.2127, + "step": 16989 + }, + { + "epoch": 5.060406932370297, + "grad_norm": 0.36523517966270447, + "learning_rate": 1.0284508681257508e-05, + "loss": 1.2308, + "step": 16990 + }, + { + "epoch": 5.060704778569966, + "grad_norm": 0.36553582549095154, + "learning_rate": 1.0283544504033697e-05, + "loss": 1.212, + "step": 16991 + }, + { + "epoch": 5.0610026247696345, + "grad_norm": 0.34784194827079773, + "learning_rate": 1.0282580324171826e-05, + "loss": 1.2338, + "step": 16992 + }, + { + "epoch": 5.061300470969303, + "grad_norm": 0.34247714281082153, + "learning_rate": 1.0281616141680857e-05, + "loss": 1.2258, + "step": 16993 + }, + { + "epoch": 5.061598317168972, + "grad_norm": 0.40866032242774963, + "learning_rate": 1.0280651956569765e-05, + "loss": 1.215, + "step": 16994 + }, + { + "epoch": 5.06189616336864, + "grad_norm": 0.3296501636505127, + "learning_rate": 1.0279687768847519e-05, + "loss": 1.2232, + "step": 16995 + }, + { + "epoch": 5.06219400956831, + "grad_norm": 0.543762743473053, + "learning_rate": 1.0278723578523093e-05, + "loss": 1.1987, + "step": 16996 + }, + { + "epoch": 5.062491855767978, + "grad_norm": 0.5276671648025513, + "learning_rate": 1.0277759385605457e-05, + "loss": 1.2185, + "step": 16997 + }, + { + "epoch": 5.062789701967646, + "grad_norm": 0.3279116451740265, + "learning_rate": 1.027679519010358e-05, + "loss": 1.2224, + "step": 16998 + }, + { + "epoch": 5.063087548167315, + "grad_norm": 0.431532621383667, + "learning_rate": 1.027583099202643e-05, + "loss": 1.2311, + "step": 16999 + }, + { + "epoch": 5.063385394366984, + "grad_norm": 0.26742449402809143, + "learning_rate": 1.027486679138298e-05, + "loss": 1.208, + "step": 17000 + }, + { + "epoch": 5.063385394366984, + "eval_loss": 1.321560263633728, + "eval_runtime": 22.8145, + "eval_samples_per_second": 76.004, + "eval_steps_per_second": 4.778, + "step": 17000 + }, + { + "epoch": 5.063683240566652, + "grad_norm": 0.33452680706977844, + "learning_rate": 1.0273902588182205e-05, + "loss": 1.2093, + "step": 17001 + }, + { + "epoch": 5.063981086766321, + "grad_norm": 0.31214168667793274, + "learning_rate": 1.027293838243307e-05, + "loss": 1.2083, + "step": 17002 + }, + { + "epoch": 5.06427893296599, + "grad_norm": 0.2911657392978668, + "learning_rate": 1.027197417414455e-05, + "loss": 1.2323, + "step": 17003 + }, + { + "epoch": 5.064576779165658, + "grad_norm": 0.504849374294281, + "learning_rate": 1.0271009963325616e-05, + "loss": 1.2218, + "step": 17004 + }, + { + "epoch": 5.064874625365327, + "grad_norm": 0.35744398832321167, + "learning_rate": 1.0270045749985233e-05, + "loss": 1.2135, + "step": 17005 + }, + { + "epoch": 5.0651724715649955, + "grad_norm": 0.3834766447544098, + "learning_rate": 1.0269081534132377e-05, + "loss": 1.2088, + "step": 17006 + }, + { + "epoch": 5.065470317764664, + "grad_norm": 0.30820387601852417, + "learning_rate": 1.0268117315776018e-05, + "loss": 1.2363, + "step": 17007 + }, + { + "epoch": 5.065768163964333, + "grad_norm": 0.41436249017715454, + "learning_rate": 1.026715309492513e-05, + "loss": 1.2097, + "step": 17008 + }, + { + "epoch": 5.066066010164001, + "grad_norm": 0.28900203108787537, + "learning_rate": 1.0266188871588675e-05, + "loss": 1.2063, + "step": 17009 + }, + { + "epoch": 5.066363856363671, + "grad_norm": 0.5163151025772095, + "learning_rate": 1.0265224645775636e-05, + "loss": 1.2129, + "step": 17010 + }, + { + "epoch": 5.066661702563339, + "grad_norm": 0.28551915287971497, + "learning_rate": 1.0264260417494976e-05, + "loss": 1.2118, + "step": 17011 + }, + { + "epoch": 5.066959548763007, + "grad_norm": 0.6795360445976257, + "learning_rate": 1.0263296186755665e-05, + "loss": 1.2144, + "step": 17012 + }, + { + "epoch": 5.0672573949626765, + "grad_norm": 0.3976183831691742, + "learning_rate": 1.026233195356668e-05, + "loss": 1.224, + "step": 17013 + }, + { + "epoch": 5.067555241162345, + "grad_norm": 0.524844229221344, + "learning_rate": 1.0261367717936992e-05, + "loss": 1.2074, + "step": 17014 + }, + { + "epoch": 5.067853087362013, + "grad_norm": 0.350325345993042, + "learning_rate": 1.0260403479875565e-05, + "loss": 1.2154, + "step": 17015 + }, + { + "epoch": 5.068150933561682, + "grad_norm": 0.47722166776657104, + "learning_rate": 1.0259439239391376e-05, + "loss": 1.2163, + "step": 17016 + }, + { + "epoch": 5.068448779761351, + "grad_norm": 0.3014627695083618, + "learning_rate": 1.0258474996493395e-05, + "loss": 1.2247, + "step": 17017 + }, + { + "epoch": 5.068746625961019, + "grad_norm": 0.36576247215270996, + "learning_rate": 1.0257510751190592e-05, + "loss": 1.2153, + "step": 17018 + }, + { + "epoch": 5.069044472160688, + "grad_norm": 0.28036728501319885, + "learning_rate": 1.025654650349194e-05, + "loss": 1.2096, + "step": 17019 + }, + { + "epoch": 5.0693423183603565, + "grad_norm": 0.437121719121933, + "learning_rate": 1.025558225340641e-05, + "loss": 1.2081, + "step": 17020 + }, + { + "epoch": 5.069640164560026, + "grad_norm": 0.278429239988327, + "learning_rate": 1.025461800094297e-05, + "loss": 1.2183, + "step": 17021 + }, + { + "epoch": 5.069938010759694, + "grad_norm": 0.4991644620895386, + "learning_rate": 1.0253653746110597e-05, + "loss": 1.213, + "step": 17022 + }, + { + "epoch": 5.070235856959362, + "grad_norm": 0.2669813334941864, + "learning_rate": 1.0252689488918257e-05, + "loss": 1.2264, + "step": 17023 + }, + { + "epoch": 5.070533703159032, + "grad_norm": 0.3850683867931366, + "learning_rate": 1.0251725229374925e-05, + "loss": 1.2246, + "step": 17024 + }, + { + "epoch": 5.0708315493587, + "grad_norm": 0.299041748046875, + "learning_rate": 1.025076096748957e-05, + "loss": 1.2083, + "step": 17025 + }, + { + "epoch": 5.071129395558368, + "grad_norm": 0.3268550634384155, + "learning_rate": 1.0249796703271167e-05, + "loss": 1.2006, + "step": 17026 + }, + { + "epoch": 5.0714272417580375, + "grad_norm": 0.340476393699646, + "learning_rate": 1.0248832436728682e-05, + "loss": 1.2162, + "step": 17027 + }, + { + "epoch": 5.071725087957706, + "grad_norm": 0.288663387298584, + "learning_rate": 1.0247868167871091e-05, + "loss": 1.2188, + "step": 17028 + }, + { + "epoch": 5.072022934157374, + "grad_norm": 0.41877585649490356, + "learning_rate": 1.0246903896707364e-05, + "loss": 1.2143, + "step": 17029 + }, + { + "epoch": 5.072320780357043, + "grad_norm": 0.26447010040283203, + "learning_rate": 1.0245939623246468e-05, + "loss": 1.2057, + "step": 17030 + }, + { + "epoch": 5.072618626556712, + "grad_norm": 0.32551413774490356, + "learning_rate": 1.0244975347497383e-05, + "loss": 1.2274, + "step": 17031 + }, + { + "epoch": 5.07291647275638, + "grad_norm": 0.3196723163127899, + "learning_rate": 1.0244011069469072e-05, + "loss": 1.2249, + "step": 17032 + }, + { + "epoch": 5.073214318956049, + "grad_norm": 0.2677357792854309, + "learning_rate": 1.0243046789170515e-05, + "loss": 1.2089, + "step": 17033 + }, + { + "epoch": 5.073512165155718, + "grad_norm": 0.2594597637653351, + "learning_rate": 1.0242082506610677e-05, + "loss": 1.2218, + "step": 17034 + }, + { + "epoch": 5.073810011355387, + "grad_norm": 0.37905433773994446, + "learning_rate": 1.0241118221798534e-05, + "loss": 1.2289, + "step": 17035 + }, + { + "epoch": 5.074107857555055, + "grad_norm": 0.25232434272766113, + "learning_rate": 1.0240153934743052e-05, + "loss": 1.2154, + "step": 17036 + }, + { + "epoch": 5.074405703754723, + "grad_norm": 0.4842188060283661, + "learning_rate": 1.0239189645453204e-05, + "loss": 1.2251, + "step": 17037 + }, + { + "epoch": 5.074703549954393, + "grad_norm": 0.2551998794078827, + "learning_rate": 1.0238225353937972e-05, + "loss": 1.2072, + "step": 17038 + }, + { + "epoch": 5.075001396154061, + "grad_norm": 0.4909974932670593, + "learning_rate": 1.0237261060206312e-05, + "loss": 1.211, + "step": 17039 + }, + { + "epoch": 5.075299242353729, + "grad_norm": 0.2678253650665283, + "learning_rate": 1.0236296764267206e-05, + "loss": 1.218, + "step": 17040 + }, + { + "epoch": 5.0755970885533985, + "grad_norm": 0.42401137948036194, + "learning_rate": 1.0235332466129621e-05, + "loss": 1.2196, + "step": 17041 + }, + { + "epoch": 5.075894934753067, + "grad_norm": 0.26835349202156067, + "learning_rate": 1.0234368165802531e-05, + "loss": 1.2153, + "step": 17042 + }, + { + "epoch": 5.076192780952735, + "grad_norm": 0.30196458101272583, + "learning_rate": 1.0233403863294907e-05, + "loss": 1.2211, + "step": 17043 + }, + { + "epoch": 5.076490627152404, + "grad_norm": 0.25685325264930725, + "learning_rate": 1.023243955861572e-05, + "loss": 1.2164, + "step": 17044 + }, + { + "epoch": 5.076788473352073, + "grad_norm": 0.32649120688438416, + "learning_rate": 1.0231475251773945e-05, + "loss": 1.2201, + "step": 17045 + }, + { + "epoch": 5.077086319551741, + "grad_norm": 0.26368629932403564, + "learning_rate": 1.023051094277855e-05, + "loss": 1.2056, + "step": 17046 + }, + { + "epoch": 5.07738416575141, + "grad_norm": 0.3373166024684906, + "learning_rate": 1.0229546631638506e-05, + "loss": 1.2039, + "step": 17047 + }, + { + "epoch": 5.077682011951079, + "grad_norm": 0.262728214263916, + "learning_rate": 1.0228582318362791e-05, + "loss": 1.2269, + "step": 17048 + }, + { + "epoch": 5.077979858150748, + "grad_norm": 0.40728479623794556, + "learning_rate": 1.0227618002960371e-05, + "loss": 1.2304, + "step": 17049 + }, + { + "epoch": 5.078277704350416, + "grad_norm": 0.2462330013513565, + "learning_rate": 1.022665368544022e-05, + "loss": 1.1993, + "step": 17050 + }, + { + "epoch": 5.0785755505500845, + "grad_norm": 0.44621655344963074, + "learning_rate": 1.0225689365811308e-05, + "loss": 1.2073, + "step": 17051 + }, + { + "epoch": 5.078873396749754, + "grad_norm": 0.3645736575126648, + "learning_rate": 1.0224725044082612e-05, + "loss": 1.222, + "step": 17052 + }, + { + "epoch": 5.079171242949422, + "grad_norm": 0.26708006858825684, + "learning_rate": 1.0223760720263099e-05, + "loss": 1.2282, + "step": 17053 + }, + { + "epoch": 5.07946908914909, + "grad_norm": 0.3193638026714325, + "learning_rate": 1.022279639436174e-05, + "loss": 1.2154, + "step": 17054 + }, + { + "epoch": 5.07976693534876, + "grad_norm": 0.3130495846271515, + "learning_rate": 1.0221832066387514e-05, + "loss": 1.2205, + "step": 17055 + }, + { + "epoch": 5.080064781548428, + "grad_norm": 0.2514370083808899, + "learning_rate": 1.0220867736349384e-05, + "loss": 1.22, + "step": 17056 + }, + { + "epoch": 5.080362627748096, + "grad_norm": 0.35595253109931946, + "learning_rate": 1.021990340425633e-05, + "loss": 1.2175, + "step": 17057 + }, + { + "epoch": 5.080660473947765, + "grad_norm": 0.34136271476745605, + "learning_rate": 1.021893907011732e-05, + "loss": 1.2155, + "step": 17058 + }, + { + "epoch": 5.080958320147434, + "grad_norm": 0.3032478094100952, + "learning_rate": 1.0217974733941325e-05, + "loss": 1.2271, + "step": 17059 + }, + { + "epoch": 5.081256166347102, + "grad_norm": 0.5057386159896851, + "learning_rate": 1.021701039573732e-05, + "loss": 1.2074, + "step": 17060 + }, + { + "epoch": 5.081554012546771, + "grad_norm": 0.30139559507369995, + "learning_rate": 1.0216046055514273e-05, + "loss": 1.2203, + "step": 17061 + }, + { + "epoch": 5.08185185874644, + "grad_norm": 0.3969147801399231, + "learning_rate": 1.0215081713281162e-05, + "loss": 1.2123, + "step": 17062 + }, + { + "epoch": 5.082149704946109, + "grad_norm": 0.4047640860080719, + "learning_rate": 1.0214117369046957e-05, + "loss": 1.2153, + "step": 17063 + }, + { + "epoch": 5.082447551145777, + "grad_norm": 0.2553812265396118, + "learning_rate": 1.0213153022820625e-05, + "loss": 1.2176, + "step": 17064 + }, + { + "epoch": 5.0827453973454455, + "grad_norm": 0.2984394133090973, + "learning_rate": 1.0212188674611145e-05, + "loss": 1.2265, + "step": 17065 + }, + { + "epoch": 5.083043243545115, + "grad_norm": 0.27930763363838196, + "learning_rate": 1.0211224324427485e-05, + "loss": 1.2174, + "step": 17066 + }, + { + "epoch": 5.083341089744783, + "grad_norm": 0.2714507281780243, + "learning_rate": 1.021025997227862e-05, + "loss": 1.2113, + "step": 17067 + }, + { + "epoch": 5.083638935944451, + "grad_norm": 0.3868211507797241, + "learning_rate": 1.020929561817352e-05, + "loss": 1.2325, + "step": 17068 + }, + { + "epoch": 5.083936782144121, + "grad_norm": 0.26492103934288025, + "learning_rate": 1.020833126212116e-05, + "loss": 1.1994, + "step": 17069 + }, + { + "epoch": 5.084234628343789, + "grad_norm": 0.37112167477607727, + "learning_rate": 1.0207366904130508e-05, + "loss": 1.1966, + "step": 17070 + }, + { + "epoch": 5.084532474543457, + "grad_norm": 0.29438942670822144, + "learning_rate": 1.0206402544210537e-05, + "loss": 1.2228, + "step": 17071 + }, + { + "epoch": 5.0848303207431265, + "grad_norm": 0.31094077229499817, + "learning_rate": 1.0205438182370225e-05, + "loss": 1.1953, + "step": 17072 + }, + { + "epoch": 5.085128166942795, + "grad_norm": 0.3183203339576721, + "learning_rate": 1.0204473818618539e-05, + "loss": 1.2291, + "step": 17073 + }, + { + "epoch": 5.085426013142463, + "grad_norm": 0.25317835807800293, + "learning_rate": 1.0203509452964451e-05, + "loss": 1.2387, + "step": 17074 + }, + { + "epoch": 5.085723859342132, + "grad_norm": 0.2823701500892639, + "learning_rate": 1.0202545085416938e-05, + "loss": 1.2124, + "step": 17075 + }, + { + "epoch": 5.086021705541801, + "grad_norm": 0.3418533205986023, + "learning_rate": 1.0201580715984968e-05, + "loss": 1.2131, + "step": 17076 + }, + { + "epoch": 5.08631955174147, + "grad_norm": 0.2873172461986542, + "learning_rate": 1.0200616344677512e-05, + "loss": 1.2091, + "step": 17077 + }, + { + "epoch": 5.086617397941138, + "grad_norm": 0.3285514712333679, + "learning_rate": 1.0199651971503545e-05, + "loss": 1.2142, + "step": 17078 + }, + { + "epoch": 5.0869152441408065, + "grad_norm": 0.3224962055683136, + "learning_rate": 1.0198687596472044e-05, + "loss": 1.2327, + "step": 17079 + }, + { + "epoch": 5.087213090340476, + "grad_norm": 0.3088357448577881, + "learning_rate": 1.0197723219591975e-05, + "loss": 1.2268, + "step": 17080 + }, + { + "epoch": 5.087510936540144, + "grad_norm": 0.2855260372161865, + "learning_rate": 1.0196758840872311e-05, + "loss": 1.2197, + "step": 17081 + }, + { + "epoch": 5.087808782739812, + "grad_norm": 0.31933191418647766, + "learning_rate": 1.0195794460322026e-05, + "loss": 1.2124, + "step": 17082 + }, + { + "epoch": 5.088106628939482, + "grad_norm": 0.31816890835762024, + "learning_rate": 1.0194830077950092e-05, + "loss": 1.2271, + "step": 17083 + }, + { + "epoch": 5.08840447513915, + "grad_norm": 0.49468258023262024, + "learning_rate": 1.0193865693765483e-05, + "loss": 1.2125, + "step": 17084 + }, + { + "epoch": 5.088702321338818, + "grad_norm": 0.26526886224746704, + "learning_rate": 1.019290130777717e-05, + "loss": 1.2154, + "step": 17085 + }, + { + "epoch": 5.0890001675384875, + "grad_norm": 0.4613696038722992, + "learning_rate": 1.0191936919994127e-05, + "loss": 1.2211, + "step": 17086 + }, + { + "epoch": 5.089298013738156, + "grad_norm": 0.28655552864074707, + "learning_rate": 1.0190972530425324e-05, + "loss": 1.2128, + "step": 17087 + }, + { + "epoch": 5.089595859937825, + "grad_norm": 0.45737722516059875, + "learning_rate": 1.0190008139079734e-05, + "loss": 1.2246, + "step": 17088 + }, + { + "epoch": 5.089893706137493, + "grad_norm": 0.3493267893791199, + "learning_rate": 1.0189043745966334e-05, + "loss": 1.2185, + "step": 17089 + }, + { + "epoch": 5.090191552337162, + "grad_norm": 0.2617943286895752, + "learning_rate": 1.0188079351094088e-05, + "loss": 1.2251, + "step": 17090 + }, + { + "epoch": 5.090489398536831, + "grad_norm": 0.30303752422332764, + "learning_rate": 1.0187114954471978e-05, + "loss": 1.2262, + "step": 17091 + }, + { + "epoch": 5.090787244736499, + "grad_norm": 0.3064664304256439, + "learning_rate": 1.018615055610897e-05, + "loss": 1.2159, + "step": 17092 + }, + { + "epoch": 5.091085090936168, + "grad_norm": 0.2800910174846649, + "learning_rate": 1.0185186156014043e-05, + "loss": 1.2074, + "step": 17093 + }, + { + "epoch": 5.091382937135837, + "grad_norm": 0.30335918068885803, + "learning_rate": 1.0184221754196162e-05, + "loss": 1.2294, + "step": 17094 + }, + { + "epoch": 5.091680783335505, + "grad_norm": 0.25953903794288635, + "learning_rate": 1.0183257350664302e-05, + "loss": 1.2077, + "step": 17095 + }, + { + "epoch": 5.091978629535173, + "grad_norm": 0.28466346859931946, + "learning_rate": 1.018229294542744e-05, + "loss": 1.2151, + "step": 17096 + }, + { + "epoch": 5.092276475734843, + "grad_norm": 0.26710572838783264, + "learning_rate": 1.0181328538494545e-05, + "loss": 1.2169, + "step": 17097 + }, + { + "epoch": 5.092574321934511, + "grad_norm": 0.4515911042690277, + "learning_rate": 1.0180364129874591e-05, + "loss": 1.2093, + "step": 17098 + }, + { + "epoch": 5.092872168134179, + "grad_norm": 0.3598514497280121, + "learning_rate": 1.017939971957655e-05, + "loss": 1.2221, + "step": 17099 + }, + { + "epoch": 5.0931700143338485, + "grad_norm": 0.42421722412109375, + "learning_rate": 1.0178435307609396e-05, + "loss": 1.2121, + "step": 17100 + }, + { + "epoch": 5.093467860533517, + "grad_norm": 0.3820800483226776, + "learning_rate": 1.01774708939821e-05, + "loss": 1.215, + "step": 17101 + }, + { + "epoch": 5.093765706733186, + "grad_norm": 0.4031168818473816, + "learning_rate": 1.0176506478703634e-05, + "loss": 1.2076, + "step": 17102 + }, + { + "epoch": 5.094063552932854, + "grad_norm": 0.4623296856880188, + "learning_rate": 1.0175542061782977e-05, + "loss": 1.2228, + "step": 17103 + }, + { + "epoch": 5.094361399132523, + "grad_norm": 0.24452053010463715, + "learning_rate": 1.0174577643229093e-05, + "loss": 1.2068, + "step": 17104 + }, + { + "epoch": 5.094659245332192, + "grad_norm": 0.3492816090583801, + "learning_rate": 1.017361322305096e-05, + "loss": 1.2233, + "step": 17105 + }, + { + "epoch": 5.09495709153186, + "grad_norm": 0.30999088287353516, + "learning_rate": 1.017264880125755e-05, + "loss": 1.2133, + "step": 17106 + }, + { + "epoch": 5.095254937731529, + "grad_norm": 0.458389550447464, + "learning_rate": 1.0171684377857836e-05, + "loss": 1.2108, + "step": 17107 + }, + { + "epoch": 5.095552783931198, + "grad_norm": 0.2613278925418854, + "learning_rate": 1.017071995286079e-05, + "loss": 1.2163, + "step": 17108 + }, + { + "epoch": 5.095850630130866, + "grad_norm": 0.7034139037132263, + "learning_rate": 1.0169755526275386e-05, + "loss": 1.2313, + "step": 17109 + }, + { + "epoch": 5.0961484763305345, + "grad_norm": 0.4257388412952423, + "learning_rate": 1.01687910981106e-05, + "loss": 1.22, + "step": 17110 + }, + { + "epoch": 5.096446322530204, + "grad_norm": 0.363075315952301, + "learning_rate": 1.0167826668375398e-05, + "loss": 1.2138, + "step": 17111 + }, + { + "epoch": 5.096744168729872, + "grad_norm": 0.26515504717826843, + "learning_rate": 1.0166862237078756e-05, + "loss": 1.2068, + "step": 17112 + }, + { + "epoch": 5.09704201492954, + "grad_norm": 0.6181104779243469, + "learning_rate": 1.0165897804229647e-05, + "loss": 1.1972, + "step": 17113 + }, + { + "epoch": 5.09733986112921, + "grad_norm": 0.32947444915771484, + "learning_rate": 1.0164933369837045e-05, + "loss": 1.2239, + "step": 17114 + }, + { + "epoch": 5.097637707328878, + "grad_norm": 0.4215790033340454, + "learning_rate": 1.0163968933909922e-05, + "loss": 1.2431, + "step": 17115 + }, + { + "epoch": 5.097935553528547, + "grad_norm": 0.2851150333881378, + "learning_rate": 1.0163004496457252e-05, + "loss": 1.2091, + "step": 17116 + }, + { + "epoch": 5.098233399728215, + "grad_norm": 0.4571061432361603, + "learning_rate": 1.0162040057488007e-05, + "loss": 1.2235, + "step": 17117 + }, + { + "epoch": 5.098531245927884, + "grad_norm": 0.2850176692008972, + "learning_rate": 1.0161075617011159e-05, + "loss": 1.1987, + "step": 17118 + }, + { + "epoch": 5.098829092127553, + "grad_norm": 0.3262980580329895, + "learning_rate": 1.0160111175035684e-05, + "loss": 1.2172, + "step": 17119 + }, + { + "epoch": 5.099126938327221, + "grad_norm": 0.3843151032924652, + "learning_rate": 1.0159146731570556e-05, + "loss": 1.2077, + "step": 17120 + }, + { + "epoch": 5.09942478452689, + "grad_norm": 0.2500159740447998, + "learning_rate": 1.0158182286624741e-05, + "loss": 1.2274, + "step": 17121 + }, + { + "epoch": 5.099722630726559, + "grad_norm": 0.4776756465435028, + "learning_rate": 1.0157217840207217e-05, + "loss": 1.2176, + "step": 17122 + }, + { + "epoch": 5.100020476926227, + "grad_norm": 0.32060763239860535, + "learning_rate": 1.0156253392326958e-05, + "loss": 1.2113, + "step": 17123 + }, + { + "epoch": 5.1003183231258955, + "grad_norm": 0.49617981910705566, + "learning_rate": 1.0155288942992933e-05, + "loss": 1.2133, + "step": 17124 + }, + { + "epoch": 5.100616169325565, + "grad_norm": 0.30920445919036865, + "learning_rate": 1.0154324492214123e-05, + "loss": 1.2167, + "step": 17125 + }, + { + "epoch": 5.100914015525233, + "grad_norm": 0.5329804420471191, + "learning_rate": 1.0153360039999492e-05, + "loss": 1.2076, + "step": 17126 + }, + { + "epoch": 5.101211861724901, + "grad_norm": 0.38505634665489197, + "learning_rate": 1.015239558635802e-05, + "loss": 1.2309, + "step": 17127 + }, + { + "epoch": 5.101509707924571, + "grad_norm": 0.46117085218429565, + "learning_rate": 1.0151431131298674e-05, + "loss": 1.2127, + "step": 17128 + }, + { + "epoch": 5.101807554124239, + "grad_norm": 0.3312280774116516, + "learning_rate": 1.0150466674830433e-05, + "loss": 1.2096, + "step": 17129 + }, + { + "epoch": 5.102105400323908, + "grad_norm": 0.6624826192855835, + "learning_rate": 1.0149502216962269e-05, + "loss": 1.2088, + "step": 17130 + }, + { + "epoch": 5.1024032465235765, + "grad_norm": 0.39815059304237366, + "learning_rate": 1.0148537757703149e-05, + "loss": 1.2267, + "step": 17131 + }, + { + "epoch": 5.102701092723245, + "grad_norm": 0.6149324774742126, + "learning_rate": 1.0147573297062055e-05, + "loss": 1.2041, + "step": 17132 + }, + { + "epoch": 5.102998938922914, + "grad_norm": 0.4168391525745392, + "learning_rate": 1.0146608835047955e-05, + "loss": 1.2109, + "step": 17133 + }, + { + "epoch": 5.103296785122582, + "grad_norm": 0.5807532072067261, + "learning_rate": 1.0145644371669824e-05, + "loss": 1.2124, + "step": 17134 + }, + { + "epoch": 5.103594631322251, + "grad_norm": 0.25392472743988037, + "learning_rate": 1.0144679906936636e-05, + "loss": 1.2329, + "step": 17135 + }, + { + "epoch": 5.10389247752192, + "grad_norm": 1.1156970262527466, + "learning_rate": 1.0143715440857361e-05, + "loss": 1.2118, + "step": 17136 + }, + { + "epoch": 5.104190323721588, + "grad_norm": 0.3196142911911011, + "learning_rate": 1.0142750973440975e-05, + "loss": 1.2248, + "step": 17137 + }, + { + "epoch": 5.1044881699212565, + "grad_norm": 0.9182085990905762, + "learning_rate": 1.0141786504696452e-05, + "loss": 1.2096, + "step": 17138 + }, + { + "epoch": 5.104786016120926, + "grad_norm": 0.25468823313713074, + "learning_rate": 1.0140822034632763e-05, + "loss": 1.2209, + "step": 17139 + }, + { + "epoch": 5.105083862320594, + "grad_norm": 0.8345251083374023, + "learning_rate": 1.0139857563258883e-05, + "loss": 1.2238, + "step": 17140 + }, + { + "epoch": 5.105381708520262, + "grad_norm": 0.3281252384185791, + "learning_rate": 1.0138893090583783e-05, + "loss": 1.2343, + "step": 17141 + }, + { + "epoch": 5.105679554719932, + "grad_norm": 0.46737855672836304, + "learning_rate": 1.013792861661644e-05, + "loss": 1.2216, + "step": 17142 + }, + { + "epoch": 5.1059774009196, + "grad_norm": 0.45355433225631714, + "learning_rate": 1.013696414136582e-05, + "loss": 1.1997, + "step": 17143 + }, + { + "epoch": 5.106275247119269, + "grad_norm": 0.3648901879787445, + "learning_rate": 1.013599966484091e-05, + "loss": 1.2295, + "step": 17144 + }, + { + "epoch": 5.1065730933189375, + "grad_norm": 0.3974287211894989, + "learning_rate": 1.013503518705067e-05, + "loss": 1.2223, + "step": 17145 + }, + { + "epoch": 5.106870939518606, + "grad_norm": 0.34418654441833496, + "learning_rate": 1.013407070800408e-05, + "loss": 1.1982, + "step": 17146 + }, + { + "epoch": 5.107168785718275, + "grad_norm": 0.3958006501197815, + "learning_rate": 1.0133106227710113e-05, + "loss": 1.2162, + "step": 17147 + }, + { + "epoch": 5.107466631917943, + "grad_norm": 0.2721463143825531, + "learning_rate": 1.0132141746177737e-05, + "loss": 1.2272, + "step": 17148 + }, + { + "epoch": 5.107764478117612, + "grad_norm": 0.3730507493019104, + "learning_rate": 1.0131177263415933e-05, + "loss": 1.2221, + "step": 17149 + }, + { + "epoch": 5.108062324317281, + "grad_norm": 0.35416150093078613, + "learning_rate": 1.013021277943367e-05, + "loss": 1.2154, + "step": 17150 + }, + { + "epoch": 5.108360170516949, + "grad_norm": 0.28584015369415283, + "learning_rate": 1.0129248294239924e-05, + "loss": 1.2311, + "step": 17151 + }, + { + "epoch": 5.108658016716618, + "grad_norm": 0.5706966519355774, + "learning_rate": 1.0128283807843666e-05, + "loss": 1.2268, + "step": 17152 + }, + { + "epoch": 5.108955862916287, + "grad_norm": 0.340482622385025, + "learning_rate": 1.012731932025387e-05, + "loss": 1.2147, + "step": 17153 + }, + { + "epoch": 5.109253709115955, + "grad_norm": 0.42856964468955994, + "learning_rate": 1.012635483147951e-05, + "loss": 1.2214, + "step": 17154 + }, + { + "epoch": 5.109551555315624, + "grad_norm": 0.2704993486404419, + "learning_rate": 1.012539034152956e-05, + "loss": 1.2267, + "step": 17155 + }, + { + "epoch": 5.109849401515293, + "grad_norm": 0.3846414387226105, + "learning_rate": 1.0124425850412992e-05, + "loss": 1.221, + "step": 17156 + }, + { + "epoch": 5.110147247714961, + "grad_norm": 0.35049378871917725, + "learning_rate": 1.0123461358138782e-05, + "loss": 1.2149, + "step": 17157 + }, + { + "epoch": 5.11044509391463, + "grad_norm": 0.2807372212409973, + "learning_rate": 1.0122496864715903e-05, + "loss": 1.2131, + "step": 17158 + }, + { + "epoch": 5.1107429401142985, + "grad_norm": 0.3807115852832794, + "learning_rate": 1.0121532370153324e-05, + "loss": 1.2065, + "step": 17159 + }, + { + "epoch": 5.111040786313967, + "grad_norm": 0.271686315536499, + "learning_rate": 1.0120567874460024e-05, + "loss": 1.2185, + "step": 17160 + }, + { + "epoch": 5.111338632513636, + "grad_norm": 0.4014586806297302, + "learning_rate": 1.0119603377644976e-05, + "loss": 1.2171, + "step": 17161 + }, + { + "epoch": 5.111636478713304, + "grad_norm": 0.2593546509742737, + "learning_rate": 1.011863887971715e-05, + "loss": 1.2111, + "step": 17162 + }, + { + "epoch": 5.111934324912973, + "grad_norm": 0.2558513581752777, + "learning_rate": 1.0117674380685523e-05, + "loss": 1.1981, + "step": 17163 + }, + { + "epoch": 5.112232171112642, + "grad_norm": 0.28646785020828247, + "learning_rate": 1.0116709880559071e-05, + "loss": 1.2059, + "step": 17164 + }, + { + "epoch": 5.11253001731231, + "grad_norm": 0.3175455331802368, + "learning_rate": 1.0115745379346755e-05, + "loss": 1.2068, + "step": 17165 + }, + { + "epoch": 5.112827863511979, + "grad_norm": 0.49774348735809326, + "learning_rate": 1.0114780877057562e-05, + "loss": 1.226, + "step": 17166 + }, + { + "epoch": 5.113125709711648, + "grad_norm": 0.31071609258651733, + "learning_rate": 1.0113816373700462e-05, + "loss": 1.197, + "step": 17167 + }, + { + "epoch": 5.113423555911316, + "grad_norm": 0.2739827632904053, + "learning_rate": 1.011285186928443e-05, + "loss": 1.23, + "step": 17168 + }, + { + "epoch": 5.113721402110985, + "grad_norm": 0.2822209596633911, + "learning_rate": 1.0111887363818433e-05, + "loss": 1.2026, + "step": 17169 + }, + { + "epoch": 5.114019248310654, + "grad_norm": 0.37194135785102844, + "learning_rate": 1.0110922857311452e-05, + "loss": 1.2078, + "step": 17170 + }, + { + "epoch": 5.114317094510322, + "grad_norm": 0.388312429189682, + "learning_rate": 1.0109958349772457e-05, + "loss": 1.2242, + "step": 17171 + }, + { + "epoch": 5.114614940709991, + "grad_norm": 0.2602137625217438, + "learning_rate": 1.0108993841210421e-05, + "loss": 1.2086, + "step": 17172 + }, + { + "epoch": 5.11491278690966, + "grad_norm": 0.2920834422111511, + "learning_rate": 1.0108029331634322e-05, + "loss": 1.2221, + "step": 17173 + }, + { + "epoch": 5.115210633109328, + "grad_norm": 0.24184873700141907, + "learning_rate": 1.0107064821053127e-05, + "loss": 1.2071, + "step": 17174 + }, + { + "epoch": 5.115508479308997, + "grad_norm": 0.26793280243873596, + "learning_rate": 1.0106100309475816e-05, + "loss": 1.211, + "step": 17175 + }, + { + "epoch": 5.115806325508665, + "grad_norm": 0.2672754228115082, + "learning_rate": 1.010513579691136e-05, + "loss": 1.1999, + "step": 17176 + }, + { + "epoch": 5.116104171708334, + "grad_norm": 0.28690797090530396, + "learning_rate": 1.0104171283368731e-05, + "loss": 1.2122, + "step": 17177 + }, + { + "epoch": 5.116402017908003, + "grad_norm": 0.3485107123851776, + "learning_rate": 1.0103206768856909e-05, + "loss": 1.2072, + "step": 17178 + }, + { + "epoch": 5.116699864107671, + "grad_norm": 0.27620166540145874, + "learning_rate": 1.0102242253384859e-05, + "loss": 1.202, + "step": 17179 + }, + { + "epoch": 5.11699771030734, + "grad_norm": 0.29888781905174255, + "learning_rate": 1.0101277736961561e-05, + "loss": 1.202, + "step": 17180 + }, + { + "epoch": 5.117295556507009, + "grad_norm": 0.24393923580646515, + "learning_rate": 1.0100313219595988e-05, + "loss": 1.2174, + "step": 17181 + }, + { + "epoch": 5.117593402706677, + "grad_norm": 0.3704393208026886, + "learning_rate": 1.0099348701297107e-05, + "loss": 1.2037, + "step": 17182 + }, + { + "epoch": 5.117891248906346, + "grad_norm": 0.24490593373775482, + "learning_rate": 1.0098384182073902e-05, + "loss": 1.213, + "step": 17183 + }, + { + "epoch": 5.118189095106015, + "grad_norm": 0.5924113988876343, + "learning_rate": 1.0097419661935341e-05, + "loss": 1.2157, + "step": 17184 + }, + { + "epoch": 5.118486941305683, + "grad_norm": 0.44691896438598633, + "learning_rate": 1.0096455140890403e-05, + "loss": 1.2153, + "step": 17185 + }, + { + "epoch": 5.118784787505352, + "grad_norm": 0.4877368211746216, + "learning_rate": 1.0095490618948052e-05, + "loss": 1.2307, + "step": 17186 + }, + { + "epoch": 5.119082633705021, + "grad_norm": 0.4787835478782654, + "learning_rate": 1.009452609611727e-05, + "loss": 1.2202, + "step": 17187 + }, + { + "epoch": 5.119380479904689, + "grad_norm": 0.33387571573257446, + "learning_rate": 1.0093561572407027e-05, + "loss": 1.2269, + "step": 17188 + }, + { + "epoch": 5.119678326104358, + "grad_norm": 0.3363417983055115, + "learning_rate": 1.0092597047826298e-05, + "loss": 1.2053, + "step": 17189 + }, + { + "epoch": 5.1199761723040265, + "grad_norm": 0.29627305269241333, + "learning_rate": 1.0091632522384056e-05, + "loss": 1.228, + "step": 17190 + }, + { + "epoch": 5.120274018503695, + "grad_norm": 0.2782908082008362, + "learning_rate": 1.0090667996089278e-05, + "loss": 1.2095, + "step": 17191 + }, + { + "epoch": 5.120571864703364, + "grad_norm": 0.34067395329475403, + "learning_rate": 1.0089703468950937e-05, + "loss": 1.2176, + "step": 17192 + }, + { + "epoch": 5.120869710903032, + "grad_norm": 0.27354928851127625, + "learning_rate": 1.0088738940978003e-05, + "loss": 1.2138, + "step": 17193 + }, + { + "epoch": 5.121167557102701, + "grad_norm": 0.40996381640434265, + "learning_rate": 1.0087774412179452e-05, + "loss": 1.2236, + "step": 17194 + }, + { + "epoch": 5.12146540330237, + "grad_norm": 0.26813942193984985, + "learning_rate": 1.008680988256426e-05, + "loss": 1.2099, + "step": 17195 + }, + { + "epoch": 5.121763249502038, + "grad_norm": 0.35690394043922424, + "learning_rate": 1.0085845352141397e-05, + "loss": 1.2072, + "step": 17196 + }, + { + "epoch": 5.122061095701707, + "grad_norm": 0.27852919697761536, + "learning_rate": 1.008488082091984e-05, + "loss": 1.2167, + "step": 17197 + }, + { + "epoch": 5.122358941901376, + "grad_norm": 0.2842017412185669, + "learning_rate": 1.0083916288908565e-05, + "loss": 1.2292, + "step": 17198 + }, + { + "epoch": 5.122656788101044, + "grad_norm": 0.37780824303627014, + "learning_rate": 1.0082951756116536e-05, + "loss": 1.2029, + "step": 17199 + }, + { + "epoch": 5.122954634300713, + "grad_norm": 0.2934509515762329, + "learning_rate": 1.0081987222552735e-05, + "loss": 1.2238, + "step": 17200 + }, + { + "epoch": 5.123252480500382, + "grad_norm": 0.3488498330116272, + "learning_rate": 1.0081022688226136e-05, + "loss": 1.2137, + "step": 17201 + }, + { + "epoch": 5.12355032670005, + "grad_norm": 0.3076275885105133, + "learning_rate": 1.0080058153145715e-05, + "loss": 1.2141, + "step": 17202 + }, + { + "epoch": 5.123848172899719, + "grad_norm": 0.2605409026145935, + "learning_rate": 1.0079093617320438e-05, + "loss": 1.2083, + "step": 17203 + }, + { + "epoch": 5.1241460190993875, + "grad_norm": 0.3248927593231201, + "learning_rate": 1.0078129080759284e-05, + "loss": 1.2143, + "step": 17204 + }, + { + "epoch": 5.124443865299056, + "grad_norm": 0.31479761004447937, + "learning_rate": 1.007716454347123e-05, + "loss": 1.2165, + "step": 17205 + }, + { + "epoch": 5.124741711498725, + "grad_norm": 0.42612096667289734, + "learning_rate": 1.007620000546524e-05, + "loss": 1.222, + "step": 17206 + }, + { + "epoch": 5.125039557698393, + "grad_norm": 0.34823623299598694, + "learning_rate": 1.0075235466750298e-05, + "loss": 1.206, + "step": 17207 + }, + { + "epoch": 5.125337403898062, + "grad_norm": 0.31496137380599976, + "learning_rate": 1.007427092733537e-05, + "loss": 1.1924, + "step": 17208 + }, + { + "epoch": 5.125635250097731, + "grad_norm": 0.3191942870616913, + "learning_rate": 1.007330638722944e-05, + "loss": 1.2107, + "step": 17209 + }, + { + "epoch": 5.125933096297399, + "grad_norm": 0.3097342550754547, + "learning_rate": 1.0072341846441473e-05, + "loss": 1.2052, + "step": 17210 + }, + { + "epoch": 5.1262309424970685, + "grad_norm": 0.2566518187522888, + "learning_rate": 1.0071377304980445e-05, + "loss": 1.2096, + "step": 17211 + }, + { + "epoch": 5.126528788696737, + "grad_norm": 0.2853011190891266, + "learning_rate": 1.007041276285533e-05, + "loss": 1.2021, + "step": 17212 + }, + { + "epoch": 5.126826634896405, + "grad_norm": 0.4396412670612335, + "learning_rate": 1.0069448220075106e-05, + "loss": 1.2148, + "step": 17213 + }, + { + "epoch": 5.127124481096074, + "grad_norm": 0.5236174464225769, + "learning_rate": 1.0068483676648742e-05, + "loss": 1.2054, + "step": 17214 + }, + { + "epoch": 5.127422327295743, + "grad_norm": 0.32824137806892395, + "learning_rate": 1.0067519132585215e-05, + "loss": 1.2092, + "step": 17215 + }, + { + "epoch": 5.127720173495411, + "grad_norm": 0.6092197895050049, + "learning_rate": 1.0066554587893499e-05, + "loss": 1.2026, + "step": 17216 + }, + { + "epoch": 5.12801801969508, + "grad_norm": 0.25081637501716614, + "learning_rate": 1.0065590042582565e-05, + "loss": 1.202, + "step": 17217 + }, + { + "epoch": 5.1283158658947485, + "grad_norm": 0.4604050815105438, + "learning_rate": 1.0064625496661387e-05, + "loss": 1.219, + "step": 17218 + }, + { + "epoch": 5.128613712094417, + "grad_norm": 0.27120324969291687, + "learning_rate": 1.0063660950138945e-05, + "loss": 1.2328, + "step": 17219 + }, + { + "epoch": 5.128911558294086, + "grad_norm": 0.4967385232448578, + "learning_rate": 1.0062696403024206e-05, + "loss": 1.2301, + "step": 17220 + }, + { + "epoch": 5.129209404493754, + "grad_norm": 0.26437661051750183, + "learning_rate": 1.0061731855326149e-05, + "loss": 1.2166, + "step": 17221 + }, + { + "epoch": 5.129507250693424, + "grad_norm": 0.3599362373352051, + "learning_rate": 1.0060767307053746e-05, + "loss": 1.2215, + "step": 17222 + }, + { + "epoch": 5.129805096893092, + "grad_norm": 0.27200520038604736, + "learning_rate": 1.0059802758215971e-05, + "loss": 1.1992, + "step": 17223 + }, + { + "epoch": 5.13010294309276, + "grad_norm": 0.5720693469047546, + "learning_rate": 1.0058838208821795e-05, + "loss": 1.2138, + "step": 17224 + }, + { + "epoch": 5.1304007892924295, + "grad_norm": 0.3677472174167633, + "learning_rate": 1.0057873658880197e-05, + "loss": 1.1955, + "step": 17225 + }, + { + "epoch": 5.130698635492098, + "grad_norm": 0.3923113942146301, + "learning_rate": 1.0056909108400153e-05, + "loss": 1.2113, + "step": 17226 + }, + { + "epoch": 5.130996481691766, + "grad_norm": 0.37376493215560913, + "learning_rate": 1.0055944557390629e-05, + "loss": 1.2135, + "step": 17227 + }, + { + "epoch": 5.131294327891435, + "grad_norm": 0.29747623205184937, + "learning_rate": 1.0054980005860604e-05, + "loss": 1.2151, + "step": 17228 + }, + { + "epoch": 5.131592174091104, + "grad_norm": 0.2658478617668152, + "learning_rate": 1.0054015453819052e-05, + "loss": 1.2116, + "step": 17229 + }, + { + "epoch": 5.131890020290772, + "grad_norm": 0.2868218719959259, + "learning_rate": 1.0053050901274947e-05, + "loss": 1.215, + "step": 17230 + }, + { + "epoch": 5.132187866490441, + "grad_norm": 0.27445900440216064, + "learning_rate": 1.0052086348237262e-05, + "loss": 1.2141, + "step": 17231 + }, + { + "epoch": 5.13248571269011, + "grad_norm": 0.28511521220207214, + "learning_rate": 1.0051121794714971e-05, + "loss": 1.2288, + "step": 17232 + }, + { + "epoch": 5.132783558889778, + "grad_norm": 0.3396657705307007, + "learning_rate": 1.0050157240717053e-05, + "loss": 1.2153, + "step": 17233 + }, + { + "epoch": 5.133081405089447, + "grad_norm": 0.2693929076194763, + "learning_rate": 1.0049192686252474e-05, + "loss": 1.2167, + "step": 17234 + }, + { + "epoch": 5.133379251289115, + "grad_norm": 0.35997962951660156, + "learning_rate": 1.0048228131330214e-05, + "loss": 1.2178, + "step": 17235 + }, + { + "epoch": 5.133677097488785, + "grad_norm": 0.25350475311279297, + "learning_rate": 1.0047263575959243e-05, + "loss": 1.221, + "step": 17236 + }, + { + "epoch": 5.133974943688453, + "grad_norm": 0.5521942377090454, + "learning_rate": 1.0046299020148538e-05, + "loss": 1.2327, + "step": 17237 + }, + { + "epoch": 5.134272789888121, + "grad_norm": 0.37590649724006653, + "learning_rate": 1.004533446390707e-05, + "loss": 1.2087, + "step": 17238 + }, + { + "epoch": 5.1345706360877905, + "grad_norm": 0.46460583806037903, + "learning_rate": 1.004436990724382e-05, + "loss": 1.2286, + "step": 17239 + }, + { + "epoch": 5.134868482287459, + "grad_norm": 0.4416270852088928, + "learning_rate": 1.0043405350167757e-05, + "loss": 1.2159, + "step": 17240 + }, + { + "epoch": 5.135166328487127, + "grad_norm": 0.5738375186920166, + "learning_rate": 1.0042440792687854e-05, + "loss": 1.2152, + "step": 17241 + }, + { + "epoch": 5.135464174686796, + "grad_norm": 0.5010666847229004, + "learning_rate": 1.0041476234813084e-05, + "loss": 1.2202, + "step": 17242 + }, + { + "epoch": 5.135762020886465, + "grad_norm": 0.6015894412994385, + "learning_rate": 1.0040511676552428e-05, + "loss": 1.2079, + "step": 17243 + }, + { + "epoch": 5.136059867086133, + "grad_norm": 0.6593461632728577, + "learning_rate": 1.0039547117914856e-05, + "loss": 1.2118, + "step": 17244 + }, + { + "epoch": 5.136357713285802, + "grad_norm": 0.3574669063091278, + "learning_rate": 1.0038582558909342e-05, + "loss": 1.2159, + "step": 17245 + }, + { + "epoch": 5.136655559485471, + "grad_norm": 0.3862614035606384, + "learning_rate": 1.0037617999544862e-05, + "loss": 1.2239, + "step": 17246 + }, + { + "epoch": 5.136953405685139, + "grad_norm": 0.3918178379535675, + "learning_rate": 1.0036653439830384e-05, + "loss": 1.223, + "step": 17247 + }, + { + "epoch": 5.137251251884808, + "grad_norm": 0.27867281436920166, + "learning_rate": 1.0035688879774887e-05, + "loss": 1.218, + "step": 17248 + }, + { + "epoch": 5.1375490980844765, + "grad_norm": 0.5913288593292236, + "learning_rate": 1.0034724319387349e-05, + "loss": 1.2221, + "step": 17249 + }, + { + "epoch": 5.137846944284146, + "grad_norm": 0.257610023021698, + "learning_rate": 1.0033759758676738e-05, + "loss": 1.2002, + "step": 17250 + }, + { + "epoch": 5.138144790483814, + "grad_norm": 0.4588865041732788, + "learning_rate": 1.003279519765203e-05, + "loss": 1.22, + "step": 17251 + }, + { + "epoch": 5.138442636683482, + "grad_norm": 0.3236146867275238, + "learning_rate": 1.00318306363222e-05, + "loss": 1.2074, + "step": 17252 + }, + { + "epoch": 5.1387404828831516, + "grad_norm": 0.28302136063575745, + "learning_rate": 1.003086607469622e-05, + "loss": 1.2233, + "step": 17253 + }, + { + "epoch": 5.13903832908282, + "grad_norm": 0.41775062680244446, + "learning_rate": 1.0029901512783067e-05, + "loss": 1.2371, + "step": 17254 + }, + { + "epoch": 5.139336175282488, + "grad_norm": 0.2608824074268341, + "learning_rate": 1.0028936950591712e-05, + "loss": 1.2233, + "step": 17255 + }, + { + "epoch": 5.139634021482157, + "grad_norm": 0.3283870816230774, + "learning_rate": 1.0027972388131135e-05, + "loss": 1.2045, + "step": 17256 + }, + { + "epoch": 5.139931867681826, + "grad_norm": 0.3376162648200989, + "learning_rate": 1.0027007825410305e-05, + "loss": 1.194, + "step": 17257 + }, + { + "epoch": 5.140229713881494, + "grad_norm": 0.3030361533164978, + "learning_rate": 1.0026043262438195e-05, + "loss": 1.2076, + "step": 17258 + }, + { + "epoch": 5.140527560081163, + "grad_norm": 0.3645525872707367, + "learning_rate": 1.002507869922378e-05, + "loss": 1.2226, + "step": 17259 + }, + { + "epoch": 5.140825406280832, + "grad_norm": 0.30467063188552856, + "learning_rate": 1.002411413577604e-05, + "loss": 1.2206, + "step": 17260 + }, + { + "epoch": 5.141123252480501, + "grad_norm": 0.3725603222846985, + "learning_rate": 1.0023149572103943e-05, + "loss": 1.2415, + "step": 17261 + }, + { + "epoch": 5.141421098680169, + "grad_norm": 0.2602631151676178, + "learning_rate": 1.0022185008216463e-05, + "loss": 1.2129, + "step": 17262 + }, + { + "epoch": 5.1417189448798375, + "grad_norm": 0.5139057636260986, + "learning_rate": 1.002122044412258e-05, + "loss": 1.2146, + "step": 17263 + }, + { + "epoch": 5.142016791079507, + "grad_norm": 0.2883543372154236, + "learning_rate": 1.0020255879831264e-05, + "loss": 1.2277, + "step": 17264 + }, + { + "epoch": 5.142314637279175, + "grad_norm": 0.36388319730758667, + "learning_rate": 1.0019291315351487e-05, + "loss": 1.2146, + "step": 17265 + }, + { + "epoch": 5.142612483478843, + "grad_norm": 0.2720346748828888, + "learning_rate": 1.0018326750692226e-05, + "loss": 1.2423, + "step": 17266 + }, + { + "epoch": 5.142910329678513, + "grad_norm": 0.393311470746994, + "learning_rate": 1.0017362185862459e-05, + "loss": 1.2339, + "step": 17267 + }, + { + "epoch": 5.143208175878181, + "grad_norm": 0.30610939860343933, + "learning_rate": 1.0016397620871154e-05, + "loss": 1.2124, + "step": 17268 + }, + { + "epoch": 5.143506022077849, + "grad_norm": 0.30627381801605225, + "learning_rate": 1.0015433055727286e-05, + "loss": 1.2222, + "step": 17269 + }, + { + "epoch": 5.1438038682775185, + "grad_norm": 0.29076746106147766, + "learning_rate": 1.0014468490439833e-05, + "loss": 1.2219, + "step": 17270 + }, + { + "epoch": 5.144101714477187, + "grad_norm": 0.2629736065864563, + "learning_rate": 1.0013503925017764e-05, + "loss": 1.228, + "step": 17271 + }, + { + "epoch": 5.144399560676855, + "grad_norm": 0.29096749424934387, + "learning_rate": 1.0012539359470059e-05, + "loss": 1.2282, + "step": 17272 + }, + { + "epoch": 5.144697406876524, + "grad_norm": 0.3243168294429779, + "learning_rate": 1.0011574793805688e-05, + "loss": 1.2092, + "step": 17273 + }, + { + "epoch": 5.144995253076193, + "grad_norm": 0.2627241313457489, + "learning_rate": 1.0010610228033629e-05, + "loss": 1.2236, + "step": 17274 + }, + { + "epoch": 5.145293099275861, + "grad_norm": 0.29743027687072754, + "learning_rate": 1.000964566216285e-05, + "loss": 1.2117, + "step": 17275 + }, + { + "epoch": 5.14559094547553, + "grad_norm": 0.2733718156814575, + "learning_rate": 1.000868109620233e-05, + "loss": 1.2265, + "step": 17276 + }, + { + "epoch": 5.1458887916751985, + "grad_norm": 0.26167747378349304, + "learning_rate": 1.0007716530161042e-05, + "loss": 1.2074, + "step": 17277 + }, + { + "epoch": 5.146186637874868, + "grad_norm": 0.25628238916397095, + "learning_rate": 1.000675196404796e-05, + "loss": 1.2209, + "step": 17278 + }, + { + "epoch": 5.146484484074536, + "grad_norm": 0.3680971562862396, + "learning_rate": 1.000578739787206e-05, + "loss": 1.2127, + "step": 17279 + }, + { + "epoch": 5.146782330274204, + "grad_norm": 0.30542105436325073, + "learning_rate": 1.0004822831642316e-05, + "loss": 1.208, + "step": 17280 + }, + { + "epoch": 5.147080176473874, + "grad_norm": 0.3616526424884796, + "learning_rate": 1.0003858265367698e-05, + "loss": 1.2073, + "step": 17281 + }, + { + "epoch": 5.147378022673542, + "grad_norm": 0.39930588006973267, + "learning_rate": 1.0002893699057184e-05, + "loss": 1.2141, + "step": 17282 + }, + { + "epoch": 5.14767586887321, + "grad_norm": 0.28660163283348083, + "learning_rate": 1.0001929132719743e-05, + "loss": 1.2291, + "step": 17283 + }, + { + "epoch": 5.1479737150728795, + "grad_norm": 0.3477869927883148, + "learning_rate": 1.0000964566364361e-05, + "loss": 1.211, + "step": 17284 + }, + { + "epoch": 5.148271561272548, + "grad_norm": 0.3396701216697693, + "learning_rate": 1e-05, + "loss": 1.2195, + "step": 17285 + }, + { + "epoch": 5.148569407472216, + "grad_norm": 0.4077571630477905, + "learning_rate": 9.999035433635644e-06, + "loss": 1.2298, + "step": 17286 + }, + { + "epoch": 5.148867253671885, + "grad_norm": 0.27981382608413696, + "learning_rate": 9.998070867280256e-06, + "loss": 1.2029, + "step": 17287 + }, + { + "epoch": 5.149165099871554, + "grad_norm": 0.40182021260261536, + "learning_rate": 9.99710630094282e-06, + "loss": 1.2286, + "step": 17288 + }, + { + "epoch": 5.149462946071223, + "grad_norm": 0.31784218549728394, + "learning_rate": 9.996141734632308e-06, + "loss": 1.2135, + "step": 17289 + }, + { + "epoch": 5.149760792270891, + "grad_norm": 0.28021320700645447, + "learning_rate": 9.995177168357687e-06, + "loss": 1.2165, + "step": 17290 + }, + { + "epoch": 5.15005863847056, + "grad_norm": 0.4634260833263397, + "learning_rate": 9.994212602127944e-06, + "loss": 1.2148, + "step": 17291 + }, + { + "epoch": 5.150356484670229, + "grad_norm": 0.4654317796230316, + "learning_rate": 9.993248035952045e-06, + "loss": 1.2105, + "step": 17292 + }, + { + "epoch": 5.150654330869897, + "grad_norm": 0.2636204659938812, + "learning_rate": 9.99228346983896e-06, + "loss": 1.2008, + "step": 17293 + }, + { + "epoch": 5.150952177069565, + "grad_norm": 0.3003879487514496, + "learning_rate": 9.991318903797673e-06, + "loss": 1.2342, + "step": 17294 + }, + { + "epoch": 5.151250023269235, + "grad_norm": 0.5199518203735352, + "learning_rate": 9.99035433783715e-06, + "loss": 1.2129, + "step": 17295 + }, + { + "epoch": 5.151547869468903, + "grad_norm": 0.31848806142807007, + "learning_rate": 9.989389771966375e-06, + "loss": 1.2095, + "step": 17296 + }, + { + "epoch": 5.151845715668571, + "grad_norm": 0.6944403648376465, + "learning_rate": 9.988425206194315e-06, + "loss": 1.216, + "step": 17297 + }, + { + "epoch": 5.1521435618682405, + "grad_norm": 0.5113841891288757, + "learning_rate": 9.987460640529943e-06, + "loss": 1.2107, + "step": 17298 + }, + { + "epoch": 5.152441408067909, + "grad_norm": 0.5333827137947083, + "learning_rate": 9.986496074982239e-06, + "loss": 1.2103, + "step": 17299 + }, + { + "epoch": 5.152739254267577, + "grad_norm": 0.4030782878398895, + "learning_rate": 9.98553150956017e-06, + "loss": 1.1972, + "step": 17300 + }, + { + "epoch": 5.153037100467246, + "grad_norm": 0.28179246187210083, + "learning_rate": 9.984566944272714e-06, + "loss": 1.2131, + "step": 17301 + }, + { + "epoch": 5.153334946666915, + "grad_norm": 0.5105195045471191, + "learning_rate": 9.98360237912885e-06, + "loss": 1.2183, + "step": 17302 + }, + { + "epoch": 5.153632792866584, + "grad_norm": 0.3332255482673645, + "learning_rate": 9.982637814137546e-06, + "loss": 1.2217, + "step": 17303 + }, + { + "epoch": 5.153930639066252, + "grad_norm": 0.3595399260520935, + "learning_rate": 9.981673249307774e-06, + "loss": 1.2045, + "step": 17304 + }, + { + "epoch": 5.154228485265921, + "grad_norm": 0.3326016366481781, + "learning_rate": 9.980708684648516e-06, + "loss": 1.1971, + "step": 17305 + }, + { + "epoch": 5.15452633146559, + "grad_norm": 0.36100009083747864, + "learning_rate": 9.979744120168741e-06, + "loss": 1.2112, + "step": 17306 + }, + { + "epoch": 5.154824177665258, + "grad_norm": 0.28945910930633545, + "learning_rate": 9.97877955587742e-06, + "loss": 1.1938, + "step": 17307 + }, + { + "epoch": 5.1551220238649265, + "grad_norm": 0.4248252511024475, + "learning_rate": 9.977814991783539e-06, + "loss": 1.2257, + "step": 17308 + }, + { + "epoch": 5.155419870064596, + "grad_norm": 0.27031490206718445, + "learning_rate": 9.976850427896063e-06, + "loss": 1.225, + "step": 17309 + }, + { + "epoch": 5.155717716264264, + "grad_norm": 0.44048282504081726, + "learning_rate": 9.975885864223965e-06, + "loss": 1.231, + "step": 17310 + }, + { + "epoch": 5.156015562463932, + "grad_norm": 0.3263550102710724, + "learning_rate": 9.974921300776222e-06, + "loss": 1.2365, + "step": 17311 + }, + { + "epoch": 5.1563134086636015, + "grad_norm": 0.49528902769088745, + "learning_rate": 9.973956737561809e-06, + "loss": 1.2331, + "step": 17312 + }, + { + "epoch": 5.15661125486327, + "grad_norm": 0.2641638219356537, + "learning_rate": 9.9729921745897e-06, + "loss": 1.209, + "step": 17313 + }, + { + "epoch": 5.156909101062938, + "grad_norm": 0.36507704854011536, + "learning_rate": 9.97202761186887e-06, + "loss": 1.2132, + "step": 17314 + }, + { + "epoch": 5.157206947262607, + "grad_norm": 0.29481709003448486, + "learning_rate": 9.971063049408288e-06, + "loss": 1.2166, + "step": 17315 + }, + { + "epoch": 5.157504793462276, + "grad_norm": 0.33992066979408264, + "learning_rate": 9.970098487216937e-06, + "loss": 1.2195, + "step": 17316 + }, + { + "epoch": 5.157802639661945, + "grad_norm": 0.3174651861190796, + "learning_rate": 9.969133925303783e-06, + "loss": 1.2139, + "step": 17317 + }, + { + "epoch": 5.158100485861613, + "grad_norm": 0.3178800642490387, + "learning_rate": 9.968169363677803e-06, + "loss": 1.2255, + "step": 17318 + }, + { + "epoch": 5.158398332061282, + "grad_norm": 0.5044521689414978, + "learning_rate": 9.967204802347972e-06, + "loss": 1.1994, + "step": 17319 + }, + { + "epoch": 5.158696178260951, + "grad_norm": 0.2825145125389099, + "learning_rate": 9.966240241323267e-06, + "loss": 1.2096, + "step": 17320 + }, + { + "epoch": 5.158994024460619, + "grad_norm": 0.532430112361908, + "learning_rate": 9.965275680612653e-06, + "loss": 1.2184, + "step": 17321 + }, + { + "epoch": 5.1592918706602875, + "grad_norm": 0.3720751106739044, + "learning_rate": 9.964311120225116e-06, + "loss": 1.2142, + "step": 17322 + }, + { + "epoch": 5.159589716859957, + "grad_norm": 0.47451266646385193, + "learning_rate": 9.96334656016962e-06, + "loss": 1.219, + "step": 17323 + }, + { + "epoch": 5.159887563059625, + "grad_norm": 0.4867680072784424, + "learning_rate": 9.962382000455142e-06, + "loss": 1.2282, + "step": 17324 + }, + { + "epoch": 5.160185409259293, + "grad_norm": 0.34293991327285767, + "learning_rate": 9.961417441090661e-06, + "loss": 1.2111, + "step": 17325 + }, + { + "epoch": 5.160483255458963, + "grad_norm": 0.4056837260723114, + "learning_rate": 9.960452882085149e-06, + "loss": 1.2323, + "step": 17326 + }, + { + "epoch": 5.160781101658631, + "grad_norm": 0.33025863766670227, + "learning_rate": 9.959488323447573e-06, + "loss": 1.2088, + "step": 17327 + }, + { + "epoch": 5.1610789478583, + "grad_norm": 0.31803029775619507, + "learning_rate": 9.958523765186917e-06, + "loss": 1.2342, + "step": 17328 + }, + { + "epoch": 5.1613767940579685, + "grad_norm": 0.3175404667854309, + "learning_rate": 9.95755920731215e-06, + "loss": 1.2191, + "step": 17329 + }, + { + "epoch": 5.161674640257637, + "grad_norm": 0.30470433831214905, + "learning_rate": 9.956594649832246e-06, + "loss": 1.2125, + "step": 17330 + }, + { + "epoch": 5.161972486457306, + "grad_norm": 0.3382590115070343, + "learning_rate": 9.955630092756183e-06, + "loss": 1.2256, + "step": 17331 + }, + { + "epoch": 5.162270332656974, + "grad_norm": 0.27963191270828247, + "learning_rate": 9.95466553609293e-06, + "loss": 1.2183, + "step": 17332 + }, + { + "epoch": 5.162568178856643, + "grad_norm": 0.3190222978591919, + "learning_rate": 9.953700979851467e-06, + "loss": 1.2033, + "step": 17333 + }, + { + "epoch": 5.162866025056312, + "grad_norm": 0.2902185916900635, + "learning_rate": 9.952736424040764e-06, + "loss": 1.2371, + "step": 17334 + }, + { + "epoch": 5.16316387125598, + "grad_norm": 0.31622791290283203, + "learning_rate": 9.95177186866979e-06, + "loss": 1.2219, + "step": 17335 + }, + { + "epoch": 5.1634617174556485, + "grad_norm": 0.34580332040786743, + "learning_rate": 9.95080731374753e-06, + "loss": 1.2018, + "step": 17336 + }, + { + "epoch": 5.163759563655318, + "grad_norm": 0.430586040019989, + "learning_rate": 9.949842759282952e-06, + "loss": 1.2038, + "step": 17337 + }, + { + "epoch": 5.164057409854986, + "grad_norm": 0.265218049287796, + "learning_rate": 9.948878205285028e-06, + "loss": 1.2268, + "step": 17338 + }, + { + "epoch": 5.164355256054654, + "grad_norm": 0.4097771644592285, + "learning_rate": 9.947913651762741e-06, + "loss": 1.2067, + "step": 17339 + }, + { + "epoch": 5.164653102254324, + "grad_norm": 0.27291157841682434, + "learning_rate": 9.946949098725056e-06, + "loss": 1.217, + "step": 17340 + }, + { + "epoch": 5.164950948453992, + "grad_norm": 0.5387572050094604, + "learning_rate": 9.94598454618095e-06, + "loss": 1.2107, + "step": 17341 + }, + { + "epoch": 5.16524879465366, + "grad_norm": 0.5005273818969727, + "learning_rate": 9.9450199941394e-06, + "loss": 1.224, + "step": 17342 + }, + { + "epoch": 5.1655466408533295, + "grad_norm": 0.2867977023124695, + "learning_rate": 9.944055442609371e-06, + "loss": 1.1963, + "step": 17343 + }, + { + "epoch": 5.165844487052998, + "grad_norm": 0.33826544880867004, + "learning_rate": 9.94309089159985e-06, + "loss": 1.2124, + "step": 17344 + }, + { + "epoch": 5.166142333252667, + "grad_norm": 0.31648164987564087, + "learning_rate": 9.942126341119806e-06, + "loss": 1.2157, + "step": 17345 + }, + { + "epoch": 5.166440179452335, + "grad_norm": 0.3026304841041565, + "learning_rate": 9.941161791178206e-06, + "loss": 1.2132, + "step": 17346 + }, + { + "epoch": 5.166738025652004, + "grad_norm": 0.3801819086074829, + "learning_rate": 9.940197241784032e-06, + "loss": 1.1988, + "step": 17347 + }, + { + "epoch": 5.167035871851673, + "grad_norm": 0.3131085932254791, + "learning_rate": 9.939232692946259e-06, + "loss": 1.1996, + "step": 17348 + }, + { + "epoch": 5.167333718051341, + "grad_norm": 0.3868970572948456, + "learning_rate": 9.938268144673853e-06, + "loss": 1.2109, + "step": 17349 + }, + { + "epoch": 5.16763156425101, + "grad_norm": 0.2998538613319397, + "learning_rate": 9.937303596975798e-06, + "loss": 1.2008, + "step": 17350 + }, + { + "epoch": 5.167929410450679, + "grad_norm": 0.32032912969589233, + "learning_rate": 9.93633904986106e-06, + "loss": 1.2054, + "step": 17351 + }, + { + "epoch": 5.168227256650347, + "grad_norm": 0.2641001045703888, + "learning_rate": 9.935374503338614e-06, + "loss": 1.1972, + "step": 17352 + }, + { + "epoch": 5.168525102850015, + "grad_norm": 0.48422569036483765, + "learning_rate": 9.934409957417438e-06, + "loss": 1.207, + "step": 17353 + }, + { + "epoch": 5.168822949049685, + "grad_norm": 0.25899428129196167, + "learning_rate": 9.933445412106506e-06, + "loss": 1.2119, + "step": 17354 + }, + { + "epoch": 5.169120795249353, + "grad_norm": 0.4854237139225006, + "learning_rate": 9.932480867414786e-06, + "loss": 1.2002, + "step": 17355 + }, + { + "epoch": 5.169418641449022, + "grad_norm": 0.25135356187820435, + "learning_rate": 9.93151632335126e-06, + "loss": 1.2103, + "step": 17356 + }, + { + "epoch": 5.1697164876486905, + "grad_norm": 0.3396010100841522, + "learning_rate": 9.930551779924899e-06, + "loss": 1.2187, + "step": 17357 + }, + { + "epoch": 5.170014333848359, + "grad_norm": 0.3366449177265167, + "learning_rate": 9.929587237144671e-06, + "loss": 1.2255, + "step": 17358 + }, + { + "epoch": 5.170312180048028, + "grad_norm": 0.410990834236145, + "learning_rate": 9.928622695019558e-06, + "loss": 1.2166, + "step": 17359 + }, + { + "epoch": 5.170610026247696, + "grad_norm": 0.29336434602737427, + "learning_rate": 9.927658153558528e-06, + "loss": 1.2211, + "step": 17360 + }, + { + "epoch": 5.170907872447365, + "grad_norm": 0.3034501373767853, + "learning_rate": 9.926693612770563e-06, + "loss": 1.211, + "step": 17361 + }, + { + "epoch": 5.171205718647034, + "grad_norm": 0.38883015513420105, + "learning_rate": 9.925729072664632e-06, + "loss": 1.2069, + "step": 17362 + }, + { + "epoch": 5.171503564846702, + "grad_norm": 0.2636663317680359, + "learning_rate": 9.924764533249704e-06, + "loss": 1.208, + "step": 17363 + }, + { + "epoch": 5.171801411046371, + "grad_norm": 0.33574822545051575, + "learning_rate": 9.923799994534764e-06, + "loss": 1.2327, + "step": 17364 + }, + { + "epoch": 5.17209925724604, + "grad_norm": 0.28025588393211365, + "learning_rate": 9.922835456528775e-06, + "loss": 1.2241, + "step": 17365 + }, + { + "epoch": 5.172397103445708, + "grad_norm": 0.38576653599739075, + "learning_rate": 9.921870919240716e-06, + "loss": 1.2103, + "step": 17366 + }, + { + "epoch": 5.1726949496453765, + "grad_norm": 0.2783429026603699, + "learning_rate": 9.920906382679563e-06, + "loss": 1.2128, + "step": 17367 + }, + { + "epoch": 5.172992795845046, + "grad_norm": 0.5975845456123352, + "learning_rate": 9.91994184685429e-06, + "loss": 1.2154, + "step": 17368 + }, + { + "epoch": 5.173290642044714, + "grad_norm": 0.4833067059516907, + "learning_rate": 9.918977311773863e-06, + "loss": 1.2295, + "step": 17369 + }, + { + "epoch": 5.173588488244383, + "grad_norm": 0.33450496196746826, + "learning_rate": 9.918012777447266e-06, + "loss": 1.2057, + "step": 17370 + }, + { + "epoch": 5.1738863344440515, + "grad_norm": 0.28698986768722534, + "learning_rate": 9.917048243883469e-06, + "loss": 1.2013, + "step": 17371 + }, + { + "epoch": 5.17418418064372, + "grad_norm": 0.6007641553878784, + "learning_rate": 9.91608371109144e-06, + "loss": 1.2038, + "step": 17372 + }, + { + "epoch": 5.174482026843389, + "grad_norm": 0.3245409429073334, + "learning_rate": 9.915119179080163e-06, + "loss": 1.2287, + "step": 17373 + }, + { + "epoch": 5.174779873043057, + "grad_norm": 0.3489818274974823, + "learning_rate": 9.914154647858608e-06, + "loss": 1.2307, + "step": 17374 + }, + { + "epoch": 5.175077719242726, + "grad_norm": 0.36396849155426025, + "learning_rate": 9.913190117435744e-06, + "loss": 1.2152, + "step": 17375 + }, + { + "epoch": 5.175375565442395, + "grad_norm": 0.4610700011253357, + "learning_rate": 9.91222558782055e-06, + "loss": 1.2149, + "step": 17376 + }, + { + "epoch": 5.175673411642063, + "grad_norm": 0.3610832393169403, + "learning_rate": 9.911261059021999e-06, + "loss": 1.2042, + "step": 17377 + }, + { + "epoch": 5.175971257841732, + "grad_norm": 0.3699630796909332, + "learning_rate": 9.910296531049066e-06, + "loss": 1.2235, + "step": 17378 + }, + { + "epoch": 5.176269104041401, + "grad_norm": 0.32827654480934143, + "learning_rate": 9.909332003910725e-06, + "loss": 1.2286, + "step": 17379 + }, + { + "epoch": 5.176566950241069, + "grad_norm": 0.4925483167171478, + "learning_rate": 9.908367477615944e-06, + "loss": 1.2264, + "step": 17380 + }, + { + "epoch": 5.1768647964407375, + "grad_norm": 0.36989858746528625, + "learning_rate": 9.907402952173705e-06, + "loss": 1.218, + "step": 17381 + }, + { + "epoch": 5.177162642640407, + "grad_norm": 0.6347072720527649, + "learning_rate": 9.906438427592977e-06, + "loss": 1.2149, + "step": 17382 + }, + { + "epoch": 5.177460488840075, + "grad_norm": 0.2984049916267395, + "learning_rate": 9.905473903882732e-06, + "loss": 1.2115, + "step": 17383 + }, + { + "epoch": 5.177758335039744, + "grad_norm": 0.27661293745040894, + "learning_rate": 9.904509381051952e-06, + "loss": 1.231, + "step": 17384 + }, + { + "epoch": 5.178056181239413, + "grad_norm": 0.4410458207130432, + "learning_rate": 9.903544859109604e-06, + "loss": 1.2063, + "step": 17385 + }, + { + "epoch": 5.178354027439081, + "grad_norm": 0.33633315563201904, + "learning_rate": 9.90258033806466e-06, + "loss": 1.2144, + "step": 17386 + }, + { + "epoch": 5.17865187363875, + "grad_norm": 0.32096680998802185, + "learning_rate": 9.901615817926101e-06, + "loss": 1.2039, + "step": 17387 + }, + { + "epoch": 5.1789497198384185, + "grad_norm": 0.3884566128253937, + "learning_rate": 9.900651298702895e-06, + "loss": 1.2143, + "step": 17388 + }, + { + "epoch": 5.179247566038087, + "grad_norm": 0.27329471707344055, + "learning_rate": 9.899686780404015e-06, + "loss": 1.2063, + "step": 17389 + }, + { + "epoch": 5.179545412237756, + "grad_norm": 0.34216269850730896, + "learning_rate": 9.898722263038442e-06, + "loss": 1.2148, + "step": 17390 + }, + { + "epoch": 5.179843258437424, + "grad_norm": 0.2769976556301117, + "learning_rate": 9.897757746615146e-06, + "loss": 1.2117, + "step": 17391 + }, + { + "epoch": 5.180141104637093, + "grad_norm": 0.33744072914123535, + "learning_rate": 9.896793231143095e-06, + "loss": 1.2177, + "step": 17392 + }, + { + "epoch": 5.180438950836762, + "grad_norm": 0.3166804313659668, + "learning_rate": 9.895828716631272e-06, + "loss": 1.2223, + "step": 17393 + }, + { + "epoch": 5.18073679703643, + "grad_norm": 0.2969924211502075, + "learning_rate": 9.894864203088642e-06, + "loss": 1.2149, + "step": 17394 + }, + { + "epoch": 5.181034643236099, + "grad_norm": 0.449819952249527, + "learning_rate": 9.893899690524185e-06, + "loss": 1.2107, + "step": 17395 + }, + { + "epoch": 5.181332489435768, + "grad_norm": 0.2913896441459656, + "learning_rate": 9.892935178946875e-06, + "loss": 1.2222, + "step": 17396 + }, + { + "epoch": 5.181630335635436, + "grad_norm": 0.4125060439109802, + "learning_rate": 9.89197066836568e-06, + "loss": 1.2265, + "step": 17397 + }, + { + "epoch": 5.181928181835105, + "grad_norm": 0.25802966952323914, + "learning_rate": 9.89100615878958e-06, + "loss": 1.204, + "step": 17398 + }, + { + "epoch": 5.182226028034774, + "grad_norm": 0.6255542039871216, + "learning_rate": 9.890041650227546e-06, + "loss": 1.2259, + "step": 17399 + }, + { + "epoch": 5.182523874234442, + "grad_norm": 0.3694813549518585, + "learning_rate": 9.889077142688552e-06, + "loss": 1.2026, + "step": 17400 + }, + { + "epoch": 5.182821720434111, + "grad_norm": 0.484057754278183, + "learning_rate": 9.888112636181568e-06, + "loss": 1.2181, + "step": 17401 + }, + { + "epoch": 5.1831195666337795, + "grad_norm": 0.2654632329940796, + "learning_rate": 9.887148130715576e-06, + "loss": 1.2032, + "step": 17402 + }, + { + "epoch": 5.183417412833448, + "grad_norm": 0.4504523277282715, + "learning_rate": 9.886183626299538e-06, + "loss": 1.2116, + "step": 17403 + }, + { + "epoch": 5.183715259033117, + "grad_norm": 0.31855273246765137, + "learning_rate": 9.88521912294244e-06, + "loss": 1.2243, + "step": 17404 + }, + { + "epoch": 5.184013105232785, + "grad_norm": 0.42272868752479553, + "learning_rate": 9.884254620653247e-06, + "loss": 1.2259, + "step": 17405 + }, + { + "epoch": 5.184310951432454, + "grad_norm": 0.4378338158130646, + "learning_rate": 9.883290119440934e-06, + "loss": 1.2033, + "step": 17406 + }, + { + "epoch": 5.184608797632123, + "grad_norm": 0.41525697708129883, + "learning_rate": 9.88232561931448e-06, + "loss": 1.215, + "step": 17407 + }, + { + "epoch": 5.184906643831791, + "grad_norm": 0.34648171067237854, + "learning_rate": 9.881361120282854e-06, + "loss": 1.2228, + "step": 17408 + }, + { + "epoch": 5.18520449003146, + "grad_norm": 0.3187936544418335, + "learning_rate": 9.880396622355026e-06, + "loss": 1.2164, + "step": 17409 + }, + { + "epoch": 5.185502336231129, + "grad_norm": 0.31613689661026, + "learning_rate": 9.87943212553998e-06, + "loss": 1.2156, + "step": 17410 + }, + { + "epoch": 5.185800182430797, + "grad_norm": 0.35116851329803467, + "learning_rate": 9.878467629846676e-06, + "loss": 1.2082, + "step": 17411 + }, + { + "epoch": 5.186098028630466, + "grad_norm": 0.33654022216796875, + "learning_rate": 9.877503135284099e-06, + "loss": 1.2396, + "step": 17412 + }, + { + "epoch": 5.186395874830135, + "grad_norm": 0.3258071839809418, + "learning_rate": 9.87653864186122e-06, + "loss": 1.1907, + "step": 17413 + }, + { + "epoch": 5.186693721029803, + "grad_norm": 0.31604331731796265, + "learning_rate": 9.875574149587007e-06, + "loss": 1.2334, + "step": 17414 + }, + { + "epoch": 5.186991567229472, + "grad_norm": 0.3823065757751465, + "learning_rate": 9.874609658470442e-06, + "loss": 1.1986, + "step": 17415 + }, + { + "epoch": 5.1872894134291405, + "grad_norm": 0.30230191349983215, + "learning_rate": 9.873645168520494e-06, + "loss": 1.2282, + "step": 17416 + }, + { + "epoch": 5.187587259628809, + "grad_norm": 0.30811795592308044, + "learning_rate": 9.872680679746132e-06, + "loss": 1.2131, + "step": 17417 + }, + { + "epoch": 5.187885105828478, + "grad_norm": 0.3960643708705902, + "learning_rate": 9.871716192156337e-06, + "loss": 1.2352, + "step": 17418 + }, + { + "epoch": 5.188182952028146, + "grad_norm": 0.2723821997642517, + "learning_rate": 9.870751705760079e-06, + "loss": 1.2048, + "step": 17419 + }, + { + "epoch": 5.188480798227815, + "grad_norm": 0.30438661575317383, + "learning_rate": 9.869787220566332e-06, + "loss": 1.2186, + "step": 17420 + }, + { + "epoch": 5.188778644427484, + "grad_norm": 0.2908874452114105, + "learning_rate": 9.86882273658407e-06, + "loss": 1.2127, + "step": 17421 + }, + { + "epoch": 5.189076490627152, + "grad_norm": 0.24763324856758118, + "learning_rate": 9.867858253822265e-06, + "loss": 1.2073, + "step": 17422 + }, + { + "epoch": 5.1893743368268215, + "grad_norm": 0.43248996138572693, + "learning_rate": 9.866893772289892e-06, + "loss": 1.2118, + "step": 17423 + }, + { + "epoch": 5.18967218302649, + "grad_norm": 0.2770494520664215, + "learning_rate": 9.865929291995922e-06, + "loss": 1.2275, + "step": 17424 + }, + { + "epoch": 5.189970029226158, + "grad_norm": 0.40055957436561584, + "learning_rate": 9.864964812949334e-06, + "loss": 1.1986, + "step": 17425 + }, + { + "epoch": 5.190267875425827, + "grad_norm": 0.43054723739624023, + "learning_rate": 9.864000335159093e-06, + "loss": 1.2248, + "step": 17426 + }, + { + "epoch": 5.190565721625496, + "grad_norm": 0.2807416319847107, + "learning_rate": 9.863035858634181e-06, + "loss": 1.2126, + "step": 17427 + }, + { + "epoch": 5.190863567825164, + "grad_norm": 0.39825111627578735, + "learning_rate": 9.862071383383562e-06, + "loss": 1.2228, + "step": 17428 + }, + { + "epoch": 5.191161414024833, + "grad_norm": 0.2781282067298889, + "learning_rate": 9.86110690941622e-06, + "loss": 1.2319, + "step": 17429 + }, + { + "epoch": 5.1914592602245015, + "grad_norm": 0.4083573520183563, + "learning_rate": 9.860142436741122e-06, + "loss": 1.2199, + "step": 17430 + }, + { + "epoch": 5.19175710642417, + "grad_norm": 0.332750529050827, + "learning_rate": 9.859177965367239e-06, + "loss": 1.1956, + "step": 17431 + }, + { + "epoch": 5.192054952623839, + "grad_norm": 0.38965967297554016, + "learning_rate": 9.85821349530355e-06, + "loss": 1.206, + "step": 17432 + }, + { + "epoch": 5.192352798823507, + "grad_norm": 0.44553112983703613, + "learning_rate": 9.857249026559028e-06, + "loss": 1.2231, + "step": 17433 + }, + { + "epoch": 5.192650645023176, + "grad_norm": 0.26943814754486084, + "learning_rate": 9.85628455914264e-06, + "loss": 1.2109, + "step": 17434 + }, + { + "epoch": 5.192948491222845, + "grad_norm": 0.2865230143070221, + "learning_rate": 9.855320093063366e-06, + "loss": 1.2177, + "step": 17435 + }, + { + "epoch": 5.193246337422513, + "grad_norm": 0.26364055275917053, + "learning_rate": 9.854355628330179e-06, + "loss": 1.2138, + "step": 17436 + }, + { + "epoch": 5.1935441836221825, + "grad_norm": 0.2749254107475281, + "learning_rate": 9.853391164952045e-06, + "loss": 1.2055, + "step": 17437 + }, + { + "epoch": 5.193842029821851, + "grad_norm": 0.30862370133399963, + "learning_rate": 9.852426702937949e-06, + "loss": 1.2385, + "step": 17438 + }, + { + "epoch": 5.194139876021519, + "grad_norm": 0.23923292756080627, + "learning_rate": 9.851462242296856e-06, + "loss": 1.2168, + "step": 17439 + }, + { + "epoch": 5.194437722221188, + "grad_norm": 0.37787410616874695, + "learning_rate": 9.850497783037736e-06, + "loss": 1.222, + "step": 17440 + }, + { + "epoch": 5.194735568420857, + "grad_norm": 0.28293028473854065, + "learning_rate": 9.849533325169568e-06, + "loss": 1.2158, + "step": 17441 + }, + { + "epoch": 5.195033414620525, + "grad_norm": 0.420856773853302, + "learning_rate": 9.848568868701329e-06, + "loss": 1.2021, + "step": 17442 + }, + { + "epoch": 5.195331260820194, + "grad_norm": 0.4423983097076416, + "learning_rate": 9.847604413641982e-06, + "loss": 1.196, + "step": 17443 + }, + { + "epoch": 5.195629107019863, + "grad_norm": 0.26101332902908325, + "learning_rate": 9.846639960000512e-06, + "loss": 1.2212, + "step": 17444 + }, + { + "epoch": 5.195926953219531, + "grad_norm": 0.38214921951293945, + "learning_rate": 9.845675507785879e-06, + "loss": 1.2202, + "step": 17445 + }, + { + "epoch": 5.1962247994192, + "grad_norm": 0.2879766821861267, + "learning_rate": 9.844711057007068e-06, + "loss": 1.2254, + "step": 17446 + }, + { + "epoch": 5.1965226456188685, + "grad_norm": 0.4692278206348419, + "learning_rate": 9.843746607673045e-06, + "loss": 1.2331, + "step": 17447 + }, + { + "epoch": 5.196820491818537, + "grad_norm": 0.2960914373397827, + "learning_rate": 9.842782159792785e-06, + "loss": 1.2097, + "step": 17448 + }, + { + "epoch": 5.197118338018206, + "grad_norm": 0.49584126472473145, + "learning_rate": 9.841817713375262e-06, + "loss": 1.2179, + "step": 17449 + }, + { + "epoch": 5.197416184217874, + "grad_norm": 0.354303240776062, + "learning_rate": 9.840853268429451e-06, + "loss": 1.1989, + "step": 17450 + }, + { + "epoch": 5.1977140304175435, + "grad_norm": 0.5706049799919128, + "learning_rate": 9.839888824964318e-06, + "loss": 1.2098, + "step": 17451 + }, + { + "epoch": 5.198011876617212, + "grad_norm": 0.2618613839149475, + "learning_rate": 9.838924382988843e-06, + "loss": 1.2201, + "step": 17452 + }, + { + "epoch": 5.19830972281688, + "grad_norm": 0.30057284235954285, + "learning_rate": 9.837959942511996e-06, + "loss": 1.2042, + "step": 17453 + }, + { + "epoch": 5.198607569016549, + "grad_norm": 0.3330328166484833, + "learning_rate": 9.83699550354275e-06, + "loss": 1.2128, + "step": 17454 + }, + { + "epoch": 5.198905415216218, + "grad_norm": 0.30026569962501526, + "learning_rate": 9.836031066090081e-06, + "loss": 1.2241, + "step": 17455 + }, + { + "epoch": 5.199203261415886, + "grad_norm": 0.2807164788246155, + "learning_rate": 9.83506663016296e-06, + "loss": 1.2092, + "step": 17456 + }, + { + "epoch": 5.199501107615555, + "grad_norm": 0.28054577112197876, + "learning_rate": 9.834102195770356e-06, + "loss": 1.2205, + "step": 17457 + }, + { + "epoch": 5.199798953815224, + "grad_norm": 0.3039628267288208, + "learning_rate": 9.833137762921248e-06, + "loss": 1.2155, + "step": 17458 + }, + { + "epoch": 5.200096800014892, + "grad_norm": 0.256278932094574, + "learning_rate": 9.832173331624607e-06, + "loss": 1.2158, + "step": 17459 + }, + { + "epoch": 5.200394646214561, + "grad_norm": 0.2707452178001404, + "learning_rate": 9.831208901889405e-06, + "loss": 1.2053, + "step": 17460 + }, + { + "epoch": 5.2006924924142295, + "grad_norm": 0.28423362970352173, + "learning_rate": 9.830244473724616e-06, + "loss": 1.2057, + "step": 17461 + }, + { + "epoch": 5.200990338613899, + "grad_norm": 0.25178104639053345, + "learning_rate": 9.829280047139211e-06, + "loss": 1.2222, + "step": 17462 + }, + { + "epoch": 5.201288184813567, + "grad_norm": 0.25856050848960876, + "learning_rate": 9.828315622142167e-06, + "loss": 1.2144, + "step": 17463 + }, + { + "epoch": 5.201586031013235, + "grad_norm": 0.3158572316169739, + "learning_rate": 9.827351198742452e-06, + "loss": 1.2236, + "step": 17464 + }, + { + "epoch": 5.201883877212905, + "grad_norm": 0.3710111975669861, + "learning_rate": 9.826386776949041e-06, + "loss": 1.2191, + "step": 17465 + }, + { + "epoch": 5.202181723412573, + "grad_norm": 0.37602880597114563, + "learning_rate": 9.82542235677091e-06, + "loss": 1.2092, + "step": 17466 + }, + { + "epoch": 5.202479569612241, + "grad_norm": 0.27766236662864685, + "learning_rate": 9.824457938217028e-06, + "loss": 1.22, + "step": 17467 + }, + { + "epoch": 5.2027774158119104, + "grad_norm": 0.2806062400341034, + "learning_rate": 9.823493521296366e-06, + "loss": 1.2019, + "step": 17468 + }, + { + "epoch": 5.203075262011579, + "grad_norm": 0.33705195784568787, + "learning_rate": 9.822529106017904e-06, + "loss": 1.229, + "step": 17469 + }, + { + "epoch": 5.203373108211247, + "grad_norm": 0.2857660949230194, + "learning_rate": 9.821564692390607e-06, + "loss": 1.2103, + "step": 17470 + }, + { + "epoch": 5.203670954410916, + "grad_norm": 0.2896214425563812, + "learning_rate": 9.82060028042345e-06, + "loss": 1.2071, + "step": 17471 + }, + { + "epoch": 5.203968800610585, + "grad_norm": 0.3136449456214905, + "learning_rate": 9.81963587012541e-06, + "loss": 1.2203, + "step": 17472 + }, + { + "epoch": 5.204266646810253, + "grad_norm": 0.2500639855861664, + "learning_rate": 9.818671461505458e-06, + "loss": 1.1988, + "step": 17473 + }, + { + "epoch": 5.204564493009922, + "grad_norm": 0.3904259502887726, + "learning_rate": 9.81770705457256e-06, + "loss": 1.2142, + "step": 17474 + }, + { + "epoch": 5.2048623392095905, + "grad_norm": 0.2964465320110321, + "learning_rate": 9.8167426493357e-06, + "loss": 1.2073, + "step": 17475 + }, + { + "epoch": 5.205160185409259, + "grad_norm": 0.3154318630695343, + "learning_rate": 9.81577824580384e-06, + "loss": 1.2363, + "step": 17476 + }, + { + "epoch": 5.205458031608928, + "grad_norm": 0.3880579173564911, + "learning_rate": 9.81481384398596e-06, + "loss": 1.2217, + "step": 17477 + }, + { + "epoch": 5.205755877808596, + "grad_norm": 0.261506050825119, + "learning_rate": 9.813849443891031e-06, + "loss": 1.2332, + "step": 17478 + }, + { + "epoch": 5.206053724008266, + "grad_norm": 0.32930904626846313, + "learning_rate": 9.812885045528022e-06, + "loss": 1.213, + "step": 17479 + }, + { + "epoch": 5.206351570207934, + "grad_norm": 0.27337881922721863, + "learning_rate": 9.811920648905913e-06, + "loss": 1.2198, + "step": 17480 + }, + { + "epoch": 5.206649416407602, + "grad_norm": 0.32574567198753357, + "learning_rate": 9.810956254033673e-06, + "loss": 1.2195, + "step": 17481 + }, + { + "epoch": 5.2069472626072715, + "grad_norm": 0.33570176362991333, + "learning_rate": 9.809991860920267e-06, + "loss": 1.2148, + "step": 17482 + }, + { + "epoch": 5.20724510880694, + "grad_norm": 0.2787150740623474, + "learning_rate": 9.809027469574677e-06, + "loss": 1.2398, + "step": 17483 + }, + { + "epoch": 5.207542955006608, + "grad_norm": 0.3820635676383972, + "learning_rate": 9.808063080005878e-06, + "loss": 1.2077, + "step": 17484 + }, + { + "epoch": 5.207840801206277, + "grad_norm": 0.2835637629032135, + "learning_rate": 9.80709869222283e-06, + "loss": 1.2236, + "step": 17485 + }, + { + "epoch": 5.208138647405946, + "grad_norm": 0.29232335090637207, + "learning_rate": 9.806134306234519e-06, + "loss": 1.2161, + "step": 17486 + }, + { + "epoch": 5.208436493605614, + "grad_norm": 0.2579543888568878, + "learning_rate": 9.80516992204991e-06, + "loss": 1.221, + "step": 17487 + }, + { + "epoch": 5.208734339805283, + "grad_norm": 0.32679247856140137, + "learning_rate": 9.804205539677976e-06, + "loss": 1.2198, + "step": 17488 + }, + { + "epoch": 5.2090321860049515, + "grad_norm": 0.27821290493011475, + "learning_rate": 9.803241159127692e-06, + "loss": 1.2147, + "step": 17489 + }, + { + "epoch": 5.209330032204621, + "grad_norm": 0.3944130837917328, + "learning_rate": 9.802276780408031e-06, + "loss": 1.2221, + "step": 17490 + }, + { + "epoch": 5.209627878404289, + "grad_norm": 0.26376447081565857, + "learning_rate": 9.801312403527958e-06, + "loss": 1.2195, + "step": 17491 + }, + { + "epoch": 5.209925724603957, + "grad_norm": 0.45310312509536743, + "learning_rate": 9.800348028496456e-06, + "loss": 1.2403, + "step": 17492 + }, + { + "epoch": 5.210223570803627, + "grad_norm": 0.28856009244918823, + "learning_rate": 9.79938365532249e-06, + "loss": 1.2018, + "step": 17493 + }, + { + "epoch": 5.210521417003295, + "grad_norm": 0.37543657422065735, + "learning_rate": 9.798419284015034e-06, + "loss": 1.2027, + "step": 17494 + }, + { + "epoch": 5.210819263202963, + "grad_norm": 0.2522352635860443, + "learning_rate": 9.797454914583067e-06, + "loss": 1.2192, + "step": 17495 + }, + { + "epoch": 5.2111171094026325, + "grad_norm": 0.7417417168617249, + "learning_rate": 9.796490547035549e-06, + "loss": 1.1934, + "step": 17496 + }, + { + "epoch": 5.211414955602301, + "grad_norm": 0.5598888993263245, + "learning_rate": 9.795526181381464e-06, + "loss": 1.214, + "step": 17497 + }, + { + "epoch": 5.211712801801969, + "grad_norm": 0.48745712637901306, + "learning_rate": 9.79456181762978e-06, + "loss": 1.2204, + "step": 17498 + }, + { + "epoch": 5.212010648001638, + "grad_norm": 0.4387660026550293, + "learning_rate": 9.793597455789463e-06, + "loss": 1.2176, + "step": 17499 + }, + { + "epoch": 5.212308494201307, + "grad_norm": 0.49820026755332947, + "learning_rate": 9.792633095869495e-06, + "loss": 1.2248, + "step": 17500 + }, + { + "epoch": 5.212308494201307, + "eval_loss": 1.3193069696426392, + "eval_runtime": 24.0255, + "eval_samples_per_second": 72.173, + "eval_steps_per_second": 4.537, + "step": 17500 + }, + { + "epoch": 5.212606340400975, + "grad_norm": 0.2895793318748474, + "learning_rate": 9.791668737878846e-06, + "loss": 1.2119, + "step": 17501 + }, + { + "epoch": 5.212904186600644, + "grad_norm": 0.38679054379463196, + "learning_rate": 9.790704381826481e-06, + "loss": 1.2001, + "step": 17502 + }, + { + "epoch": 5.213202032800313, + "grad_norm": 0.3843201696872711, + "learning_rate": 9.789740027721384e-06, + "loss": 1.212, + "step": 17503 + }, + { + "epoch": 5.213499878999982, + "grad_norm": 0.38126033544540405, + "learning_rate": 9.78877567557252e-06, + "loss": 1.2244, + "step": 17504 + }, + { + "epoch": 5.21379772519965, + "grad_norm": 0.362287700176239, + "learning_rate": 9.787811325388858e-06, + "loss": 1.2214, + "step": 17505 + }, + { + "epoch": 5.2140955713993185, + "grad_norm": 0.2599650025367737, + "learning_rate": 9.786846977179377e-06, + "loss": 1.2052, + "step": 17506 + }, + { + "epoch": 5.214393417598988, + "grad_norm": 0.3888007700443268, + "learning_rate": 9.785882630953048e-06, + "loss": 1.206, + "step": 17507 + }, + { + "epoch": 5.214691263798656, + "grad_norm": 0.3158775269985199, + "learning_rate": 9.78491828671884e-06, + "loss": 1.2135, + "step": 17508 + }, + { + "epoch": 5.214989109998324, + "grad_norm": 0.3141838610172272, + "learning_rate": 9.783953944485729e-06, + "loss": 1.206, + "step": 17509 + }, + { + "epoch": 5.2152869561979935, + "grad_norm": 0.3854498565196991, + "learning_rate": 9.782989604262682e-06, + "loss": 1.2131, + "step": 17510 + }, + { + "epoch": 5.215584802397662, + "grad_norm": 0.2511853873729706, + "learning_rate": 9.782025266058679e-06, + "loss": 1.1993, + "step": 17511 + }, + { + "epoch": 5.21588264859733, + "grad_norm": 0.539507269859314, + "learning_rate": 9.781060929882684e-06, + "loss": 1.2137, + "step": 17512 + }, + { + "epoch": 5.216180494796999, + "grad_norm": 0.2853509485721588, + "learning_rate": 9.780096595743671e-06, + "loss": 1.2201, + "step": 17513 + }, + { + "epoch": 5.216478340996668, + "grad_norm": 0.5206080079078674, + "learning_rate": 9.779132263650618e-06, + "loss": 1.2328, + "step": 17514 + }, + { + "epoch": 5.216776187196336, + "grad_norm": 0.344535768032074, + "learning_rate": 9.778167933612492e-06, + "loss": 1.2202, + "step": 17515 + }, + { + "epoch": 5.217074033396005, + "grad_norm": 0.483694851398468, + "learning_rate": 9.777203605638261e-06, + "loss": 1.2102, + "step": 17516 + }, + { + "epoch": 5.217371879595674, + "grad_norm": 0.5337632298469543, + "learning_rate": 9.776239279736903e-06, + "loss": 1.218, + "step": 17517 + }, + { + "epoch": 5.217669725795343, + "grad_norm": 0.5443182587623596, + "learning_rate": 9.775274955917393e-06, + "loss": 1.2189, + "step": 17518 + }, + { + "epoch": 5.217967571995011, + "grad_norm": 0.460360050201416, + "learning_rate": 9.774310634188692e-06, + "loss": 1.2124, + "step": 17519 + }, + { + "epoch": 5.2182654181946795, + "grad_norm": 0.36664527654647827, + "learning_rate": 9.773346314559784e-06, + "loss": 1.2119, + "step": 17520 + }, + { + "epoch": 5.218563264394349, + "grad_norm": 0.3464254140853882, + "learning_rate": 9.772381997039634e-06, + "loss": 1.2099, + "step": 17521 + }, + { + "epoch": 5.218861110594017, + "grad_norm": 0.4611830711364746, + "learning_rate": 9.771417681637212e-06, + "loss": 1.223, + "step": 17522 + }, + { + "epoch": 5.219158956793685, + "grad_norm": 0.28207167983055115, + "learning_rate": 9.770453368361495e-06, + "loss": 1.211, + "step": 17523 + }, + { + "epoch": 5.219456802993355, + "grad_norm": 0.32353290915489197, + "learning_rate": 9.769489057221455e-06, + "loss": 1.2154, + "step": 17524 + }, + { + "epoch": 5.219754649193023, + "grad_norm": 0.3409768342971802, + "learning_rate": 9.768524748226056e-06, + "loss": 1.227, + "step": 17525 + }, + { + "epoch": 5.220052495392691, + "grad_norm": 0.29958558082580566, + "learning_rate": 9.767560441384283e-06, + "loss": 1.2078, + "step": 17526 + }, + { + "epoch": 5.22035034159236, + "grad_norm": 0.3059716522693634, + "learning_rate": 9.766596136705095e-06, + "loss": 1.2103, + "step": 17527 + }, + { + "epoch": 5.220648187792029, + "grad_norm": 0.4833516478538513, + "learning_rate": 9.765631834197472e-06, + "loss": 1.2104, + "step": 17528 + }, + { + "epoch": 5.220946033991698, + "grad_norm": 0.2676942050457001, + "learning_rate": 9.764667533870382e-06, + "loss": 1.2193, + "step": 17529 + }, + { + "epoch": 5.221243880191366, + "grad_norm": 0.4212324321269989, + "learning_rate": 9.763703235732796e-06, + "loss": 1.2406, + "step": 17530 + }, + { + "epoch": 5.221541726391035, + "grad_norm": 0.3484458029270172, + "learning_rate": 9.76273893979369e-06, + "loss": 1.2358, + "step": 17531 + }, + { + "epoch": 5.221839572590704, + "grad_norm": 0.4059697389602661, + "learning_rate": 9.761774646062035e-06, + "loss": 1.2106, + "step": 17532 + }, + { + "epoch": 5.222137418790372, + "grad_norm": 0.39221394062042236, + "learning_rate": 9.760810354546794e-06, + "loss": 1.2103, + "step": 17533 + }, + { + "epoch": 5.2224352649900405, + "grad_norm": 0.5751363039016724, + "learning_rate": 9.759846065256953e-06, + "loss": 1.215, + "step": 17534 + }, + { + "epoch": 5.22273311118971, + "grad_norm": 0.267691969871521, + "learning_rate": 9.758881778201471e-06, + "loss": 1.2276, + "step": 17535 + }, + { + "epoch": 5.223030957389378, + "grad_norm": 0.41326481103897095, + "learning_rate": 9.757917493389324e-06, + "loss": 1.2136, + "step": 17536 + }, + { + "epoch": 5.223328803589046, + "grad_norm": 0.34432196617126465, + "learning_rate": 9.756953210829489e-06, + "loss": 1.2152, + "step": 17537 + }, + { + "epoch": 5.223626649788716, + "grad_norm": 0.29226770997047424, + "learning_rate": 9.755988930530931e-06, + "loss": 1.2106, + "step": 17538 + }, + { + "epoch": 5.223924495988384, + "grad_norm": 0.2618924379348755, + "learning_rate": 9.75502465250262e-06, + "loss": 1.2285, + "step": 17539 + }, + { + "epoch": 5.224222342188052, + "grad_norm": 0.34921810030937195, + "learning_rate": 9.754060376753536e-06, + "loss": 1.216, + "step": 17540 + }, + { + "epoch": 5.2245201883877215, + "grad_norm": 0.29432594776153564, + "learning_rate": 9.753096103292641e-06, + "loss": 1.2133, + "step": 17541 + }, + { + "epoch": 5.22481803458739, + "grad_norm": 0.2998746335506439, + "learning_rate": 9.752131832128912e-06, + "loss": 1.2209, + "step": 17542 + }, + { + "epoch": 5.225115880787058, + "grad_norm": 0.28745922446250916, + "learning_rate": 9.751167563271322e-06, + "loss": 1.1923, + "step": 17543 + }, + { + "epoch": 5.225413726986727, + "grad_norm": 0.31892332434654236, + "learning_rate": 9.750203296728835e-06, + "loss": 1.2131, + "step": 17544 + }, + { + "epoch": 5.225711573186396, + "grad_norm": 0.26440367102622986, + "learning_rate": 9.749239032510432e-06, + "loss": 1.2247, + "step": 17545 + }, + { + "epoch": 5.226009419386065, + "grad_norm": 0.26981377601623535, + "learning_rate": 9.748274770625077e-06, + "loss": 1.214, + "step": 17546 + }, + { + "epoch": 5.226307265585733, + "grad_norm": 0.366583913564682, + "learning_rate": 9.747310511081745e-06, + "loss": 1.2091, + "step": 17547 + }, + { + "epoch": 5.2266051117854015, + "grad_norm": 0.30591365694999695, + "learning_rate": 9.746346253889406e-06, + "loss": 1.2096, + "step": 17548 + }, + { + "epoch": 5.226902957985071, + "grad_norm": 0.3112654387950897, + "learning_rate": 9.745381999057033e-06, + "loss": 1.211, + "step": 17549 + }, + { + "epoch": 5.227200804184739, + "grad_norm": 0.35584914684295654, + "learning_rate": 9.744417746593592e-06, + "loss": 1.2054, + "step": 17550 + }, + { + "epoch": 5.227498650384407, + "grad_norm": 0.26016563177108765, + "learning_rate": 9.743453496508065e-06, + "loss": 1.216, + "step": 17551 + }, + { + "epoch": 5.227796496584077, + "grad_norm": 0.3549031615257263, + "learning_rate": 9.742489248809411e-06, + "loss": 1.2164, + "step": 17552 + }, + { + "epoch": 5.228094342783745, + "grad_norm": 0.2862130403518677, + "learning_rate": 9.741525003506606e-06, + "loss": 1.1969, + "step": 17553 + }, + { + "epoch": 5.228392188983413, + "grad_norm": 0.3492962121963501, + "learning_rate": 9.740560760608627e-06, + "loss": 1.2084, + "step": 17554 + }, + { + "epoch": 5.2286900351830825, + "grad_norm": 0.4583175480365753, + "learning_rate": 9.73959652012444e-06, + "loss": 1.2102, + "step": 17555 + }, + { + "epoch": 5.228987881382751, + "grad_norm": 0.24967622756958008, + "learning_rate": 9.738632282063013e-06, + "loss": 1.2008, + "step": 17556 + }, + { + "epoch": 5.22928572758242, + "grad_norm": 0.4711858630180359, + "learning_rate": 9.737668046433322e-06, + "loss": 1.203, + "step": 17557 + }, + { + "epoch": 5.229583573782088, + "grad_norm": 0.28161999583244324, + "learning_rate": 9.736703813244336e-06, + "loss": 1.2067, + "step": 17558 + }, + { + "epoch": 5.229881419981757, + "grad_norm": 0.4732733368873596, + "learning_rate": 9.735739582505026e-06, + "loss": 1.2155, + "step": 17559 + }, + { + "epoch": 5.230179266181426, + "grad_norm": 0.366251677274704, + "learning_rate": 9.734775354224368e-06, + "loss": 1.1981, + "step": 17560 + }, + { + "epoch": 5.230477112381094, + "grad_norm": 0.3029963970184326, + "learning_rate": 9.733811128411323e-06, + "loss": 1.2099, + "step": 17561 + }, + { + "epoch": 5.230774958580763, + "grad_norm": 0.33303940296173096, + "learning_rate": 9.732846905074874e-06, + "loss": 1.2302, + "step": 17562 + }, + { + "epoch": 5.231072804780432, + "grad_norm": 0.3316311240196228, + "learning_rate": 9.731882684223985e-06, + "loss": 1.2288, + "step": 17563 + }, + { + "epoch": 5.2313706509801, + "grad_norm": 0.2685508728027344, + "learning_rate": 9.730918465867624e-06, + "loss": 1.2142, + "step": 17564 + }, + { + "epoch": 5.2316684971797685, + "grad_norm": 0.2979573905467987, + "learning_rate": 9.729954250014769e-06, + "loss": 1.2045, + "step": 17565 + }, + { + "epoch": 5.231966343379438, + "grad_norm": 0.24490252137184143, + "learning_rate": 9.728990036674391e-06, + "loss": 1.224, + "step": 17566 + }, + { + "epoch": 5.232264189579106, + "grad_norm": 0.2672775089740753, + "learning_rate": 9.728025825855452e-06, + "loss": 1.2051, + "step": 17567 + }, + { + "epoch": 5.232562035778774, + "grad_norm": 0.2893596887588501, + "learning_rate": 9.727061617566933e-06, + "loss": 1.2056, + "step": 17568 + }, + { + "epoch": 5.2328598819784435, + "grad_norm": 0.2536107301712036, + "learning_rate": 9.726097411817798e-06, + "loss": 1.219, + "step": 17569 + }, + { + "epoch": 5.233157728178112, + "grad_norm": 0.3155689835548401, + "learning_rate": 9.725133208617023e-06, + "loss": 1.2312, + "step": 17570 + }, + { + "epoch": 5.233455574377781, + "grad_norm": 0.3005101680755615, + "learning_rate": 9.724169007973575e-06, + "loss": 1.2119, + "step": 17571 + }, + { + "epoch": 5.233753420577449, + "grad_norm": 0.3837868273258209, + "learning_rate": 9.723204809896427e-06, + "loss": 1.2167, + "step": 17572 + }, + { + "epoch": 5.234051266777118, + "grad_norm": 0.4749842882156372, + "learning_rate": 9.722240614394546e-06, + "loss": 1.2254, + "step": 17573 + }, + { + "epoch": 5.234349112976787, + "grad_norm": 1.0652549266815186, + "learning_rate": 9.72127642147691e-06, + "loss": 1.2283, + "step": 17574 + }, + { + "epoch": 5.234646959176455, + "grad_norm": 0.4877004027366638, + "learning_rate": 9.720312231152481e-06, + "loss": 1.1977, + "step": 17575 + }, + { + "epoch": 5.234944805376124, + "grad_norm": 0.5479477643966675, + "learning_rate": 9.719348043430239e-06, + "loss": 1.209, + "step": 17576 + }, + { + "epoch": 5.235242651575793, + "grad_norm": 0.26567378640174866, + "learning_rate": 9.718383858319146e-06, + "loss": 1.212, + "step": 17577 + }, + { + "epoch": 5.235540497775461, + "grad_norm": 0.5459462404251099, + "learning_rate": 9.717419675828176e-06, + "loss": 1.2013, + "step": 17578 + }, + { + "epoch": 5.2358383439751295, + "grad_norm": 0.4284096360206604, + "learning_rate": 9.716455495966305e-06, + "loss": 1.2098, + "step": 17579 + }, + { + "epoch": 5.236136190174799, + "grad_norm": 0.34207162261009216, + "learning_rate": 9.715491318742499e-06, + "loss": 1.214, + "step": 17580 + }, + { + "epoch": 5.236434036374467, + "grad_norm": 0.3737085461616516, + "learning_rate": 9.714527144165721e-06, + "loss": 1.2201, + "step": 17581 + }, + { + "epoch": 5.236731882574135, + "grad_norm": 0.3189679980278015, + "learning_rate": 9.713562972244955e-06, + "loss": 1.2147, + "step": 17582 + }, + { + "epoch": 5.237029728773805, + "grad_norm": 0.4002085030078888, + "learning_rate": 9.712598802989166e-06, + "loss": 1.2063, + "step": 17583 + }, + { + "epoch": 5.237327574973473, + "grad_norm": 0.2849947214126587, + "learning_rate": 9.711634636407319e-06, + "loss": 1.2209, + "step": 17584 + }, + { + "epoch": 5.237625421173142, + "grad_norm": 0.31420210003852844, + "learning_rate": 9.710670472508395e-06, + "loss": 1.2154, + "step": 17585 + }, + { + "epoch": 5.23792326737281, + "grad_norm": 0.3056415915489197, + "learning_rate": 9.709706311301358e-06, + "loss": 1.2037, + "step": 17586 + }, + { + "epoch": 5.238221113572479, + "grad_norm": 0.24837148189544678, + "learning_rate": 9.708742152795176e-06, + "loss": 1.1954, + "step": 17587 + }, + { + "epoch": 5.238518959772148, + "grad_norm": 0.2823173403739929, + "learning_rate": 9.707777996998825e-06, + "loss": 1.2145, + "step": 17588 + }, + { + "epoch": 5.238816805971816, + "grad_norm": 0.33144593238830566, + "learning_rate": 9.706813843921274e-06, + "loss": 1.2109, + "step": 17589 + }, + { + "epoch": 5.239114652171485, + "grad_norm": 0.2580885887145996, + "learning_rate": 9.70584969357149e-06, + "loss": 1.2253, + "step": 17590 + }, + { + "epoch": 5.239412498371154, + "grad_norm": 0.2694717347621918, + "learning_rate": 9.70488554595845e-06, + "loss": 1.2105, + "step": 17591 + }, + { + "epoch": 5.239710344570822, + "grad_norm": 0.3748553991317749, + "learning_rate": 9.703921401091115e-06, + "loss": 1.2233, + "step": 17592 + }, + { + "epoch": 5.2400081907704905, + "grad_norm": 0.35062405467033386, + "learning_rate": 9.702957258978466e-06, + "loss": 1.2265, + "step": 17593 + }, + { + "epoch": 5.24030603697016, + "grad_norm": 0.31125640869140625, + "learning_rate": 9.701993119629465e-06, + "loss": 1.2274, + "step": 17594 + }, + { + "epoch": 5.240603883169828, + "grad_norm": 0.4129265248775482, + "learning_rate": 9.701028983053083e-06, + "loss": 1.217, + "step": 17595 + }, + { + "epoch": 5.240901729369497, + "grad_norm": 0.2553410530090332, + "learning_rate": 9.700064849258298e-06, + "loss": 1.2156, + "step": 17596 + }, + { + "epoch": 5.241199575569166, + "grad_norm": 0.36992135643959045, + "learning_rate": 9.699100718254071e-06, + "loss": 1.2004, + "step": 17597 + }, + { + "epoch": 5.241497421768834, + "grad_norm": 0.2936934530735016, + "learning_rate": 9.698136590049375e-06, + "loss": 1.2098, + "step": 17598 + }, + { + "epoch": 5.241795267968503, + "grad_norm": 0.4095652997493744, + "learning_rate": 9.697172464653184e-06, + "loss": 1.2192, + "step": 17599 + }, + { + "epoch": 5.2420931141681715, + "grad_norm": 0.27325400710105896, + "learning_rate": 9.696208342074461e-06, + "loss": 1.2032, + "step": 17600 + }, + { + "epoch": 5.24239096036784, + "grad_norm": 0.4788769483566284, + "learning_rate": 9.69524422232218e-06, + "loss": 1.2196, + "step": 17601 + }, + { + "epoch": 5.242688806567509, + "grad_norm": 0.2858133912086487, + "learning_rate": 9.694280105405314e-06, + "loss": 1.2188, + "step": 17602 + }, + { + "epoch": 5.242986652767177, + "grad_norm": 0.3255537748336792, + "learning_rate": 9.69331599133283e-06, + "loss": 1.2285, + "step": 17603 + }, + { + "epoch": 5.243284498966846, + "grad_norm": 0.31990641355514526, + "learning_rate": 9.692351880113695e-06, + "loss": 1.2128, + "step": 17604 + }, + { + "epoch": 5.243582345166515, + "grad_norm": 0.25648993253707886, + "learning_rate": 9.691387771756883e-06, + "loss": 1.2196, + "step": 17605 + }, + { + "epoch": 5.243880191366183, + "grad_norm": 0.2624090909957886, + "learning_rate": 9.690423666271365e-06, + "loss": 1.2048, + "step": 17606 + }, + { + "epoch": 5.2441780375658515, + "grad_norm": 0.2932678163051605, + "learning_rate": 9.689459563666105e-06, + "loss": 1.2028, + "step": 17607 + }, + { + "epoch": 5.244475883765521, + "grad_norm": 0.27394574880599976, + "learning_rate": 9.688495463950081e-06, + "loss": 1.2082, + "step": 17608 + }, + { + "epoch": 5.244773729965189, + "grad_norm": 0.2619938850402832, + "learning_rate": 9.687531367132257e-06, + "loss": 1.1827, + "step": 17609 + }, + { + "epoch": 5.245071576164857, + "grad_norm": 0.3157538175582886, + "learning_rate": 9.686567273221605e-06, + "loss": 1.2123, + "step": 17610 + }, + { + "epoch": 5.245369422364527, + "grad_norm": 0.38306286931037903, + "learning_rate": 9.685603182227093e-06, + "loss": 1.2202, + "step": 17611 + }, + { + "epoch": 5.245667268564195, + "grad_norm": 0.3174692988395691, + "learning_rate": 9.68463909415769e-06, + "loss": 1.2023, + "step": 17612 + }, + { + "epoch": 5.245965114763864, + "grad_norm": 0.2601618766784668, + "learning_rate": 9.683675009022375e-06, + "loss": 1.2242, + "step": 17613 + }, + { + "epoch": 5.2462629609635325, + "grad_norm": 0.30510395765304565, + "learning_rate": 9.682710926830107e-06, + "loss": 1.2169, + "step": 17614 + }, + { + "epoch": 5.246560807163201, + "grad_norm": 0.2462833821773529, + "learning_rate": 9.681746847589856e-06, + "loss": 1.221, + "step": 17615 + }, + { + "epoch": 5.24685865336287, + "grad_norm": 0.3740920424461365, + "learning_rate": 9.6807827713106e-06, + "loss": 1.1939, + "step": 17616 + }, + { + "epoch": 5.247156499562538, + "grad_norm": 0.33551445603370667, + "learning_rate": 9.679818698001303e-06, + "loss": 1.208, + "step": 17617 + }, + { + "epoch": 5.247454345762207, + "grad_norm": 0.3807724118232727, + "learning_rate": 9.67885462767093e-06, + "loss": 1.2273, + "step": 17618 + }, + { + "epoch": 5.247752191961876, + "grad_norm": 0.34905049204826355, + "learning_rate": 9.677890560328463e-06, + "loss": 1.1955, + "step": 17619 + }, + { + "epoch": 5.248050038161544, + "grad_norm": 0.41026571393013, + "learning_rate": 9.676926495982861e-06, + "loss": 1.2262, + "step": 17620 + }, + { + "epoch": 5.248347884361213, + "grad_norm": 0.4517008662223816, + "learning_rate": 9.675962434643096e-06, + "loss": 1.2145, + "step": 17621 + }, + { + "epoch": 5.248645730560882, + "grad_norm": 0.3837314546108246, + "learning_rate": 9.67499837631814e-06, + "loss": 1.2141, + "step": 17622 + }, + { + "epoch": 5.24894357676055, + "grad_norm": 0.7110933661460876, + "learning_rate": 9.674034321016961e-06, + "loss": 1.2122, + "step": 17623 + }, + { + "epoch": 5.249241422960219, + "grad_norm": 0.2553151547908783, + "learning_rate": 9.673070268748526e-06, + "loss": 1.2309, + "step": 17624 + }, + { + "epoch": 5.249539269159888, + "grad_norm": 0.6351589560508728, + "learning_rate": 9.67210621952181e-06, + "loss": 1.2229, + "step": 17625 + }, + { + "epoch": 5.249837115359556, + "grad_norm": 0.2685965299606323, + "learning_rate": 9.671142173345777e-06, + "loss": 1.2156, + "step": 17626 + }, + { + "epoch": 5.250134961559225, + "grad_norm": 0.5644192695617676, + "learning_rate": 9.670178130229402e-06, + "loss": 1.2083, + "step": 17627 + }, + { + "epoch": 5.2504328077588935, + "grad_norm": 0.30944108963012695, + "learning_rate": 9.669214090181649e-06, + "loss": 1.2108, + "step": 17628 + }, + { + "epoch": 5.250730653958562, + "grad_norm": 0.6028257012367249, + "learning_rate": 9.668250053211487e-06, + "loss": 1.2264, + "step": 17629 + }, + { + "epoch": 5.251028500158231, + "grad_norm": 0.426084041595459, + "learning_rate": 9.66728601932789e-06, + "loss": 1.2186, + "step": 17630 + }, + { + "epoch": 5.251326346357899, + "grad_norm": 0.4742308557033539, + "learning_rate": 9.666321988539827e-06, + "loss": 1.2166, + "step": 17631 + }, + { + "epoch": 5.251624192557568, + "grad_norm": 0.5128064751625061, + "learning_rate": 9.66535796085626e-06, + "loss": 1.2101, + "step": 17632 + }, + { + "epoch": 5.251922038757237, + "grad_norm": 0.28773415088653564, + "learning_rate": 9.664393936286169e-06, + "loss": 1.2323, + "step": 17633 + }, + { + "epoch": 5.252219884956905, + "grad_norm": 0.8632734417915344, + "learning_rate": 9.663429914838513e-06, + "loss": 1.2105, + "step": 17634 + }, + { + "epoch": 5.252517731156574, + "grad_norm": 0.32639697194099426, + "learning_rate": 9.662465896522267e-06, + "loss": 1.2116, + "step": 17635 + }, + { + "epoch": 5.252815577356243, + "grad_norm": 0.653701901435852, + "learning_rate": 9.6615018813464e-06, + "loss": 1.2181, + "step": 17636 + }, + { + "epoch": 5.253113423555911, + "grad_norm": 0.3668355345726013, + "learning_rate": 9.66053786931988e-06, + "loss": 1.2244, + "step": 17637 + }, + { + "epoch": 5.25341126975558, + "grad_norm": 0.3794378340244293, + "learning_rate": 9.659573860451671e-06, + "loss": 1.2342, + "step": 17638 + }, + { + "epoch": 5.253709115955249, + "grad_norm": 0.9495980739593506, + "learning_rate": 9.658609854750753e-06, + "loss": 1.2059, + "step": 17639 + }, + { + "epoch": 5.254006962154917, + "grad_norm": 0.3066650927066803, + "learning_rate": 9.657645852226086e-06, + "loss": 1.208, + "step": 17640 + }, + { + "epoch": 5.254304808354586, + "grad_norm": 0.6032782793045044, + "learning_rate": 9.65668185288664e-06, + "loss": 1.2145, + "step": 17641 + }, + { + "epoch": 5.254602654554255, + "grad_norm": 0.5128459334373474, + "learning_rate": 9.65571785674139e-06, + "loss": 1.2139, + "step": 17642 + }, + { + "epoch": 5.254900500753923, + "grad_norm": 0.5280553102493286, + "learning_rate": 9.654753863799296e-06, + "loss": 1.231, + "step": 17643 + }, + { + "epoch": 5.255198346953592, + "grad_norm": 0.6278181672096252, + "learning_rate": 9.653789874069337e-06, + "loss": 1.2198, + "step": 17644 + }, + { + "epoch": 5.25549619315326, + "grad_norm": 0.49107906222343445, + "learning_rate": 9.652825887560474e-06, + "loss": 1.2191, + "step": 17645 + }, + { + "epoch": 5.255794039352929, + "grad_norm": 0.5377057194709778, + "learning_rate": 9.651861904281675e-06, + "loss": 1.208, + "step": 17646 + }, + { + "epoch": 5.256091885552598, + "grad_norm": 0.3466222286224365, + "learning_rate": 9.650897924241916e-06, + "loss": 1.2331, + "step": 17647 + }, + { + "epoch": 5.256389731752266, + "grad_norm": 0.4054228663444519, + "learning_rate": 9.649933947450163e-06, + "loss": 1.2129, + "step": 17648 + }, + { + "epoch": 5.256687577951935, + "grad_norm": 0.29387709498405457, + "learning_rate": 9.648969973915378e-06, + "loss": 1.2155, + "step": 17649 + }, + { + "epoch": 5.256985424151604, + "grad_norm": 0.4338807165622711, + "learning_rate": 9.648006003646539e-06, + "loss": 1.2124, + "step": 17650 + }, + { + "epoch": 5.257283270351272, + "grad_norm": 0.2910458445549011, + "learning_rate": 9.647042036652612e-06, + "loss": 1.2221, + "step": 17651 + }, + { + "epoch": 5.257581116550941, + "grad_norm": 0.2524261474609375, + "learning_rate": 9.646078072942561e-06, + "loss": 1.2138, + "step": 17652 + }, + { + "epoch": 5.25787896275061, + "grad_norm": 0.28109875321388245, + "learning_rate": 9.64511411252536e-06, + "loss": 1.2215, + "step": 17653 + }, + { + "epoch": 5.258176808950278, + "grad_norm": 0.272085964679718, + "learning_rate": 9.644150155409976e-06, + "loss": 1.2126, + "step": 17654 + }, + { + "epoch": 5.258474655149947, + "grad_norm": 0.26784154772758484, + "learning_rate": 9.643186201605374e-06, + "loss": 1.2166, + "step": 17655 + }, + { + "epoch": 5.258772501349616, + "grad_norm": 0.2522640824317932, + "learning_rate": 9.642222251120531e-06, + "loss": 1.2058, + "step": 17656 + }, + { + "epoch": 5.259070347549284, + "grad_norm": 0.26999029517173767, + "learning_rate": 9.641258303964408e-06, + "loss": 1.2084, + "step": 17657 + }, + { + "epoch": 5.259368193748953, + "grad_norm": 0.2779795229434967, + "learning_rate": 9.640294360145975e-06, + "loss": 1.2173, + "step": 17658 + }, + { + "epoch": 5.2596660399486215, + "grad_norm": 0.29149943590164185, + "learning_rate": 9.639330419674201e-06, + "loss": 1.2034, + "step": 17659 + }, + { + "epoch": 5.25996388614829, + "grad_norm": 0.278231680393219, + "learning_rate": 9.638366482558052e-06, + "loss": 1.2171, + "step": 17660 + }, + { + "epoch": 5.260261732347959, + "grad_norm": 0.30518946051597595, + "learning_rate": 9.637402548806503e-06, + "loss": 1.2219, + "step": 17661 + }, + { + "epoch": 5.260559578547627, + "grad_norm": 0.2650560736656189, + "learning_rate": 9.63643861842852e-06, + "loss": 1.2044, + "step": 17662 + }, + { + "epoch": 5.260857424747297, + "grad_norm": 0.28559422492980957, + "learning_rate": 9.635474691433063e-06, + "loss": 1.2218, + "step": 17663 + }, + { + "epoch": 5.261155270946965, + "grad_norm": 0.2490164190530777, + "learning_rate": 9.63451076782911e-06, + "loss": 1.2148, + "step": 17664 + }, + { + "epoch": 5.261453117146633, + "grad_norm": 0.2811809480190277, + "learning_rate": 9.633546847625627e-06, + "loss": 1.232, + "step": 17665 + }, + { + "epoch": 5.261750963346302, + "grad_norm": 0.3814217150211334, + "learning_rate": 9.63258293083158e-06, + "loss": 1.2042, + "step": 17666 + }, + { + "epoch": 5.262048809545971, + "grad_norm": 0.29409468173980713, + "learning_rate": 9.63161901745594e-06, + "loss": 1.2282, + "step": 17667 + }, + { + "epoch": 5.262346655745639, + "grad_norm": 0.2859947681427002, + "learning_rate": 9.630655107507674e-06, + "loss": 1.2236, + "step": 17668 + }, + { + "epoch": 5.262644501945308, + "grad_norm": 0.32990679144859314, + "learning_rate": 9.629691200995744e-06, + "loss": 1.2226, + "step": 17669 + }, + { + "epoch": 5.262942348144977, + "grad_norm": 0.25081712007522583, + "learning_rate": 9.628727297929127e-06, + "loss": 1.2071, + "step": 17670 + }, + { + "epoch": 5.263240194344645, + "grad_norm": 0.357969731092453, + "learning_rate": 9.62776339831679e-06, + "loss": 1.2039, + "step": 17671 + }, + { + "epoch": 5.263538040544314, + "grad_norm": 0.2931002676486969, + "learning_rate": 9.626799502167694e-06, + "loss": 1.2117, + "step": 17672 + }, + { + "epoch": 5.2638358867439825, + "grad_norm": 0.33453667163848877, + "learning_rate": 9.625835609490817e-06, + "loss": 1.2101, + "step": 17673 + }, + { + "epoch": 5.264133732943651, + "grad_norm": 0.4000210165977478, + "learning_rate": 9.62487172029512e-06, + "loss": 1.2081, + "step": 17674 + }, + { + "epoch": 5.26443157914332, + "grad_norm": 0.25864914059638977, + "learning_rate": 9.62390783458957e-06, + "loss": 1.2191, + "step": 17675 + }, + { + "epoch": 5.264729425342988, + "grad_norm": 0.460506796836853, + "learning_rate": 9.622943952383138e-06, + "loss": 1.2148, + "step": 17676 + }, + { + "epoch": 5.265027271542657, + "grad_norm": 0.41654765605926514, + "learning_rate": 9.62198007368479e-06, + "loss": 1.2235, + "step": 17677 + }, + { + "epoch": 5.265325117742326, + "grad_norm": 0.26073628664016724, + "learning_rate": 9.621016198503498e-06, + "loss": 1.2126, + "step": 17678 + }, + { + "epoch": 5.265622963941994, + "grad_norm": 0.34627699851989746, + "learning_rate": 9.620052326848229e-06, + "loss": 1.2296, + "step": 17679 + }, + { + "epoch": 5.2659208101416635, + "grad_norm": 0.25202104449272156, + "learning_rate": 9.619088458727943e-06, + "loss": 1.2184, + "step": 17680 + }, + { + "epoch": 5.266218656341332, + "grad_norm": 0.263837605714798, + "learning_rate": 9.618124594151616e-06, + "loss": 1.2041, + "step": 17681 + }, + { + "epoch": 5.266516502541, + "grad_norm": 0.30499914288520813, + "learning_rate": 9.617160733128214e-06, + "loss": 1.223, + "step": 17682 + }, + { + "epoch": 5.266814348740669, + "grad_norm": 0.36672741174697876, + "learning_rate": 9.6161968756667e-06, + "loss": 1.196, + "step": 17683 + }, + { + "epoch": 5.267112194940338, + "grad_norm": 0.2590443789958954, + "learning_rate": 9.615233021776049e-06, + "loss": 1.2014, + "step": 17684 + }, + { + "epoch": 5.267410041140006, + "grad_norm": 0.380149781703949, + "learning_rate": 9.614269171465224e-06, + "loss": 1.2197, + "step": 17685 + }, + { + "epoch": 5.267707887339675, + "grad_norm": 0.25747448205947876, + "learning_rate": 9.613305324743191e-06, + "loss": 1.2121, + "step": 17686 + }, + { + "epoch": 5.2680057335393435, + "grad_norm": 0.4346485137939453, + "learning_rate": 9.612341481618924e-06, + "loss": 1.215, + "step": 17687 + }, + { + "epoch": 5.268303579739012, + "grad_norm": 0.3029664158821106, + "learning_rate": 9.611377642101386e-06, + "loss": 1.2256, + "step": 17688 + }, + { + "epoch": 5.268601425938681, + "grad_norm": 0.31039392948150635, + "learning_rate": 9.61041380619954e-06, + "loss": 1.2062, + "step": 17689 + }, + { + "epoch": 5.268899272138349, + "grad_norm": 0.3747265636920929, + "learning_rate": 9.609449973922363e-06, + "loss": 1.2106, + "step": 17690 + }, + { + "epoch": 5.269197118338019, + "grad_norm": 0.28945842385292053, + "learning_rate": 9.608486145278813e-06, + "loss": 1.2366, + "step": 17691 + }, + { + "epoch": 5.269494964537687, + "grad_norm": 0.2742080092430115, + "learning_rate": 9.607522320277866e-06, + "loss": 1.2289, + "step": 17692 + }, + { + "epoch": 5.269792810737355, + "grad_norm": 0.23953720927238464, + "learning_rate": 9.606558498928485e-06, + "loss": 1.214, + "step": 17693 + }, + { + "epoch": 5.2700906569370245, + "grad_norm": 0.30654656887054443, + "learning_rate": 9.605594681239636e-06, + "loss": 1.2046, + "step": 17694 + }, + { + "epoch": 5.270388503136693, + "grad_norm": 0.27098795771598816, + "learning_rate": 9.604630867220288e-06, + "loss": 1.2117, + "step": 17695 + }, + { + "epoch": 5.270686349336361, + "grad_norm": 0.3117823600769043, + "learning_rate": 9.603667056879412e-06, + "loss": 1.2012, + "step": 17696 + }, + { + "epoch": 5.27098419553603, + "grad_norm": 0.3452744781970978, + "learning_rate": 9.602703250225966e-06, + "loss": 1.2038, + "step": 17697 + }, + { + "epoch": 5.271282041735699, + "grad_norm": 0.2786961793899536, + "learning_rate": 9.601739447268926e-06, + "loss": 1.2262, + "step": 17698 + }, + { + "epoch": 5.271579887935367, + "grad_norm": 0.3331828713417053, + "learning_rate": 9.600775648017253e-06, + "loss": 1.2126, + "step": 17699 + }, + { + "epoch": 5.271877734135036, + "grad_norm": 0.26668357849121094, + "learning_rate": 9.599811852479916e-06, + "loss": 1.2136, + "step": 17700 + }, + { + "epoch": 5.272175580334705, + "grad_norm": 0.27471211552619934, + "learning_rate": 9.598848060665885e-06, + "loss": 1.2303, + "step": 17701 + }, + { + "epoch": 5.272473426534373, + "grad_norm": 0.2641032338142395, + "learning_rate": 9.597884272584126e-06, + "loss": 1.2158, + "step": 17702 + }, + { + "epoch": 5.272771272734042, + "grad_norm": 0.28214526176452637, + "learning_rate": 9.5969204882436e-06, + "loss": 1.2057, + "step": 17703 + }, + { + "epoch": 5.27306911893371, + "grad_norm": 0.37983354926109314, + "learning_rate": 9.595956707653282e-06, + "loss": 1.2041, + "step": 17704 + }, + { + "epoch": 5.27336696513338, + "grad_norm": 0.4316655397415161, + "learning_rate": 9.594992930822134e-06, + "loss": 1.2116, + "step": 17705 + }, + { + "epoch": 5.273664811333048, + "grad_norm": 0.285697340965271, + "learning_rate": 9.594029157759122e-06, + "loss": 1.2097, + "step": 17706 + }, + { + "epoch": 5.273962657532716, + "grad_norm": 0.3625587522983551, + "learning_rate": 9.59306538847322e-06, + "loss": 1.2089, + "step": 17707 + }, + { + "epoch": 5.2742605037323855, + "grad_norm": 0.33040204644203186, + "learning_rate": 9.592101622973385e-06, + "loss": 1.2388, + "step": 17708 + }, + { + "epoch": 5.274558349932054, + "grad_norm": 0.26028236746788025, + "learning_rate": 9.591137861268593e-06, + "loss": 1.2072, + "step": 17709 + }, + { + "epoch": 5.274856196131722, + "grad_norm": 0.27688753604888916, + "learning_rate": 9.590174103367807e-06, + "loss": 1.2125, + "step": 17710 + }, + { + "epoch": 5.275154042331391, + "grad_norm": 0.29101675748825073, + "learning_rate": 9.589210349279987e-06, + "loss": 1.1932, + "step": 17711 + }, + { + "epoch": 5.27545188853106, + "grad_norm": 0.4670671224594116, + "learning_rate": 9.588246599014109e-06, + "loss": 1.2191, + "step": 17712 + }, + { + "epoch": 5.275749734730728, + "grad_norm": 0.6348828077316284, + "learning_rate": 9.587282852579139e-06, + "loss": 1.1951, + "step": 17713 + }, + { + "epoch": 5.276047580930397, + "grad_norm": 0.3590271770954132, + "learning_rate": 9.586319109984035e-06, + "loss": 1.2174, + "step": 17714 + }, + { + "epoch": 5.276345427130066, + "grad_norm": 0.2943142056465149, + "learning_rate": 9.585355371237776e-06, + "loss": 1.2091, + "step": 17715 + }, + { + "epoch": 5.276643273329734, + "grad_norm": 0.3032686710357666, + "learning_rate": 9.584391636349319e-06, + "loss": 1.2194, + "step": 17716 + }, + { + "epoch": 5.276941119529403, + "grad_norm": 0.28097230195999146, + "learning_rate": 9.583427905327634e-06, + "loss": 1.2386, + "step": 17717 + }, + { + "epoch": 5.2772389657290715, + "grad_norm": 0.36082130670547485, + "learning_rate": 9.582464178181685e-06, + "loss": 1.2099, + "step": 17718 + }, + { + "epoch": 5.277536811928741, + "grad_norm": 0.2619275152683258, + "learning_rate": 9.581500454920443e-06, + "loss": 1.2171, + "step": 17719 + }, + { + "epoch": 5.277834658128409, + "grad_norm": 0.2929697334766388, + "learning_rate": 9.580536735552869e-06, + "loss": 1.1921, + "step": 17720 + }, + { + "epoch": 5.278132504328077, + "grad_norm": 0.2946698069572449, + "learning_rate": 9.579573020087935e-06, + "loss": 1.2141, + "step": 17721 + }, + { + "epoch": 5.278430350527747, + "grad_norm": 0.3796788156032562, + "learning_rate": 9.578609308534604e-06, + "loss": 1.2278, + "step": 17722 + }, + { + "epoch": 5.278728196727415, + "grad_norm": 0.2787896692752838, + "learning_rate": 9.577645600901838e-06, + "loss": 1.215, + "step": 17723 + }, + { + "epoch": 5.279026042927083, + "grad_norm": 0.5407164096832275, + "learning_rate": 9.576681897198613e-06, + "loss": 1.2331, + "step": 17724 + }, + { + "epoch": 5.279323889126752, + "grad_norm": 0.31933292746543884, + "learning_rate": 9.575718197433886e-06, + "loss": 1.2001, + "step": 17725 + }, + { + "epoch": 5.279621735326421, + "grad_norm": 0.3746192455291748, + "learning_rate": 9.574754501616631e-06, + "loss": 1.2093, + "step": 17726 + }, + { + "epoch": 5.279919581526089, + "grad_norm": 0.3737927973270416, + "learning_rate": 9.57379080975581e-06, + "loss": 1.2042, + "step": 17727 + }, + { + "epoch": 5.280217427725758, + "grad_norm": 0.2543933093547821, + "learning_rate": 9.572827121860387e-06, + "loss": 1.2233, + "step": 17728 + }, + { + "epoch": 5.280515273925427, + "grad_norm": 0.27333301305770874, + "learning_rate": 9.57186343793933e-06, + "loss": 1.225, + "step": 17729 + }, + { + "epoch": 5.280813120125096, + "grad_norm": 0.2692798376083374, + "learning_rate": 9.570899758001608e-06, + "loss": 1.2261, + "step": 17730 + }, + { + "epoch": 5.281110966324764, + "grad_norm": 0.25104212760925293, + "learning_rate": 9.569936082056181e-06, + "loss": 1.2219, + "step": 17731 + }, + { + "epoch": 5.2814088125244325, + "grad_norm": 0.3911302089691162, + "learning_rate": 9.568972410112023e-06, + "loss": 1.2354, + "step": 17732 + }, + { + "epoch": 5.281706658724102, + "grad_norm": 0.37411147356033325, + "learning_rate": 9.568008742178094e-06, + "loss": 1.2028, + "step": 17733 + }, + { + "epoch": 5.28200450492377, + "grad_norm": 0.290594220161438, + "learning_rate": 9.567045078263357e-06, + "loss": 1.2115, + "step": 17734 + }, + { + "epoch": 5.282302351123438, + "grad_norm": 0.3030377924442291, + "learning_rate": 9.566081418376784e-06, + "loss": 1.2088, + "step": 17735 + }, + { + "epoch": 5.282600197323108, + "grad_norm": 0.41590309143066406, + "learning_rate": 9.56511776252734e-06, + "loss": 1.2045, + "step": 17736 + }, + { + "epoch": 5.282898043522776, + "grad_norm": 0.4200705587863922, + "learning_rate": 9.564154110723986e-06, + "loss": 1.2218, + "step": 17737 + }, + { + "epoch": 5.283195889722444, + "grad_norm": 0.35149914026260376, + "learning_rate": 9.563190462975696e-06, + "loss": 1.2257, + "step": 17738 + }, + { + "epoch": 5.2834937359221135, + "grad_norm": 0.39634302258491516, + "learning_rate": 9.562226819291426e-06, + "loss": 1.2153, + "step": 17739 + }, + { + "epoch": 5.283791582121782, + "grad_norm": 0.27166733145713806, + "learning_rate": 9.561263179680149e-06, + "loss": 1.2203, + "step": 17740 + }, + { + "epoch": 5.28408942832145, + "grad_norm": 0.25245028734207153, + "learning_rate": 9.560299544150828e-06, + "loss": 1.2002, + "step": 17741 + }, + { + "epoch": 5.284387274521119, + "grad_norm": 0.3067881166934967, + "learning_rate": 9.559335912712425e-06, + "loss": 1.2105, + "step": 17742 + }, + { + "epoch": 5.284685120720788, + "grad_norm": 0.35038381814956665, + "learning_rate": 9.558372285373913e-06, + "loss": 1.1963, + "step": 17743 + }, + { + "epoch": 5.284982966920456, + "grad_norm": 0.2612840235233307, + "learning_rate": 9.557408662144254e-06, + "loss": 1.2286, + "step": 17744 + }, + { + "epoch": 5.285280813120125, + "grad_norm": 0.3415238559246063, + "learning_rate": 9.556445043032408e-06, + "loss": 1.2139, + "step": 17745 + }, + { + "epoch": 5.2855786593197935, + "grad_norm": 0.3501487672328949, + "learning_rate": 9.555481428047351e-06, + "loss": 1.1985, + "step": 17746 + }, + { + "epoch": 5.285876505519463, + "grad_norm": 0.2651747465133667, + "learning_rate": 9.55451781719804e-06, + "loss": 1.1988, + "step": 17747 + }, + { + "epoch": 5.286174351719131, + "grad_norm": 0.38092848658561707, + "learning_rate": 9.553554210493439e-06, + "loss": 1.2156, + "step": 17748 + }, + { + "epoch": 5.286472197918799, + "grad_norm": 0.26460573077201843, + "learning_rate": 9.552590607942524e-06, + "loss": 1.2366, + "step": 17749 + }, + { + "epoch": 5.286770044118469, + "grad_norm": 0.37430649995803833, + "learning_rate": 9.551627009554253e-06, + "loss": 1.2026, + "step": 17750 + }, + { + "epoch": 5.287067890318137, + "grad_norm": 0.30642634630203247, + "learning_rate": 9.550663415337587e-06, + "loss": 1.2131, + "step": 17751 + }, + { + "epoch": 5.287365736517805, + "grad_norm": 0.32907119393348694, + "learning_rate": 9.549699825301499e-06, + "loss": 1.2165, + "step": 17752 + }, + { + "epoch": 5.2876635827174745, + "grad_norm": 0.29158130288124084, + "learning_rate": 9.548736239454953e-06, + "loss": 1.2203, + "step": 17753 + }, + { + "epoch": 5.287961428917143, + "grad_norm": 0.4016822874546051, + "learning_rate": 9.547772657806905e-06, + "loss": 1.2239, + "step": 17754 + }, + { + "epoch": 5.288259275116811, + "grad_norm": 0.381194531917572, + "learning_rate": 9.546809080366335e-06, + "loss": 1.2095, + "step": 17755 + }, + { + "epoch": 5.28855712131648, + "grad_norm": 0.33159124851226807, + "learning_rate": 9.5458455071422e-06, + "loss": 1.2109, + "step": 17756 + }, + { + "epoch": 5.288854967516149, + "grad_norm": 0.4588654041290283, + "learning_rate": 9.544881938143457e-06, + "loss": 1.1984, + "step": 17757 + }, + { + "epoch": 5.289152813715818, + "grad_norm": 0.2766730487346649, + "learning_rate": 9.543918373379084e-06, + "loss": 1.2229, + "step": 17758 + }, + { + "epoch": 5.289450659915486, + "grad_norm": 0.3315102159976959, + "learning_rate": 9.54295481285804e-06, + "loss": 1.2087, + "step": 17759 + }, + { + "epoch": 5.289748506115155, + "grad_norm": 0.34840163588523865, + "learning_rate": 9.541991256589292e-06, + "loss": 1.2084, + "step": 17760 + }, + { + "epoch": 5.290046352314824, + "grad_norm": 0.33905497193336487, + "learning_rate": 9.541027704581807e-06, + "loss": 1.1986, + "step": 17761 + }, + { + "epoch": 5.290344198514492, + "grad_norm": 0.4713587760925293, + "learning_rate": 9.540064156844539e-06, + "loss": 1.2222, + "step": 17762 + }, + { + "epoch": 5.29064204471416, + "grad_norm": 0.35660380125045776, + "learning_rate": 9.539100613386468e-06, + "loss": 1.2171, + "step": 17763 + }, + { + "epoch": 5.29093989091383, + "grad_norm": 0.4753265976905823, + "learning_rate": 9.538137074216546e-06, + "loss": 1.2329, + "step": 17764 + }, + { + "epoch": 5.291237737113498, + "grad_norm": 0.3240477442741394, + "learning_rate": 9.53717353934374e-06, + "loss": 1.1892, + "step": 17765 + }, + { + "epoch": 5.291535583313166, + "grad_norm": 0.39129161834716797, + "learning_rate": 9.536210008777022e-06, + "loss": 1.2146, + "step": 17766 + }, + { + "epoch": 5.2918334295128355, + "grad_norm": 0.250642865896225, + "learning_rate": 9.535246482525353e-06, + "loss": 1.2152, + "step": 17767 + }, + { + "epoch": 5.292131275712504, + "grad_norm": 0.3610994219779968, + "learning_rate": 9.534282960597692e-06, + "loss": 1.2171, + "step": 17768 + }, + { + "epoch": 5.292429121912173, + "grad_norm": 0.25481879711151123, + "learning_rate": 9.533319443003011e-06, + "loss": 1.2119, + "step": 17769 + }, + { + "epoch": 5.292726968111841, + "grad_norm": 0.30706268548965454, + "learning_rate": 9.532355929750269e-06, + "loss": 1.2121, + "step": 17770 + }, + { + "epoch": 5.29302481431151, + "grad_norm": 0.28423357009887695, + "learning_rate": 9.53139242084843e-06, + "loss": 1.2116, + "step": 17771 + }, + { + "epoch": 5.293322660511179, + "grad_norm": 0.35022106766700745, + "learning_rate": 9.530428916306466e-06, + "loss": 1.1977, + "step": 17772 + }, + { + "epoch": 5.293620506710847, + "grad_norm": 0.2840099632740021, + "learning_rate": 9.529465416133336e-06, + "loss": 1.2053, + "step": 17773 + }, + { + "epoch": 5.293918352910516, + "grad_norm": 0.3796977400779724, + "learning_rate": 9.528501920338e-06, + "loss": 1.2172, + "step": 17774 + }, + { + "epoch": 5.294216199110185, + "grad_norm": 0.2766299843788147, + "learning_rate": 9.52753842892943e-06, + "loss": 1.1984, + "step": 17775 + }, + { + "epoch": 5.294514045309853, + "grad_norm": 0.4037684500217438, + "learning_rate": 9.526574941916587e-06, + "loss": 1.2149, + "step": 17776 + }, + { + "epoch": 5.2948118915095215, + "grad_norm": 0.25487518310546875, + "learning_rate": 9.525611459308434e-06, + "loss": 1.2085, + "step": 17777 + }, + { + "epoch": 5.295109737709191, + "grad_norm": 0.4670896530151367, + "learning_rate": 9.524647981113938e-06, + "loss": 1.2128, + "step": 17778 + }, + { + "epoch": 5.295407583908859, + "grad_norm": 0.2758674621582031, + "learning_rate": 9.523684507342059e-06, + "loss": 1.2209, + "step": 17779 + }, + { + "epoch": 5.295705430108527, + "grad_norm": 0.40095141530036926, + "learning_rate": 9.522721038001768e-06, + "loss": 1.2096, + "step": 17780 + }, + { + "epoch": 5.296003276308197, + "grad_norm": 0.29094254970550537, + "learning_rate": 9.521757573102021e-06, + "loss": 1.209, + "step": 17781 + }, + { + "epoch": 5.296301122507865, + "grad_norm": 0.2720484137535095, + "learning_rate": 9.520794112651786e-06, + "loss": 1.2176, + "step": 17782 + }, + { + "epoch": 5.296598968707533, + "grad_norm": 0.45520949363708496, + "learning_rate": 9.519830656660026e-06, + "loss": 1.2073, + "step": 17783 + }, + { + "epoch": 5.296896814907202, + "grad_norm": 0.27867141366004944, + "learning_rate": 9.518867205135707e-06, + "loss": 1.2325, + "step": 17784 + }, + { + "epoch": 5.297194661106871, + "grad_norm": 0.4318096339702606, + "learning_rate": 9.517903758087788e-06, + "loss": 1.2067, + "step": 17785 + }, + { + "epoch": 5.29749250730654, + "grad_norm": 0.29537105560302734, + "learning_rate": 9.516940315525241e-06, + "loss": 1.2041, + "step": 17786 + }, + { + "epoch": 5.297790353506208, + "grad_norm": 0.5036028027534485, + "learning_rate": 9.51597687745702e-06, + "loss": 1.2101, + "step": 17787 + }, + { + "epoch": 5.298088199705877, + "grad_norm": 0.4049065411090851, + "learning_rate": 9.515013443892094e-06, + "loss": 1.217, + "step": 17788 + }, + { + "epoch": 5.298386045905546, + "grad_norm": 0.3740525245666504, + "learning_rate": 9.514050014839429e-06, + "loss": 1.2225, + "step": 17789 + }, + { + "epoch": 5.298683892105214, + "grad_norm": 0.3414017856121063, + "learning_rate": 9.513086590307983e-06, + "loss": 1.204, + "step": 17790 + }, + { + "epoch": 5.2989817383048825, + "grad_norm": 0.35962656140327454, + "learning_rate": 9.512123170306722e-06, + "loss": 1.1991, + "step": 17791 + }, + { + "epoch": 5.299279584504552, + "grad_norm": 0.32481124997138977, + "learning_rate": 9.511159754844613e-06, + "loss": 1.2131, + "step": 17792 + }, + { + "epoch": 5.29957743070422, + "grad_norm": 0.3517327904701233, + "learning_rate": 9.510196343930611e-06, + "loss": 1.2195, + "step": 17793 + }, + { + "epoch": 5.299875276903888, + "grad_norm": 0.27785369753837585, + "learning_rate": 9.509232937573688e-06, + "loss": 1.2305, + "step": 17794 + }, + { + "epoch": 5.300173123103558, + "grad_norm": 0.5341268181800842, + "learning_rate": 9.508269535782805e-06, + "loss": 1.208, + "step": 17795 + }, + { + "epoch": 5.300470969303226, + "grad_norm": 0.2722247242927551, + "learning_rate": 9.50730613856692e-06, + "loss": 1.2207, + "step": 17796 + }, + { + "epoch": 5.300768815502895, + "grad_norm": 0.49601635336875916, + "learning_rate": 9.506342745935007e-06, + "loss": 1.198, + "step": 17797 + }, + { + "epoch": 5.3010666617025635, + "grad_norm": 0.2821829915046692, + "learning_rate": 9.505379357896023e-06, + "loss": 1.2195, + "step": 17798 + }, + { + "epoch": 5.301364507902232, + "grad_norm": 0.34916460514068604, + "learning_rate": 9.504415974458926e-06, + "loss": 1.2172, + "step": 17799 + }, + { + "epoch": 5.301662354101901, + "grad_norm": 0.2600118815898895, + "learning_rate": 9.503452595632688e-06, + "loss": 1.213, + "step": 17800 + }, + { + "epoch": 5.301960200301569, + "grad_norm": 0.37544959783554077, + "learning_rate": 9.502489221426272e-06, + "loss": 1.2135, + "step": 17801 + }, + { + "epoch": 5.302258046501238, + "grad_norm": 0.25689440965652466, + "learning_rate": 9.501525851848632e-06, + "loss": 1.2068, + "step": 17802 + }, + { + "epoch": 5.302555892700907, + "grad_norm": 0.2783738076686859, + "learning_rate": 9.500562486908739e-06, + "loss": 1.2299, + "step": 17803 + }, + { + "epoch": 5.302853738900575, + "grad_norm": 0.38854414224624634, + "learning_rate": 9.499599126615556e-06, + "loss": 1.2184, + "step": 17804 + }, + { + "epoch": 5.3031515851002435, + "grad_norm": 0.4283054769039154, + "learning_rate": 9.498635770978042e-06, + "loss": 1.2103, + "step": 17805 + }, + { + "epoch": 5.303449431299913, + "grad_norm": 0.3234732449054718, + "learning_rate": 9.497672420005164e-06, + "loss": 1.2152, + "step": 17806 + }, + { + "epoch": 5.303747277499581, + "grad_norm": 0.408430278301239, + "learning_rate": 9.496709073705878e-06, + "loss": 1.2168, + "step": 17807 + }, + { + "epoch": 5.304045123699249, + "grad_norm": 0.26550015807151794, + "learning_rate": 9.495745732089156e-06, + "loss": 1.2022, + "step": 17808 + }, + { + "epoch": 5.304342969898919, + "grad_norm": 0.5079704523086548, + "learning_rate": 9.494782395163958e-06, + "loss": 1.2119, + "step": 17809 + }, + { + "epoch": 5.304640816098587, + "grad_norm": 0.30164191126823425, + "learning_rate": 9.493819062939241e-06, + "loss": 1.2243, + "step": 17810 + }, + { + "epoch": 5.304938662298255, + "grad_norm": 0.3827284574508667, + "learning_rate": 9.492855735423973e-06, + "loss": 1.2108, + "step": 17811 + }, + { + "epoch": 5.3052365084979245, + "grad_norm": 0.4162214696407318, + "learning_rate": 9.491892412627121e-06, + "loss": 1.226, + "step": 17812 + }, + { + "epoch": 5.305534354697593, + "grad_norm": 0.27671313285827637, + "learning_rate": 9.490929094557636e-06, + "loss": 1.2261, + "step": 17813 + }, + { + "epoch": 5.305832200897262, + "grad_norm": 0.725034236907959, + "learning_rate": 9.489965781224491e-06, + "loss": 1.2205, + "step": 17814 + }, + { + "epoch": 5.30613004709693, + "grad_norm": 0.6562210321426392, + "learning_rate": 9.489002472636645e-06, + "loss": 1.2204, + "step": 17815 + }, + { + "epoch": 5.306427893296599, + "grad_norm": 0.2668152451515198, + "learning_rate": 9.488039168803054e-06, + "loss": 1.1972, + "step": 17816 + }, + { + "epoch": 5.306725739496268, + "grad_norm": 0.5385468006134033, + "learning_rate": 9.487075869732691e-06, + "loss": 1.2154, + "step": 17817 + }, + { + "epoch": 5.307023585695936, + "grad_norm": 0.30791765451431274, + "learning_rate": 9.486112575434516e-06, + "loss": 1.2202, + "step": 17818 + }, + { + "epoch": 5.307321431895605, + "grad_norm": 0.2882668375968933, + "learning_rate": 9.485149285917485e-06, + "loss": 1.2193, + "step": 17819 + }, + { + "epoch": 5.307619278095274, + "grad_norm": 0.2871219515800476, + "learning_rate": 9.484186001190569e-06, + "loss": 1.2082, + "step": 17820 + }, + { + "epoch": 5.307917124294942, + "grad_norm": 0.32335707545280457, + "learning_rate": 9.483222721262725e-06, + "loss": 1.2187, + "step": 17821 + }, + { + "epoch": 5.30821497049461, + "grad_norm": 0.3050103187561035, + "learning_rate": 9.482259446142913e-06, + "loss": 1.2008, + "step": 17822 + }, + { + "epoch": 5.30851281669428, + "grad_norm": 0.33260276913642883, + "learning_rate": 9.481296175840099e-06, + "loss": 1.2144, + "step": 17823 + }, + { + "epoch": 5.308810662893948, + "grad_norm": 0.3357332944869995, + "learning_rate": 9.480332910363243e-06, + "loss": 1.23, + "step": 17824 + }, + { + "epoch": 5.309108509093617, + "grad_norm": 0.2864167392253876, + "learning_rate": 9.479369649721314e-06, + "loss": 1.2138, + "step": 17825 + }, + { + "epoch": 5.3094063552932855, + "grad_norm": 0.42594629526138306, + "learning_rate": 9.478406393923267e-06, + "loss": 1.2064, + "step": 17826 + }, + { + "epoch": 5.309704201492954, + "grad_norm": 0.3915179669857025, + "learning_rate": 9.477443142978063e-06, + "loss": 1.2205, + "step": 17827 + }, + { + "epoch": 5.310002047692623, + "grad_norm": 0.27900174260139465, + "learning_rate": 9.47647989689467e-06, + "loss": 1.2366, + "step": 17828 + }, + { + "epoch": 5.310299893892291, + "grad_norm": 0.25326424837112427, + "learning_rate": 9.475516655682046e-06, + "loss": 1.21, + "step": 17829 + }, + { + "epoch": 5.31059774009196, + "grad_norm": 0.4116702973842621, + "learning_rate": 9.47455341934915e-06, + "loss": 1.2168, + "step": 17830 + }, + { + "epoch": 5.310895586291629, + "grad_norm": 0.3074418902397156, + "learning_rate": 9.473590187904952e-06, + "loss": 1.2171, + "step": 17831 + }, + { + "epoch": 5.311193432491297, + "grad_norm": 0.31306183338165283, + "learning_rate": 9.472626961358408e-06, + "loss": 1.2301, + "step": 17832 + }, + { + "epoch": 5.311491278690966, + "grad_norm": 0.2802980840206146, + "learning_rate": 9.471663739718479e-06, + "loss": 1.2108, + "step": 17833 + }, + { + "epoch": 5.311789124890635, + "grad_norm": 0.2544321119785309, + "learning_rate": 9.47070052299413e-06, + "loss": 1.2143, + "step": 17834 + }, + { + "epoch": 5.312086971090303, + "grad_norm": 0.292193740606308, + "learning_rate": 9.469737311194323e-06, + "loss": 1.2123, + "step": 17835 + }, + { + "epoch": 5.312384817289972, + "grad_norm": 0.3129338324069977, + "learning_rate": 9.468774104328013e-06, + "loss": 1.2038, + "step": 17836 + }, + { + "epoch": 5.312682663489641, + "grad_norm": 0.252285897731781, + "learning_rate": 9.46781090240417e-06, + "loss": 1.2071, + "step": 17837 + }, + { + "epoch": 5.312980509689309, + "grad_norm": 0.4913385212421417, + "learning_rate": 9.466847705431755e-06, + "loss": 1.2365, + "step": 17838 + }, + { + "epoch": 5.313278355888978, + "grad_norm": 0.34144484996795654, + "learning_rate": 9.46588451341972e-06, + "loss": 1.2136, + "step": 17839 + }, + { + "epoch": 5.313576202088647, + "grad_norm": 0.46132946014404297, + "learning_rate": 9.464921326377035e-06, + "loss": 1.222, + "step": 17840 + }, + { + "epoch": 5.313874048288315, + "grad_norm": 0.3557392954826355, + "learning_rate": 9.463958144312659e-06, + "loss": 1.2168, + "step": 17841 + }, + { + "epoch": 5.314171894487984, + "grad_norm": 0.4408482313156128, + "learning_rate": 9.462994967235555e-06, + "loss": 1.213, + "step": 17842 + }, + { + "epoch": 5.314469740687652, + "grad_norm": 0.3457951545715332, + "learning_rate": 9.462031795154685e-06, + "loss": 1.2156, + "step": 17843 + }, + { + "epoch": 5.314767586887321, + "grad_norm": 0.34607723355293274, + "learning_rate": 9.461068628079002e-06, + "loss": 1.2225, + "step": 17844 + }, + { + "epoch": 5.31506543308699, + "grad_norm": 0.2906036674976349, + "learning_rate": 9.460105466017478e-06, + "loss": 1.2226, + "step": 17845 + }, + { + "epoch": 5.315363279286658, + "grad_norm": 0.4787485599517822, + "learning_rate": 9.45914230897907e-06, + "loss": 1.1961, + "step": 17846 + }, + { + "epoch": 5.315661125486327, + "grad_norm": 0.37462329864501953, + "learning_rate": 9.458179156972733e-06, + "loss": 1.2196, + "step": 17847 + }, + { + "epoch": 5.315958971685996, + "grad_norm": 0.6380683779716492, + "learning_rate": 9.457216010007439e-06, + "loss": 1.2072, + "step": 17848 + }, + { + "epoch": 5.316256817885664, + "grad_norm": 0.3334093689918518, + "learning_rate": 9.456252868092145e-06, + "loss": 1.2152, + "step": 17849 + }, + { + "epoch": 5.3165546640853325, + "grad_norm": 0.622526228427887, + "learning_rate": 9.455289731235804e-06, + "loss": 1.2119, + "step": 17850 + }, + { + "epoch": 5.316852510285002, + "grad_norm": 0.3051876723766327, + "learning_rate": 9.454326599447388e-06, + "loss": 1.2189, + "step": 17851 + }, + { + "epoch": 5.31715035648467, + "grad_norm": 0.48956653475761414, + "learning_rate": 9.453363472735853e-06, + "loss": 1.2174, + "step": 17852 + }, + { + "epoch": 5.317448202684339, + "grad_norm": 0.2591938376426697, + "learning_rate": 9.452400351110155e-06, + "loss": 1.2161, + "step": 17853 + }, + { + "epoch": 5.317746048884008, + "grad_norm": 0.36875611543655396, + "learning_rate": 9.451437234579266e-06, + "loss": 1.1962, + "step": 17854 + }, + { + "epoch": 5.318043895083676, + "grad_norm": 0.33657798171043396, + "learning_rate": 9.450474123152142e-06, + "loss": 1.2126, + "step": 17855 + }, + { + "epoch": 5.318341741283345, + "grad_norm": 0.2821645736694336, + "learning_rate": 9.449511016837738e-06, + "loss": 1.2124, + "step": 17856 + }, + { + "epoch": 5.3186395874830135, + "grad_norm": 0.42428329586982727, + "learning_rate": 9.448547915645021e-06, + "loss": 1.217, + "step": 17857 + }, + { + "epoch": 5.318937433682682, + "grad_norm": 0.2546981871128082, + "learning_rate": 9.447584819582946e-06, + "loss": 1.2193, + "step": 17858 + }, + { + "epoch": 5.319235279882351, + "grad_norm": 0.44893303513526917, + "learning_rate": 9.446621728660479e-06, + "loss": 1.2146, + "step": 17859 + }, + { + "epoch": 5.319533126082019, + "grad_norm": 0.30277305841445923, + "learning_rate": 9.445658642886582e-06, + "loss": 1.2293, + "step": 17860 + }, + { + "epoch": 5.319830972281688, + "grad_norm": 0.30651018023490906, + "learning_rate": 9.444695562270207e-06, + "loss": 1.2051, + "step": 17861 + }, + { + "epoch": 5.320128818481357, + "grad_norm": 0.26027822494506836, + "learning_rate": 9.443732486820323e-06, + "loss": 1.22, + "step": 17862 + }, + { + "epoch": 5.320426664681025, + "grad_norm": 0.4425266981124878, + "learning_rate": 9.442769416545884e-06, + "loss": 1.2201, + "step": 17863 + }, + { + "epoch": 5.320724510880694, + "grad_norm": 0.24000824987888336, + "learning_rate": 9.441806351455855e-06, + "loss": 1.2179, + "step": 17864 + }, + { + "epoch": 5.321022357080363, + "grad_norm": 0.6749671697616577, + "learning_rate": 9.440843291559193e-06, + "loss": 1.2134, + "step": 17865 + }, + { + "epoch": 5.321320203280031, + "grad_norm": 0.337689608335495, + "learning_rate": 9.439880236864862e-06, + "loss": 1.2113, + "step": 17866 + }, + { + "epoch": 5.3216180494797, + "grad_norm": 0.4312625825405121, + "learning_rate": 9.438917187381815e-06, + "loss": 1.2298, + "step": 17867 + }, + { + "epoch": 5.321915895679369, + "grad_norm": 0.28401559591293335, + "learning_rate": 9.437954143119021e-06, + "loss": 1.233, + "step": 17868 + }, + { + "epoch": 5.322213741879037, + "grad_norm": 0.3657039403915405, + "learning_rate": 9.436991104085435e-06, + "loss": 1.2235, + "step": 17869 + }, + { + "epoch": 5.322511588078706, + "grad_norm": 0.3694859743118286, + "learning_rate": 9.436028070290013e-06, + "loss": 1.226, + "step": 17870 + }, + { + "epoch": 5.3228094342783745, + "grad_norm": 0.32885557413101196, + "learning_rate": 9.435065041741727e-06, + "loss": 1.246, + "step": 17871 + }, + { + "epoch": 5.323107280478043, + "grad_norm": 0.4674130380153656, + "learning_rate": 9.434102018449527e-06, + "loss": 1.2229, + "step": 17872 + }, + { + "epoch": 5.323405126677712, + "grad_norm": 0.5305304527282715, + "learning_rate": 9.433139000422373e-06, + "loss": 1.2012, + "step": 17873 + }, + { + "epoch": 5.32370297287738, + "grad_norm": 0.36852362751960754, + "learning_rate": 9.43217598766923e-06, + "loss": 1.2154, + "step": 17874 + }, + { + "epoch": 5.324000819077049, + "grad_norm": 0.3169606626033783, + "learning_rate": 9.43121298019905e-06, + "loss": 1.2203, + "step": 17875 + }, + { + "epoch": 5.324298665276718, + "grad_norm": 0.5542791485786438, + "learning_rate": 9.430249978020803e-06, + "loss": 1.2147, + "step": 17876 + }, + { + "epoch": 5.324596511476386, + "grad_norm": 0.3020332455635071, + "learning_rate": 9.429286981143445e-06, + "loss": 1.2097, + "step": 17877 + }, + { + "epoch": 5.324894357676055, + "grad_norm": 0.43916815519332886, + "learning_rate": 9.428323989575928e-06, + "loss": 1.2228, + "step": 17878 + }, + { + "epoch": 5.325192203875724, + "grad_norm": 0.307855486869812, + "learning_rate": 9.42736100332722e-06, + "loss": 1.2083, + "step": 17879 + }, + { + "epoch": 5.325490050075392, + "grad_norm": 0.38172781467437744, + "learning_rate": 9.426398022406283e-06, + "loss": 1.189, + "step": 17880 + }, + { + "epoch": 5.325787896275061, + "grad_norm": 0.4009075164794922, + "learning_rate": 9.425435046822064e-06, + "loss": 1.2176, + "step": 17881 + }, + { + "epoch": 5.32608574247473, + "grad_norm": 0.39744630455970764, + "learning_rate": 9.424472076583533e-06, + "loss": 1.2135, + "step": 17882 + }, + { + "epoch": 5.326383588674398, + "grad_norm": 0.4499882757663727, + "learning_rate": 9.423509111699648e-06, + "loss": 1.2094, + "step": 17883 + }, + { + "epoch": 5.326681434874067, + "grad_norm": 0.2811439335346222, + "learning_rate": 9.422546152179363e-06, + "loss": 1.2125, + "step": 17884 + }, + { + "epoch": 5.3269792810737355, + "grad_norm": 0.7157266139984131, + "learning_rate": 9.421583198031644e-06, + "loss": 1.2169, + "step": 17885 + }, + { + "epoch": 5.327277127273404, + "grad_norm": 0.3518158793449402, + "learning_rate": 9.420620249265446e-06, + "loss": 1.2124, + "step": 17886 + }, + { + "epoch": 5.327574973473073, + "grad_norm": 0.5372270941734314, + "learning_rate": 9.41965730588973e-06, + "loss": 1.2254, + "step": 17887 + }, + { + "epoch": 5.327872819672741, + "grad_norm": 0.24127618968486786, + "learning_rate": 9.418694367913452e-06, + "loss": 1.2182, + "step": 17888 + }, + { + "epoch": 5.32817066587241, + "grad_norm": 0.4268087148666382, + "learning_rate": 9.417731435345578e-06, + "loss": 1.2406, + "step": 17889 + }, + { + "epoch": 5.328468512072079, + "grad_norm": 0.35385340452194214, + "learning_rate": 9.416768508195057e-06, + "loss": 1.2134, + "step": 17890 + }, + { + "epoch": 5.328766358271747, + "grad_norm": 0.41965508460998535, + "learning_rate": 9.415805586470858e-06, + "loss": 1.2291, + "step": 17891 + }, + { + "epoch": 5.3290642044714165, + "grad_norm": 0.43825778365135193, + "learning_rate": 9.414842670181931e-06, + "loss": 1.2106, + "step": 17892 + }, + { + "epoch": 5.329362050671085, + "grad_norm": 0.41483011841773987, + "learning_rate": 9.413879759337242e-06, + "loss": 1.2263, + "step": 17893 + }, + { + "epoch": 5.329659896870753, + "grad_norm": 0.43688637018203735, + "learning_rate": 9.412916853945747e-06, + "loss": 1.2183, + "step": 17894 + }, + { + "epoch": 5.329957743070422, + "grad_norm": 0.31809523701667786, + "learning_rate": 9.411953954016402e-06, + "loss": 1.213, + "step": 17895 + }, + { + "epoch": 5.330255589270091, + "grad_norm": 0.37871140241622925, + "learning_rate": 9.410991059558172e-06, + "loss": 1.2118, + "step": 17896 + }, + { + "epoch": 5.330553435469759, + "grad_norm": 0.28996196389198303, + "learning_rate": 9.410028170580013e-06, + "loss": 1.2075, + "step": 17897 + }, + { + "epoch": 5.330851281669428, + "grad_norm": 0.5004257559776306, + "learning_rate": 9.409065287090878e-06, + "loss": 1.2207, + "step": 17898 + }, + { + "epoch": 5.331149127869097, + "grad_norm": 0.2947828471660614, + "learning_rate": 9.408102409099732e-06, + "loss": 1.2302, + "step": 17899 + }, + { + "epoch": 5.331446974068765, + "grad_norm": 0.538334846496582, + "learning_rate": 9.407139536615535e-06, + "loss": 1.2083, + "step": 17900 + }, + { + "epoch": 5.331744820268434, + "grad_norm": 0.29365020990371704, + "learning_rate": 9.406176669647237e-06, + "loss": 1.2143, + "step": 17901 + }, + { + "epoch": 5.332042666468102, + "grad_norm": 0.38196516036987305, + "learning_rate": 9.405213808203807e-06, + "loss": 1.202, + "step": 17902 + }, + { + "epoch": 5.332340512667772, + "grad_norm": 0.37977781891822815, + "learning_rate": 9.404250952294196e-06, + "loss": 1.2042, + "step": 17903 + }, + { + "epoch": 5.33263835886744, + "grad_norm": 0.37083926796913147, + "learning_rate": 9.403288101927361e-06, + "loss": 1.2232, + "step": 17904 + }, + { + "epoch": 5.332936205067108, + "grad_norm": 0.545269787311554, + "learning_rate": 9.402325257112265e-06, + "loss": 1.2124, + "step": 17905 + }, + { + "epoch": 5.3332340512667775, + "grad_norm": 0.4284341335296631, + "learning_rate": 9.401362417857869e-06, + "loss": 1.2099, + "step": 17906 + }, + { + "epoch": 5.333531897466446, + "grad_norm": 0.4500286877155304, + "learning_rate": 9.40039958417312e-06, + "loss": 1.2032, + "step": 17907 + }, + { + "epoch": 5.333829743666114, + "grad_norm": 0.40348461270332336, + "learning_rate": 9.39943675606699e-06, + "loss": 1.209, + "step": 17908 + }, + { + "epoch": 5.334127589865783, + "grad_norm": 0.3139120042324066, + "learning_rate": 9.398473933548424e-06, + "loss": 1.2135, + "step": 17909 + }, + { + "epoch": 5.334425436065452, + "grad_norm": 0.34571802616119385, + "learning_rate": 9.39751111662639e-06, + "loss": 1.2213, + "step": 17910 + }, + { + "epoch": 5.33472328226512, + "grad_norm": 0.3927350342273712, + "learning_rate": 9.396548305309841e-06, + "loss": 1.208, + "step": 17911 + }, + { + "epoch": 5.335021128464789, + "grad_norm": 0.3874667286872864, + "learning_rate": 9.395585499607733e-06, + "loss": 1.2036, + "step": 17912 + }, + { + "epoch": 5.335318974664458, + "grad_norm": 0.3238022029399872, + "learning_rate": 9.39462269952903e-06, + "loss": 1.2117, + "step": 17913 + }, + { + "epoch": 5.335616820864126, + "grad_norm": 0.4481665790081024, + "learning_rate": 9.393659905082687e-06, + "loss": 1.2279, + "step": 17914 + }, + { + "epoch": 5.335914667063795, + "grad_norm": 0.34377190470695496, + "learning_rate": 9.392697116277658e-06, + "loss": 1.2223, + "step": 17915 + }, + { + "epoch": 5.3362125132634635, + "grad_norm": 0.510254442691803, + "learning_rate": 9.391734333122908e-06, + "loss": 1.2092, + "step": 17916 + }, + { + "epoch": 5.336510359463132, + "grad_norm": 0.29175788164138794, + "learning_rate": 9.390771555627386e-06, + "loss": 1.2047, + "step": 17917 + }, + { + "epoch": 5.336808205662801, + "grad_norm": 0.3241147994995117, + "learning_rate": 9.389808783800054e-06, + "loss": 1.206, + "step": 17918 + }, + { + "epoch": 5.337106051862469, + "grad_norm": 0.278617799282074, + "learning_rate": 9.388846017649874e-06, + "loss": 1.2159, + "step": 17919 + }, + { + "epoch": 5.3374038980621386, + "grad_norm": 0.29171955585479736, + "learning_rate": 9.387883257185798e-06, + "loss": 1.2128, + "step": 17920 + }, + { + "epoch": 5.337701744261807, + "grad_norm": 0.2922581434249878, + "learning_rate": 9.38692050241678e-06, + "loss": 1.2119, + "step": 17921 + }, + { + "epoch": 5.337999590461475, + "grad_norm": 0.38868823647499084, + "learning_rate": 9.385957753351785e-06, + "loss": 1.2143, + "step": 17922 + }, + { + "epoch": 5.338297436661144, + "grad_norm": 0.4154965877532959, + "learning_rate": 9.384995009999765e-06, + "loss": 1.2204, + "step": 17923 + }, + { + "epoch": 5.338595282860813, + "grad_norm": 0.3261200487613678, + "learning_rate": 9.384032272369681e-06, + "loss": 1.2043, + "step": 17924 + }, + { + "epoch": 5.338893129060481, + "grad_norm": 0.5034027695655823, + "learning_rate": 9.38306954047049e-06, + "loss": 1.2086, + "step": 17925 + }, + { + "epoch": 5.33919097526015, + "grad_norm": 0.267456591129303, + "learning_rate": 9.382106814311144e-06, + "loss": 1.2194, + "step": 17926 + }, + { + "epoch": 5.339488821459819, + "grad_norm": 0.311736524105072, + "learning_rate": 9.381144093900608e-06, + "loss": 1.2, + "step": 17927 + }, + { + "epoch": 5.339786667659487, + "grad_norm": 0.35112830996513367, + "learning_rate": 9.380181379247833e-06, + "loss": 1.2188, + "step": 17928 + }, + { + "epoch": 5.340084513859156, + "grad_norm": 0.30734434723854065, + "learning_rate": 9.379218670361775e-06, + "loss": 1.2018, + "step": 17929 + }, + { + "epoch": 5.3403823600588245, + "grad_norm": 0.3171696662902832, + "learning_rate": 9.378255967251397e-06, + "loss": 1.2034, + "step": 17930 + }, + { + "epoch": 5.340680206258494, + "grad_norm": 0.38718852400779724, + "learning_rate": 9.377293269925653e-06, + "loss": 1.1832, + "step": 17931 + }, + { + "epoch": 5.340978052458162, + "grad_norm": 0.3817012310028076, + "learning_rate": 9.376330578393496e-06, + "loss": 1.2307, + "step": 17932 + }, + { + "epoch": 5.34127589865783, + "grad_norm": 0.28209900856018066, + "learning_rate": 9.375367892663891e-06, + "loss": 1.2172, + "step": 17933 + }, + { + "epoch": 5.3415737448575, + "grad_norm": 0.5042916536331177, + "learning_rate": 9.374405212745786e-06, + "loss": 1.2226, + "step": 17934 + }, + { + "epoch": 5.341871591057168, + "grad_norm": 0.3044802248477936, + "learning_rate": 9.37344253864814e-06, + "loss": 1.2117, + "step": 17935 + }, + { + "epoch": 5.342169437256836, + "grad_norm": 0.3037538230419159, + "learning_rate": 9.372479870379916e-06, + "loss": 1.2138, + "step": 17936 + }, + { + "epoch": 5.3424672834565055, + "grad_norm": 0.2792463004589081, + "learning_rate": 9.371517207950065e-06, + "loss": 1.2184, + "step": 17937 + }, + { + "epoch": 5.342765129656174, + "grad_norm": 0.3734724521636963, + "learning_rate": 9.37055455136754e-06, + "loss": 1.2171, + "step": 17938 + }, + { + "epoch": 5.343062975855842, + "grad_norm": 0.26259225606918335, + "learning_rate": 9.369591900641306e-06, + "loss": 1.2193, + "step": 17939 + }, + { + "epoch": 5.343360822055511, + "grad_norm": 0.5744872093200684, + "learning_rate": 9.36862925578031e-06, + "loss": 1.2346, + "step": 17940 + }, + { + "epoch": 5.34365866825518, + "grad_norm": 0.48147451877593994, + "learning_rate": 9.367666616793518e-06, + "loss": 1.2165, + "step": 17941 + }, + { + "epoch": 5.343956514454848, + "grad_norm": 0.43163394927978516, + "learning_rate": 9.366703983689881e-06, + "loss": 1.2116, + "step": 17942 + }, + { + "epoch": 5.344254360654517, + "grad_norm": 0.5540128946304321, + "learning_rate": 9.365741356478352e-06, + "loss": 1.2129, + "step": 17943 + }, + { + "epoch": 5.3445522068541855, + "grad_norm": 0.26069483160972595, + "learning_rate": 9.364778735167896e-06, + "loss": 1.1946, + "step": 17944 + }, + { + "epoch": 5.344850053053854, + "grad_norm": 0.3064899742603302, + "learning_rate": 9.363816119767462e-06, + "loss": 1.2158, + "step": 17945 + }, + { + "epoch": 5.345147899253523, + "grad_norm": 0.2740856111049652, + "learning_rate": 9.36285351028601e-06, + "loss": 1.2178, + "step": 17946 + }, + { + "epoch": 5.345445745453191, + "grad_norm": 0.276441365480423, + "learning_rate": 9.361890906732492e-06, + "loss": 1.2188, + "step": 17947 + }, + { + "epoch": 5.345743591652861, + "grad_norm": 0.2677435576915741, + "learning_rate": 9.360928309115869e-06, + "loss": 1.2214, + "step": 17948 + }, + { + "epoch": 5.346041437852529, + "grad_norm": 0.2829286754131317, + "learning_rate": 9.359965717445088e-06, + "loss": 1.2325, + "step": 17949 + }, + { + "epoch": 5.346339284052197, + "grad_norm": 0.26164767146110535, + "learning_rate": 9.359003131729117e-06, + "loss": 1.2073, + "step": 17950 + }, + { + "epoch": 5.3466371302518665, + "grad_norm": 0.2733970880508423, + "learning_rate": 9.358040551976905e-06, + "loss": 1.2282, + "step": 17951 + }, + { + "epoch": 5.346934976451535, + "grad_norm": 0.25731900334358215, + "learning_rate": 9.357077978197406e-06, + "loss": 1.2132, + "step": 17952 + }, + { + "epoch": 5.347232822651203, + "grad_norm": 0.2532481849193573, + "learning_rate": 9.356115410399578e-06, + "loss": 1.1961, + "step": 17953 + }, + { + "epoch": 5.347530668850872, + "grad_norm": 0.24681970477104187, + "learning_rate": 9.35515284859238e-06, + "loss": 1.2083, + "step": 17954 + }, + { + "epoch": 5.347828515050541, + "grad_norm": 0.26690852642059326, + "learning_rate": 9.35419029278476e-06, + "loss": 1.2168, + "step": 17955 + }, + { + "epoch": 5.348126361250209, + "grad_norm": 0.3608032464981079, + "learning_rate": 9.353227742985683e-06, + "loss": 1.2013, + "step": 17956 + }, + { + "epoch": 5.348424207449878, + "grad_norm": 0.3427022099494934, + "learning_rate": 9.352265199204093e-06, + "loss": 1.2331, + "step": 17957 + }, + { + "epoch": 5.348722053649547, + "grad_norm": 0.27666598558425903, + "learning_rate": 9.351302661448954e-06, + "loss": 1.2087, + "step": 17958 + }, + { + "epoch": 5.349019899849216, + "grad_norm": 0.30473482608795166, + "learning_rate": 9.350340129729222e-06, + "loss": 1.2099, + "step": 17959 + }, + { + "epoch": 5.349317746048884, + "grad_norm": 0.34101781249046326, + "learning_rate": 9.349377604053844e-06, + "loss": 1.2122, + "step": 17960 + }, + { + "epoch": 5.349615592248552, + "grad_norm": 0.5237231850624084, + "learning_rate": 9.348415084431786e-06, + "loss": 1.2027, + "step": 17961 + }, + { + "epoch": 5.349913438448222, + "grad_norm": 0.2598274350166321, + "learning_rate": 9.347452570871997e-06, + "loss": 1.2104, + "step": 17962 + }, + { + "epoch": 5.35021128464789, + "grad_norm": 0.629736602306366, + "learning_rate": 9.346490063383428e-06, + "loss": 1.2289, + "step": 17963 + }, + { + "epoch": 5.350509130847558, + "grad_norm": 0.3449530005455017, + "learning_rate": 9.345527561975042e-06, + "loss": 1.215, + "step": 17964 + }, + { + "epoch": 5.3508069770472275, + "grad_norm": 0.5009580254554749, + "learning_rate": 9.344565066655794e-06, + "loss": 1.22, + "step": 17965 + }, + { + "epoch": 5.351104823246896, + "grad_norm": 0.3275929093360901, + "learning_rate": 9.343602577434629e-06, + "loss": 1.2253, + "step": 17966 + }, + { + "epoch": 5.351402669446564, + "grad_norm": 0.45423343777656555, + "learning_rate": 9.342640094320512e-06, + "loss": 1.1974, + "step": 17967 + }, + { + "epoch": 5.351700515646233, + "grad_norm": 0.3593066930770874, + "learning_rate": 9.341677617322397e-06, + "loss": 1.2035, + "step": 17968 + }, + { + "epoch": 5.351998361845902, + "grad_norm": 0.3311804234981537, + "learning_rate": 9.340715146449231e-06, + "loss": 1.2092, + "step": 17969 + }, + { + "epoch": 5.352296208045571, + "grad_norm": 0.26455381512641907, + "learning_rate": 9.339752681709977e-06, + "loss": 1.2205, + "step": 17970 + }, + { + "epoch": 5.352594054245239, + "grad_norm": 0.5381209254264832, + "learning_rate": 9.338790223113588e-06, + "loss": 1.2136, + "step": 17971 + }, + { + "epoch": 5.352891900444908, + "grad_norm": 0.36212044954299927, + "learning_rate": 9.337827770669013e-06, + "loss": 1.2138, + "step": 17972 + }, + { + "epoch": 5.353189746644577, + "grad_norm": 0.37515610456466675, + "learning_rate": 9.336865324385216e-06, + "loss": 1.2114, + "step": 17973 + }, + { + "epoch": 5.353487592844245, + "grad_norm": 0.27836519479751587, + "learning_rate": 9.33590288427114e-06, + "loss": 1.2183, + "step": 17974 + }, + { + "epoch": 5.3537854390439135, + "grad_norm": 0.6861031651496887, + "learning_rate": 9.334940450335751e-06, + "loss": 1.2132, + "step": 17975 + }, + { + "epoch": 5.354083285243583, + "grad_norm": 0.3119991719722748, + "learning_rate": 9.333978022587996e-06, + "loss": 1.2244, + "step": 17976 + }, + { + "epoch": 5.354381131443251, + "grad_norm": 0.46049776673316956, + "learning_rate": 9.33301560103683e-06, + "loss": 1.2165, + "step": 17977 + }, + { + "epoch": 5.354678977642919, + "grad_norm": 0.30692294239997864, + "learning_rate": 9.332053185691213e-06, + "loss": 1.2065, + "step": 17978 + }, + { + "epoch": 5.3549768238425886, + "grad_norm": 0.4809039235115051, + "learning_rate": 9.331090776560093e-06, + "loss": 1.2108, + "step": 17979 + }, + { + "epoch": 5.355274670042257, + "grad_norm": 0.37557461857795715, + "learning_rate": 9.330128373652422e-06, + "loss": 1.1953, + "step": 17980 + }, + { + "epoch": 5.355572516241925, + "grad_norm": 0.3039226830005646, + "learning_rate": 9.32916597697716e-06, + "loss": 1.2384, + "step": 17981 + }, + { + "epoch": 5.355870362441594, + "grad_norm": 0.368528813123703, + "learning_rate": 9.328203586543263e-06, + "loss": 1.1951, + "step": 17982 + }, + { + "epoch": 5.356168208641263, + "grad_norm": 0.32374516129493713, + "learning_rate": 9.327241202359676e-06, + "loss": 1.2297, + "step": 17983 + }, + { + "epoch": 5.356466054840931, + "grad_norm": 0.3582445979118347, + "learning_rate": 9.326278824435362e-06, + "loss": 1.2151, + "step": 17984 + }, + { + "epoch": 5.3567639010406, + "grad_norm": 0.5023511052131653, + "learning_rate": 9.325316452779272e-06, + "loss": 1.213, + "step": 17985 + }, + { + "epoch": 5.357061747240269, + "grad_norm": 0.28617364168167114, + "learning_rate": 9.324354087400352e-06, + "loss": 1.2169, + "step": 17986 + }, + { + "epoch": 5.357359593439938, + "grad_norm": 0.5104385018348694, + "learning_rate": 9.323391728307566e-06, + "loss": 1.2146, + "step": 17987 + }, + { + "epoch": 5.357657439639606, + "grad_norm": 0.2949276268482208, + "learning_rate": 9.322429375509867e-06, + "loss": 1.2286, + "step": 17988 + }, + { + "epoch": 5.3579552858392745, + "grad_norm": 0.370175838470459, + "learning_rate": 9.3214670290162e-06, + "loss": 1.2187, + "step": 17989 + }, + { + "epoch": 5.358253132038944, + "grad_norm": 0.36740320920944214, + "learning_rate": 9.320504688835529e-06, + "loss": 1.2088, + "step": 17990 + }, + { + "epoch": 5.358550978238612, + "grad_norm": 0.34314897656440735, + "learning_rate": 9.319542354976798e-06, + "loss": 1.2278, + "step": 17991 + }, + { + "epoch": 5.35884882443828, + "grad_norm": 0.42298799753189087, + "learning_rate": 9.31858002744897e-06, + "loss": 1.2119, + "step": 17992 + }, + { + "epoch": 5.35914667063795, + "grad_norm": 0.2853969931602478, + "learning_rate": 9.31761770626099e-06, + "loss": 1.2096, + "step": 17993 + }, + { + "epoch": 5.359444516837618, + "grad_norm": 0.41448962688446045, + "learning_rate": 9.316655391421813e-06, + "loss": 1.2232, + "step": 17994 + }, + { + "epoch": 5.359742363037286, + "grad_norm": 0.2936389744281769, + "learning_rate": 9.3156930829404e-06, + "loss": 1.2195, + "step": 17995 + }, + { + "epoch": 5.3600402092369555, + "grad_norm": 0.4390643835067749, + "learning_rate": 9.314730780825696e-06, + "loss": 1.2135, + "step": 17996 + }, + { + "epoch": 5.360338055436624, + "grad_norm": 0.3160460889339447, + "learning_rate": 9.313768485086654e-06, + "loss": 1.2192, + "step": 17997 + }, + { + "epoch": 5.360635901636293, + "grad_norm": 0.3259376287460327, + "learning_rate": 9.312806195732234e-06, + "loss": 1.2287, + "step": 17998 + }, + { + "epoch": 5.360933747835961, + "grad_norm": 0.3526167571544647, + "learning_rate": 9.311843912771381e-06, + "loss": 1.203, + "step": 17999 + }, + { + "epoch": 5.36123159403563, + "grad_norm": 0.24719078838825226, + "learning_rate": 9.310881636213049e-06, + "loss": 1.2155, + "step": 18000 + }, + { + "epoch": 5.36123159403563, + "eval_loss": 1.3248833417892456, + "eval_runtime": 23.5169, + "eval_samples_per_second": 73.734, + "eval_steps_per_second": 4.635, + "step": 18000 + }, + { + "epoch": 5.361529440235299, + "grad_norm": 0.6050386428833008, + "learning_rate": 9.3099193660662e-06, + "loss": 1.2225, + "step": 18001 + }, + { + "epoch": 5.361827286434967, + "grad_norm": 0.3603726625442505, + "learning_rate": 9.30895710233978e-06, + "loss": 1.2119, + "step": 18002 + }, + { + "epoch": 5.3621251326346355, + "grad_norm": 0.49911609292030334, + "learning_rate": 9.307994845042737e-06, + "loss": 1.2209, + "step": 18003 + }, + { + "epoch": 5.362422978834305, + "grad_norm": 0.3440111577510834, + "learning_rate": 9.307032594184033e-06, + "loss": 1.207, + "step": 18004 + }, + { + "epoch": 5.362720825033973, + "grad_norm": 0.48627927899360657, + "learning_rate": 9.306070349772613e-06, + "loss": 1.2057, + "step": 18005 + }, + { + "epoch": 5.363018671233641, + "grad_norm": 0.29417282342910767, + "learning_rate": 9.305108111817433e-06, + "loss": 1.2203, + "step": 18006 + }, + { + "epoch": 5.363316517433311, + "grad_norm": 0.5244109630584717, + "learning_rate": 9.304145880327449e-06, + "loss": 1.2114, + "step": 18007 + }, + { + "epoch": 5.363614363632979, + "grad_norm": 0.3801937401294708, + "learning_rate": 9.303183655311606e-06, + "loss": 1.2046, + "step": 18008 + }, + { + "epoch": 5.363912209832647, + "grad_norm": 0.3669234812259674, + "learning_rate": 9.302221436778866e-06, + "loss": 1.1938, + "step": 18009 + }, + { + "epoch": 5.3642100560323165, + "grad_norm": 0.3036079704761505, + "learning_rate": 9.301259224738171e-06, + "loss": 1.2144, + "step": 18010 + }, + { + "epoch": 5.364507902231985, + "grad_norm": 0.29160866141319275, + "learning_rate": 9.300297019198481e-06, + "loss": 1.2146, + "step": 18011 + }, + { + "epoch": 5.364805748431653, + "grad_norm": 0.2609872817993164, + "learning_rate": 9.299334820168744e-06, + "loss": 1.2157, + "step": 18012 + }, + { + "epoch": 5.365103594631322, + "grad_norm": 0.31827905774116516, + "learning_rate": 9.298372627657916e-06, + "loss": 1.2018, + "step": 18013 + }, + { + "epoch": 5.365401440830991, + "grad_norm": 0.2499566376209259, + "learning_rate": 9.297410441674943e-06, + "loss": 1.218, + "step": 18014 + }, + { + "epoch": 5.36569928703066, + "grad_norm": 0.2731861174106598, + "learning_rate": 9.296448262228786e-06, + "loss": 1.2339, + "step": 18015 + }, + { + "epoch": 5.365997133230328, + "grad_norm": 0.31170159578323364, + "learning_rate": 9.295486089328389e-06, + "loss": 1.2083, + "step": 18016 + }, + { + "epoch": 5.366294979429997, + "grad_norm": 0.3354482352733612, + "learning_rate": 9.294523922982704e-06, + "loss": 1.2025, + "step": 18017 + }, + { + "epoch": 5.366592825629666, + "grad_norm": 0.24018102884292603, + "learning_rate": 9.293561763200689e-06, + "loss": 1.2288, + "step": 18018 + }, + { + "epoch": 5.366890671829334, + "grad_norm": 0.2768344283103943, + "learning_rate": 9.292599609991294e-06, + "loss": 1.216, + "step": 18019 + }, + { + "epoch": 5.367188518029002, + "grad_norm": 0.2534888684749603, + "learning_rate": 9.291637463363465e-06, + "loss": 1.2002, + "step": 18020 + }, + { + "epoch": 5.367486364228672, + "grad_norm": 0.2585897743701935, + "learning_rate": 9.290675323326163e-06, + "loss": 1.201, + "step": 18021 + }, + { + "epoch": 5.36778421042834, + "grad_norm": 0.25801509618759155, + "learning_rate": 9.289713189888331e-06, + "loss": 1.2112, + "step": 18022 + }, + { + "epoch": 5.368082056628008, + "grad_norm": 0.2865597903728485, + "learning_rate": 9.288751063058922e-06, + "loss": 1.2266, + "step": 18023 + }, + { + "epoch": 5.3683799028276775, + "grad_norm": 0.3403066098690033, + "learning_rate": 9.287788942846895e-06, + "loss": 1.2114, + "step": 18024 + }, + { + "epoch": 5.368677749027346, + "grad_norm": 0.3064972162246704, + "learning_rate": 9.286826829261191e-06, + "loss": 1.227, + "step": 18025 + }, + { + "epoch": 5.368975595227015, + "grad_norm": 0.2508450448513031, + "learning_rate": 9.285864722310771e-06, + "loss": 1.2192, + "step": 18026 + }, + { + "epoch": 5.369273441426683, + "grad_norm": 0.2641198933124542, + "learning_rate": 9.284902622004583e-06, + "loss": 1.2297, + "step": 18027 + }, + { + "epoch": 5.369571287626352, + "grad_norm": 0.270623117685318, + "learning_rate": 9.283940528351572e-06, + "loss": 1.1957, + "step": 18028 + }, + { + "epoch": 5.369869133826021, + "grad_norm": 0.2816285192966461, + "learning_rate": 9.282978441360695e-06, + "loss": 1.2281, + "step": 18029 + }, + { + "epoch": 5.370166980025689, + "grad_norm": 0.265140563249588, + "learning_rate": 9.282016361040908e-06, + "loss": 1.219, + "step": 18030 + }, + { + "epoch": 5.370464826225358, + "grad_norm": 0.259594589471817, + "learning_rate": 9.28105428740115e-06, + "loss": 1.2095, + "step": 18031 + }, + { + "epoch": 5.370762672425027, + "grad_norm": 0.297926664352417, + "learning_rate": 9.280092220450383e-06, + "loss": 1.2337, + "step": 18032 + }, + { + "epoch": 5.371060518624695, + "grad_norm": 0.27153486013412476, + "learning_rate": 9.279130160197552e-06, + "loss": 1.2038, + "step": 18033 + }, + { + "epoch": 5.3713583648243635, + "grad_norm": 0.28715774416923523, + "learning_rate": 9.278168106651609e-06, + "loss": 1.1976, + "step": 18034 + }, + { + "epoch": 5.371656211024033, + "grad_norm": 0.3049171268939972, + "learning_rate": 9.277206059821505e-06, + "loss": 1.2263, + "step": 18035 + }, + { + "epoch": 5.371954057223701, + "grad_norm": 0.3039532005786896, + "learning_rate": 9.276244019716194e-06, + "loss": 1.2125, + "step": 18036 + }, + { + "epoch": 5.37225190342337, + "grad_norm": 0.2982765734195709, + "learning_rate": 9.27528198634462e-06, + "loss": 1.2146, + "step": 18037 + }, + { + "epoch": 5.3725497496230386, + "grad_norm": 0.253120094537735, + "learning_rate": 9.274319959715742e-06, + "loss": 1.2187, + "step": 18038 + }, + { + "epoch": 5.372847595822707, + "grad_norm": 0.26992347836494446, + "learning_rate": 9.273357939838499e-06, + "loss": 1.2178, + "step": 18039 + }, + { + "epoch": 5.373145442022376, + "grad_norm": 0.2781268060207367, + "learning_rate": 9.272395926721855e-06, + "loss": 1.2198, + "step": 18040 + }, + { + "epoch": 5.373443288222044, + "grad_norm": 0.2756299376487732, + "learning_rate": 9.271433920374751e-06, + "loss": 1.214, + "step": 18041 + }, + { + "epoch": 5.373741134421713, + "grad_norm": 0.2943572700023651, + "learning_rate": 9.270471920806138e-06, + "loss": 1.2098, + "step": 18042 + }, + { + "epoch": 5.374038980621382, + "grad_norm": 0.2673749625682831, + "learning_rate": 9.269509928024973e-06, + "loss": 1.1977, + "step": 18043 + }, + { + "epoch": 5.37433682682105, + "grad_norm": 0.2815070152282715, + "learning_rate": 9.268547942040204e-06, + "loss": 1.2069, + "step": 18044 + }, + { + "epoch": 5.374634673020719, + "grad_norm": 0.26712632179260254, + "learning_rate": 9.267585962860772e-06, + "loss": 1.2251, + "step": 18045 + }, + { + "epoch": 5.374932519220388, + "grad_norm": 0.2705835998058319, + "learning_rate": 9.266623990495637e-06, + "loss": 1.2116, + "step": 18046 + }, + { + "epoch": 5.375230365420056, + "grad_norm": 0.2494700849056244, + "learning_rate": 9.26566202495375e-06, + "loss": 1.214, + "step": 18047 + }, + { + "epoch": 5.3755282116197245, + "grad_norm": 0.29077938199043274, + "learning_rate": 9.264700066244053e-06, + "loss": 1.2048, + "step": 18048 + }, + { + "epoch": 5.375826057819394, + "grad_norm": 0.2771974802017212, + "learning_rate": 9.263738114375504e-06, + "loss": 1.1968, + "step": 18049 + }, + { + "epoch": 5.376123904019062, + "grad_norm": 0.3069455623626709, + "learning_rate": 9.26277616935705e-06, + "loss": 1.2043, + "step": 18050 + }, + { + "epoch": 5.37642175021873, + "grad_norm": 0.5123701691627502, + "learning_rate": 9.261814231197634e-06, + "loss": 1.2239, + "step": 18051 + }, + { + "epoch": 5.3767195964184, + "grad_norm": 0.461436003446579, + "learning_rate": 9.260852299906216e-06, + "loss": 1.2143, + "step": 18052 + }, + { + "epoch": 5.377017442618068, + "grad_norm": 0.3305855691432953, + "learning_rate": 9.259890375491742e-06, + "loss": 1.2199, + "step": 18053 + }, + { + "epoch": 5.377315288817737, + "grad_norm": 0.2641879618167877, + "learning_rate": 9.258928457963158e-06, + "loss": 1.2123, + "step": 18054 + }, + { + "epoch": 5.3776131350174055, + "grad_norm": 0.28682488203048706, + "learning_rate": 9.25796654732942e-06, + "loss": 1.204, + "step": 18055 + }, + { + "epoch": 5.377910981217074, + "grad_norm": 0.3019677698612213, + "learning_rate": 9.257004643599471e-06, + "loss": 1.1973, + "step": 18056 + }, + { + "epoch": 5.378208827416743, + "grad_norm": 0.24850806593894958, + "learning_rate": 9.256042746782267e-06, + "loss": 1.1932, + "step": 18057 + }, + { + "epoch": 5.378506673616411, + "grad_norm": 0.25137320160865784, + "learning_rate": 9.255080856886752e-06, + "loss": 1.1915, + "step": 18058 + }, + { + "epoch": 5.37880451981608, + "grad_norm": 0.29007431864738464, + "learning_rate": 9.254118973921877e-06, + "loss": 1.2136, + "step": 18059 + }, + { + "epoch": 5.379102366015749, + "grad_norm": 0.2802312672138214, + "learning_rate": 9.253157097896594e-06, + "loss": 1.1988, + "step": 18060 + }, + { + "epoch": 5.379400212215417, + "grad_norm": 0.36415979266166687, + "learning_rate": 9.252195228819851e-06, + "loss": 1.2182, + "step": 18061 + }, + { + "epoch": 5.3796980584150855, + "grad_norm": 0.5965977907180786, + "learning_rate": 9.251233366700591e-06, + "loss": 1.235, + "step": 18062 + }, + { + "epoch": 5.379995904614755, + "grad_norm": 0.38883692026138306, + "learning_rate": 9.250271511547772e-06, + "loss": 1.193, + "step": 18063 + }, + { + "epoch": 5.380293750814423, + "grad_norm": 0.3269845247268677, + "learning_rate": 9.249309663370337e-06, + "loss": 1.2292, + "step": 18064 + }, + { + "epoch": 5.380591597014092, + "grad_norm": 0.34158313274383545, + "learning_rate": 9.248347822177234e-06, + "loss": 1.2024, + "step": 18065 + }, + { + "epoch": 5.380889443213761, + "grad_norm": 0.31512251496315, + "learning_rate": 9.24738598797742e-06, + "loss": 1.2162, + "step": 18066 + }, + { + "epoch": 5.381187289413429, + "grad_norm": 0.2715738117694855, + "learning_rate": 9.246424160779837e-06, + "loss": 1.2141, + "step": 18067 + }, + { + "epoch": 5.381485135613098, + "grad_norm": 0.3289284110069275, + "learning_rate": 9.245462340593433e-06, + "loss": 1.2114, + "step": 18068 + }, + { + "epoch": 5.3817829818127665, + "grad_norm": 0.30728328227996826, + "learning_rate": 9.24450052742716e-06, + "loss": 1.2002, + "step": 18069 + }, + { + "epoch": 5.382080828012435, + "grad_norm": 0.2602955102920532, + "learning_rate": 9.243538721289967e-06, + "loss": 1.2094, + "step": 18070 + }, + { + "epoch": 5.382378674212104, + "grad_norm": 0.3036993145942688, + "learning_rate": 9.242576922190799e-06, + "loss": 1.2198, + "step": 18071 + }, + { + "epoch": 5.382676520411772, + "grad_norm": 0.27439242601394653, + "learning_rate": 9.241615130138609e-06, + "loss": 1.2206, + "step": 18072 + }, + { + "epoch": 5.382974366611441, + "grad_norm": 0.41943836212158203, + "learning_rate": 9.240653345142339e-06, + "loss": 1.2101, + "step": 18073 + }, + { + "epoch": 5.38327221281111, + "grad_norm": 0.30936744809150696, + "learning_rate": 9.239691567210944e-06, + "loss": 1.2138, + "step": 18074 + }, + { + "epoch": 5.383570059010778, + "grad_norm": 0.4905613958835602, + "learning_rate": 9.23872979635337e-06, + "loss": 1.2157, + "step": 18075 + }, + { + "epoch": 5.383867905210447, + "grad_norm": 0.461593359708786, + "learning_rate": 9.237768032578562e-06, + "loss": 1.2015, + "step": 18076 + }, + { + "epoch": 5.384165751410116, + "grad_norm": 0.2584933042526245, + "learning_rate": 9.236806275895474e-06, + "loss": 1.2083, + "step": 18077 + }, + { + "epoch": 5.384463597609784, + "grad_norm": 0.3259392976760864, + "learning_rate": 9.235844526313052e-06, + "loss": 1.2151, + "step": 18078 + }, + { + "epoch": 5.384761443809452, + "grad_norm": 0.26622921228408813, + "learning_rate": 9.234882783840237e-06, + "loss": 1.2024, + "step": 18079 + }, + { + "epoch": 5.385059290009122, + "grad_norm": 0.32340699434280396, + "learning_rate": 9.23392104848599e-06, + "loss": 1.2195, + "step": 18080 + }, + { + "epoch": 5.38535713620879, + "grad_norm": 0.26871415972709656, + "learning_rate": 9.232959320259246e-06, + "loss": 1.212, + "step": 18081 + }, + { + "epoch": 5.385654982408459, + "grad_norm": 0.2671147286891937, + "learning_rate": 9.231997599168958e-06, + "loss": 1.2365, + "step": 18082 + }, + { + "epoch": 5.3859528286081275, + "grad_norm": 0.2677510678768158, + "learning_rate": 9.231035885224078e-06, + "loss": 1.2061, + "step": 18083 + }, + { + "epoch": 5.386250674807796, + "grad_norm": 0.2720658779144287, + "learning_rate": 9.230074178433552e-06, + "loss": 1.2226, + "step": 18084 + }, + { + "epoch": 5.386548521007465, + "grad_norm": 0.29764074087142944, + "learning_rate": 9.22911247880632e-06, + "loss": 1.2092, + "step": 18085 + }, + { + "epoch": 5.386846367207133, + "grad_norm": 0.2718268632888794, + "learning_rate": 9.228150786351338e-06, + "loss": 1.2151, + "step": 18086 + }, + { + "epoch": 5.387144213406802, + "grad_norm": 0.2869923412799835, + "learning_rate": 9.22718910107755e-06, + "loss": 1.2134, + "step": 18087 + }, + { + "epoch": 5.387442059606471, + "grad_norm": 0.32346364855766296, + "learning_rate": 9.226227422993902e-06, + "loss": 1.2153, + "step": 18088 + }, + { + "epoch": 5.387739905806139, + "grad_norm": 0.29786375164985657, + "learning_rate": 9.225265752109348e-06, + "loss": 1.2364, + "step": 18089 + }, + { + "epoch": 5.388037752005808, + "grad_norm": 0.34467774629592896, + "learning_rate": 9.224304088432825e-06, + "loss": 1.2188, + "step": 18090 + }, + { + "epoch": 5.388335598205477, + "grad_norm": 0.3004086911678314, + "learning_rate": 9.223342431973291e-06, + "loss": 1.2172, + "step": 18091 + }, + { + "epoch": 5.388633444405145, + "grad_norm": 0.26614007353782654, + "learning_rate": 9.222380782739684e-06, + "loss": 1.2121, + "step": 18092 + }, + { + "epoch": 5.388931290604814, + "grad_norm": 0.33353137969970703, + "learning_rate": 9.221419140740956e-06, + "loss": 1.2229, + "step": 18093 + }, + { + "epoch": 5.389229136804483, + "grad_norm": 0.3915778696537018, + "learning_rate": 9.220457505986054e-06, + "loss": 1.2106, + "step": 18094 + }, + { + "epoch": 5.389526983004151, + "grad_norm": 0.4370432198047638, + "learning_rate": 9.219495878483926e-06, + "loss": 1.2036, + "step": 18095 + }, + { + "epoch": 5.38982482920382, + "grad_norm": 0.29130157828330994, + "learning_rate": 9.218534258243513e-06, + "loss": 1.1929, + "step": 18096 + }, + { + "epoch": 5.3901226754034886, + "grad_norm": 0.8444786667823792, + "learning_rate": 9.21757264527377e-06, + "loss": 1.2169, + "step": 18097 + }, + { + "epoch": 5.390420521603157, + "grad_norm": 0.5754273533821106, + "learning_rate": 9.216611039583634e-06, + "loss": 1.2039, + "step": 18098 + }, + { + "epoch": 5.390718367802826, + "grad_norm": 0.48834800720214844, + "learning_rate": 9.21564944118206e-06, + "loss": 1.2149, + "step": 18099 + }, + { + "epoch": 5.391016214002494, + "grad_norm": 0.5807204246520996, + "learning_rate": 9.21468785007799e-06, + "loss": 1.2197, + "step": 18100 + }, + { + "epoch": 5.391314060202163, + "grad_norm": 0.31268003582954407, + "learning_rate": 9.213726266280376e-06, + "loss": 1.2186, + "step": 18101 + }, + { + "epoch": 5.391611906401832, + "grad_norm": 0.3523191809654236, + "learning_rate": 9.212764689798154e-06, + "loss": 1.2136, + "step": 18102 + }, + { + "epoch": 5.3919097526015, + "grad_norm": 0.42938998341560364, + "learning_rate": 9.211803120640284e-06, + "loss": 1.1947, + "step": 18103 + }, + { + "epoch": 5.3922075988011695, + "grad_norm": 0.3200710117816925, + "learning_rate": 9.210841558815701e-06, + "loss": 1.1886, + "step": 18104 + }, + { + "epoch": 5.392505445000838, + "grad_norm": 0.5177934765815735, + "learning_rate": 9.209880004333356e-06, + "loss": 1.2166, + "step": 18105 + }, + { + "epoch": 5.392803291200506, + "grad_norm": 0.3521498143672943, + "learning_rate": 9.208918457202196e-06, + "loss": 1.2151, + "step": 18106 + }, + { + "epoch": 5.393101137400175, + "grad_norm": 0.34822359681129456, + "learning_rate": 9.207956917431164e-06, + "loss": 1.2223, + "step": 18107 + }, + { + "epoch": 5.393398983599844, + "grad_norm": 0.26184678077697754, + "learning_rate": 9.206995385029212e-06, + "loss": 1.2318, + "step": 18108 + }, + { + "epoch": 5.393696829799512, + "grad_norm": 0.37167298793792725, + "learning_rate": 9.20603386000528e-06, + "loss": 1.2011, + "step": 18109 + }, + { + "epoch": 5.393994675999181, + "grad_norm": 0.3147827088832855, + "learning_rate": 9.205072342368313e-06, + "loss": 1.1932, + "step": 18110 + }, + { + "epoch": 5.39429252219885, + "grad_norm": 0.3623134195804596, + "learning_rate": 9.204110832127262e-06, + "loss": 1.2324, + "step": 18111 + }, + { + "epoch": 5.394590368398518, + "grad_norm": 0.2865893244743347, + "learning_rate": 9.203149329291072e-06, + "loss": 1.2222, + "step": 18112 + }, + { + "epoch": 5.394888214598187, + "grad_norm": 0.3469681441783905, + "learning_rate": 9.202187833868684e-06, + "loss": 1.206, + "step": 18113 + }, + { + "epoch": 5.3951860607978555, + "grad_norm": 0.3164348304271698, + "learning_rate": 9.20122634586905e-06, + "loss": 1.2132, + "step": 18114 + }, + { + "epoch": 5.395483906997524, + "grad_norm": 0.49953708052635193, + "learning_rate": 9.200264865301112e-06, + "loss": 1.2122, + "step": 18115 + }, + { + "epoch": 5.395781753197193, + "grad_norm": 0.25462424755096436, + "learning_rate": 9.199303392173813e-06, + "loss": 1.2072, + "step": 18116 + }, + { + "epoch": 5.396079599396861, + "grad_norm": 0.526871919631958, + "learning_rate": 9.198341926496103e-06, + "loss": 1.2332, + "step": 18117 + }, + { + "epoch": 5.39637744559653, + "grad_norm": 0.3667275607585907, + "learning_rate": 9.197380468276928e-06, + "loss": 1.2296, + "step": 18118 + }, + { + "epoch": 5.396675291796199, + "grad_norm": 0.4166170358657837, + "learning_rate": 9.196419017525226e-06, + "loss": 1.2276, + "step": 18119 + }, + { + "epoch": 5.396973137995867, + "grad_norm": 0.4676266610622406, + "learning_rate": 9.195457574249952e-06, + "loss": 1.2217, + "step": 18120 + }, + { + "epoch": 5.397270984195536, + "grad_norm": 0.26535314321517944, + "learning_rate": 9.194496138460045e-06, + "loss": 1.2196, + "step": 18121 + }, + { + "epoch": 5.397568830395205, + "grad_norm": 0.3117094039916992, + "learning_rate": 9.19353471016445e-06, + "loss": 1.2165, + "step": 18122 + }, + { + "epoch": 5.397866676594873, + "grad_norm": 0.2659699022769928, + "learning_rate": 9.192573289372115e-06, + "loss": 1.2111, + "step": 18123 + }, + { + "epoch": 5.398164522794542, + "grad_norm": 0.290687620639801, + "learning_rate": 9.19161187609198e-06, + "loss": 1.2025, + "step": 18124 + }, + { + "epoch": 5.398462368994211, + "grad_norm": 0.2657442092895508, + "learning_rate": 9.190650470332998e-06, + "loss": 1.1948, + "step": 18125 + }, + { + "epoch": 5.398760215193879, + "grad_norm": 0.27361974120140076, + "learning_rate": 9.18968907210411e-06, + "loss": 1.2114, + "step": 18126 + }, + { + "epoch": 5.399058061393548, + "grad_norm": 0.2510913908481598, + "learning_rate": 9.188727681414253e-06, + "loss": 1.2179, + "step": 18127 + }, + { + "epoch": 5.3993559075932165, + "grad_norm": 0.27142348885536194, + "learning_rate": 9.187766298272382e-06, + "loss": 1.2206, + "step": 18128 + }, + { + "epoch": 5.399653753792885, + "grad_norm": 0.29261237382888794, + "learning_rate": 9.18680492268744e-06, + "loss": 1.217, + "step": 18129 + }, + { + "epoch": 5.399951599992554, + "grad_norm": 0.26918327808380127, + "learning_rate": 9.185843554668367e-06, + "loss": 1.2178, + "step": 18130 + }, + { + "epoch": 5.400249446192222, + "grad_norm": 0.26643863320350647, + "learning_rate": 9.184882194224114e-06, + "loss": 1.2121, + "step": 18131 + }, + { + "epoch": 5.400547292391892, + "grad_norm": 0.38250744342803955, + "learning_rate": 9.18392084136362e-06, + "loss": 1.1919, + "step": 18132 + }, + { + "epoch": 5.40084513859156, + "grad_norm": 0.30113449692726135, + "learning_rate": 9.182959496095828e-06, + "loss": 1.2066, + "step": 18133 + }, + { + "epoch": 5.401142984791228, + "grad_norm": 0.2831449806690216, + "learning_rate": 9.181998158429687e-06, + "loss": 1.218, + "step": 18134 + }, + { + "epoch": 5.4014408309908974, + "grad_norm": 0.2621733248233795, + "learning_rate": 9.18103682837414e-06, + "loss": 1.2143, + "step": 18135 + }, + { + "epoch": 5.401738677190566, + "grad_norm": 0.31273043155670166, + "learning_rate": 9.180075505938128e-06, + "loss": 1.2174, + "step": 18136 + }, + { + "epoch": 5.402036523390234, + "grad_norm": 0.2662215828895569, + "learning_rate": 9.179114191130601e-06, + "loss": 1.209, + "step": 18137 + }, + { + "epoch": 5.402334369589903, + "grad_norm": 0.27449601888656616, + "learning_rate": 9.178152883960499e-06, + "loss": 1.2049, + "step": 18138 + }, + { + "epoch": 5.402632215789572, + "grad_norm": 0.27994704246520996, + "learning_rate": 9.177191584436761e-06, + "loss": 1.2015, + "step": 18139 + }, + { + "epoch": 5.40293006198924, + "grad_norm": 0.2788822650909424, + "learning_rate": 9.176230292568338e-06, + "loss": 1.1974, + "step": 18140 + }, + { + "epoch": 5.403227908188909, + "grad_norm": 0.2680191099643707, + "learning_rate": 9.175269008364171e-06, + "loss": 1.2183, + "step": 18141 + }, + { + "epoch": 5.4035257543885775, + "grad_norm": 0.2693394720554352, + "learning_rate": 9.174307731833209e-06, + "loss": 1.2179, + "step": 18142 + }, + { + "epoch": 5.403823600588246, + "grad_norm": 0.29945388436317444, + "learning_rate": 9.173346462984389e-06, + "loss": 1.2279, + "step": 18143 + }, + { + "epoch": 5.404121446787915, + "grad_norm": 0.35322317481040955, + "learning_rate": 9.172385201826655e-06, + "loss": 1.2076, + "step": 18144 + }, + { + "epoch": 5.404419292987583, + "grad_norm": 0.2653945982456207, + "learning_rate": 9.171423948368953e-06, + "loss": 1.2353, + "step": 18145 + }, + { + "epoch": 5.404717139187252, + "grad_norm": 0.29993048310279846, + "learning_rate": 9.170462702620226e-06, + "loss": 1.2341, + "step": 18146 + }, + { + "epoch": 5.405014985386921, + "grad_norm": 0.2617839574813843, + "learning_rate": 9.169501464589415e-06, + "loss": 1.1914, + "step": 18147 + }, + { + "epoch": 5.405312831586589, + "grad_norm": 0.3259333670139313, + "learning_rate": 9.168540234285467e-06, + "loss": 1.2127, + "step": 18148 + }, + { + "epoch": 5.4056106777862585, + "grad_norm": 0.27893179655075073, + "learning_rate": 9.167579011717325e-06, + "loss": 1.2069, + "step": 18149 + }, + { + "epoch": 5.405908523985927, + "grad_norm": 0.4722149968147278, + "learning_rate": 9.166617796893924e-06, + "loss": 1.2231, + "step": 18150 + }, + { + "epoch": 5.406206370185595, + "grad_norm": 0.38504910469055176, + "learning_rate": 9.165656589824217e-06, + "loss": 1.2233, + "step": 18151 + }, + { + "epoch": 5.406504216385264, + "grad_norm": 0.32826289534568787, + "learning_rate": 9.164695390517146e-06, + "loss": 1.2198, + "step": 18152 + }, + { + "epoch": 5.406802062584933, + "grad_norm": 0.5815978646278381, + "learning_rate": 9.163734198981646e-06, + "loss": 1.2187, + "step": 18153 + }, + { + "epoch": 5.407099908784601, + "grad_norm": 0.3500232398509979, + "learning_rate": 9.162773015226668e-06, + "loss": 1.2191, + "step": 18154 + }, + { + "epoch": 5.40739775498427, + "grad_norm": 0.35605713725090027, + "learning_rate": 9.161811839261148e-06, + "loss": 1.2168, + "step": 18155 + }, + { + "epoch": 5.4076956011839385, + "grad_norm": 0.25141337513923645, + "learning_rate": 9.160850671094039e-06, + "loss": 1.1975, + "step": 18156 + }, + { + "epoch": 5.407993447383607, + "grad_norm": 0.528760552406311, + "learning_rate": 9.159889510734272e-06, + "loss": 1.2074, + "step": 18157 + }, + { + "epoch": 5.408291293583276, + "grad_norm": 0.33496665954589844, + "learning_rate": 9.158928358190795e-06, + "loss": 1.2165, + "step": 18158 + }, + { + "epoch": 5.408589139782944, + "grad_norm": 0.41628533601760864, + "learning_rate": 9.157967213472551e-06, + "loss": 1.2201, + "step": 18159 + }, + { + "epoch": 5.408886985982614, + "grad_norm": 0.2675477862358093, + "learning_rate": 9.157006076588482e-06, + "loss": 1.2163, + "step": 18160 + }, + { + "epoch": 5.409184832182282, + "grad_norm": 0.5859951376914978, + "learning_rate": 9.156044947547527e-06, + "loss": 1.2208, + "step": 18161 + }, + { + "epoch": 5.40948267838195, + "grad_norm": 0.29776814579963684, + "learning_rate": 9.155083826358633e-06, + "loss": 1.2189, + "step": 18162 + }, + { + "epoch": 5.4097805245816195, + "grad_norm": 0.49345192313194275, + "learning_rate": 9.15412271303074e-06, + "loss": 1.2356, + "step": 18163 + }, + { + "epoch": 5.410078370781288, + "grad_norm": 0.30862030386924744, + "learning_rate": 9.153161607572787e-06, + "loss": 1.2398, + "step": 18164 + }, + { + "epoch": 5.410376216980956, + "grad_norm": 0.43222489953041077, + "learning_rate": 9.152200509993723e-06, + "loss": 1.2104, + "step": 18165 + }, + { + "epoch": 5.410674063180625, + "grad_norm": 0.39378243684768677, + "learning_rate": 9.151239420302486e-06, + "loss": 1.2025, + "step": 18166 + }, + { + "epoch": 5.410971909380294, + "grad_norm": 0.46720996499061584, + "learning_rate": 9.150278338508014e-06, + "loss": 1.2142, + "step": 18167 + }, + { + "epoch": 5.411269755579962, + "grad_norm": 0.45590829849243164, + "learning_rate": 9.149317264619258e-06, + "loss": 1.2117, + "step": 18168 + }, + { + "epoch": 5.411567601779631, + "grad_norm": 0.35495367646217346, + "learning_rate": 9.14835619864515e-06, + "loss": 1.2097, + "step": 18169 + }, + { + "epoch": 5.4118654479793, + "grad_norm": 0.5380088090896606, + "learning_rate": 9.147395140594636e-06, + "loss": 1.2135, + "step": 18170 + }, + { + "epoch": 5.412163294178969, + "grad_norm": 0.28789058327674866, + "learning_rate": 9.146434090476662e-06, + "loss": 1.2117, + "step": 18171 + }, + { + "epoch": 5.412461140378637, + "grad_norm": 0.526710033416748, + "learning_rate": 9.14547304830016e-06, + "loss": 1.2172, + "step": 18172 + }, + { + "epoch": 5.4127589865783055, + "grad_norm": 0.4094356596469879, + "learning_rate": 9.14451201407408e-06, + "loss": 1.2259, + "step": 18173 + }, + { + "epoch": 5.413056832777975, + "grad_norm": 0.6341694593429565, + "learning_rate": 9.143550987807362e-06, + "loss": 1.2236, + "step": 18174 + }, + { + "epoch": 5.413354678977643, + "grad_norm": 0.3391723036766052, + "learning_rate": 9.142589969508939e-06, + "loss": 1.2087, + "step": 18175 + }, + { + "epoch": 5.413652525177311, + "grad_norm": 0.3829106092453003, + "learning_rate": 9.14162895918776e-06, + "loss": 1.2058, + "step": 18176 + }, + { + "epoch": 5.4139503713769805, + "grad_norm": 0.4187778830528259, + "learning_rate": 9.14066795685277e-06, + "loss": 1.2121, + "step": 18177 + }, + { + "epoch": 5.414248217576649, + "grad_norm": 0.3036881387233734, + "learning_rate": 9.139706962512896e-06, + "loss": 1.2073, + "step": 18178 + }, + { + "epoch": 5.414546063776317, + "grad_norm": 0.5648795366287231, + "learning_rate": 9.138745976177095e-06, + "loss": 1.2092, + "step": 18179 + }, + { + "epoch": 5.414843909975986, + "grad_norm": 0.3204457461833954, + "learning_rate": 9.137784997854296e-06, + "loss": 1.2077, + "step": 18180 + }, + { + "epoch": 5.415141756175655, + "grad_norm": 0.34159207344055176, + "learning_rate": 9.136824027553446e-06, + "loss": 1.2049, + "step": 18181 + }, + { + "epoch": 5.415439602375323, + "grad_norm": 0.3680896759033203, + "learning_rate": 9.135863065283483e-06, + "loss": 1.2051, + "step": 18182 + }, + { + "epoch": 5.415737448574992, + "grad_norm": 0.2764698565006256, + "learning_rate": 9.134902111053351e-06, + "loss": 1.2079, + "step": 18183 + }, + { + "epoch": 5.416035294774661, + "grad_norm": 0.3397405445575714, + "learning_rate": 9.133941164871986e-06, + "loss": 1.2207, + "step": 18184 + }, + { + "epoch": 5.416333140974329, + "grad_norm": 0.4269631505012512, + "learning_rate": 9.132980226748332e-06, + "loss": 1.2093, + "step": 18185 + }, + { + "epoch": 5.416630987173998, + "grad_norm": 0.26480230689048767, + "learning_rate": 9.132019296691328e-06, + "loss": 1.2033, + "step": 18186 + }, + { + "epoch": 5.4169288333736665, + "grad_norm": 0.4031720459461212, + "learning_rate": 9.131058374709913e-06, + "loss": 1.2122, + "step": 18187 + }, + { + "epoch": 5.417226679573336, + "grad_norm": 0.31415700912475586, + "learning_rate": 9.130097460813033e-06, + "loss": 1.2125, + "step": 18188 + }, + { + "epoch": 5.417524525773004, + "grad_norm": 0.31522077322006226, + "learning_rate": 9.129136555009618e-06, + "loss": 1.226, + "step": 18189 + }, + { + "epoch": 5.417822371972672, + "grad_norm": 0.38622716069221497, + "learning_rate": 9.12817565730862e-06, + "loss": 1.2163, + "step": 18190 + }, + { + "epoch": 5.418120218172342, + "grad_norm": 0.3281724452972412, + "learning_rate": 9.127214767718974e-06, + "loss": 1.2082, + "step": 18191 + }, + { + "epoch": 5.41841806437201, + "grad_norm": 0.4519134759902954, + "learning_rate": 9.126253886249615e-06, + "loss": 1.2136, + "step": 18192 + }, + { + "epoch": 5.418715910571678, + "grad_norm": 0.2930494248867035, + "learning_rate": 9.125293012909488e-06, + "loss": 1.211, + "step": 18193 + }, + { + "epoch": 5.4190137567713474, + "grad_norm": 0.3956511616706848, + "learning_rate": 9.124332147707536e-06, + "loss": 1.1966, + "step": 18194 + }, + { + "epoch": 5.419311602971016, + "grad_norm": 0.2750784456729889, + "learning_rate": 9.12337129065269e-06, + "loss": 1.2092, + "step": 18195 + }, + { + "epoch": 5.419609449170684, + "grad_norm": 0.28932952880859375, + "learning_rate": 9.1224104417539e-06, + "loss": 1.2123, + "step": 18196 + }, + { + "epoch": 5.419907295370353, + "grad_norm": 0.3183478116989136, + "learning_rate": 9.1214496010201e-06, + "loss": 1.2372, + "step": 18197 + }, + { + "epoch": 5.420205141570022, + "grad_norm": 0.334583044052124, + "learning_rate": 9.120488768460225e-06, + "loss": 1.2048, + "step": 18198 + }, + { + "epoch": 5.420502987769691, + "grad_norm": 0.283008873462677, + "learning_rate": 9.119527944083221e-06, + "loss": 1.2254, + "step": 18199 + }, + { + "epoch": 5.420800833969359, + "grad_norm": 0.2623262107372284, + "learning_rate": 9.118567127898029e-06, + "loss": 1.2186, + "step": 18200 + }, + { + "epoch": 5.4210986801690275, + "grad_norm": 0.24411393702030182, + "learning_rate": 9.11760631991358e-06, + "loss": 1.1984, + "step": 18201 + }, + { + "epoch": 5.421396526368697, + "grad_norm": 0.29732614755630493, + "learning_rate": 9.116645520138825e-06, + "loss": 1.2079, + "step": 18202 + }, + { + "epoch": 5.421694372568365, + "grad_norm": 0.2932446300983429, + "learning_rate": 9.11568472858269e-06, + "loss": 1.2149, + "step": 18203 + }, + { + "epoch": 5.421992218768033, + "grad_norm": 0.2652064561843872, + "learning_rate": 9.114723945254124e-06, + "loss": 1.2271, + "step": 18204 + }, + { + "epoch": 5.422290064967703, + "grad_norm": 0.2633882761001587, + "learning_rate": 9.113763170162063e-06, + "loss": 1.2182, + "step": 18205 + }, + { + "epoch": 5.422587911167371, + "grad_norm": 0.2902883291244507, + "learning_rate": 9.11280240331544e-06, + "loss": 1.2125, + "step": 18206 + }, + { + "epoch": 5.422885757367039, + "grad_norm": 0.27402180433273315, + "learning_rate": 9.111841644723206e-06, + "loss": 1.2167, + "step": 18207 + }, + { + "epoch": 5.4231836035667085, + "grad_norm": 0.26779574155807495, + "learning_rate": 9.110880894394293e-06, + "loss": 1.2106, + "step": 18208 + }, + { + "epoch": 5.423481449766377, + "grad_norm": 0.31007951498031616, + "learning_rate": 9.109920152337636e-06, + "loss": 1.217, + "step": 18209 + }, + { + "epoch": 5.423779295966045, + "grad_norm": 0.32808202505111694, + "learning_rate": 9.108959418562181e-06, + "loss": 1.2104, + "step": 18210 + }, + { + "epoch": 5.424077142165714, + "grad_norm": 0.2879560589790344, + "learning_rate": 9.10799869307686e-06, + "loss": 1.2195, + "step": 18211 + }, + { + "epoch": 5.424374988365383, + "grad_norm": 0.33658215403556824, + "learning_rate": 9.107037975890615e-06, + "loss": 1.2163, + "step": 18212 + }, + { + "epoch": 5.424672834565051, + "grad_norm": 0.25510627031326294, + "learning_rate": 9.106077267012386e-06, + "loss": 1.2117, + "step": 18213 + }, + { + "epoch": 5.42497068076472, + "grad_norm": 0.3268873989582062, + "learning_rate": 9.10511656645111e-06, + "loss": 1.2161, + "step": 18214 + }, + { + "epoch": 5.4252685269643885, + "grad_norm": 0.2651354968547821, + "learning_rate": 9.10415587421572e-06, + "loss": 1.2273, + "step": 18215 + }, + { + "epoch": 5.425566373164058, + "grad_norm": 0.2550216317176819, + "learning_rate": 9.103195190315163e-06, + "loss": 1.2082, + "step": 18216 + }, + { + "epoch": 5.425864219363726, + "grad_norm": 0.3427906334400177, + "learning_rate": 9.102234514758372e-06, + "loss": 1.1958, + "step": 18217 + }, + { + "epoch": 5.426162065563394, + "grad_norm": 0.30091381072998047, + "learning_rate": 9.101273847554282e-06, + "loss": 1.185, + "step": 18218 + }, + { + "epoch": 5.426459911763064, + "grad_norm": 0.45391273498535156, + "learning_rate": 9.100313188711841e-06, + "loss": 1.2302, + "step": 18219 + }, + { + "epoch": 5.426757757962732, + "grad_norm": 0.4105113446712494, + "learning_rate": 9.09935253823998e-06, + "loss": 1.2077, + "step": 18220 + }, + { + "epoch": 5.4270556041624, + "grad_norm": 0.3463220000267029, + "learning_rate": 9.098391896147632e-06, + "loss": 1.2034, + "step": 18221 + }, + { + "epoch": 5.4273534503620695, + "grad_norm": 0.28957387804985046, + "learning_rate": 9.097431262443742e-06, + "loss": 1.219, + "step": 18222 + }, + { + "epoch": 5.427651296561738, + "grad_norm": 0.7690960764884949, + "learning_rate": 9.096470637137244e-06, + "loss": 1.1962, + "step": 18223 + }, + { + "epoch": 5.427949142761406, + "grad_norm": 0.8276326060295105, + "learning_rate": 9.095510020237081e-06, + "loss": 1.2095, + "step": 18224 + }, + { + "epoch": 5.428246988961075, + "grad_norm": 0.4270513355731964, + "learning_rate": 9.094549411752189e-06, + "loss": 1.2106, + "step": 18225 + }, + { + "epoch": 5.428544835160744, + "grad_norm": 0.6832343935966492, + "learning_rate": 9.093588811691496e-06, + "loss": 1.2024, + "step": 18226 + }, + { + "epoch": 5.428842681360413, + "grad_norm": 0.35265395045280457, + "learning_rate": 9.092628220063952e-06, + "loss": 1.2139, + "step": 18227 + }, + { + "epoch": 5.429140527560081, + "grad_norm": 0.4333440959453583, + "learning_rate": 9.091667636878485e-06, + "loss": 1.2171, + "step": 18228 + }, + { + "epoch": 5.42943837375975, + "grad_norm": 0.38746553659439087, + "learning_rate": 9.090707062144036e-06, + "loss": 1.2053, + "step": 18229 + }, + { + "epoch": 5.429736219959419, + "grad_norm": 0.27784475684165955, + "learning_rate": 9.089746495869546e-06, + "loss": 1.2268, + "step": 18230 + }, + { + "epoch": 5.430034066159087, + "grad_norm": 0.4725354313850403, + "learning_rate": 9.088785938063946e-06, + "loss": 1.2313, + "step": 18231 + }, + { + "epoch": 5.4303319123587555, + "grad_norm": 0.3004723787307739, + "learning_rate": 9.087825388736172e-06, + "loss": 1.2157, + "step": 18232 + }, + { + "epoch": 5.430629758558425, + "grad_norm": 0.4051141142845154, + "learning_rate": 9.086864847895167e-06, + "loss": 1.2036, + "step": 18233 + }, + { + "epoch": 5.430927604758093, + "grad_norm": 0.2720029056072235, + "learning_rate": 9.085904315549863e-06, + "loss": 1.2042, + "step": 18234 + }, + { + "epoch": 5.431225450957761, + "grad_norm": 0.3584952652454376, + "learning_rate": 9.084943791709195e-06, + "loss": 1.2117, + "step": 18235 + }, + { + "epoch": 5.4315232971574305, + "grad_norm": 0.2827838063240051, + "learning_rate": 9.083983276382107e-06, + "loss": 1.2068, + "step": 18236 + }, + { + "epoch": 5.431821143357099, + "grad_norm": 0.28493532538414, + "learning_rate": 9.083022769577533e-06, + "loss": 1.2201, + "step": 18237 + }, + { + "epoch": 5.432118989556768, + "grad_norm": 0.31898900866508484, + "learning_rate": 9.082062271304402e-06, + "loss": 1.23, + "step": 18238 + }, + { + "epoch": 5.432416835756436, + "grad_norm": 0.25817587971687317, + "learning_rate": 9.081101781571657e-06, + "loss": 1.2027, + "step": 18239 + }, + { + "epoch": 5.432714681956105, + "grad_norm": 0.295402467250824, + "learning_rate": 9.080141300388236e-06, + "loss": 1.2, + "step": 18240 + }, + { + "epoch": 5.433012528155774, + "grad_norm": 0.3077404797077179, + "learning_rate": 9.07918082776307e-06, + "loss": 1.2176, + "step": 18241 + }, + { + "epoch": 5.433310374355442, + "grad_norm": 0.31870508193969727, + "learning_rate": 9.0782203637051e-06, + "loss": 1.2089, + "step": 18242 + }, + { + "epoch": 5.433608220555111, + "grad_norm": 0.287725567817688, + "learning_rate": 9.077259908223258e-06, + "loss": 1.2217, + "step": 18243 + }, + { + "epoch": 5.43390606675478, + "grad_norm": 0.6407656669616699, + "learning_rate": 9.076299461326483e-06, + "loss": 1.2058, + "step": 18244 + }, + { + "epoch": 5.434203912954448, + "grad_norm": 0.4119032621383667, + "learning_rate": 9.075339023023708e-06, + "loss": 1.2338, + "step": 18245 + }, + { + "epoch": 5.4345017591541165, + "grad_norm": 0.3540785312652588, + "learning_rate": 9.074378593323871e-06, + "loss": 1.2247, + "step": 18246 + }, + { + "epoch": 5.434799605353786, + "grad_norm": 0.38739004731178284, + "learning_rate": 9.073418172235906e-06, + "loss": 1.2142, + "step": 18247 + }, + { + "epoch": 5.435097451553454, + "grad_norm": 0.2594635486602783, + "learning_rate": 9.072457759768752e-06, + "loss": 1.2107, + "step": 18248 + }, + { + "epoch": 5.435395297753122, + "grad_norm": 0.34582409262657166, + "learning_rate": 9.071497355931338e-06, + "loss": 1.2054, + "step": 18249 + }, + { + "epoch": 5.435693143952792, + "grad_norm": 0.2676573097705841, + "learning_rate": 9.070536960732608e-06, + "loss": 1.2036, + "step": 18250 + }, + { + "epoch": 5.43599099015246, + "grad_norm": 0.28851285576820374, + "learning_rate": 9.069576574181492e-06, + "loss": 1.2087, + "step": 18251 + }, + { + "epoch": 5.436288836352128, + "grad_norm": 0.2775110900402069, + "learning_rate": 9.068616196286923e-06, + "loss": 1.2247, + "step": 18252 + }, + { + "epoch": 5.4365866825517974, + "grad_norm": 0.2745726406574249, + "learning_rate": 9.067655827057845e-06, + "loss": 1.2229, + "step": 18253 + }, + { + "epoch": 5.436884528751466, + "grad_norm": 0.28324687480926514, + "learning_rate": 9.066695466503188e-06, + "loss": 1.2208, + "step": 18254 + }, + { + "epoch": 5.437182374951135, + "grad_norm": 0.3074977695941925, + "learning_rate": 9.065735114631882e-06, + "loss": 1.2083, + "step": 18255 + }, + { + "epoch": 5.437480221150803, + "grad_norm": 0.28714606165885925, + "learning_rate": 9.06477477145287e-06, + "loss": 1.2214, + "step": 18256 + }, + { + "epoch": 5.437778067350472, + "grad_norm": 0.3407638370990753, + "learning_rate": 9.063814436975083e-06, + "loss": 1.2245, + "step": 18257 + }, + { + "epoch": 5.438075913550141, + "grad_norm": 0.2992120087146759, + "learning_rate": 9.062854111207455e-06, + "loss": 1.203, + "step": 18258 + }, + { + "epoch": 5.438373759749809, + "grad_norm": 0.41462570428848267, + "learning_rate": 9.061893794158928e-06, + "loss": 1.2146, + "step": 18259 + }, + { + "epoch": 5.4386716059494775, + "grad_norm": 0.26136377453804016, + "learning_rate": 9.060933485838424e-06, + "loss": 1.2159, + "step": 18260 + }, + { + "epoch": 5.438969452149147, + "grad_norm": 0.3173859417438507, + "learning_rate": 9.05997318625489e-06, + "loss": 1.2015, + "step": 18261 + }, + { + "epoch": 5.439267298348815, + "grad_norm": 0.2862769067287445, + "learning_rate": 9.059012895417253e-06, + "loss": 1.2199, + "step": 18262 + }, + { + "epoch": 5.439565144548483, + "grad_norm": 0.4170054495334625, + "learning_rate": 9.05805261333445e-06, + "loss": 1.2215, + "step": 18263 + }, + { + "epoch": 5.439862990748153, + "grad_norm": 0.2662082016468048, + "learning_rate": 9.057092340015415e-06, + "loss": 1.2128, + "step": 18264 + }, + { + "epoch": 5.440160836947821, + "grad_norm": 0.2805964946746826, + "learning_rate": 9.056132075469084e-06, + "loss": 1.2208, + "step": 18265 + }, + { + "epoch": 5.44045868314749, + "grad_norm": 0.26170092821121216, + "learning_rate": 9.055171819704386e-06, + "loss": 1.2281, + "step": 18266 + }, + { + "epoch": 5.4407565293471585, + "grad_norm": 0.2577800452709198, + "learning_rate": 9.054211572730262e-06, + "loss": 1.2095, + "step": 18267 + }, + { + "epoch": 5.441054375546827, + "grad_norm": 0.3083866238594055, + "learning_rate": 9.053251334555642e-06, + "loss": 1.2248, + "step": 18268 + }, + { + "epoch": 5.441352221746496, + "grad_norm": 0.3356550335884094, + "learning_rate": 9.05229110518946e-06, + "loss": 1.2078, + "step": 18269 + }, + { + "epoch": 5.441650067946164, + "grad_norm": 0.3031206429004669, + "learning_rate": 9.051330884640649e-06, + "loss": 1.2214, + "step": 18270 + }, + { + "epoch": 5.441947914145833, + "grad_norm": 0.45580852031707764, + "learning_rate": 9.050370672918142e-06, + "loss": 1.2097, + "step": 18271 + }, + { + "epoch": 5.442245760345502, + "grad_norm": 0.2769217789173126, + "learning_rate": 9.049410470030882e-06, + "loss": 1.2442, + "step": 18272 + }, + { + "epoch": 5.44254360654517, + "grad_norm": 0.26227572560310364, + "learning_rate": 9.048450275987793e-06, + "loss": 1.2209, + "step": 18273 + }, + { + "epoch": 5.4428414527448385, + "grad_norm": 0.3485429883003235, + "learning_rate": 9.047490090797807e-06, + "loss": 1.1998, + "step": 18274 + }, + { + "epoch": 5.443139298944508, + "grad_norm": 0.27909791469573975, + "learning_rate": 9.046529914469865e-06, + "loss": 1.2104, + "step": 18275 + }, + { + "epoch": 5.443437145144176, + "grad_norm": 0.37177976965904236, + "learning_rate": 9.045569747012899e-06, + "loss": 1.2253, + "step": 18276 + }, + { + "epoch": 5.443734991343844, + "grad_norm": 0.3017202913761139, + "learning_rate": 9.044609588435833e-06, + "loss": 1.2138, + "step": 18277 + }, + { + "epoch": 5.444032837543514, + "grad_norm": 0.5364689826965332, + "learning_rate": 9.043649438747615e-06, + "loss": 1.2166, + "step": 18278 + }, + { + "epoch": 5.444330683743182, + "grad_norm": 0.6448546051979065, + "learning_rate": 9.042689297957169e-06, + "loss": 1.221, + "step": 18279 + }, + { + "epoch": 5.44462852994285, + "grad_norm": 0.31614118814468384, + "learning_rate": 9.041729166073425e-06, + "loss": 1.2145, + "step": 18280 + }, + { + "epoch": 5.4449263761425195, + "grad_norm": 0.7211610078811646, + "learning_rate": 9.040769043105324e-06, + "loss": 1.2261, + "step": 18281 + }, + { + "epoch": 5.445224222342188, + "grad_norm": 0.2672927677631378, + "learning_rate": 9.039808929061794e-06, + "loss": 1.2147, + "step": 18282 + }, + { + "epoch": 5.445522068541857, + "grad_norm": 0.4884949028491974, + "learning_rate": 9.038848823951767e-06, + "loss": 1.2154, + "step": 18283 + }, + { + "epoch": 5.445819914741525, + "grad_norm": 0.5389834642410278, + "learning_rate": 9.037888727784184e-06, + "loss": 1.2121, + "step": 18284 + }, + { + "epoch": 5.446117760941194, + "grad_norm": 0.5120760798454285, + "learning_rate": 9.03692864056797e-06, + "loss": 1.2138, + "step": 18285 + }, + { + "epoch": 5.446415607140863, + "grad_norm": 0.43643325567245483, + "learning_rate": 9.035968562312053e-06, + "loss": 1.2054, + "step": 18286 + }, + { + "epoch": 5.446713453340531, + "grad_norm": 0.3008381724357605, + "learning_rate": 9.035008493025376e-06, + "loss": 1.2004, + "step": 18287 + }, + { + "epoch": 5.4470112995402, + "grad_norm": 0.3887217044830322, + "learning_rate": 9.034048432716864e-06, + "loss": 1.2344, + "step": 18288 + }, + { + "epoch": 5.447309145739869, + "grad_norm": 0.3350813388824463, + "learning_rate": 9.033088381395457e-06, + "loss": 1.205, + "step": 18289 + }, + { + "epoch": 5.447606991939537, + "grad_norm": 0.4774436950683594, + "learning_rate": 9.032128339070082e-06, + "loss": 1.2161, + "step": 18290 + }, + { + "epoch": 5.4479048381392055, + "grad_norm": 0.278996080160141, + "learning_rate": 9.031168305749665e-06, + "loss": 1.2103, + "step": 18291 + }, + { + "epoch": 5.448202684338875, + "grad_norm": 0.38473767042160034, + "learning_rate": 9.03020828144315e-06, + "loss": 1.2301, + "step": 18292 + }, + { + "epoch": 5.448500530538543, + "grad_norm": 0.298493891954422, + "learning_rate": 9.029248266159463e-06, + "loss": 1.2248, + "step": 18293 + }, + { + "epoch": 5.448798376738212, + "grad_norm": 0.3384168744087219, + "learning_rate": 9.028288259907532e-06, + "loss": 1.1965, + "step": 18294 + }, + { + "epoch": 5.4490962229378805, + "grad_norm": 0.3702334761619568, + "learning_rate": 9.027328262696299e-06, + "loss": 1.2223, + "step": 18295 + }, + { + "epoch": 5.449394069137549, + "grad_norm": 0.30994391441345215, + "learning_rate": 9.026368274534687e-06, + "loss": 1.2046, + "step": 18296 + }, + { + "epoch": 5.449691915337218, + "grad_norm": 0.42713069915771484, + "learning_rate": 9.02540829543163e-06, + "loss": 1.2071, + "step": 18297 + }, + { + "epoch": 5.449989761536886, + "grad_norm": 0.4066449999809265, + "learning_rate": 9.024448325396061e-06, + "loss": 1.218, + "step": 18298 + }, + { + "epoch": 5.450287607736555, + "grad_norm": 0.26543983817100525, + "learning_rate": 9.02348836443691e-06, + "loss": 1.2051, + "step": 18299 + }, + { + "epoch": 5.450585453936224, + "grad_norm": 0.2768924832344055, + "learning_rate": 9.022528412563106e-06, + "loss": 1.2095, + "step": 18300 + }, + { + "epoch": 5.450883300135892, + "grad_norm": 0.3367660343647003, + "learning_rate": 9.021568469783588e-06, + "loss": 1.2271, + "step": 18301 + }, + { + "epoch": 5.451181146335561, + "grad_norm": 0.48054754734039307, + "learning_rate": 9.020608536107281e-06, + "loss": 1.2254, + "step": 18302 + }, + { + "epoch": 5.45147899253523, + "grad_norm": 0.283011257648468, + "learning_rate": 9.019648611543114e-06, + "loss": 1.2157, + "step": 18303 + }, + { + "epoch": 5.451776838734898, + "grad_norm": 0.5190119743347168, + "learning_rate": 9.018688696100024e-06, + "loss": 1.2134, + "step": 18304 + }, + { + "epoch": 5.452074684934567, + "grad_norm": 0.2879144847393036, + "learning_rate": 9.017728789786939e-06, + "loss": 1.2123, + "step": 18305 + }, + { + "epoch": 5.452372531134236, + "grad_norm": 0.4425755739212036, + "learning_rate": 9.016768892612789e-06, + "loss": 1.2165, + "step": 18306 + }, + { + "epoch": 5.452670377333904, + "grad_norm": 0.2664521634578705, + "learning_rate": 9.01580900458651e-06, + "loss": 1.2049, + "step": 18307 + }, + { + "epoch": 5.452968223533573, + "grad_norm": 0.39505434036254883, + "learning_rate": 9.014849125717023e-06, + "loss": 1.2173, + "step": 18308 + }, + { + "epoch": 5.453266069733242, + "grad_norm": 0.45414382219314575, + "learning_rate": 9.013889256013267e-06, + "loss": 1.2137, + "step": 18309 + }, + { + "epoch": 5.45356391593291, + "grad_norm": 0.41616225242614746, + "learning_rate": 9.01292939548417e-06, + "loss": 1.2219, + "step": 18310 + }, + { + "epoch": 5.453861762132579, + "grad_norm": 0.489369660615921, + "learning_rate": 9.01196954413866e-06, + "loss": 1.2301, + "step": 18311 + }, + { + "epoch": 5.4541596083322474, + "grad_norm": 0.4263423681259155, + "learning_rate": 9.011009701985672e-06, + "loss": 1.204, + "step": 18312 + }, + { + "epoch": 5.454457454531916, + "grad_norm": 0.5042636394500732, + "learning_rate": 9.010049869034134e-06, + "loss": 1.2235, + "step": 18313 + }, + { + "epoch": 5.454755300731585, + "grad_norm": 0.25747379660606384, + "learning_rate": 9.009090045292974e-06, + "loss": 1.2085, + "step": 18314 + }, + { + "epoch": 5.455053146931253, + "grad_norm": 0.5377143025398254, + "learning_rate": 9.008130230771126e-06, + "loss": 1.229, + "step": 18315 + }, + { + "epoch": 5.455350993130922, + "grad_norm": 0.2980526089668274, + "learning_rate": 9.007170425477516e-06, + "loss": 1.2155, + "step": 18316 + }, + { + "epoch": 5.455648839330591, + "grad_norm": 0.41524243354797363, + "learning_rate": 9.006210629421073e-06, + "loss": 1.2021, + "step": 18317 + }, + { + "epoch": 5.455946685530259, + "grad_norm": 0.30433499813079834, + "learning_rate": 9.005250842610737e-06, + "loss": 1.2032, + "step": 18318 + }, + { + "epoch": 5.4562445317299275, + "grad_norm": 0.33835574984550476, + "learning_rate": 9.004291065055429e-06, + "loss": 1.2287, + "step": 18319 + }, + { + "epoch": 5.456542377929597, + "grad_norm": 0.28729456663131714, + "learning_rate": 9.003331296764075e-06, + "loss": 1.213, + "step": 18320 + }, + { + "epoch": 5.456840224129265, + "grad_norm": 0.31967926025390625, + "learning_rate": 9.002371537745615e-06, + "loss": 1.2085, + "step": 18321 + }, + { + "epoch": 5.457138070328934, + "grad_norm": 0.3398250937461853, + "learning_rate": 9.001411788008969e-06, + "loss": 1.2216, + "step": 18322 + }, + { + "epoch": 5.457435916528603, + "grad_norm": 0.2628474831581116, + "learning_rate": 9.000452047563073e-06, + "loss": 1.2137, + "step": 18323 + }, + { + "epoch": 5.457733762728271, + "grad_norm": 0.2607848644256592, + "learning_rate": 8.999492316416854e-06, + "loss": 1.2297, + "step": 18324 + }, + { + "epoch": 5.45803160892794, + "grad_norm": 0.31650781631469727, + "learning_rate": 8.998532594579239e-06, + "loss": 1.2153, + "step": 18325 + }, + { + "epoch": 5.4583294551276085, + "grad_norm": 0.27923887968063354, + "learning_rate": 8.997572882059164e-06, + "loss": 1.2249, + "step": 18326 + }, + { + "epoch": 5.458627301327277, + "grad_norm": 0.28001120686531067, + "learning_rate": 8.99661317886555e-06, + "loss": 1.2096, + "step": 18327 + }, + { + "epoch": 5.458925147526946, + "grad_norm": 0.320626437664032, + "learning_rate": 8.99565348500733e-06, + "loss": 1.2126, + "step": 18328 + }, + { + "epoch": 5.459222993726614, + "grad_norm": 0.3825687766075134, + "learning_rate": 8.994693800493431e-06, + "loss": 1.2081, + "step": 18329 + }, + { + "epoch": 5.459520839926283, + "grad_norm": 0.2936956286430359, + "learning_rate": 8.993734125332787e-06, + "loss": 1.2197, + "step": 18330 + }, + { + "epoch": 5.459818686125952, + "grad_norm": 0.33461901545524597, + "learning_rate": 8.992774459534317e-06, + "loss": 1.2147, + "step": 18331 + }, + { + "epoch": 5.46011653232562, + "grad_norm": 0.3067677319049835, + "learning_rate": 8.99181480310696e-06, + "loss": 1.2107, + "step": 18332 + }, + { + "epoch": 5.460414378525289, + "grad_norm": 0.32353153824806213, + "learning_rate": 8.990855156059636e-06, + "loss": 1.209, + "step": 18333 + }, + { + "epoch": 5.460712224724958, + "grad_norm": 0.3095123767852783, + "learning_rate": 8.989895518401279e-06, + "loss": 1.2131, + "step": 18334 + }, + { + "epoch": 5.461010070924626, + "grad_norm": 0.33417394757270813, + "learning_rate": 8.988935890140816e-06, + "loss": 1.2177, + "step": 18335 + }, + { + "epoch": 5.461307917124295, + "grad_norm": 0.2994416356086731, + "learning_rate": 8.987976271287177e-06, + "loss": 1.2155, + "step": 18336 + }, + { + "epoch": 5.461605763323964, + "grad_norm": 0.28538838028907776, + "learning_rate": 8.987016661849284e-06, + "loss": 1.2049, + "step": 18337 + }, + { + "epoch": 5.461903609523632, + "grad_norm": 0.46026667952537537, + "learning_rate": 8.986057061836073e-06, + "loss": 1.2048, + "step": 18338 + }, + { + "epoch": 5.462201455723301, + "grad_norm": 0.4897211194038391, + "learning_rate": 8.985097471256463e-06, + "loss": 1.2067, + "step": 18339 + }, + { + "epoch": 5.4624993019229695, + "grad_norm": 0.2775269150733948, + "learning_rate": 8.984137890119391e-06, + "loss": 1.2, + "step": 18340 + }, + { + "epoch": 5.462797148122638, + "grad_norm": 0.2839283347129822, + "learning_rate": 8.983178318433782e-06, + "loss": 1.2031, + "step": 18341 + }, + { + "epoch": 5.463094994322307, + "grad_norm": 0.2751613259315491, + "learning_rate": 8.982218756208558e-06, + "loss": 1.218, + "step": 18342 + }, + { + "epoch": 5.463392840521975, + "grad_norm": 0.3411848545074463, + "learning_rate": 8.981259203452656e-06, + "loss": 1.2398, + "step": 18343 + }, + { + "epoch": 5.463690686721644, + "grad_norm": 0.33500733971595764, + "learning_rate": 8.980299660174999e-06, + "loss": 1.2235, + "step": 18344 + }, + { + "epoch": 5.463988532921313, + "grad_norm": 0.40132421255111694, + "learning_rate": 8.97934012638451e-06, + "loss": 1.2379, + "step": 18345 + }, + { + "epoch": 5.464286379120981, + "grad_norm": 0.3655446469783783, + "learning_rate": 8.978380602090123e-06, + "loss": 1.2182, + "step": 18346 + }, + { + "epoch": 5.46458422532065, + "grad_norm": 0.405704140663147, + "learning_rate": 8.977421087300766e-06, + "loss": 1.2228, + "step": 18347 + }, + { + "epoch": 5.464882071520319, + "grad_norm": 0.2523886263370514, + "learning_rate": 8.976461582025359e-06, + "loss": 1.2207, + "step": 18348 + }, + { + "epoch": 5.465179917719987, + "grad_norm": 0.2591855227947235, + "learning_rate": 8.975502086272838e-06, + "loss": 1.1991, + "step": 18349 + }, + { + "epoch": 5.465477763919656, + "grad_norm": 0.27837905287742615, + "learning_rate": 8.974542600052121e-06, + "loss": 1.2202, + "step": 18350 + }, + { + "epoch": 5.465775610119325, + "grad_norm": 0.2816462218761444, + "learning_rate": 8.973583123372141e-06, + "loss": 1.2033, + "step": 18351 + }, + { + "epoch": 5.466073456318993, + "grad_norm": 0.25594595074653625, + "learning_rate": 8.972623656241824e-06, + "loss": 1.2113, + "step": 18352 + }, + { + "epoch": 5.466371302518662, + "grad_norm": 0.33204081654548645, + "learning_rate": 8.971664198670096e-06, + "loss": 1.2096, + "step": 18353 + }, + { + "epoch": 5.4666691487183305, + "grad_norm": 0.27806761860847473, + "learning_rate": 8.970704750665881e-06, + "loss": 1.213, + "step": 18354 + }, + { + "epoch": 5.466966994917999, + "grad_norm": 0.4787549078464508, + "learning_rate": 8.969745312238111e-06, + "loss": 1.1913, + "step": 18355 + }, + { + "epoch": 5.467264841117668, + "grad_norm": 0.5014739632606506, + "learning_rate": 8.968785883395708e-06, + "loss": 1.1924, + "step": 18356 + }, + { + "epoch": 5.467562687317336, + "grad_norm": 0.3203149437904358, + "learning_rate": 8.967826464147602e-06, + "loss": 1.2279, + "step": 18357 + }, + { + "epoch": 5.467860533517005, + "grad_norm": 0.3924318552017212, + "learning_rate": 8.966867054502716e-06, + "loss": 1.2199, + "step": 18358 + }, + { + "epoch": 5.468158379716674, + "grad_norm": 0.43972548842430115, + "learning_rate": 8.965907654469977e-06, + "loss": 1.2139, + "step": 18359 + }, + { + "epoch": 5.468456225916342, + "grad_norm": 0.40511569380760193, + "learning_rate": 8.964948264058316e-06, + "loss": 1.2193, + "step": 18360 + }, + { + "epoch": 5.4687540721160115, + "grad_norm": 0.550166666507721, + "learning_rate": 8.963988883276655e-06, + "loss": 1.2036, + "step": 18361 + }, + { + "epoch": 5.46905191831568, + "grad_norm": 0.5369710326194763, + "learning_rate": 8.963029512133914e-06, + "loss": 1.1987, + "step": 18362 + }, + { + "epoch": 5.469349764515348, + "grad_norm": 0.39129918813705444, + "learning_rate": 8.96207015063903e-06, + "loss": 1.2158, + "step": 18363 + }, + { + "epoch": 5.469647610715017, + "grad_norm": 0.5112589001655579, + "learning_rate": 8.961110798800923e-06, + "loss": 1.2255, + "step": 18364 + }, + { + "epoch": 5.469945456914686, + "grad_norm": 0.31732383370399475, + "learning_rate": 8.960151456628515e-06, + "loss": 1.2118, + "step": 18365 + }, + { + "epoch": 5.470243303114354, + "grad_norm": 0.38156500458717346, + "learning_rate": 8.95919212413074e-06, + "loss": 1.2154, + "step": 18366 + }, + { + "epoch": 5.470541149314023, + "grad_norm": 0.3965311348438263, + "learning_rate": 8.958232801316522e-06, + "loss": 1.2045, + "step": 18367 + }, + { + "epoch": 5.470838995513692, + "grad_norm": 0.3953753411769867, + "learning_rate": 8.957273488194777e-06, + "loss": 1.2268, + "step": 18368 + }, + { + "epoch": 5.47113684171336, + "grad_norm": 0.3397037386894226, + "learning_rate": 8.956314184774441e-06, + "loss": 1.2149, + "step": 18369 + }, + { + "epoch": 5.471434687913029, + "grad_norm": 0.28182947635650635, + "learning_rate": 8.955354891064437e-06, + "loss": 1.2082, + "step": 18370 + }, + { + "epoch": 5.471732534112697, + "grad_norm": 0.42395129799842834, + "learning_rate": 8.954395607073683e-06, + "loss": 1.1916, + "step": 18371 + }, + { + "epoch": 5.472030380312367, + "grad_norm": 0.30112504959106445, + "learning_rate": 8.953436332811117e-06, + "loss": 1.219, + "step": 18372 + }, + { + "epoch": 5.472328226512035, + "grad_norm": 0.445017009973526, + "learning_rate": 8.95247706828565e-06, + "loss": 1.2093, + "step": 18373 + }, + { + "epoch": 5.472626072711703, + "grad_norm": 0.3907952606678009, + "learning_rate": 8.951517813506218e-06, + "loss": 1.2053, + "step": 18374 + }, + { + "epoch": 5.4729239189113725, + "grad_norm": 0.5662532448768616, + "learning_rate": 8.95055856848174e-06, + "loss": 1.2022, + "step": 18375 + }, + { + "epoch": 5.473221765111041, + "grad_norm": 0.31141963601112366, + "learning_rate": 8.949599333221139e-06, + "loss": 1.2199, + "step": 18376 + }, + { + "epoch": 5.473519611310709, + "grad_norm": 0.4439815580844879, + "learning_rate": 8.948640107733347e-06, + "loss": 1.2111, + "step": 18377 + }, + { + "epoch": 5.473817457510378, + "grad_norm": 0.3216348886489868, + "learning_rate": 8.947680892027284e-06, + "loss": 1.2042, + "step": 18378 + }, + { + "epoch": 5.474115303710047, + "grad_norm": 0.28621119260787964, + "learning_rate": 8.94672168611187e-06, + "loss": 1.2067, + "step": 18379 + }, + { + "epoch": 5.474413149909715, + "grad_norm": 0.4639444649219513, + "learning_rate": 8.94576248999604e-06, + "loss": 1.207, + "step": 18380 + }, + { + "epoch": 5.474710996109384, + "grad_norm": 0.3653220534324646, + "learning_rate": 8.944803303688709e-06, + "loss": 1.2242, + "step": 18381 + }, + { + "epoch": 5.475008842309053, + "grad_norm": 0.305291086435318, + "learning_rate": 8.943844127198802e-06, + "loss": 1.2147, + "step": 18382 + }, + { + "epoch": 5.475306688508721, + "grad_norm": 0.2845461666584015, + "learning_rate": 8.94288496053525e-06, + "loss": 1.2144, + "step": 18383 + }, + { + "epoch": 5.47560453470839, + "grad_norm": 0.295358270406723, + "learning_rate": 8.941925803706973e-06, + "loss": 1.1963, + "step": 18384 + }, + { + "epoch": 5.4759023809080585, + "grad_norm": 0.39815086126327515, + "learning_rate": 8.940966656722887e-06, + "loss": 1.2201, + "step": 18385 + }, + { + "epoch": 5.476200227107727, + "grad_norm": 0.28686094284057617, + "learning_rate": 8.940007519591928e-06, + "loss": 1.2102, + "step": 18386 + }, + { + "epoch": 5.476498073307396, + "grad_norm": 0.3080119490623474, + "learning_rate": 8.939048392323015e-06, + "loss": 1.2257, + "step": 18387 + }, + { + "epoch": 5.476795919507064, + "grad_norm": 0.29281705617904663, + "learning_rate": 8.93808927492507e-06, + "loss": 1.207, + "step": 18388 + }, + { + "epoch": 5.477093765706734, + "grad_norm": 0.3752022087574005, + "learning_rate": 8.93713016740702e-06, + "loss": 1.2202, + "step": 18389 + }, + { + "epoch": 5.477391611906402, + "grad_norm": 0.25749897956848145, + "learning_rate": 8.936171069777783e-06, + "loss": 1.2357, + "step": 18390 + }, + { + "epoch": 5.47768945810607, + "grad_norm": 0.34351646900177, + "learning_rate": 8.93521198204629e-06, + "loss": 1.2126, + "step": 18391 + }, + { + "epoch": 5.477987304305739, + "grad_norm": 0.336144357919693, + "learning_rate": 8.934252904221455e-06, + "loss": 1.2029, + "step": 18392 + }, + { + "epoch": 5.478285150505408, + "grad_norm": 0.7184677124023438, + "learning_rate": 8.933293836312208e-06, + "loss": 1.2172, + "step": 18393 + }, + { + "epoch": 5.478582996705076, + "grad_norm": 0.4423912465572357, + "learning_rate": 8.932334778327471e-06, + "loss": 1.2211, + "step": 18394 + }, + { + "epoch": 5.478880842904745, + "grad_norm": 0.49526455998420715, + "learning_rate": 8.931375730276168e-06, + "loss": 1.2241, + "step": 18395 + }, + { + "epoch": 5.479178689104414, + "grad_norm": 0.43283307552337646, + "learning_rate": 8.930416692167214e-06, + "loss": 1.2188, + "step": 18396 + }, + { + "epoch": 5.479476535304082, + "grad_norm": 0.4464777708053589, + "learning_rate": 8.929457664009541e-06, + "loss": 1.199, + "step": 18397 + }, + { + "epoch": 5.479774381503751, + "grad_norm": 0.29845792055130005, + "learning_rate": 8.928498645812068e-06, + "loss": 1.2119, + "step": 18398 + }, + { + "epoch": 5.4800722277034195, + "grad_norm": 0.5206387639045715, + "learning_rate": 8.927539637583718e-06, + "loss": 1.2363, + "step": 18399 + }, + { + "epoch": 5.480370073903089, + "grad_norm": 0.2644738554954529, + "learning_rate": 8.926580639333415e-06, + "loss": 1.2211, + "step": 18400 + }, + { + "epoch": 5.480667920102757, + "grad_norm": 0.6316187977790833, + "learning_rate": 8.92562165107008e-06, + "loss": 1.2272, + "step": 18401 + }, + { + "epoch": 5.480965766302425, + "grad_norm": 0.2890859842300415, + "learning_rate": 8.924662672802631e-06, + "loss": 1.1993, + "step": 18402 + }, + { + "epoch": 5.481263612502095, + "grad_norm": 0.4356417953968048, + "learning_rate": 8.92370370454e-06, + "loss": 1.2013, + "step": 18403 + }, + { + "epoch": 5.481561458701763, + "grad_norm": 0.3311325013637543, + "learning_rate": 8.922744746291098e-06, + "loss": 1.2298, + "step": 18404 + }, + { + "epoch": 5.481859304901431, + "grad_norm": 0.3121452033519745, + "learning_rate": 8.921785798064855e-06, + "loss": 1.2123, + "step": 18405 + }, + { + "epoch": 5.4821571511011005, + "grad_norm": 0.4611448645591736, + "learning_rate": 8.920826859870191e-06, + "loss": 1.2307, + "step": 18406 + }, + { + "epoch": 5.482454997300769, + "grad_norm": 0.3852958381175995, + "learning_rate": 8.919867931716025e-06, + "loss": 1.2162, + "step": 18407 + }, + { + "epoch": 5.482752843500437, + "grad_norm": 0.3547942638397217, + "learning_rate": 8.918909013611286e-06, + "loss": 1.2042, + "step": 18408 + }, + { + "epoch": 5.483050689700106, + "grad_norm": 0.28711792826652527, + "learning_rate": 8.917950105564888e-06, + "loss": 1.2028, + "step": 18409 + }, + { + "epoch": 5.483348535899775, + "grad_norm": 0.4506579637527466, + "learning_rate": 8.916991207585754e-06, + "loss": 1.2153, + "step": 18410 + }, + { + "epoch": 5.483646382099443, + "grad_norm": 0.2823413908481598, + "learning_rate": 8.916032319682808e-06, + "loss": 1.239, + "step": 18411 + }, + { + "epoch": 5.483944228299112, + "grad_norm": 0.44218409061431885, + "learning_rate": 8.915073441864972e-06, + "loss": 1.2009, + "step": 18412 + }, + { + "epoch": 5.4842420744987805, + "grad_norm": 0.28987812995910645, + "learning_rate": 8.914114574141161e-06, + "loss": 1.2083, + "step": 18413 + }, + { + "epoch": 5.484539920698449, + "grad_norm": 0.43456321954727173, + "learning_rate": 8.913155716520305e-06, + "loss": 1.2262, + "step": 18414 + }, + { + "epoch": 5.484837766898118, + "grad_norm": 0.3784896731376648, + "learning_rate": 8.91219686901132e-06, + "loss": 1.1998, + "step": 18415 + }, + { + "epoch": 5.485135613097786, + "grad_norm": 0.41341546177864075, + "learning_rate": 8.911238031623126e-06, + "loss": 1.2083, + "step": 18416 + }, + { + "epoch": 5.485433459297456, + "grad_norm": 0.4241754114627838, + "learning_rate": 8.910279204364646e-06, + "loss": 1.2373, + "step": 18417 + }, + { + "epoch": 5.485731305497124, + "grad_norm": 0.35157403349876404, + "learning_rate": 8.909320387244803e-06, + "loss": 1.2006, + "step": 18418 + }, + { + "epoch": 5.486029151696792, + "grad_norm": 0.5017350316047668, + "learning_rate": 8.908361580272512e-06, + "loss": 1.2114, + "step": 18419 + }, + { + "epoch": 5.4863269978964615, + "grad_norm": 0.3144020140171051, + "learning_rate": 8.907402783456698e-06, + "loss": 1.225, + "step": 18420 + }, + { + "epoch": 5.48662484409613, + "grad_norm": 0.5000543594360352, + "learning_rate": 8.90644399680628e-06, + "loss": 1.2153, + "step": 18421 + }, + { + "epoch": 5.486922690295798, + "grad_norm": 0.25704795122146606, + "learning_rate": 8.905485220330178e-06, + "loss": 1.1973, + "step": 18422 + }, + { + "epoch": 5.487220536495467, + "grad_norm": 0.7779128551483154, + "learning_rate": 8.904526454037316e-06, + "loss": 1.2219, + "step": 18423 + }, + { + "epoch": 5.487518382695136, + "grad_norm": 0.35181763768196106, + "learning_rate": 8.903567697936607e-06, + "loss": 1.2091, + "step": 18424 + }, + { + "epoch": 5.487816228894804, + "grad_norm": 0.6172589063644409, + "learning_rate": 8.902608952036981e-06, + "loss": 1.2107, + "step": 18425 + }, + { + "epoch": 5.488114075094473, + "grad_norm": 0.3031884431838989, + "learning_rate": 8.901650216347351e-06, + "loss": 1.219, + "step": 18426 + }, + { + "epoch": 5.488411921294142, + "grad_norm": 0.6517040729522705, + "learning_rate": 8.900691490876636e-06, + "loss": 1.2083, + "step": 18427 + }, + { + "epoch": 5.488709767493811, + "grad_norm": 0.36943522095680237, + "learning_rate": 8.89973277563376e-06, + "loss": 1.2268, + "step": 18428 + }, + { + "epoch": 5.489007613693479, + "grad_norm": 0.37507328391075134, + "learning_rate": 8.898774070627643e-06, + "loss": 1.2116, + "step": 18429 + }, + { + "epoch": 5.489305459893147, + "grad_norm": 0.4282570481300354, + "learning_rate": 8.897815375867198e-06, + "loss": 1.2036, + "step": 18430 + }, + { + "epoch": 5.489603306092817, + "grad_norm": 0.25638723373413086, + "learning_rate": 8.896856691361355e-06, + "loss": 1.2073, + "step": 18431 + }, + { + "epoch": 5.489901152292485, + "grad_norm": 0.4131999909877777, + "learning_rate": 8.895898017119027e-06, + "loss": 1.2227, + "step": 18432 + }, + { + "epoch": 5.490198998492153, + "grad_norm": 0.3734094202518463, + "learning_rate": 8.894939353149132e-06, + "loss": 1.2203, + "step": 18433 + }, + { + "epoch": 5.4904968446918225, + "grad_norm": 0.5136287212371826, + "learning_rate": 8.893980699460592e-06, + "loss": 1.2178, + "step": 18434 + }, + { + "epoch": 5.490794690891491, + "grad_norm": 0.4026089012622833, + "learning_rate": 8.893022056062329e-06, + "loss": 1.2056, + "step": 18435 + }, + { + "epoch": 5.491092537091159, + "grad_norm": 0.38764455914497375, + "learning_rate": 8.892063422963252e-06, + "loss": 1.2145, + "step": 18436 + }, + { + "epoch": 5.491390383290828, + "grad_norm": 0.4066968560218811, + "learning_rate": 8.891104800172294e-06, + "loss": 1.221, + "step": 18437 + }, + { + "epoch": 5.491688229490497, + "grad_norm": 0.2829129993915558, + "learning_rate": 8.890146187698363e-06, + "loss": 1.2168, + "step": 18438 + }, + { + "epoch": 5.491986075690166, + "grad_norm": 0.3591943085193634, + "learning_rate": 8.889187585550386e-06, + "loss": 1.2168, + "step": 18439 + }, + { + "epoch": 5.492283921889834, + "grad_norm": 0.3042803704738617, + "learning_rate": 8.888228993737274e-06, + "loss": 1.2036, + "step": 18440 + }, + { + "epoch": 5.492581768089503, + "grad_norm": 0.30777686834335327, + "learning_rate": 8.887270412267947e-06, + "loss": 1.1961, + "step": 18441 + }, + { + "epoch": 5.492879614289172, + "grad_norm": 0.3260495364665985, + "learning_rate": 8.88631184115133e-06, + "loss": 1.1952, + "step": 18442 + }, + { + "epoch": 5.49317746048884, + "grad_norm": 0.29427823424339294, + "learning_rate": 8.885353280396336e-06, + "loss": 1.2176, + "step": 18443 + }, + { + "epoch": 5.4934753066885085, + "grad_norm": 0.3360000252723694, + "learning_rate": 8.884394730011882e-06, + "loss": 1.2074, + "step": 18444 + }, + { + "epoch": 5.493773152888178, + "grad_norm": 0.27873557806015015, + "learning_rate": 8.88343619000689e-06, + "loss": 1.2113, + "step": 18445 + }, + { + "epoch": 5.494070999087846, + "grad_norm": 0.3372797966003418, + "learning_rate": 8.88247766039028e-06, + "loss": 1.2249, + "step": 18446 + }, + { + "epoch": 5.494368845287514, + "grad_norm": 0.29132211208343506, + "learning_rate": 8.881519141170961e-06, + "loss": 1.2078, + "step": 18447 + }, + { + "epoch": 5.494666691487184, + "grad_norm": 0.2600608468055725, + "learning_rate": 8.880560632357863e-06, + "loss": 1.2101, + "step": 18448 + }, + { + "epoch": 5.494964537686852, + "grad_norm": 0.3094232678413391, + "learning_rate": 8.879602133959895e-06, + "loss": 1.2166, + "step": 18449 + }, + { + "epoch": 5.49526238388652, + "grad_norm": 0.25211331248283386, + "learning_rate": 8.878643645985973e-06, + "loss": 1.2241, + "step": 18450 + }, + { + "epoch": 5.495560230086189, + "grad_norm": 0.2739332616329193, + "learning_rate": 8.877685168445023e-06, + "loss": 1.2284, + "step": 18451 + }, + { + "epoch": 5.495858076285858, + "grad_norm": 0.31273096799850464, + "learning_rate": 8.87672670134596e-06, + "loss": 1.2279, + "step": 18452 + }, + { + "epoch": 5.496155922485526, + "grad_norm": 0.27651849389076233, + "learning_rate": 8.875768244697696e-06, + "loss": 1.2305, + "step": 18453 + }, + { + "epoch": 5.496453768685195, + "grad_norm": 0.41167640686035156, + "learning_rate": 8.874809798509157e-06, + "loss": 1.2204, + "step": 18454 + }, + { + "epoch": 5.496751614884864, + "grad_norm": 0.32396993041038513, + "learning_rate": 8.87385136278925e-06, + "loss": 1.2188, + "step": 18455 + }, + { + "epoch": 5.497049461084533, + "grad_norm": 0.34511902928352356, + "learning_rate": 8.872892937546904e-06, + "loss": 1.2022, + "step": 18456 + }, + { + "epoch": 5.497347307284201, + "grad_norm": 0.37354591488838196, + "learning_rate": 8.871934522791027e-06, + "loss": 1.2257, + "step": 18457 + }, + { + "epoch": 5.4976451534838695, + "grad_norm": 0.25477805733680725, + "learning_rate": 8.870976118530536e-06, + "loss": 1.2144, + "step": 18458 + }, + { + "epoch": 5.497942999683539, + "grad_norm": 0.3068343698978424, + "learning_rate": 8.870017724774356e-06, + "loss": 1.2195, + "step": 18459 + }, + { + "epoch": 5.498240845883207, + "grad_norm": 0.2564184069633484, + "learning_rate": 8.869059341531398e-06, + "loss": 1.2271, + "step": 18460 + }, + { + "epoch": 5.498538692082875, + "grad_norm": 0.3234730660915375, + "learning_rate": 8.868100968810575e-06, + "loss": 1.226, + "step": 18461 + }, + { + "epoch": 5.498836538282545, + "grad_norm": 0.2749391496181488, + "learning_rate": 8.867142606620814e-06, + "loss": 1.217, + "step": 18462 + }, + { + "epoch": 5.499134384482213, + "grad_norm": 0.2855779528617859, + "learning_rate": 8.866184254971021e-06, + "loss": 1.2347, + "step": 18463 + }, + { + "epoch": 5.499432230681881, + "grad_norm": 0.3118726909160614, + "learning_rate": 8.865225913870114e-06, + "loss": 1.2201, + "step": 18464 + }, + { + "epoch": 5.4997300768815505, + "grad_norm": 0.3345693051815033, + "learning_rate": 8.864267583327019e-06, + "loss": 1.2166, + "step": 18465 + }, + { + "epoch": 5.500027923081219, + "grad_norm": 0.6133779287338257, + "learning_rate": 8.863309263350644e-06, + "loss": 1.2284, + "step": 18466 + }, + { + "epoch": 5.500325769280888, + "grad_norm": 0.3372969329357147, + "learning_rate": 8.862350953949901e-06, + "loss": 1.1898, + "step": 18467 + }, + { + "epoch": 5.500623615480556, + "grad_norm": 0.5956374406814575, + "learning_rate": 8.861392655133717e-06, + "loss": 1.2131, + "step": 18468 + }, + { + "epoch": 5.500921461680225, + "grad_norm": 0.45349788665771484, + "learning_rate": 8.860434366910999e-06, + "loss": 1.2157, + "step": 18469 + }, + { + "epoch": 5.501219307879894, + "grad_norm": 0.2968503534793854, + "learning_rate": 8.859476089290666e-06, + "loss": 1.2075, + "step": 18470 + }, + { + "epoch": 5.501517154079562, + "grad_norm": 0.2783360779285431, + "learning_rate": 8.858517822281635e-06, + "loss": 1.2184, + "step": 18471 + }, + { + "epoch": 5.5018150002792305, + "grad_norm": 0.4133557677268982, + "learning_rate": 8.857559565892818e-06, + "loss": 1.1962, + "step": 18472 + }, + { + "epoch": 5.5021128464789, + "grad_norm": 0.278022438287735, + "learning_rate": 8.856601320133136e-06, + "loss": 1.2136, + "step": 18473 + }, + { + "epoch": 5.502410692678568, + "grad_norm": 0.4163760840892792, + "learning_rate": 8.8556430850115e-06, + "loss": 1.2145, + "step": 18474 + }, + { + "epoch": 5.502708538878236, + "grad_norm": 0.3640947639942169, + "learning_rate": 8.854684860536826e-06, + "loss": 1.1836, + "step": 18475 + }, + { + "epoch": 5.503006385077906, + "grad_norm": 0.3118014335632324, + "learning_rate": 8.85372664671803e-06, + "loss": 1.2163, + "step": 18476 + }, + { + "epoch": 5.503304231277574, + "grad_norm": 0.3818114399909973, + "learning_rate": 8.852768443564028e-06, + "loss": 1.2028, + "step": 18477 + }, + { + "epoch": 5.503602077477243, + "grad_norm": 0.25287437438964844, + "learning_rate": 8.851810251083731e-06, + "loss": 1.2116, + "step": 18478 + }, + { + "epoch": 5.5038999236769115, + "grad_norm": 0.31228604912757874, + "learning_rate": 8.85085206928606e-06, + "loss": 1.2138, + "step": 18479 + }, + { + "epoch": 5.50419776987658, + "grad_norm": 0.4218159317970276, + "learning_rate": 8.849893898179926e-06, + "loss": 1.2261, + "step": 18480 + }, + { + "epoch": 5.504495616076248, + "grad_norm": 0.33540377020835876, + "learning_rate": 8.84893573777424e-06, + "loss": 1.2219, + "step": 18481 + }, + { + "epoch": 5.504793462275917, + "grad_norm": 0.44699543714523315, + "learning_rate": 8.847977588077927e-06, + "loss": 1.2134, + "step": 18482 + }, + { + "epoch": 5.505091308475586, + "grad_norm": 0.34513965249061584, + "learning_rate": 8.847019449099895e-06, + "loss": 1.2288, + "step": 18483 + }, + { + "epoch": 5.505389154675255, + "grad_norm": 0.3132277727127075, + "learning_rate": 8.846061320849055e-06, + "loss": 1.2226, + "step": 18484 + }, + { + "epoch": 5.505687000874923, + "grad_norm": 0.3509070575237274, + "learning_rate": 8.845103203334329e-06, + "loss": 1.2312, + "step": 18485 + }, + { + "epoch": 5.505984847074592, + "grad_norm": 0.3406696617603302, + "learning_rate": 8.844145096564624e-06, + "loss": 1.2114, + "step": 18486 + }, + { + "epoch": 5.506282693274261, + "grad_norm": 0.39734143018722534, + "learning_rate": 8.843187000548857e-06, + "loss": 1.189, + "step": 18487 + }, + { + "epoch": 5.506580539473929, + "grad_norm": 0.25761228799819946, + "learning_rate": 8.842228915295945e-06, + "loss": 1.2072, + "step": 18488 + }, + { + "epoch": 5.506878385673597, + "grad_norm": 0.3563632071018219, + "learning_rate": 8.841270840814798e-06, + "loss": 1.2216, + "step": 18489 + }, + { + "epoch": 5.507176231873267, + "grad_norm": 0.3468446135520935, + "learning_rate": 8.840312777114333e-06, + "loss": 1.2229, + "step": 18490 + }, + { + "epoch": 5.507474078072935, + "grad_norm": 0.3897448182106018, + "learning_rate": 8.839354724203463e-06, + "loss": 1.2155, + "step": 18491 + }, + { + "epoch": 5.507771924272603, + "grad_norm": 0.30430126190185547, + "learning_rate": 8.838396682091095e-06, + "loss": 1.2164, + "step": 18492 + }, + { + "epoch": 5.5080697704722725, + "grad_norm": 0.4468208849430084, + "learning_rate": 8.837438650786152e-06, + "loss": 1.2188, + "step": 18493 + }, + { + "epoch": 5.508367616671941, + "grad_norm": 0.4163860082626343, + "learning_rate": 8.836480630297544e-06, + "loss": 1.1951, + "step": 18494 + }, + { + "epoch": 5.50866546287161, + "grad_norm": 0.3813307285308838, + "learning_rate": 8.83552262063418e-06, + "loss": 1.2161, + "step": 18495 + }, + { + "epoch": 5.508963309071278, + "grad_norm": 0.4980100989341736, + "learning_rate": 8.834564621804982e-06, + "loss": 1.214, + "step": 18496 + }, + { + "epoch": 5.509261155270947, + "grad_norm": 0.2579140067100525, + "learning_rate": 8.833606633818855e-06, + "loss": 1.1984, + "step": 18497 + }, + { + "epoch": 5.509559001470616, + "grad_norm": 0.32582640647888184, + "learning_rate": 8.832648656684715e-06, + "loss": 1.2078, + "step": 18498 + }, + { + "epoch": 5.509856847670284, + "grad_norm": 0.3629567325115204, + "learning_rate": 8.831690690411476e-06, + "loss": 1.2263, + "step": 18499 + }, + { + "epoch": 5.510154693869953, + "grad_norm": 0.3442101776599884, + "learning_rate": 8.830732735008052e-06, + "loss": 1.2194, + "step": 18500 + }, + { + "epoch": 5.510154693869953, + "eval_loss": 1.3182575702667236, + "eval_runtime": 24.6666, + "eval_samples_per_second": 70.298, + "eval_steps_per_second": 4.419, + "step": 18500 + }, + { + "epoch": 5.510452540069622, + "grad_norm": 0.3001834452152252, + "learning_rate": 8.829774790483349e-06, + "loss": 1.2075, + "step": 18501 + }, + { + "epoch": 5.51075038626929, + "grad_norm": 0.32820916175842285, + "learning_rate": 8.828816856846288e-06, + "loss": 1.2052, + "step": 18502 + }, + { + "epoch": 5.5110482324689585, + "grad_norm": 0.3135179281234741, + "learning_rate": 8.827858934105773e-06, + "loss": 1.2233, + "step": 18503 + }, + { + "epoch": 5.511346078668628, + "grad_norm": 0.2526983916759491, + "learning_rate": 8.826901022270723e-06, + "loss": 1.2112, + "step": 18504 + }, + { + "epoch": 5.511643924868296, + "grad_norm": 0.33921104669570923, + "learning_rate": 8.825943121350052e-06, + "loss": 1.2052, + "step": 18505 + }, + { + "epoch": 5.511941771067965, + "grad_norm": 0.29194679856300354, + "learning_rate": 8.824985231352664e-06, + "loss": 1.2115, + "step": 18506 + }, + { + "epoch": 5.512239617267634, + "grad_norm": 0.30259567499160767, + "learning_rate": 8.824027352287479e-06, + "loss": 1.2089, + "step": 18507 + }, + { + "epoch": 5.512537463467302, + "grad_norm": 0.26926037669181824, + "learning_rate": 8.823069484163406e-06, + "loss": 1.2225, + "step": 18508 + }, + { + "epoch": 5.512835309666971, + "grad_norm": 0.27251744270324707, + "learning_rate": 8.82211162698935e-06, + "loss": 1.1978, + "step": 18509 + }, + { + "epoch": 5.513133155866639, + "grad_norm": 0.2935759425163269, + "learning_rate": 8.821153780774235e-06, + "loss": 1.2113, + "step": 18510 + }, + { + "epoch": 5.513431002066308, + "grad_norm": 0.3544779121875763, + "learning_rate": 8.820195945526969e-06, + "loss": 1.2084, + "step": 18511 + }, + { + "epoch": 5.513728848265977, + "grad_norm": 0.2574149966239929, + "learning_rate": 8.819238121256455e-06, + "loss": 1.2179, + "step": 18512 + }, + { + "epoch": 5.514026694465645, + "grad_norm": 0.4223068058490753, + "learning_rate": 8.818280307971617e-06, + "loss": 1.1989, + "step": 18513 + }, + { + "epoch": 5.514324540665314, + "grad_norm": 0.2965776324272156, + "learning_rate": 8.81732250568136e-06, + "loss": 1.2182, + "step": 18514 + }, + { + "epoch": 5.514622386864983, + "grad_norm": 0.3070499002933502, + "learning_rate": 8.81636471439459e-06, + "loss": 1.2067, + "step": 18515 + }, + { + "epoch": 5.514920233064651, + "grad_norm": 0.2785414755344391, + "learning_rate": 8.815406934120228e-06, + "loss": 1.2132, + "step": 18516 + }, + { + "epoch": 5.5152180792643195, + "grad_norm": 0.2849379777908325, + "learning_rate": 8.814449164867183e-06, + "loss": 1.2151, + "step": 18517 + }, + { + "epoch": 5.515515925463989, + "grad_norm": 0.2550511658191681, + "learning_rate": 8.813491406644358e-06, + "loss": 1.1991, + "step": 18518 + }, + { + "epoch": 5.515813771663657, + "grad_norm": 0.5555850863456726, + "learning_rate": 8.812533659460675e-06, + "loss": 1.231, + "step": 18519 + }, + { + "epoch": 5.516111617863325, + "grad_norm": 0.4058053493499756, + "learning_rate": 8.811575923325036e-06, + "loss": 1.2164, + "step": 18520 + }, + { + "epoch": 5.516409464062995, + "grad_norm": 0.38803279399871826, + "learning_rate": 8.81061819824636e-06, + "loss": 1.22, + "step": 18521 + }, + { + "epoch": 5.516707310262663, + "grad_norm": 0.26936450600624084, + "learning_rate": 8.809660484233547e-06, + "loss": 1.2179, + "step": 18522 + }, + { + "epoch": 5.517005156462332, + "grad_norm": 0.6515083909034729, + "learning_rate": 8.808702781295514e-06, + "loss": 1.2145, + "step": 18523 + }, + { + "epoch": 5.5173030026620005, + "grad_norm": 0.3911673128604889, + "learning_rate": 8.807745089441175e-06, + "loss": 1.2138, + "step": 18524 + }, + { + "epoch": 5.517600848861669, + "grad_norm": 0.5121654868125916, + "learning_rate": 8.806787408679436e-06, + "loss": 1.2131, + "step": 18525 + }, + { + "epoch": 5.517898695061338, + "grad_norm": 0.26042410731315613, + "learning_rate": 8.805829739019201e-06, + "loss": 1.2077, + "step": 18526 + }, + { + "epoch": 5.518196541261006, + "grad_norm": 0.7270402908325195, + "learning_rate": 8.804872080469393e-06, + "loss": 1.2068, + "step": 18527 + }, + { + "epoch": 5.518494387460675, + "grad_norm": 0.2767256796360016, + "learning_rate": 8.803914433038912e-06, + "loss": 1.2086, + "step": 18528 + }, + { + "epoch": 5.518792233660344, + "grad_norm": 0.5118314027786255, + "learning_rate": 8.802956796736668e-06, + "loss": 1.2073, + "step": 18529 + }, + { + "epoch": 5.519090079860012, + "grad_norm": 0.2819366753101349, + "learning_rate": 8.801999171571577e-06, + "loss": 1.1999, + "step": 18530 + }, + { + "epoch": 5.5193879260596805, + "grad_norm": 0.440983384847641, + "learning_rate": 8.801041557552546e-06, + "loss": 1.2039, + "step": 18531 + }, + { + "epoch": 5.51968577225935, + "grad_norm": 0.3366145193576813, + "learning_rate": 8.800083954688481e-06, + "loss": 1.1966, + "step": 18532 + }, + { + "epoch": 5.519983618459018, + "grad_norm": 0.27563244104385376, + "learning_rate": 8.799126362988295e-06, + "loss": 1.2102, + "step": 18533 + }, + { + "epoch": 5.520281464658687, + "grad_norm": 0.370993047952652, + "learning_rate": 8.798168782460899e-06, + "loss": 1.1929, + "step": 18534 + }, + { + "epoch": 5.520579310858356, + "grad_norm": 0.2635239362716675, + "learning_rate": 8.797211213115195e-06, + "loss": 1.2147, + "step": 18535 + }, + { + "epoch": 5.520877157058024, + "grad_norm": 0.3107627034187317, + "learning_rate": 8.7962536549601e-06, + "loss": 1.2119, + "step": 18536 + }, + { + "epoch": 5.521175003257693, + "grad_norm": 0.3534769117832184, + "learning_rate": 8.795296108004517e-06, + "loss": 1.2235, + "step": 18537 + }, + { + "epoch": 5.5214728494573615, + "grad_norm": 0.25283482670783997, + "learning_rate": 8.794338572257362e-06, + "loss": 1.2023, + "step": 18538 + }, + { + "epoch": 5.52177069565703, + "grad_norm": 0.3561493754386902, + "learning_rate": 8.793381047727538e-06, + "loss": 1.2204, + "step": 18539 + }, + { + "epoch": 5.522068541856699, + "grad_norm": 0.26294341683387756, + "learning_rate": 8.792423534423951e-06, + "loss": 1.2209, + "step": 18540 + }, + { + "epoch": 5.522366388056367, + "grad_norm": 0.4278562664985657, + "learning_rate": 8.79146603235552e-06, + "loss": 1.2298, + "step": 18541 + }, + { + "epoch": 5.522664234256036, + "grad_norm": 0.2937805652618408, + "learning_rate": 8.790508541531147e-06, + "loss": 1.2144, + "step": 18542 + }, + { + "epoch": 5.522962080455705, + "grad_norm": 0.34140509366989136, + "learning_rate": 8.789551061959736e-06, + "loss": 1.2189, + "step": 18543 + }, + { + "epoch": 5.523259926655373, + "grad_norm": 0.27104368805885315, + "learning_rate": 8.788593593650204e-06, + "loss": 1.2167, + "step": 18544 + }, + { + "epoch": 5.5235577728550425, + "grad_norm": 0.45311427116394043, + "learning_rate": 8.787636136611455e-06, + "loss": 1.2114, + "step": 18545 + }, + { + "epoch": 5.523855619054711, + "grad_norm": 0.36299455165863037, + "learning_rate": 8.786678690852393e-06, + "loss": 1.2189, + "step": 18546 + }, + { + "epoch": 5.524153465254379, + "grad_norm": 0.31058499217033386, + "learning_rate": 8.785721256381935e-06, + "loss": 1.2253, + "step": 18547 + }, + { + "epoch": 5.524451311454047, + "grad_norm": 0.2723679840564728, + "learning_rate": 8.784763833208983e-06, + "loss": 1.2067, + "step": 18548 + }, + { + "epoch": 5.524749157653717, + "grad_norm": 0.34415721893310547, + "learning_rate": 8.783806421342444e-06, + "loss": 1.2154, + "step": 18549 + }, + { + "epoch": 5.525047003853385, + "grad_norm": 0.26098328828811646, + "learning_rate": 8.782849020791229e-06, + "loss": 1.2159, + "step": 18550 + }, + { + "epoch": 5.525344850053054, + "grad_norm": 0.3889535367488861, + "learning_rate": 8.781891631564244e-06, + "loss": 1.2156, + "step": 18551 + }, + { + "epoch": 5.5256426962527225, + "grad_norm": 0.30614978075027466, + "learning_rate": 8.780934253670392e-06, + "loss": 1.2065, + "step": 18552 + }, + { + "epoch": 5.525940542452391, + "grad_norm": 0.2868693768978119, + "learning_rate": 8.77997688711859e-06, + "loss": 1.2106, + "step": 18553 + }, + { + "epoch": 5.52623838865206, + "grad_norm": 0.2587234675884247, + "learning_rate": 8.779019531917738e-06, + "loss": 1.2135, + "step": 18554 + }, + { + "epoch": 5.526536234851728, + "grad_norm": 0.2812231183052063, + "learning_rate": 8.778062188076747e-06, + "loss": 1.2034, + "step": 18555 + }, + { + "epoch": 5.526834081051397, + "grad_norm": 0.27662885189056396, + "learning_rate": 8.77710485560452e-06, + "loss": 1.2187, + "step": 18556 + }, + { + "epoch": 5.527131927251066, + "grad_norm": 0.2594603896141052, + "learning_rate": 8.776147534509966e-06, + "loss": 1.2186, + "step": 18557 + }, + { + "epoch": 5.527429773450734, + "grad_norm": 0.27191632986068726, + "learning_rate": 8.775190224801993e-06, + "loss": 1.2423, + "step": 18558 + }, + { + "epoch": 5.527727619650403, + "grad_norm": 0.32991641759872437, + "learning_rate": 8.774232926489507e-06, + "loss": 1.2202, + "step": 18559 + }, + { + "epoch": 5.528025465850072, + "grad_norm": 0.330274373292923, + "learning_rate": 8.773275639581412e-06, + "loss": 1.2141, + "step": 18560 + }, + { + "epoch": 5.52832331204974, + "grad_norm": 0.4951980710029602, + "learning_rate": 8.77231836408662e-06, + "loss": 1.2053, + "step": 18561 + }, + { + "epoch": 5.528621158249409, + "grad_norm": 0.32925066351890564, + "learning_rate": 8.77136110001403e-06, + "loss": 1.2068, + "step": 18562 + }, + { + "epoch": 5.528919004449078, + "grad_norm": 0.7124512195587158, + "learning_rate": 8.770403847372555e-06, + "loss": 1.2128, + "step": 18563 + }, + { + "epoch": 5.529216850648746, + "grad_norm": 0.29837125539779663, + "learning_rate": 8.769446606171097e-06, + "loss": 1.217, + "step": 18564 + }, + { + "epoch": 5.529514696848415, + "grad_norm": 0.7584240436553955, + "learning_rate": 8.768489376418566e-06, + "loss": 1.2025, + "step": 18565 + }, + { + "epoch": 5.529812543048084, + "grad_norm": 0.3047201931476593, + "learning_rate": 8.767532158123861e-06, + "loss": 1.2153, + "step": 18566 + }, + { + "epoch": 5.530110389247752, + "grad_norm": 0.34465089440345764, + "learning_rate": 8.766574951295895e-06, + "loss": 1.2253, + "step": 18567 + }, + { + "epoch": 5.530408235447421, + "grad_norm": 0.3658747375011444, + "learning_rate": 8.76561775594357e-06, + "loss": 1.2177, + "step": 18568 + }, + { + "epoch": 5.530706081647089, + "grad_norm": 0.39325231313705444, + "learning_rate": 8.764660572075791e-06, + "loss": 1.2274, + "step": 18569 + }, + { + "epoch": 5.531003927846758, + "grad_norm": 0.32030919194221497, + "learning_rate": 8.76370339970147e-06, + "loss": 1.2102, + "step": 18570 + }, + { + "epoch": 5.531301774046427, + "grad_norm": 0.3121950626373291, + "learning_rate": 8.762746238829502e-06, + "loss": 1.2034, + "step": 18571 + }, + { + "epoch": 5.531599620246095, + "grad_norm": 0.2956571877002716, + "learning_rate": 8.761789089468803e-06, + "loss": 1.2112, + "step": 18572 + }, + { + "epoch": 5.5318974664457645, + "grad_norm": 0.28109005093574524, + "learning_rate": 8.760831951628274e-06, + "loss": 1.2143, + "step": 18573 + }, + { + "epoch": 5.532195312645433, + "grad_norm": 0.2740158438682556, + "learning_rate": 8.759874825316813e-06, + "loss": 1.2155, + "step": 18574 + }, + { + "epoch": 5.532493158845101, + "grad_norm": 0.2645137310028076, + "learning_rate": 8.758917710543334e-06, + "loss": 1.2234, + "step": 18575 + }, + { + "epoch": 5.53279100504477, + "grad_norm": 0.340607613325119, + "learning_rate": 8.757960607316742e-06, + "loss": 1.2356, + "step": 18576 + }, + { + "epoch": 5.533088851244439, + "grad_norm": 0.26851311326026917, + "learning_rate": 8.757003515645932e-06, + "loss": 1.212, + "step": 18577 + }, + { + "epoch": 5.533386697444107, + "grad_norm": 0.4204433560371399, + "learning_rate": 8.756046435539823e-06, + "loss": 1.217, + "step": 18578 + }, + { + "epoch": 5.533684543643776, + "grad_norm": 0.2745577096939087, + "learning_rate": 8.75508936700731e-06, + "loss": 1.2187, + "step": 18579 + }, + { + "epoch": 5.533982389843445, + "grad_norm": 0.40016692876815796, + "learning_rate": 8.754132310057297e-06, + "loss": 1.2246, + "step": 18580 + }, + { + "epoch": 5.534280236043113, + "grad_norm": 0.3720231354236603, + "learning_rate": 8.753175264698692e-06, + "loss": 1.1945, + "step": 18581 + }, + { + "epoch": 5.534578082242782, + "grad_norm": 0.24383926391601562, + "learning_rate": 8.7522182309404e-06, + "loss": 1.2113, + "step": 18582 + }, + { + "epoch": 5.5348759284424505, + "grad_norm": 0.27411460876464844, + "learning_rate": 8.751261208791321e-06, + "loss": 1.2208, + "step": 18583 + }, + { + "epoch": 5.535173774642119, + "grad_norm": 0.30837035179138184, + "learning_rate": 8.750304198260363e-06, + "loss": 1.2063, + "step": 18584 + }, + { + "epoch": 5.535471620841788, + "grad_norm": 0.28115174174308777, + "learning_rate": 8.749347199356427e-06, + "loss": 1.2168, + "step": 18585 + }, + { + "epoch": 5.535769467041456, + "grad_norm": 0.35087934136390686, + "learning_rate": 8.748390212088417e-06, + "loss": 1.2076, + "step": 18586 + }, + { + "epoch": 5.536067313241125, + "grad_norm": 0.33117544651031494, + "learning_rate": 8.74743323646524e-06, + "loss": 1.2235, + "step": 18587 + }, + { + "epoch": 5.536365159440794, + "grad_norm": 0.26276567578315735, + "learning_rate": 8.746476272495794e-06, + "loss": 1.2051, + "step": 18588 + }, + { + "epoch": 5.536663005640462, + "grad_norm": 0.40405192971229553, + "learning_rate": 8.74551932018899e-06, + "loss": 1.2053, + "step": 18589 + }, + { + "epoch": 5.536960851840131, + "grad_norm": 0.31916743516921997, + "learning_rate": 8.744562379553728e-06, + "loss": 1.2294, + "step": 18590 + }, + { + "epoch": 5.5372586980398, + "grad_norm": 0.28235921263694763, + "learning_rate": 8.743605450598904e-06, + "loss": 1.1922, + "step": 18591 + }, + { + "epoch": 5.537556544239468, + "grad_norm": 0.3274117410182953, + "learning_rate": 8.74264853333343e-06, + "loss": 1.2165, + "step": 18592 + }, + { + "epoch": 5.537854390439137, + "grad_norm": 0.365242063999176, + "learning_rate": 8.74169162776621e-06, + "loss": 1.2052, + "step": 18593 + }, + { + "epoch": 5.538152236638806, + "grad_norm": 0.25951096415519714, + "learning_rate": 8.74073473390614e-06, + "loss": 1.2147, + "step": 18594 + }, + { + "epoch": 5.538450082838474, + "grad_norm": 0.5193450450897217, + "learning_rate": 8.73977785176213e-06, + "loss": 1.215, + "step": 18595 + }, + { + "epoch": 5.538747929038143, + "grad_norm": 0.36995893716812134, + "learning_rate": 8.738820981343079e-06, + "loss": 1.2198, + "step": 18596 + }, + { + "epoch": 5.5390457752378115, + "grad_norm": 0.31989729404449463, + "learning_rate": 8.737864122657884e-06, + "loss": 1.2118, + "step": 18597 + }, + { + "epoch": 5.53934362143748, + "grad_norm": 0.4452241361141205, + "learning_rate": 8.736907275715455e-06, + "loss": 1.2305, + "step": 18598 + }, + { + "epoch": 5.539641467637149, + "grad_norm": 0.29764696955680847, + "learning_rate": 8.735950440524696e-06, + "loss": 1.2093, + "step": 18599 + }, + { + "epoch": 5.539939313836817, + "grad_norm": 0.27439337968826294, + "learning_rate": 8.734993617094503e-06, + "loss": 1.2046, + "step": 18600 + }, + { + "epoch": 5.540237160036487, + "grad_norm": 0.40285763144493103, + "learning_rate": 8.734036805433784e-06, + "loss": 1.2097, + "step": 18601 + }, + { + "epoch": 5.540535006236155, + "grad_norm": 0.30859941244125366, + "learning_rate": 8.733080005551437e-06, + "loss": 1.2013, + "step": 18602 + }, + { + "epoch": 5.540832852435823, + "grad_norm": 0.41414734721183777, + "learning_rate": 8.732123217456363e-06, + "loss": 1.204, + "step": 18603 + }, + { + "epoch": 5.5411306986354925, + "grad_norm": 0.3561864197254181, + "learning_rate": 8.731166441157467e-06, + "loss": 1.2186, + "step": 18604 + }, + { + "epoch": 5.541428544835161, + "grad_norm": 0.31468212604522705, + "learning_rate": 8.730209676663647e-06, + "loss": 1.2236, + "step": 18605 + }, + { + "epoch": 5.541726391034829, + "grad_norm": 0.2603742182254791, + "learning_rate": 8.729252923983812e-06, + "loss": 1.2058, + "step": 18606 + }, + { + "epoch": 5.542024237234498, + "grad_norm": 0.42395567893981934, + "learning_rate": 8.72829618312686e-06, + "loss": 1.2131, + "step": 18607 + }, + { + "epoch": 5.542322083434167, + "grad_norm": 0.2771393060684204, + "learning_rate": 8.727339454101685e-06, + "loss": 1.2138, + "step": 18608 + }, + { + "epoch": 5.542619929633835, + "grad_norm": 0.3564935028553009, + "learning_rate": 8.726382736917198e-06, + "loss": 1.208, + "step": 18609 + }, + { + "epoch": 5.542917775833504, + "grad_norm": 0.313340425491333, + "learning_rate": 8.725426031582297e-06, + "loss": 1.2148, + "step": 18610 + }, + { + "epoch": 5.5432156220331725, + "grad_norm": 0.43277180194854736, + "learning_rate": 8.724469338105879e-06, + "loss": 1.2183, + "step": 18611 + }, + { + "epoch": 5.543513468232842, + "grad_norm": 0.6412144899368286, + "learning_rate": 8.723512656496855e-06, + "loss": 1.2178, + "step": 18612 + }, + { + "epoch": 5.54381131443251, + "grad_norm": 0.3044149577617645, + "learning_rate": 8.722555986764117e-06, + "loss": 1.202, + "step": 18613 + }, + { + "epoch": 5.544109160632178, + "grad_norm": 0.5289282202720642, + "learning_rate": 8.721599328916566e-06, + "loss": 1.2052, + "step": 18614 + }, + { + "epoch": 5.544407006831847, + "grad_norm": 0.34878963232040405, + "learning_rate": 8.720642682963105e-06, + "loss": 1.2101, + "step": 18615 + }, + { + "epoch": 5.544704853031516, + "grad_norm": 0.35936352610588074, + "learning_rate": 8.719686048912638e-06, + "loss": 1.2199, + "step": 18616 + }, + { + "epoch": 5.545002699231184, + "grad_norm": 0.2650383412837982, + "learning_rate": 8.718729426774057e-06, + "loss": 1.2374, + "step": 18617 + }, + { + "epoch": 5.5453005454308535, + "grad_norm": 0.44902336597442627, + "learning_rate": 8.717772816556273e-06, + "loss": 1.2209, + "step": 18618 + }, + { + "epoch": 5.545598391630522, + "grad_norm": 0.3704821765422821, + "learning_rate": 8.716816218268174e-06, + "loss": 1.2021, + "step": 18619 + }, + { + "epoch": 5.54589623783019, + "grad_norm": 0.3121505677700043, + "learning_rate": 8.715859631918671e-06, + "loss": 1.2147, + "step": 18620 + }, + { + "epoch": 5.546194084029859, + "grad_norm": 0.3140326738357544, + "learning_rate": 8.714903057516658e-06, + "loss": 1.2442, + "step": 18621 + }, + { + "epoch": 5.546491930229528, + "grad_norm": 0.391445130109787, + "learning_rate": 8.713946495071036e-06, + "loss": 1.2062, + "step": 18622 + }, + { + "epoch": 5.546789776429196, + "grad_norm": 0.30727216601371765, + "learning_rate": 8.712989944590707e-06, + "loss": 1.2142, + "step": 18623 + }, + { + "epoch": 5.547087622628865, + "grad_norm": 0.6961880922317505, + "learning_rate": 8.712033406084569e-06, + "loss": 1.2115, + "step": 18624 + }, + { + "epoch": 5.547385468828534, + "grad_norm": 0.2826353907585144, + "learning_rate": 8.711076879561516e-06, + "loss": 1.2188, + "step": 18625 + }, + { + "epoch": 5.547683315028202, + "grad_norm": 0.5970001816749573, + "learning_rate": 8.710120365030456e-06, + "loss": 1.2031, + "step": 18626 + }, + { + "epoch": 5.547981161227871, + "grad_norm": 0.3100568652153015, + "learning_rate": 8.709163862500286e-06, + "loss": 1.2106, + "step": 18627 + }, + { + "epoch": 5.548279007427539, + "grad_norm": 0.3357331156730652, + "learning_rate": 8.708207371979899e-06, + "loss": 1.2049, + "step": 18628 + }, + { + "epoch": 5.548576853627209, + "grad_norm": 0.5884115695953369, + "learning_rate": 8.707250893478205e-06, + "loss": 1.2124, + "step": 18629 + }, + { + "epoch": 5.548874699826877, + "grad_norm": 0.2585620582103729, + "learning_rate": 8.706294427004096e-06, + "loss": 1.2203, + "step": 18630 + }, + { + "epoch": 5.549172546026545, + "grad_norm": 0.585599958896637, + "learning_rate": 8.705337972566468e-06, + "loss": 1.2078, + "step": 18631 + }, + { + "epoch": 5.5494703922262145, + "grad_norm": 0.3208726644515991, + "learning_rate": 8.70438153017423e-06, + "loss": 1.2138, + "step": 18632 + }, + { + "epoch": 5.549768238425883, + "grad_norm": 0.35780036449432373, + "learning_rate": 8.70342509983627e-06, + "loss": 1.2029, + "step": 18633 + }, + { + "epoch": 5.550066084625551, + "grad_norm": 0.39285334944725037, + "learning_rate": 8.702468681561489e-06, + "loss": 1.2305, + "step": 18634 + }, + { + "epoch": 5.55036393082522, + "grad_norm": 0.3960075080394745, + "learning_rate": 8.701512275358792e-06, + "loss": 1.2247, + "step": 18635 + }, + { + "epoch": 5.550661777024889, + "grad_norm": 0.28272077441215515, + "learning_rate": 8.700555881237067e-06, + "loss": 1.2208, + "step": 18636 + }, + { + "epoch": 5.550959623224557, + "grad_norm": 0.35603034496307373, + "learning_rate": 8.699599499205223e-06, + "loss": 1.2145, + "step": 18637 + }, + { + "epoch": 5.551257469424226, + "grad_norm": 0.3324495851993561, + "learning_rate": 8.698643129272153e-06, + "loss": 1.2128, + "step": 18638 + }, + { + "epoch": 5.551555315623895, + "grad_norm": 0.4525752067565918, + "learning_rate": 8.69768677144675e-06, + "loss": 1.1984, + "step": 18639 + }, + { + "epoch": 5.551853161823564, + "grad_norm": 0.31580206751823425, + "learning_rate": 8.696730425737919e-06, + "loss": 1.2108, + "step": 18640 + }, + { + "epoch": 5.552151008023232, + "grad_norm": 0.4952149987220764, + "learning_rate": 8.695774092154557e-06, + "loss": 1.2176, + "step": 18641 + }, + { + "epoch": 5.5524488542229005, + "grad_norm": 0.24655117094516754, + "learning_rate": 8.694817770705556e-06, + "loss": 1.1995, + "step": 18642 + }, + { + "epoch": 5.55274670042257, + "grad_norm": 0.4545476734638214, + "learning_rate": 8.693861461399822e-06, + "loss": 1.2256, + "step": 18643 + }, + { + "epoch": 5.553044546622238, + "grad_norm": 0.2850292921066284, + "learning_rate": 8.692905164246246e-06, + "loss": 1.2114, + "step": 18644 + }, + { + "epoch": 5.553342392821906, + "grad_norm": 0.3026027977466583, + "learning_rate": 8.691948879253725e-06, + "loss": 1.2075, + "step": 18645 + }, + { + "epoch": 5.5536402390215756, + "grad_norm": 0.2584564983844757, + "learning_rate": 8.69099260643116e-06, + "loss": 1.2205, + "step": 18646 + }, + { + "epoch": 5.553938085221244, + "grad_norm": 0.26178839802742004, + "learning_rate": 8.690036345787448e-06, + "loss": 1.2336, + "step": 18647 + }, + { + "epoch": 5.554235931420912, + "grad_norm": 0.31421634554862976, + "learning_rate": 8.68908009733148e-06, + "loss": 1.1965, + "step": 18648 + }, + { + "epoch": 5.554533777620581, + "grad_norm": 0.33658066391944885, + "learning_rate": 8.688123861072161e-06, + "loss": 1.2293, + "step": 18649 + }, + { + "epoch": 5.55483162382025, + "grad_norm": 0.279276967048645, + "learning_rate": 8.687167637018382e-06, + "loss": 1.2275, + "step": 18650 + }, + { + "epoch": 5.555129470019918, + "grad_norm": 0.328046590089798, + "learning_rate": 8.68621142517904e-06, + "loss": 1.2176, + "step": 18651 + }, + { + "epoch": 5.555427316219587, + "grad_norm": 0.41847747564315796, + "learning_rate": 8.685255225563036e-06, + "loss": 1.1987, + "step": 18652 + }, + { + "epoch": 5.555725162419256, + "grad_norm": 0.2721949517726898, + "learning_rate": 8.68429903817926e-06, + "loss": 1.2135, + "step": 18653 + }, + { + "epoch": 5.556023008618924, + "grad_norm": 0.3741403818130493, + "learning_rate": 8.683342863036614e-06, + "loss": 1.2309, + "step": 18654 + }, + { + "epoch": 5.556320854818593, + "grad_norm": 0.26806801557540894, + "learning_rate": 8.682386700143992e-06, + "loss": 1.2134, + "step": 18655 + }, + { + "epoch": 5.5566187010182615, + "grad_norm": 0.33318209648132324, + "learning_rate": 8.681430549510285e-06, + "loss": 1.206, + "step": 18656 + }, + { + "epoch": 5.556916547217931, + "grad_norm": 0.24343553185462952, + "learning_rate": 8.680474411144397e-06, + "loss": 1.2038, + "step": 18657 + }, + { + "epoch": 5.557214393417599, + "grad_norm": 0.31716734170913696, + "learning_rate": 8.679518285055223e-06, + "loss": 1.2201, + "step": 18658 + }, + { + "epoch": 5.557512239617267, + "grad_norm": 0.2625264823436737, + "learning_rate": 8.67856217125165e-06, + "loss": 1.2048, + "step": 18659 + }, + { + "epoch": 5.557810085816937, + "grad_norm": 0.2422669231891632, + "learning_rate": 8.677606069742585e-06, + "loss": 1.2226, + "step": 18660 + }, + { + "epoch": 5.558107932016605, + "grad_norm": 0.260936439037323, + "learning_rate": 8.676649980536917e-06, + "loss": 1.2042, + "step": 18661 + }, + { + "epoch": 5.558405778216273, + "grad_norm": 0.27901342511177063, + "learning_rate": 8.67569390364354e-06, + "loss": 1.2098, + "step": 18662 + }, + { + "epoch": 5.5587036244159425, + "grad_norm": 0.2622801661491394, + "learning_rate": 8.674737839071352e-06, + "loss": 1.2123, + "step": 18663 + }, + { + "epoch": 5.559001470615611, + "grad_norm": 0.28370293974876404, + "learning_rate": 8.67378178682925e-06, + "loss": 1.2006, + "step": 18664 + }, + { + "epoch": 5.559299316815279, + "grad_norm": 0.29662513732910156, + "learning_rate": 8.672825746926124e-06, + "loss": 1.2139, + "step": 18665 + }, + { + "epoch": 5.559597163014948, + "grad_norm": 0.2562781572341919, + "learning_rate": 8.671869719370875e-06, + "loss": 1.2201, + "step": 18666 + }, + { + "epoch": 5.559895009214617, + "grad_norm": 0.24717770516872406, + "learning_rate": 8.670913704172392e-06, + "loss": 1.2094, + "step": 18667 + }, + { + "epoch": 5.560192855414286, + "grad_norm": 0.33261075615882874, + "learning_rate": 8.669957701339571e-06, + "loss": 1.2179, + "step": 18668 + }, + { + "epoch": 5.560490701613954, + "grad_norm": 0.4968486726284027, + "learning_rate": 8.669001710881309e-06, + "loss": 1.2071, + "step": 18669 + }, + { + "epoch": 5.5607885478136225, + "grad_norm": 0.27463075518608093, + "learning_rate": 8.668045732806496e-06, + "loss": 1.2028, + "step": 18670 + }, + { + "epoch": 5.561086394013292, + "grad_norm": 0.615999698638916, + "learning_rate": 8.667089767124036e-06, + "loss": 1.2257, + "step": 18671 + }, + { + "epoch": 5.56138424021296, + "grad_norm": 0.36775702238082886, + "learning_rate": 8.666133813842814e-06, + "loss": 1.211, + "step": 18672 + }, + { + "epoch": 5.561682086412628, + "grad_norm": 0.3622463345527649, + "learning_rate": 8.665177872971722e-06, + "loss": 1.1954, + "step": 18673 + }, + { + "epoch": 5.561979932612298, + "grad_norm": 0.3058466613292694, + "learning_rate": 8.664221944519665e-06, + "loss": 1.2134, + "step": 18674 + }, + { + "epoch": 5.562277778811966, + "grad_norm": 0.3505442142486572, + "learning_rate": 8.663266028495526e-06, + "loss": 1.1974, + "step": 18675 + }, + { + "epoch": 5.562575625011634, + "grad_norm": 0.2794327437877655, + "learning_rate": 8.662310124908202e-06, + "loss": 1.2144, + "step": 18676 + }, + { + "epoch": 5.5628734712113035, + "grad_norm": 0.32328954339027405, + "learning_rate": 8.661354233766593e-06, + "loss": 1.2119, + "step": 18677 + }, + { + "epoch": 5.563171317410972, + "grad_norm": 0.2575834095478058, + "learning_rate": 8.660398355079585e-06, + "loss": 1.2076, + "step": 18678 + }, + { + "epoch": 5.563469163610641, + "grad_norm": 0.30577370524406433, + "learning_rate": 8.65944248885607e-06, + "loss": 1.229, + "step": 18679 + }, + { + "epoch": 5.563767009810309, + "grad_norm": 0.2884595990180969, + "learning_rate": 8.658486635104948e-06, + "loss": 1.2007, + "step": 18680 + }, + { + "epoch": 5.564064856009978, + "grad_norm": 0.4116250276565552, + "learning_rate": 8.657530793835111e-06, + "loss": 1.2152, + "step": 18681 + }, + { + "epoch": 5.564362702209646, + "grad_norm": 0.2661183476448059, + "learning_rate": 8.656574965055445e-06, + "loss": 1.2063, + "step": 18682 + }, + { + "epoch": 5.564660548409315, + "grad_norm": 0.5799434185028076, + "learning_rate": 8.655619148774853e-06, + "loss": 1.2191, + "step": 18683 + }, + { + "epoch": 5.564958394608984, + "grad_norm": 0.47470203042030334, + "learning_rate": 8.654663345002222e-06, + "loss": 1.214, + "step": 18684 + }, + { + "epoch": 5.565256240808653, + "grad_norm": 0.43432360887527466, + "learning_rate": 8.653707553746441e-06, + "loss": 1.2132, + "step": 18685 + }, + { + "epoch": 5.565554087008321, + "grad_norm": 0.6057737469673157, + "learning_rate": 8.652751775016411e-06, + "loss": 1.2212, + "step": 18686 + }, + { + "epoch": 5.565851933207989, + "grad_norm": 0.2544526755809784, + "learning_rate": 8.651796008821018e-06, + "loss": 1.1973, + "step": 18687 + }, + { + "epoch": 5.566149779407659, + "grad_norm": 0.4432823061943054, + "learning_rate": 8.650840255169161e-06, + "loss": 1.2099, + "step": 18688 + }, + { + "epoch": 5.566447625607327, + "grad_norm": 0.2819180488586426, + "learning_rate": 8.649884514069726e-06, + "loss": 1.1989, + "step": 18689 + }, + { + "epoch": 5.566745471806995, + "grad_norm": 0.3026280105113983, + "learning_rate": 8.648928785531606e-06, + "loss": 1.2189, + "step": 18690 + }, + { + "epoch": 5.5670433180066645, + "grad_norm": 0.3398076593875885, + "learning_rate": 8.647973069563696e-06, + "loss": 1.2319, + "step": 18691 + }, + { + "epoch": 5.567341164206333, + "grad_norm": 0.3294442892074585, + "learning_rate": 8.647017366174884e-06, + "loss": 1.2072, + "step": 18692 + }, + { + "epoch": 5.567639010406001, + "grad_norm": 0.316977322101593, + "learning_rate": 8.646061675374062e-06, + "loss": 1.2243, + "step": 18693 + }, + { + "epoch": 5.56793685660567, + "grad_norm": 0.28699055314064026, + "learning_rate": 8.645105997170128e-06, + "loss": 1.2149, + "step": 18694 + }, + { + "epoch": 5.568234702805339, + "grad_norm": 0.3433881402015686, + "learning_rate": 8.64415033157197e-06, + "loss": 1.2225, + "step": 18695 + }, + { + "epoch": 5.568532549005008, + "grad_norm": 0.2621270716190338, + "learning_rate": 8.643194678588473e-06, + "loss": 1.2206, + "step": 18696 + }, + { + "epoch": 5.568830395204676, + "grad_norm": 0.28355029225349426, + "learning_rate": 8.642239038228537e-06, + "loss": 1.2218, + "step": 18697 + }, + { + "epoch": 5.569128241404345, + "grad_norm": 0.41647031903266907, + "learning_rate": 8.641283410501048e-06, + "loss": 1.2096, + "step": 18698 + }, + { + "epoch": 5.569426087604014, + "grad_norm": 0.3549540340900421, + "learning_rate": 8.640327795414898e-06, + "loss": 1.2188, + "step": 18699 + }, + { + "epoch": 5.569723933803682, + "grad_norm": 0.31762734055519104, + "learning_rate": 8.639372192978982e-06, + "loss": 1.1985, + "step": 18700 + }, + { + "epoch": 5.5700217800033505, + "grad_norm": 0.336806982755661, + "learning_rate": 8.638416603202188e-06, + "loss": 1.2094, + "step": 18701 + }, + { + "epoch": 5.57031962620302, + "grad_norm": 0.27429044246673584, + "learning_rate": 8.6374610260934e-06, + "loss": 1.223, + "step": 18702 + }, + { + "epoch": 5.570617472402688, + "grad_norm": 0.29460474848747253, + "learning_rate": 8.636505461661521e-06, + "loss": 1.2174, + "step": 18703 + }, + { + "epoch": 5.570915318602356, + "grad_norm": 0.3771706819534302, + "learning_rate": 8.635549909915433e-06, + "loss": 1.2074, + "step": 18704 + }, + { + "epoch": 5.5712131648020256, + "grad_norm": 0.26914507150650024, + "learning_rate": 8.634594370864029e-06, + "loss": 1.2124, + "step": 18705 + }, + { + "epoch": 5.571511011001694, + "grad_norm": 0.31377163529396057, + "learning_rate": 8.6336388445162e-06, + "loss": 1.2175, + "step": 18706 + }, + { + "epoch": 5.571808857201363, + "grad_norm": 0.3108786344528198, + "learning_rate": 8.632683330880832e-06, + "loss": 1.2009, + "step": 18707 + }, + { + "epoch": 5.572106703401031, + "grad_norm": 0.29319754242897034, + "learning_rate": 8.63172782996682e-06, + "loss": 1.1998, + "step": 18708 + }, + { + "epoch": 5.5724045496007, + "grad_norm": 0.30423447489738464, + "learning_rate": 8.630772341783051e-06, + "loss": 1.2221, + "step": 18709 + }, + { + "epoch": 5.572702395800369, + "grad_norm": 0.36488884687423706, + "learning_rate": 8.629816866338414e-06, + "loss": 1.2176, + "step": 18710 + }, + { + "epoch": 5.573000242000037, + "grad_norm": 0.27655303478240967, + "learning_rate": 8.628861403641804e-06, + "loss": 1.2165, + "step": 18711 + }, + { + "epoch": 5.573298088199706, + "grad_norm": 0.31800028681755066, + "learning_rate": 8.627905953702106e-06, + "loss": 1.2167, + "step": 18712 + }, + { + "epoch": 5.573595934399375, + "grad_norm": 0.282143771648407, + "learning_rate": 8.626950516528208e-06, + "loss": 1.2117, + "step": 18713 + }, + { + "epoch": 5.573893780599043, + "grad_norm": 0.4625825583934784, + "learning_rate": 8.625995092129004e-06, + "loss": 1.2113, + "step": 18714 + }, + { + "epoch": 5.5741916267987115, + "grad_norm": 0.2888241410255432, + "learning_rate": 8.625039680513378e-06, + "loss": 1.2345, + "step": 18715 + }, + { + "epoch": 5.574489472998381, + "grad_norm": 0.4537830054759979, + "learning_rate": 8.624084281690221e-06, + "loss": 1.2212, + "step": 18716 + }, + { + "epoch": 5.574787319198049, + "grad_norm": 0.28274741768836975, + "learning_rate": 8.623128895668426e-06, + "loss": 1.1989, + "step": 18717 + }, + { + "epoch": 5.575085165397717, + "grad_norm": 0.48294222354888916, + "learning_rate": 8.622173522456877e-06, + "loss": 1.2164, + "step": 18718 + }, + { + "epoch": 5.575383011597387, + "grad_norm": 0.2852092385292053, + "learning_rate": 8.621218162064463e-06, + "loss": 1.2211, + "step": 18719 + }, + { + "epoch": 5.575680857797055, + "grad_norm": 0.27762648463249207, + "learning_rate": 8.620262814500074e-06, + "loss": 1.2014, + "step": 18720 + }, + { + "epoch": 5.575978703996723, + "grad_norm": 0.3186171352863312, + "learning_rate": 8.619307479772597e-06, + "loss": 1.2106, + "step": 18721 + }, + { + "epoch": 5.5762765501963925, + "grad_norm": 0.3749951124191284, + "learning_rate": 8.618352157890921e-06, + "loss": 1.2206, + "step": 18722 + }, + { + "epoch": 5.576574396396061, + "grad_norm": 0.28278741240501404, + "learning_rate": 8.617396848863937e-06, + "loss": 1.2109, + "step": 18723 + }, + { + "epoch": 5.57687224259573, + "grad_norm": 0.2718892991542816, + "learning_rate": 8.616441552700528e-06, + "loss": 1.2149, + "step": 18724 + }, + { + "epoch": 5.577170088795398, + "grad_norm": 0.4734122157096863, + "learning_rate": 8.615486269409587e-06, + "loss": 1.221, + "step": 18725 + }, + { + "epoch": 5.577467934995067, + "grad_norm": 0.3258187472820282, + "learning_rate": 8.614530998999996e-06, + "loss": 1.1993, + "step": 18726 + }, + { + "epoch": 5.577765781194736, + "grad_norm": 0.3898892402648926, + "learning_rate": 8.61357574148065e-06, + "loss": 1.2075, + "step": 18727 + }, + { + "epoch": 5.578063627394404, + "grad_norm": 0.32334861159324646, + "learning_rate": 8.61262049686043e-06, + "loss": 1.2034, + "step": 18728 + }, + { + "epoch": 5.5783614735940725, + "grad_norm": 0.3380454182624817, + "learning_rate": 8.61166526514823e-06, + "loss": 1.2182, + "step": 18729 + }, + { + "epoch": 5.578659319793742, + "grad_norm": 0.2916748821735382, + "learning_rate": 8.610710046352929e-06, + "loss": 1.2187, + "step": 18730 + }, + { + "epoch": 5.57895716599341, + "grad_norm": 0.3558313846588135, + "learning_rate": 8.609754840483422e-06, + "loss": 1.208, + "step": 18731 + }, + { + "epoch": 5.579255012193078, + "grad_norm": 0.32697343826293945, + "learning_rate": 8.60879964754859e-06, + "loss": 1.2068, + "step": 18732 + }, + { + "epoch": 5.579552858392748, + "grad_norm": 0.26370006799697876, + "learning_rate": 8.607844467557324e-06, + "loss": 1.2128, + "step": 18733 + }, + { + "epoch": 5.579850704592416, + "grad_norm": 0.3471877872943878, + "learning_rate": 8.60688930051851e-06, + "loss": 1.2152, + "step": 18734 + }, + { + "epoch": 5.580148550792085, + "grad_norm": 0.2663270831108093, + "learning_rate": 8.605934146441032e-06, + "loss": 1.2091, + "step": 18735 + }, + { + "epoch": 5.5804463969917535, + "grad_norm": 0.35175296664237976, + "learning_rate": 8.604979005333784e-06, + "loss": 1.2053, + "step": 18736 + }, + { + "epoch": 5.580744243191422, + "grad_norm": 0.28595876693725586, + "learning_rate": 8.604023877205649e-06, + "loss": 1.2011, + "step": 18737 + }, + { + "epoch": 5.581042089391091, + "grad_norm": 0.3111506998538971, + "learning_rate": 8.603068762065507e-06, + "loss": 1.2178, + "step": 18738 + }, + { + "epoch": 5.581339935590759, + "grad_norm": 0.4185241758823395, + "learning_rate": 8.60211365992225e-06, + "loss": 1.2049, + "step": 18739 + }, + { + "epoch": 5.581637781790428, + "grad_norm": 0.2716710567474365, + "learning_rate": 8.601158570784768e-06, + "loss": 1.2111, + "step": 18740 + }, + { + "epoch": 5.581935627990097, + "grad_norm": 0.4536549746990204, + "learning_rate": 8.600203494661936e-06, + "loss": 1.1988, + "step": 18741 + }, + { + "epoch": 5.582233474189765, + "grad_norm": 0.25362464785575867, + "learning_rate": 8.599248431562652e-06, + "loss": 1.2091, + "step": 18742 + }, + { + "epoch": 5.582531320389434, + "grad_norm": 0.5159428119659424, + "learning_rate": 8.598293381495797e-06, + "loss": 1.224, + "step": 18743 + }, + { + "epoch": 5.582829166589103, + "grad_norm": 0.2671842873096466, + "learning_rate": 8.597338344470252e-06, + "loss": 1.2188, + "step": 18744 + }, + { + "epoch": 5.583127012788771, + "grad_norm": 0.4450357258319855, + "learning_rate": 8.596383320494907e-06, + "loss": 1.2102, + "step": 18745 + }, + { + "epoch": 5.58342485898844, + "grad_norm": 0.2808002233505249, + "learning_rate": 8.595428309578649e-06, + "loss": 1.2119, + "step": 18746 + }, + { + "epoch": 5.583722705188109, + "grad_norm": 0.3414168953895569, + "learning_rate": 8.594473311730357e-06, + "loss": 1.2076, + "step": 18747 + }, + { + "epoch": 5.584020551387777, + "grad_norm": 0.2827218770980835, + "learning_rate": 8.593518326958927e-06, + "loss": 1.1986, + "step": 18748 + }, + { + "epoch": 5.584318397587445, + "grad_norm": 0.45315414667129517, + "learning_rate": 8.592563355273237e-06, + "loss": 1.206, + "step": 18749 + }, + { + "epoch": 5.5846162437871145, + "grad_norm": 0.4909801781177521, + "learning_rate": 8.591608396682168e-06, + "loss": 1.207, + "step": 18750 + }, + { + "epoch": 5.584914089986783, + "grad_norm": 0.28537270426750183, + "learning_rate": 8.59065345119461e-06, + "loss": 1.1959, + "step": 18751 + }, + { + "epoch": 5.585211936186452, + "grad_norm": 0.2702919542789459, + "learning_rate": 8.589698518819445e-06, + "loss": 1.2125, + "step": 18752 + }, + { + "epoch": 5.58550978238612, + "grad_norm": 0.2869855463504791, + "learning_rate": 8.588743599565565e-06, + "loss": 1.2124, + "step": 18753 + }, + { + "epoch": 5.585807628585789, + "grad_norm": 0.2739764451980591, + "learning_rate": 8.58778869344185e-06, + "loss": 1.2187, + "step": 18754 + }, + { + "epoch": 5.586105474785458, + "grad_norm": 0.2951570153236389, + "learning_rate": 8.586833800457178e-06, + "loss": 1.1987, + "step": 18755 + }, + { + "epoch": 5.586403320985126, + "grad_norm": 0.2770441770553589, + "learning_rate": 8.585878920620442e-06, + "loss": 1.2057, + "step": 18756 + }, + { + "epoch": 5.586701167184795, + "grad_norm": 0.3250736594200134, + "learning_rate": 8.58492405394052e-06, + "loss": 1.2193, + "step": 18757 + }, + { + "epoch": 5.586999013384464, + "grad_norm": 0.2726987600326538, + "learning_rate": 8.5839692004263e-06, + "loss": 1.2042, + "step": 18758 + }, + { + "epoch": 5.587296859584132, + "grad_norm": 0.35027015209198, + "learning_rate": 8.583014360086666e-06, + "loss": 1.2219, + "step": 18759 + }, + { + "epoch": 5.5875947057838005, + "grad_norm": 0.26397666335105896, + "learning_rate": 8.5820595329305e-06, + "loss": 1.2154, + "step": 18760 + }, + { + "epoch": 5.58789255198347, + "grad_norm": 0.297736257314682, + "learning_rate": 8.581104718966683e-06, + "loss": 1.1915, + "step": 18761 + }, + { + "epoch": 5.588190398183138, + "grad_norm": 0.2567819356918335, + "learning_rate": 8.580149918204103e-06, + "loss": 1.2153, + "step": 18762 + }, + { + "epoch": 5.588488244382807, + "grad_norm": 0.25121933221817017, + "learning_rate": 8.579195130651643e-06, + "loss": 1.2009, + "step": 18763 + }, + { + "epoch": 5.5887860905824756, + "grad_norm": 0.2932000160217285, + "learning_rate": 8.578240356318182e-06, + "loss": 1.2024, + "step": 18764 + }, + { + "epoch": 5.589083936782144, + "grad_norm": 0.2614770531654358, + "learning_rate": 8.577285595212609e-06, + "loss": 1.2157, + "step": 18765 + }, + { + "epoch": 5.589381782981813, + "grad_norm": 0.4117249548435211, + "learning_rate": 8.576330847343805e-06, + "loss": 1.2268, + "step": 18766 + }, + { + "epoch": 5.589679629181481, + "grad_norm": 0.2886642813682556, + "learning_rate": 8.575376112720646e-06, + "loss": 1.2096, + "step": 18767 + }, + { + "epoch": 5.58997747538115, + "grad_norm": 0.6023201942443848, + "learning_rate": 8.574421391352025e-06, + "loss": 1.2227, + "step": 18768 + }, + { + "epoch": 5.590275321580819, + "grad_norm": 0.3868212401866913, + "learning_rate": 8.573466683246818e-06, + "loss": 1.197, + "step": 18769 + }, + { + "epoch": 5.590573167780487, + "grad_norm": 0.555033266544342, + "learning_rate": 8.572511988413911e-06, + "loss": 1.2141, + "step": 18770 + }, + { + "epoch": 5.590871013980156, + "grad_norm": 0.41188016533851624, + "learning_rate": 8.571557306862187e-06, + "loss": 1.1943, + "step": 18771 + }, + { + "epoch": 5.591168860179825, + "grad_norm": 0.30140528082847595, + "learning_rate": 8.570602638600522e-06, + "loss": 1.2171, + "step": 18772 + }, + { + "epoch": 5.591466706379493, + "grad_norm": 0.2842493951320648, + "learning_rate": 8.569647983637807e-06, + "loss": 1.2128, + "step": 18773 + }, + { + "epoch": 5.591764552579162, + "grad_norm": 0.28710734844207764, + "learning_rate": 8.568693341982916e-06, + "loss": 1.2281, + "step": 18774 + }, + { + "epoch": 5.592062398778831, + "grad_norm": 0.3069527745246887, + "learning_rate": 8.567738713644733e-06, + "loss": 1.2262, + "step": 18775 + }, + { + "epoch": 5.592360244978499, + "grad_norm": 0.2823931872844696, + "learning_rate": 8.566784098632145e-06, + "loss": 1.2168, + "step": 18776 + }, + { + "epoch": 5.592658091178168, + "grad_norm": 0.2592270076274872, + "learning_rate": 8.565829496954031e-06, + "loss": 1.2094, + "step": 18777 + }, + { + "epoch": 5.592955937377837, + "grad_norm": 0.2832771837711334, + "learning_rate": 8.564874908619267e-06, + "loss": 1.2212, + "step": 18778 + }, + { + "epoch": 5.593253783577505, + "grad_norm": 0.3229985535144806, + "learning_rate": 8.563920333636741e-06, + "loss": 1.2213, + "step": 18779 + }, + { + "epoch": 5.593551629777174, + "grad_norm": 0.36390164494514465, + "learning_rate": 8.56296577201533e-06, + "loss": 1.2064, + "step": 18780 + }, + { + "epoch": 5.5938494759768425, + "grad_norm": 0.3118399977684021, + "learning_rate": 8.562011223763915e-06, + "loss": 1.2143, + "step": 18781 + }, + { + "epoch": 5.594147322176511, + "grad_norm": 0.3940849304199219, + "learning_rate": 8.561056688891384e-06, + "loss": 1.2118, + "step": 18782 + }, + { + "epoch": 5.59444516837618, + "grad_norm": 0.32886838912963867, + "learning_rate": 8.560102167406613e-06, + "loss": 1.2108, + "step": 18783 + }, + { + "epoch": 5.594743014575848, + "grad_norm": 0.4006761610507965, + "learning_rate": 8.559147659318477e-06, + "loss": 1.2062, + "step": 18784 + }, + { + "epoch": 5.595040860775517, + "grad_norm": 0.36049771308898926, + "learning_rate": 8.558193164635867e-06, + "loss": 1.2015, + "step": 18785 + }, + { + "epoch": 5.595338706975186, + "grad_norm": 0.5363227128982544, + "learning_rate": 8.557238683367654e-06, + "loss": 1.2056, + "step": 18786 + }, + { + "epoch": 5.595636553174854, + "grad_norm": 0.2785434126853943, + "learning_rate": 8.556284215522728e-06, + "loss": 1.2278, + "step": 18787 + }, + { + "epoch": 5.5959343993745225, + "grad_norm": 0.4219331443309784, + "learning_rate": 8.555329761109964e-06, + "loss": 1.2122, + "step": 18788 + }, + { + "epoch": 5.596232245574192, + "grad_norm": 0.353397011756897, + "learning_rate": 8.55437532013824e-06, + "loss": 1.2174, + "step": 18789 + }, + { + "epoch": 5.59653009177386, + "grad_norm": 0.37983760237693787, + "learning_rate": 8.55342089261644e-06, + "loss": 1.2183, + "step": 18790 + }, + { + "epoch": 5.596827937973529, + "grad_norm": 0.47747302055358887, + "learning_rate": 8.55246647855344e-06, + "loss": 1.2159, + "step": 18791 + }, + { + "epoch": 5.597125784173198, + "grad_norm": 0.3402019143104553, + "learning_rate": 8.551512077958125e-06, + "loss": 1.2093, + "step": 18792 + }, + { + "epoch": 5.597423630372866, + "grad_norm": 0.43803659081459045, + "learning_rate": 8.55055769083937e-06, + "loss": 1.2114, + "step": 18793 + }, + { + "epoch": 5.597721476572535, + "grad_norm": 0.2545213997364044, + "learning_rate": 8.549603317206058e-06, + "loss": 1.2043, + "step": 18794 + }, + { + "epoch": 5.5980193227722035, + "grad_norm": 0.4699476361274719, + "learning_rate": 8.548648957067065e-06, + "loss": 1.2068, + "step": 18795 + }, + { + "epoch": 5.598317168971872, + "grad_norm": 0.36989253759384155, + "learning_rate": 8.547694610431275e-06, + "loss": 1.1999, + "step": 18796 + }, + { + "epoch": 5.598615015171541, + "grad_norm": 0.6188362836837769, + "learning_rate": 8.54674027730756e-06, + "loss": 1.2172, + "step": 18797 + }, + { + "epoch": 5.598912861371209, + "grad_norm": 0.349924236536026, + "learning_rate": 8.545785957704803e-06, + "loss": 1.2266, + "step": 18798 + }, + { + "epoch": 5.599210707570878, + "grad_norm": 0.2815835475921631, + "learning_rate": 8.544831651631887e-06, + "loss": 1.2155, + "step": 18799 + }, + { + "epoch": 5.599508553770547, + "grad_norm": 0.6178797483444214, + "learning_rate": 8.543877359097685e-06, + "loss": 1.2115, + "step": 18800 + }, + { + "epoch": 5.599806399970215, + "grad_norm": 0.30827033519744873, + "learning_rate": 8.542923080111074e-06, + "loss": 1.2162, + "step": 18801 + }, + { + "epoch": 5.6001042461698844, + "grad_norm": 0.6834231615066528, + "learning_rate": 8.54196881468094e-06, + "loss": 1.2137, + "step": 18802 + }, + { + "epoch": 5.600402092369553, + "grad_norm": 0.27194470167160034, + "learning_rate": 8.541014562816155e-06, + "loss": 1.2091, + "step": 18803 + }, + { + "epoch": 5.600699938569221, + "grad_norm": 0.5771031379699707, + "learning_rate": 8.540060324525599e-06, + "loss": 1.2368, + "step": 18804 + }, + { + "epoch": 5.60099778476889, + "grad_norm": 0.31596338748931885, + "learning_rate": 8.539106099818153e-06, + "loss": 1.2241, + "step": 18805 + }, + { + "epoch": 5.601295630968559, + "grad_norm": 0.5005790591239929, + "learning_rate": 8.538151888702689e-06, + "loss": 1.2164, + "step": 18806 + }, + { + "epoch": 5.601593477168227, + "grad_norm": 0.48484519124031067, + "learning_rate": 8.53719769118809e-06, + "loss": 1.2168, + "step": 18807 + }, + { + "epoch": 5.601891323367896, + "grad_norm": 0.37810784578323364, + "learning_rate": 8.536243507283237e-06, + "loss": 1.2111, + "step": 18808 + }, + { + "epoch": 5.6021891695675645, + "grad_norm": 0.4823022782802582, + "learning_rate": 8.535289336996994e-06, + "loss": 1.2149, + "step": 18809 + }, + { + "epoch": 5.602487015767233, + "grad_norm": 0.2944742441177368, + "learning_rate": 8.534335180338252e-06, + "loss": 1.2198, + "step": 18810 + }, + { + "epoch": 5.602784861966902, + "grad_norm": 0.5242235064506531, + "learning_rate": 8.533381037315886e-06, + "loss": 1.2257, + "step": 18811 + }, + { + "epoch": 5.60308270816657, + "grad_norm": 0.3142121136188507, + "learning_rate": 8.532426907938766e-06, + "loss": 1.2181, + "step": 18812 + }, + { + "epoch": 5.60338055436624, + "grad_norm": 0.6312505602836609, + "learning_rate": 8.531472792215777e-06, + "loss": 1.2207, + "step": 18813 + }, + { + "epoch": 5.603678400565908, + "grad_norm": 0.2981034517288208, + "learning_rate": 8.530518690155791e-06, + "loss": 1.2019, + "step": 18814 + }, + { + "epoch": 5.603976246765576, + "grad_norm": 0.47545936703681946, + "learning_rate": 8.529564601767688e-06, + "loss": 1.2126, + "step": 18815 + }, + { + "epoch": 5.604274092965245, + "grad_norm": 0.33774271607398987, + "learning_rate": 8.528610527060343e-06, + "loss": 1.205, + "step": 18816 + }, + { + "epoch": 5.604571939164914, + "grad_norm": 0.5593737959861755, + "learning_rate": 8.527656466042635e-06, + "loss": 1.2098, + "step": 18817 + }, + { + "epoch": 5.604869785364582, + "grad_norm": 0.2931670844554901, + "learning_rate": 8.526702418723434e-06, + "loss": 1.2033, + "step": 18818 + }, + { + "epoch": 5.605167631564251, + "grad_norm": 0.3417098820209503, + "learning_rate": 8.525748385111627e-06, + "loss": 1.2194, + "step": 18819 + }, + { + "epoch": 5.60546547776392, + "grad_norm": 0.30952945351600647, + "learning_rate": 8.524794365216079e-06, + "loss": 1.2114, + "step": 18820 + }, + { + "epoch": 5.605763323963588, + "grad_norm": 0.28921374678611755, + "learning_rate": 8.523840359045671e-06, + "loss": 1.2054, + "step": 18821 + }, + { + "epoch": 5.606061170163257, + "grad_norm": 0.4081917703151703, + "learning_rate": 8.522886366609284e-06, + "loss": 1.2007, + "step": 18822 + }, + { + "epoch": 5.6063590163629256, + "grad_norm": 0.27021026611328125, + "learning_rate": 8.521932387915785e-06, + "loss": 1.2161, + "step": 18823 + }, + { + "epoch": 5.606656862562594, + "grad_norm": 0.3343936800956726, + "learning_rate": 8.520978422974056e-06, + "loss": 1.2213, + "step": 18824 + }, + { + "epoch": 5.606954708762263, + "grad_norm": 0.38951677083969116, + "learning_rate": 8.520024471792973e-06, + "loss": 1.2167, + "step": 18825 + }, + { + "epoch": 5.607252554961931, + "grad_norm": 0.28023195266723633, + "learning_rate": 8.519070534381402e-06, + "loss": 1.212, + "step": 18826 + }, + { + "epoch": 5.6075504011616, + "grad_norm": 0.3627184331417084, + "learning_rate": 8.518116610748229e-06, + "loss": 1.2015, + "step": 18827 + }, + { + "epoch": 5.607848247361269, + "grad_norm": 0.4622196555137634, + "learning_rate": 8.517162700902327e-06, + "loss": 1.2171, + "step": 18828 + }, + { + "epoch": 5.608146093560937, + "grad_norm": 0.24727940559387207, + "learning_rate": 8.516208804852563e-06, + "loss": 1.2236, + "step": 18829 + }, + { + "epoch": 5.6084439397606065, + "grad_norm": 0.49555546045303345, + "learning_rate": 8.515254922607826e-06, + "loss": 1.2069, + "step": 18830 + }, + { + "epoch": 5.608741785960275, + "grad_norm": 0.33231067657470703, + "learning_rate": 8.51430105417698e-06, + "loss": 1.2274, + "step": 18831 + }, + { + "epoch": 5.609039632159943, + "grad_norm": 0.3582731783390045, + "learning_rate": 8.5133471995689e-06, + "loss": 1.221, + "step": 18832 + }, + { + "epoch": 5.609337478359612, + "grad_norm": 0.29329031705856323, + "learning_rate": 8.512393358792464e-06, + "loss": 1.2344, + "step": 18833 + }, + { + "epoch": 5.609635324559281, + "grad_norm": 0.8112807273864746, + "learning_rate": 8.511439531856546e-06, + "loss": 1.2053, + "step": 18834 + }, + { + "epoch": 5.609933170758949, + "grad_norm": 0.42413321137428284, + "learning_rate": 8.510485718770023e-06, + "loss": 1.2241, + "step": 18835 + }, + { + "epoch": 5.610231016958618, + "grad_norm": 0.509304404258728, + "learning_rate": 8.509531919541766e-06, + "loss": 1.2085, + "step": 18836 + }, + { + "epoch": 5.610528863158287, + "grad_norm": 0.2927802801132202, + "learning_rate": 8.508578134180646e-06, + "loss": 1.222, + "step": 18837 + }, + { + "epoch": 5.610826709357955, + "grad_norm": 0.6966512799263, + "learning_rate": 8.507624362695544e-06, + "loss": 1.1927, + "step": 18838 + }, + { + "epoch": 5.611124555557624, + "grad_norm": 0.2601867914199829, + "learning_rate": 8.506670605095327e-06, + "loss": 1.2122, + "step": 18839 + }, + { + "epoch": 5.6114224017572925, + "grad_norm": 0.619441568851471, + "learning_rate": 8.50571686138887e-06, + "loss": 1.224, + "step": 18840 + }, + { + "epoch": 5.611720247956962, + "grad_norm": 0.25965413451194763, + "learning_rate": 8.504763131585051e-06, + "loss": 1.2239, + "step": 18841 + }, + { + "epoch": 5.61201809415663, + "grad_norm": 0.41319066286087036, + "learning_rate": 8.503809415692743e-06, + "loss": 1.2229, + "step": 18842 + }, + { + "epoch": 5.612315940356298, + "grad_norm": 0.4615105390548706, + "learning_rate": 8.50285571372081e-06, + "loss": 1.236, + "step": 18843 + }, + { + "epoch": 5.6126137865559675, + "grad_norm": 0.3761954605579376, + "learning_rate": 8.501902025678139e-06, + "loss": 1.2214, + "step": 18844 + }, + { + "epoch": 5.612911632755636, + "grad_norm": 0.6200788617134094, + "learning_rate": 8.500948351573591e-06, + "loss": 1.1989, + "step": 18845 + }, + { + "epoch": 5.613209478955304, + "grad_norm": 0.24761039018630981, + "learning_rate": 8.499994691416043e-06, + "loss": 1.2109, + "step": 18846 + }, + { + "epoch": 5.613507325154973, + "grad_norm": 0.6030553579330444, + "learning_rate": 8.499041045214373e-06, + "loss": 1.2018, + "step": 18847 + }, + { + "epoch": 5.613805171354642, + "grad_norm": 0.42672887444496155, + "learning_rate": 8.498087412977448e-06, + "loss": 1.2113, + "step": 18848 + }, + { + "epoch": 5.61410301755431, + "grad_norm": 0.44532349705696106, + "learning_rate": 8.497133794714138e-06, + "loss": 1.2105, + "step": 18849 + }, + { + "epoch": 5.614400863753979, + "grad_norm": 0.624701976776123, + "learning_rate": 8.49618019043332e-06, + "loss": 1.2215, + "step": 18850 + }, + { + "epoch": 5.614698709953648, + "grad_norm": 0.2653880715370178, + "learning_rate": 8.495226600143867e-06, + "loss": 1.2222, + "step": 18851 + }, + { + "epoch": 5.614996556153316, + "grad_norm": 0.5351306200027466, + "learning_rate": 8.494273023854649e-06, + "loss": 1.1984, + "step": 18852 + }, + { + "epoch": 5.615294402352985, + "grad_norm": 0.38506999611854553, + "learning_rate": 8.493319461574539e-06, + "loss": 1.1987, + "step": 18853 + }, + { + "epoch": 5.6155922485526535, + "grad_norm": 0.38100340962409973, + "learning_rate": 8.492365913312404e-06, + "loss": 1.204, + "step": 18854 + }, + { + "epoch": 5.615890094752322, + "grad_norm": 0.44310852885246277, + "learning_rate": 8.491412379077125e-06, + "loss": 1.2164, + "step": 18855 + }, + { + "epoch": 5.616187940951991, + "grad_norm": 0.32990390062332153, + "learning_rate": 8.490458858877566e-06, + "loss": 1.221, + "step": 18856 + }, + { + "epoch": 5.616485787151659, + "grad_norm": 0.4279313385486603, + "learning_rate": 8.489505352722598e-06, + "loss": 1.2042, + "step": 18857 + }, + { + "epoch": 5.616783633351329, + "grad_norm": 0.4555051922798157, + "learning_rate": 8.4885518606211e-06, + "loss": 1.2099, + "step": 18858 + }, + { + "epoch": 5.617081479550997, + "grad_norm": 0.2626411020755768, + "learning_rate": 8.487598382581939e-06, + "loss": 1.218, + "step": 18859 + }, + { + "epoch": 5.617379325750665, + "grad_norm": 0.534856915473938, + "learning_rate": 8.48664491861398e-06, + "loss": 1.2085, + "step": 18860 + }, + { + "epoch": 5.6176771719503344, + "grad_norm": 0.3554114103317261, + "learning_rate": 8.485691468726103e-06, + "loss": 1.2312, + "step": 18861 + }, + { + "epoch": 5.617975018150003, + "grad_norm": 0.2575479745864868, + "learning_rate": 8.484738032927173e-06, + "loss": 1.2201, + "step": 18862 + }, + { + "epoch": 5.618272864349671, + "grad_norm": 0.34061622619628906, + "learning_rate": 8.48378461122606e-06, + "loss": 1.2132, + "step": 18863 + }, + { + "epoch": 5.61857071054934, + "grad_norm": 0.37505868077278137, + "learning_rate": 8.482831203631643e-06, + "loss": 1.2189, + "step": 18864 + }, + { + "epoch": 5.618868556749009, + "grad_norm": 0.25552624464035034, + "learning_rate": 8.481877810152785e-06, + "loss": 1.2091, + "step": 18865 + }, + { + "epoch": 5.619166402948677, + "grad_norm": 0.35332217812538147, + "learning_rate": 8.480924430798356e-06, + "loss": 1.2025, + "step": 18866 + }, + { + "epoch": 5.619464249148346, + "grad_norm": 0.27599403262138367, + "learning_rate": 8.47997106557723e-06, + "loss": 1.2023, + "step": 18867 + }, + { + "epoch": 5.6197620953480145, + "grad_norm": 0.2798996567726135, + "learning_rate": 8.479017714498272e-06, + "loss": 1.205, + "step": 18868 + }, + { + "epoch": 5.620059941547684, + "grad_norm": 0.2830234467983246, + "learning_rate": 8.478064377570356e-06, + "loss": 1.2093, + "step": 18869 + }, + { + "epoch": 5.620357787747352, + "grad_norm": 0.24677562713623047, + "learning_rate": 8.477111054802353e-06, + "loss": 1.2056, + "step": 18870 + }, + { + "epoch": 5.62065563394702, + "grad_norm": 0.2698245048522949, + "learning_rate": 8.476157746203127e-06, + "loss": 1.2119, + "step": 18871 + }, + { + "epoch": 5.62095348014669, + "grad_norm": 0.318942666053772, + "learning_rate": 8.475204451781552e-06, + "loss": 1.21, + "step": 18872 + }, + { + "epoch": 5.621251326346358, + "grad_norm": 0.3281908929347992, + "learning_rate": 8.474251171546497e-06, + "loss": 1.2293, + "step": 18873 + }, + { + "epoch": 5.621549172546026, + "grad_norm": 0.27140331268310547, + "learning_rate": 8.473297905506827e-06, + "loss": 1.2012, + "step": 18874 + }, + { + "epoch": 5.6218470187456955, + "grad_norm": 0.32080644369125366, + "learning_rate": 8.472344653671418e-06, + "loss": 1.221, + "step": 18875 + }, + { + "epoch": 5.622144864945364, + "grad_norm": 0.42807522416114807, + "learning_rate": 8.471391416049135e-06, + "loss": 1.2233, + "step": 18876 + }, + { + "epoch": 5.622442711145032, + "grad_norm": 0.33594784140586853, + "learning_rate": 8.470438192648842e-06, + "loss": 1.2104, + "step": 18877 + }, + { + "epoch": 5.622740557344701, + "grad_norm": 0.3213953375816345, + "learning_rate": 8.469484983479418e-06, + "loss": 1.2106, + "step": 18878 + }, + { + "epoch": 5.62303840354437, + "grad_norm": 0.4297904372215271, + "learning_rate": 8.468531788549725e-06, + "loss": 1.2107, + "step": 18879 + }, + { + "epoch": 5.623336249744039, + "grad_norm": 0.28557974100112915, + "learning_rate": 8.467578607868632e-06, + "loss": 1.2018, + "step": 18880 + }, + { + "epoch": 5.623634095943707, + "grad_norm": 0.2677142322063446, + "learning_rate": 8.466625441445008e-06, + "loss": 1.2246, + "step": 18881 + }, + { + "epoch": 5.6239319421433756, + "grad_norm": 0.2982311546802521, + "learning_rate": 8.465672289287723e-06, + "loss": 1.1946, + "step": 18882 + }, + { + "epoch": 5.624229788343044, + "grad_norm": 0.25173237919807434, + "learning_rate": 8.464719151405637e-06, + "loss": 1.2024, + "step": 18883 + }, + { + "epoch": 5.624527634542713, + "grad_norm": 0.3390488922595978, + "learning_rate": 8.463766027807631e-06, + "loss": 1.2011, + "step": 18884 + }, + { + "epoch": 5.624825480742381, + "grad_norm": 0.2797108292579651, + "learning_rate": 8.462812918502561e-06, + "loss": 1.2204, + "step": 18885 + }, + { + "epoch": 5.625123326942051, + "grad_norm": 0.403302937746048, + "learning_rate": 8.461859823499301e-06, + "loss": 1.2188, + "step": 18886 + }, + { + "epoch": 5.625421173141719, + "grad_norm": 0.2568717300891876, + "learning_rate": 8.460906742806719e-06, + "loss": 1.2233, + "step": 18887 + }, + { + "epoch": 5.625719019341387, + "grad_norm": 0.27788710594177246, + "learning_rate": 8.459953676433676e-06, + "loss": 1.2051, + "step": 18888 + }, + { + "epoch": 5.6260168655410565, + "grad_norm": 0.35678115487098694, + "learning_rate": 8.459000624389048e-06, + "loss": 1.2251, + "step": 18889 + }, + { + "epoch": 5.626314711740725, + "grad_norm": 0.31350693106651306, + "learning_rate": 8.458047586681697e-06, + "loss": 1.2183, + "step": 18890 + }, + { + "epoch": 5.626612557940393, + "grad_norm": 0.31936779618263245, + "learning_rate": 8.457094563320486e-06, + "loss": 1.2098, + "step": 18891 + }, + { + "epoch": 5.626910404140062, + "grad_norm": 0.33972495794296265, + "learning_rate": 8.456141554314289e-06, + "loss": 1.2224, + "step": 18892 + }, + { + "epoch": 5.627208250339731, + "grad_norm": 0.32694512605667114, + "learning_rate": 8.455188559671972e-06, + "loss": 1.2177, + "step": 18893 + }, + { + "epoch": 5.627506096539399, + "grad_norm": 0.2845608592033386, + "learning_rate": 8.454235579402395e-06, + "loss": 1.2339, + "step": 18894 + }, + { + "epoch": 5.627803942739068, + "grad_norm": 0.5377947092056274, + "learning_rate": 8.453282613514432e-06, + "loss": 1.2087, + "step": 18895 + }, + { + "epoch": 5.628101788938737, + "grad_norm": 0.41585099697113037, + "learning_rate": 8.452329662016946e-06, + "loss": 1.194, + "step": 18896 + }, + { + "epoch": 5.628399635138406, + "grad_norm": 0.4627498388290405, + "learning_rate": 8.451376724918802e-06, + "loss": 1.1897, + "step": 18897 + }, + { + "epoch": 5.628697481338074, + "grad_norm": 0.320951372385025, + "learning_rate": 8.450423802228868e-06, + "loss": 1.2084, + "step": 18898 + }, + { + "epoch": 5.6289953275377425, + "grad_norm": 0.5758249759674072, + "learning_rate": 8.449470893956012e-06, + "loss": 1.2189, + "step": 18899 + }, + { + "epoch": 5.629293173737412, + "grad_norm": 0.29355067014694214, + "learning_rate": 8.44851800010909e-06, + "loss": 1.2192, + "step": 18900 + }, + { + "epoch": 5.62959101993708, + "grad_norm": 0.44087085127830505, + "learning_rate": 8.447565120696982e-06, + "loss": 1.2107, + "step": 18901 + }, + { + "epoch": 5.629888866136748, + "grad_norm": 0.2866421341896057, + "learning_rate": 8.44661225572854e-06, + "loss": 1.2108, + "step": 18902 + }, + { + "epoch": 5.6301867123364175, + "grad_norm": 0.2748970091342926, + "learning_rate": 8.445659405212641e-06, + "loss": 1.1986, + "step": 18903 + }, + { + "epoch": 5.630484558536086, + "grad_norm": 0.37564316391944885, + "learning_rate": 8.444706569158141e-06, + "loss": 1.2284, + "step": 18904 + }, + { + "epoch": 5.630782404735754, + "grad_norm": 0.29964256286621094, + "learning_rate": 8.443753747573907e-06, + "loss": 1.219, + "step": 18905 + }, + { + "epoch": 5.631080250935423, + "grad_norm": 0.3106033504009247, + "learning_rate": 8.442800940468811e-06, + "loss": 1.1973, + "step": 18906 + }, + { + "epoch": 5.631378097135092, + "grad_norm": 0.473921537399292, + "learning_rate": 8.44184814785171e-06, + "loss": 1.2174, + "step": 18907 + }, + { + "epoch": 5.631675943334761, + "grad_norm": 0.38011687994003296, + "learning_rate": 8.44089536973147e-06, + "loss": 1.2218, + "step": 18908 + }, + { + "epoch": 5.631973789534429, + "grad_norm": 0.5878388285636902, + "learning_rate": 8.439942606116958e-06, + "loss": 1.2024, + "step": 18909 + }, + { + "epoch": 5.632271635734098, + "grad_norm": 0.35579267144203186, + "learning_rate": 8.438989857017036e-06, + "loss": 1.2191, + "step": 18910 + }, + { + "epoch": 5.632569481933767, + "grad_norm": 0.4992106854915619, + "learning_rate": 8.438037122440568e-06, + "loss": 1.216, + "step": 18911 + }, + { + "epoch": 5.632867328133435, + "grad_norm": 0.32686930894851685, + "learning_rate": 8.437084402396424e-06, + "loss": 1.1955, + "step": 18912 + }, + { + "epoch": 5.6331651743331035, + "grad_norm": 0.3842843770980835, + "learning_rate": 8.436131696893462e-06, + "loss": 1.205, + "step": 18913 + }, + { + "epoch": 5.633463020532773, + "grad_norm": 0.40132343769073486, + "learning_rate": 8.435179005940545e-06, + "loss": 1.226, + "step": 18914 + }, + { + "epoch": 5.633760866732441, + "grad_norm": 0.3229738473892212, + "learning_rate": 8.43422632954654e-06, + "loss": 1.2151, + "step": 18915 + }, + { + "epoch": 5.634058712932109, + "grad_norm": 0.4361542761325836, + "learning_rate": 8.433273667720311e-06, + "loss": 1.1996, + "step": 18916 + }, + { + "epoch": 5.634356559131779, + "grad_norm": 0.3054964244365692, + "learning_rate": 8.432321020470716e-06, + "loss": 1.2243, + "step": 18917 + }, + { + "epoch": 5.634654405331447, + "grad_norm": 0.4058511555194855, + "learning_rate": 8.431368387806628e-06, + "loss": 1.2217, + "step": 18918 + }, + { + "epoch": 5.634952251531116, + "grad_norm": 0.27990689873695374, + "learning_rate": 8.430415769736899e-06, + "loss": 1.2277, + "step": 18919 + }, + { + "epoch": 5.6352500977307844, + "grad_norm": 0.372094988822937, + "learning_rate": 8.429463166270402e-06, + "loss": 1.2076, + "step": 18920 + }, + { + "epoch": 5.635547943930453, + "grad_norm": 0.2758762538433075, + "learning_rate": 8.428510577415994e-06, + "loss": 1.2251, + "step": 18921 + }, + { + "epoch": 5.635845790130121, + "grad_norm": 0.4484846591949463, + "learning_rate": 8.427558003182537e-06, + "loss": 1.2135, + "step": 18922 + }, + { + "epoch": 5.63614363632979, + "grad_norm": 0.28194659948349, + "learning_rate": 8.4266054435789e-06, + "loss": 1.2089, + "step": 18923 + }, + { + "epoch": 5.636441482529459, + "grad_norm": 0.3799077868461609, + "learning_rate": 8.42565289861394e-06, + "loss": 1.1986, + "step": 18924 + }, + { + "epoch": 5.636739328729128, + "grad_norm": 0.297743558883667, + "learning_rate": 8.42470036829652e-06, + "loss": 1.2172, + "step": 18925 + }, + { + "epoch": 5.637037174928796, + "grad_norm": 0.3965313136577606, + "learning_rate": 8.423747852635505e-06, + "loss": 1.2199, + "step": 18926 + }, + { + "epoch": 5.6373350211284645, + "grad_norm": 0.3734728693962097, + "learning_rate": 8.422795351639753e-06, + "loss": 1.2079, + "step": 18927 + }, + { + "epoch": 5.637632867328134, + "grad_norm": 0.43451786041259766, + "learning_rate": 8.421842865318126e-06, + "loss": 1.2205, + "step": 18928 + }, + { + "epoch": 5.637930713527802, + "grad_norm": 0.45394381880760193, + "learning_rate": 8.420890393679493e-06, + "loss": 1.207, + "step": 18929 + }, + { + "epoch": 5.63822855972747, + "grad_norm": 0.3637072443962097, + "learning_rate": 8.41993793673271e-06, + "loss": 1.205, + "step": 18930 + }, + { + "epoch": 5.63852640592714, + "grad_norm": 0.5764725208282471, + "learning_rate": 8.418985494486634e-06, + "loss": 1.2403, + "step": 18931 + }, + { + "epoch": 5.638824252126808, + "grad_norm": 0.2863178551197052, + "learning_rate": 8.418033066950135e-06, + "loss": 1.2124, + "step": 18932 + }, + { + "epoch": 5.639122098326476, + "grad_norm": 0.5326183438301086, + "learning_rate": 8.417080654132073e-06, + "loss": 1.2185, + "step": 18933 + }, + { + "epoch": 5.6394199445261455, + "grad_norm": 0.26585251092910767, + "learning_rate": 8.416128256041303e-06, + "loss": 1.213, + "step": 18934 + }, + { + "epoch": 5.639717790725814, + "grad_norm": 0.528961181640625, + "learning_rate": 8.415175872686692e-06, + "loss": 1.2138, + "step": 18935 + }, + { + "epoch": 5.640015636925483, + "grad_norm": 0.2646341919898987, + "learning_rate": 8.414223504077097e-06, + "loss": 1.2055, + "step": 18936 + }, + { + "epoch": 5.640313483125151, + "grad_norm": 0.45840033888816833, + "learning_rate": 8.413271150221385e-06, + "loss": 1.2116, + "step": 18937 + }, + { + "epoch": 5.64061132932482, + "grad_norm": 0.3243200182914734, + "learning_rate": 8.412318811128408e-06, + "loss": 1.208, + "step": 18938 + }, + { + "epoch": 5.640909175524489, + "grad_norm": 0.3127284049987793, + "learning_rate": 8.411366486807032e-06, + "loss": 1.2072, + "step": 18939 + }, + { + "epoch": 5.641207021724157, + "grad_norm": 0.44704189896583557, + "learning_rate": 8.410414177266115e-06, + "loss": 1.2271, + "step": 18940 + }, + { + "epoch": 5.6415048679238256, + "grad_norm": 0.3079887330532074, + "learning_rate": 8.409461882514522e-06, + "loss": 1.2151, + "step": 18941 + }, + { + "epoch": 5.641802714123495, + "grad_norm": 0.35030487179756165, + "learning_rate": 8.408509602561104e-06, + "loss": 1.2223, + "step": 18942 + }, + { + "epoch": 5.642100560323163, + "grad_norm": 0.2610160708427429, + "learning_rate": 8.407557337414729e-06, + "loss": 1.2239, + "step": 18943 + }, + { + "epoch": 5.642398406522831, + "grad_norm": 0.25979888439178467, + "learning_rate": 8.406605087084252e-06, + "loss": 1.2207, + "step": 18944 + }, + { + "epoch": 5.642696252722501, + "grad_norm": 0.3337382376194, + "learning_rate": 8.405652851578533e-06, + "loss": 1.2228, + "step": 18945 + }, + { + "epoch": 5.642994098922169, + "grad_norm": 0.34534624218940735, + "learning_rate": 8.404700630906437e-06, + "loss": 1.2325, + "step": 18946 + }, + { + "epoch": 5.643291945121838, + "grad_norm": 0.25939908623695374, + "learning_rate": 8.40374842507682e-06, + "loss": 1.2174, + "step": 18947 + }, + { + "epoch": 5.6435897913215065, + "grad_norm": 0.2573728859424591, + "learning_rate": 8.402796234098535e-06, + "loss": 1.2184, + "step": 18948 + }, + { + "epoch": 5.643887637521175, + "grad_norm": 0.3470657467842102, + "learning_rate": 8.401844057980452e-06, + "loss": 1.2045, + "step": 18949 + }, + { + "epoch": 5.644185483720843, + "grad_norm": 0.2562240958213806, + "learning_rate": 8.40089189673142e-06, + "loss": 1.2018, + "step": 18950 + }, + { + "epoch": 5.644483329920512, + "grad_norm": 0.3188422620296478, + "learning_rate": 8.399939750360302e-06, + "loss": 1.2067, + "step": 18951 + }, + { + "epoch": 5.644781176120181, + "grad_norm": 0.29142096638679504, + "learning_rate": 8.398987618875963e-06, + "loss": 1.2017, + "step": 18952 + }, + { + "epoch": 5.64507902231985, + "grad_norm": 0.2580066919326782, + "learning_rate": 8.398035502287247e-06, + "loss": 1.2134, + "step": 18953 + }, + { + "epoch": 5.645376868519518, + "grad_norm": 0.3329210877418518, + "learning_rate": 8.397083400603028e-06, + "loss": 1.2081, + "step": 18954 + }, + { + "epoch": 5.645674714719187, + "grad_norm": 0.2585013806819916, + "learning_rate": 8.396131313832157e-06, + "loss": 1.2118, + "step": 18955 + }, + { + "epoch": 5.645972560918856, + "grad_norm": 0.34125280380249023, + "learning_rate": 8.395179241983486e-06, + "loss": 1.2101, + "step": 18956 + }, + { + "epoch": 5.646270407118524, + "grad_norm": 0.27053216099739075, + "learning_rate": 8.394227185065883e-06, + "loss": 1.198, + "step": 18957 + }, + { + "epoch": 5.6465682533181925, + "grad_norm": 0.3379208743572235, + "learning_rate": 8.393275143088205e-06, + "loss": 1.2102, + "step": 18958 + }, + { + "epoch": 5.646866099517862, + "grad_norm": 0.26291197538375854, + "learning_rate": 8.3923231160593e-06, + "loss": 1.2126, + "step": 18959 + }, + { + "epoch": 5.64716394571753, + "grad_norm": 0.29343560338020325, + "learning_rate": 8.391371103988037e-06, + "loss": 1.2033, + "step": 18960 + }, + { + "epoch": 5.647461791917198, + "grad_norm": 0.30597466230392456, + "learning_rate": 8.390419106883268e-06, + "loss": 1.219, + "step": 18961 + }, + { + "epoch": 5.6477596381168675, + "grad_norm": 0.3227294385433197, + "learning_rate": 8.38946712475385e-06, + "loss": 1.2069, + "step": 18962 + }, + { + "epoch": 5.648057484316536, + "grad_norm": 0.42713555693626404, + "learning_rate": 8.38851515760864e-06, + "loss": 1.195, + "step": 18963 + }, + { + "epoch": 5.648355330516205, + "grad_norm": 0.6076002717018127, + "learning_rate": 8.3875632054565e-06, + "loss": 1.225, + "step": 18964 + }, + { + "epoch": 5.648653176715873, + "grad_norm": 0.2540733218193054, + "learning_rate": 8.386611268306275e-06, + "loss": 1.2237, + "step": 18965 + }, + { + "epoch": 5.648951022915542, + "grad_norm": 0.41091498732566833, + "learning_rate": 8.385659346166837e-06, + "loss": 1.233, + "step": 18966 + }, + { + "epoch": 5.649248869115211, + "grad_norm": 0.28372156620025635, + "learning_rate": 8.384707439047031e-06, + "loss": 1.2191, + "step": 18967 + }, + { + "epoch": 5.649546715314879, + "grad_norm": 0.3598971962928772, + "learning_rate": 8.383755546955718e-06, + "loss": 1.2235, + "step": 18968 + }, + { + "epoch": 5.649844561514548, + "grad_norm": 0.2593168020248413, + "learning_rate": 8.382803669901758e-06, + "loss": 1.217, + "step": 18969 + }, + { + "epoch": 5.650142407714217, + "grad_norm": 0.3058864176273346, + "learning_rate": 8.381851807893997e-06, + "loss": 1.2026, + "step": 18970 + }, + { + "epoch": 5.650440253913885, + "grad_norm": 0.2763136625289917, + "learning_rate": 8.380899960941303e-06, + "loss": 1.2111, + "step": 18971 + }, + { + "epoch": 5.6507381001135535, + "grad_norm": 0.26642781496047974, + "learning_rate": 8.379948129052524e-06, + "loss": 1.2084, + "step": 18972 + }, + { + "epoch": 5.651035946313223, + "grad_norm": 0.36185839772224426, + "learning_rate": 8.378996312236515e-06, + "loss": 1.207, + "step": 18973 + }, + { + "epoch": 5.651333792512891, + "grad_norm": 0.29904067516326904, + "learning_rate": 8.378044510502135e-06, + "loss": 1.2122, + "step": 18974 + }, + { + "epoch": 5.65163163871256, + "grad_norm": 0.32451102137565613, + "learning_rate": 8.377092723858242e-06, + "loss": 1.2137, + "step": 18975 + }, + { + "epoch": 5.651929484912229, + "grad_norm": 0.26365503668785095, + "learning_rate": 8.376140952313683e-06, + "loss": 1.2039, + "step": 18976 + }, + { + "epoch": 5.652227331111897, + "grad_norm": 0.42056193947792053, + "learning_rate": 8.375189195877322e-06, + "loss": 1.2356, + "step": 18977 + }, + { + "epoch": 5.652525177311566, + "grad_norm": 0.281960666179657, + "learning_rate": 8.374237454558012e-06, + "loss": 1.2018, + "step": 18978 + }, + { + "epoch": 5.6528230235112344, + "grad_norm": 0.3317055404186249, + "learning_rate": 8.3732857283646e-06, + "loss": 1.2156, + "step": 18979 + }, + { + "epoch": 5.653120869710903, + "grad_norm": 0.289728581905365, + "learning_rate": 8.37233401730595e-06, + "loss": 1.2128, + "step": 18980 + }, + { + "epoch": 5.653418715910572, + "grad_norm": 0.3112536072731018, + "learning_rate": 8.371382321390914e-06, + "loss": 1.2087, + "step": 18981 + }, + { + "epoch": 5.65371656211024, + "grad_norm": 0.26634669303894043, + "learning_rate": 8.370430640628342e-06, + "loss": 1.1908, + "step": 18982 + }, + { + "epoch": 5.654014408309909, + "grad_norm": 0.26398199796676636, + "learning_rate": 8.369478975027098e-06, + "loss": 1.2136, + "step": 18983 + }, + { + "epoch": 5.654312254509578, + "grad_norm": 0.28051403164863586, + "learning_rate": 8.368527324596026e-06, + "loss": 1.2097, + "step": 18984 + }, + { + "epoch": 5.654610100709246, + "grad_norm": 0.2899719178676605, + "learning_rate": 8.367575689343987e-06, + "loss": 1.2244, + "step": 18985 + }, + { + "epoch": 5.654907946908915, + "grad_norm": 0.3312186896800995, + "learning_rate": 8.366624069279832e-06, + "loss": 1.2207, + "step": 18986 + }, + { + "epoch": 5.655205793108584, + "grad_norm": 0.26433223485946655, + "learning_rate": 8.365672464412412e-06, + "loss": 1.2051, + "step": 18987 + }, + { + "epoch": 5.655503639308252, + "grad_norm": 0.28281447291374207, + "learning_rate": 8.364720874750589e-06, + "loss": 1.1937, + "step": 18988 + }, + { + "epoch": 5.65580148550792, + "grad_norm": 0.2555307447910309, + "learning_rate": 8.36376930030321e-06, + "loss": 1.2254, + "step": 18989 + }, + { + "epoch": 5.65609933170759, + "grad_norm": 0.30408424139022827, + "learning_rate": 8.362817741079126e-06, + "loss": 1.2226, + "step": 18990 + }, + { + "epoch": 5.656397177907258, + "grad_norm": 0.27140143513679504, + "learning_rate": 8.361866197087198e-06, + "loss": 1.2, + "step": 18991 + }, + { + "epoch": 5.656695024106927, + "grad_norm": 0.4277518689632416, + "learning_rate": 8.360914668336273e-06, + "loss": 1.2277, + "step": 18992 + }, + { + "epoch": 5.6569928703065955, + "grad_norm": 0.33056220412254333, + "learning_rate": 8.359963154835202e-06, + "loss": 1.2102, + "step": 18993 + }, + { + "epoch": 5.657290716506264, + "grad_norm": 0.4178558886051178, + "learning_rate": 8.359011656592847e-06, + "loss": 1.2317, + "step": 18994 + }, + { + "epoch": 5.657588562705933, + "grad_norm": 0.33721283078193665, + "learning_rate": 8.358060173618055e-06, + "loss": 1.2214, + "step": 18995 + }, + { + "epoch": 5.657886408905601, + "grad_norm": 0.3312799334526062, + "learning_rate": 8.357108705919674e-06, + "loss": 1.1996, + "step": 18996 + }, + { + "epoch": 5.65818425510527, + "grad_norm": 0.2949334979057312, + "learning_rate": 8.356157253506563e-06, + "loss": 1.1997, + "step": 18997 + }, + { + "epoch": 5.658482101304939, + "grad_norm": 0.2550343871116638, + "learning_rate": 8.355205816387574e-06, + "loss": 1.2174, + "step": 18998 + }, + { + "epoch": 5.658779947504607, + "grad_norm": 0.29912087321281433, + "learning_rate": 8.354254394571555e-06, + "loss": 1.2096, + "step": 18999 + }, + { + "epoch": 5.6590777937042755, + "grad_norm": 0.2648436725139618, + "learning_rate": 8.353302988067364e-06, + "loss": 1.2185, + "step": 19000 + }, + { + "epoch": 5.6590777937042755, + "eval_loss": 1.319551944732666, + "eval_runtime": 24.0597, + "eval_samples_per_second": 72.071, + "eval_steps_per_second": 4.53, + "step": 19000 + }, + { + "epoch": 5.659375639903945, + "grad_norm": 0.3021543622016907, + "learning_rate": 8.352351596883842e-06, + "loss": 1.2241, + "step": 19001 + }, + { + "epoch": 5.659673486103613, + "grad_norm": 0.25659576058387756, + "learning_rate": 8.351400221029854e-06, + "loss": 1.2227, + "step": 19002 + }, + { + "epoch": 5.659971332303282, + "grad_norm": 0.3784889578819275, + "learning_rate": 8.350448860514243e-06, + "loss": 1.2191, + "step": 19003 + }, + { + "epoch": 5.660269178502951, + "grad_norm": 0.3093247711658478, + "learning_rate": 8.349497515345859e-06, + "loss": 1.2171, + "step": 19004 + }, + { + "epoch": 5.660567024702619, + "grad_norm": 0.29522302746772766, + "learning_rate": 8.348546185533562e-06, + "loss": 1.2271, + "step": 19005 + }, + { + "epoch": 5.660864870902288, + "grad_norm": 0.2940315306186676, + "learning_rate": 8.347594871086196e-06, + "loss": 1.2193, + "step": 19006 + }, + { + "epoch": 5.6611627171019565, + "grad_norm": 0.2734469175338745, + "learning_rate": 8.34664357201261e-06, + "loss": 1.2209, + "step": 19007 + }, + { + "epoch": 5.661460563301625, + "grad_norm": 0.2689495086669922, + "learning_rate": 8.345692288321664e-06, + "loss": 1.2209, + "step": 19008 + }, + { + "epoch": 5.661758409501294, + "grad_norm": 0.27630409598350525, + "learning_rate": 8.344741020022199e-06, + "loss": 1.2123, + "step": 19009 + }, + { + "epoch": 5.662056255700962, + "grad_norm": 0.26288482546806335, + "learning_rate": 8.343789767123067e-06, + "loss": 1.2226, + "step": 19010 + }, + { + "epoch": 5.662354101900631, + "grad_norm": 0.2920735478401184, + "learning_rate": 8.342838529633127e-06, + "loss": 1.1971, + "step": 19011 + }, + { + "epoch": 5.6626519481003, + "grad_norm": 0.24532313644886017, + "learning_rate": 8.341887307561222e-06, + "loss": 1.2094, + "step": 19012 + }, + { + "epoch": 5.662949794299968, + "grad_norm": 0.27133235335350037, + "learning_rate": 8.3409361009162e-06, + "loss": 1.2149, + "step": 19013 + }, + { + "epoch": 5.6632476404996375, + "grad_norm": 0.30312278866767883, + "learning_rate": 8.339984909706917e-06, + "loss": 1.2144, + "step": 19014 + }, + { + "epoch": 5.663545486699306, + "grad_norm": 0.35235607624053955, + "learning_rate": 8.339033733942217e-06, + "loss": 1.2117, + "step": 19015 + }, + { + "epoch": 5.663843332898974, + "grad_norm": 0.3692496716976166, + "learning_rate": 8.33808257363095e-06, + "loss": 1.2185, + "step": 19016 + }, + { + "epoch": 5.6641411790986425, + "grad_norm": 0.29738691449165344, + "learning_rate": 8.337131428781974e-06, + "loss": 1.2064, + "step": 19017 + }, + { + "epoch": 5.664439025298312, + "grad_norm": 0.41260096430778503, + "learning_rate": 8.336180299404126e-06, + "loss": 1.2193, + "step": 19018 + }, + { + "epoch": 5.66473687149798, + "grad_norm": 0.2588588297367096, + "learning_rate": 8.335229185506267e-06, + "loss": 1.211, + "step": 19019 + }, + { + "epoch": 5.665034717697649, + "grad_norm": 0.28206971287727356, + "learning_rate": 8.33427808709724e-06, + "loss": 1.2098, + "step": 19020 + }, + { + "epoch": 5.6653325638973175, + "grad_norm": 0.3220142126083374, + "learning_rate": 8.333327004185889e-06, + "loss": 1.2041, + "step": 19021 + }, + { + "epoch": 5.665630410096986, + "grad_norm": 0.2897467315196991, + "learning_rate": 8.332375936781072e-06, + "loss": 1.2004, + "step": 19022 + }, + { + "epoch": 5.665928256296655, + "grad_norm": 0.3630657196044922, + "learning_rate": 8.331424884891636e-06, + "loss": 1.2017, + "step": 19023 + }, + { + "epoch": 5.666226102496323, + "grad_norm": 0.3009297251701355, + "learning_rate": 8.330473848526421e-06, + "loss": 1.2175, + "step": 19024 + }, + { + "epoch": 5.666523948695992, + "grad_norm": 0.450808048248291, + "learning_rate": 8.329522827694288e-06, + "loss": 1.2163, + "step": 19025 + }, + { + "epoch": 5.666821794895661, + "grad_norm": 0.292876273393631, + "learning_rate": 8.328571822404074e-06, + "loss": 1.2134, + "step": 19026 + }, + { + "epoch": 5.667119641095329, + "grad_norm": 0.5328595638275146, + "learning_rate": 8.327620832664632e-06, + "loss": 1.2056, + "step": 19027 + }, + { + "epoch": 5.667417487294998, + "grad_norm": 0.3636288642883301, + "learning_rate": 8.326669858484814e-06, + "loss": 1.1931, + "step": 19028 + }, + { + "epoch": 5.667715333494667, + "grad_norm": 0.5162214636802673, + "learning_rate": 8.325718899873461e-06, + "loss": 1.211, + "step": 19029 + }, + { + "epoch": 5.668013179694335, + "grad_norm": 0.36680299043655396, + "learning_rate": 8.324767956839422e-06, + "loss": 1.2063, + "step": 19030 + }, + { + "epoch": 5.668311025894004, + "grad_norm": 0.461832731962204, + "learning_rate": 8.323817029391548e-06, + "loss": 1.208, + "step": 19031 + }, + { + "epoch": 5.668608872093673, + "grad_norm": 0.2879669666290283, + "learning_rate": 8.322866117538681e-06, + "loss": 1.2145, + "step": 19032 + }, + { + "epoch": 5.668906718293341, + "grad_norm": 0.36926543712615967, + "learning_rate": 8.32191522128967e-06, + "loss": 1.1999, + "step": 19033 + }, + { + "epoch": 5.66920456449301, + "grad_norm": 0.36631879210472107, + "learning_rate": 8.320964340653365e-06, + "loss": 1.2087, + "step": 19034 + }, + { + "epoch": 5.669502410692679, + "grad_norm": 0.4344761073589325, + "learning_rate": 8.32001347563861e-06, + "loss": 1.2041, + "step": 19035 + }, + { + "epoch": 5.669800256892347, + "grad_norm": 0.39795640110969543, + "learning_rate": 8.319062626254256e-06, + "loss": 1.2098, + "step": 19036 + }, + { + "epoch": 5.670098103092016, + "grad_norm": 0.36752912402153015, + "learning_rate": 8.318111792509144e-06, + "loss": 1.2046, + "step": 19037 + }, + { + "epoch": 5.6703959492916844, + "grad_norm": 0.44404342770576477, + "learning_rate": 8.31716097441212e-06, + "loss": 1.1956, + "step": 19038 + }, + { + "epoch": 5.670693795491353, + "grad_norm": 0.4044566750526428, + "learning_rate": 8.316210171972035e-06, + "loss": 1.2201, + "step": 19039 + }, + { + "epoch": 5.670991641691022, + "grad_norm": 0.4066852033138275, + "learning_rate": 8.315259385197736e-06, + "loss": 1.2309, + "step": 19040 + }, + { + "epoch": 5.67128948789069, + "grad_norm": 0.3474169969558716, + "learning_rate": 8.31430861409806e-06, + "loss": 1.2164, + "step": 19041 + }, + { + "epoch": 5.6715873340903595, + "grad_norm": 0.3894362151622772, + "learning_rate": 8.313357858681866e-06, + "loss": 1.213, + "step": 19042 + }, + { + "epoch": 5.671885180290028, + "grad_norm": 0.2640187442302704, + "learning_rate": 8.312407118957987e-06, + "loss": 1.2096, + "step": 19043 + }, + { + "epoch": 5.672183026489696, + "grad_norm": 0.5282630920410156, + "learning_rate": 8.311456394935276e-06, + "loss": 1.2067, + "step": 19044 + }, + { + "epoch": 5.672480872689365, + "grad_norm": 0.3081777095794678, + "learning_rate": 8.310505686622578e-06, + "loss": 1.1988, + "step": 19045 + }, + { + "epoch": 5.672778718889034, + "grad_norm": 0.31176304817199707, + "learning_rate": 8.309554994028737e-06, + "loss": 1.2089, + "step": 19046 + }, + { + "epoch": 5.673076565088702, + "grad_norm": 0.2757018506526947, + "learning_rate": 8.308604317162595e-06, + "loss": 1.2213, + "step": 19047 + }, + { + "epoch": 5.673374411288371, + "grad_norm": 0.30400386452674866, + "learning_rate": 8.307653656033004e-06, + "loss": 1.2051, + "step": 19048 + }, + { + "epoch": 5.67367225748804, + "grad_norm": 0.25698554515838623, + "learning_rate": 8.306703010648803e-06, + "loss": 1.2182, + "step": 19049 + }, + { + "epoch": 5.673970103687708, + "grad_norm": 0.35520604252815247, + "learning_rate": 8.305752381018838e-06, + "loss": 1.2223, + "step": 19050 + }, + { + "epoch": 5.674267949887377, + "grad_norm": 0.322625070810318, + "learning_rate": 8.304801767151955e-06, + "loss": 1.2248, + "step": 19051 + }, + { + "epoch": 5.6745657960870455, + "grad_norm": 0.44615495204925537, + "learning_rate": 8.303851169056996e-06, + "loss": 1.2166, + "step": 19052 + }, + { + "epoch": 5.674863642286715, + "grad_norm": 0.31453388929367065, + "learning_rate": 8.30290058674281e-06, + "loss": 1.2085, + "step": 19053 + }, + { + "epoch": 5.675161488486383, + "grad_norm": 0.5074651837348938, + "learning_rate": 8.301950020218239e-06, + "loss": 1.2077, + "step": 19054 + }, + { + "epoch": 5.675459334686051, + "grad_norm": 0.264446496963501, + "learning_rate": 8.300999469492119e-06, + "loss": 1.2259, + "step": 19055 + }, + { + "epoch": 5.67575718088572, + "grad_norm": 0.503488302230835, + "learning_rate": 8.300048934573306e-06, + "loss": 1.2303, + "step": 19056 + }, + { + "epoch": 5.676055027085389, + "grad_norm": 0.4028741419315338, + "learning_rate": 8.29909841547064e-06, + "loss": 1.2191, + "step": 19057 + }, + { + "epoch": 5.676352873285057, + "grad_norm": 0.5179157853126526, + "learning_rate": 8.298147912192957e-06, + "loss": 1.2093, + "step": 19058 + }, + { + "epoch": 5.676650719484726, + "grad_norm": 0.3718646168708801, + "learning_rate": 8.29719742474911e-06, + "loss": 1.2216, + "step": 19059 + }, + { + "epoch": 5.676948565684395, + "grad_norm": 0.41000795364379883, + "learning_rate": 8.29624695314794e-06, + "loss": 1.2196, + "step": 19060 + }, + { + "epoch": 5.677246411884063, + "grad_norm": 0.30932697653770447, + "learning_rate": 8.295296497398285e-06, + "loss": 1.2069, + "step": 19061 + }, + { + "epoch": 5.677544258083732, + "grad_norm": 0.2911328077316284, + "learning_rate": 8.294346057508992e-06, + "loss": 1.2038, + "step": 19062 + }, + { + "epoch": 5.677842104283401, + "grad_norm": 0.36987268924713135, + "learning_rate": 8.293395633488905e-06, + "loss": 1.2031, + "step": 19063 + }, + { + "epoch": 5.678139950483069, + "grad_norm": 0.3083730638027191, + "learning_rate": 8.292445225346863e-06, + "loss": 1.21, + "step": 19064 + }, + { + "epoch": 5.678437796682738, + "grad_norm": 0.2953846752643585, + "learning_rate": 8.291494833091714e-06, + "loss": 1.2102, + "step": 19065 + }, + { + "epoch": 5.6787356428824065, + "grad_norm": 0.28472769260406494, + "learning_rate": 8.29054445673229e-06, + "loss": 1.1993, + "step": 19066 + }, + { + "epoch": 5.679033489082075, + "grad_norm": 0.3901467025279999, + "learning_rate": 8.289594096277447e-06, + "loss": 1.2172, + "step": 19067 + }, + { + "epoch": 5.679331335281744, + "grad_norm": 0.3293989300727844, + "learning_rate": 8.288643751736016e-06, + "loss": 1.2302, + "step": 19068 + }, + { + "epoch": 5.679629181481412, + "grad_norm": 0.46018221974372864, + "learning_rate": 8.287693423116841e-06, + "loss": 1.1961, + "step": 19069 + }, + { + "epoch": 5.679927027681082, + "grad_norm": 0.34103935956954956, + "learning_rate": 8.28674311042877e-06, + "loss": 1.2098, + "step": 19070 + }, + { + "epoch": 5.68022487388075, + "grad_norm": 0.4321513772010803, + "learning_rate": 8.285792813680641e-06, + "loss": 1.193, + "step": 19071 + }, + { + "epoch": 5.680522720080418, + "grad_norm": 0.3278217315673828, + "learning_rate": 8.28484253288129e-06, + "loss": 1.2055, + "step": 19072 + }, + { + "epoch": 5.6808205662800875, + "grad_norm": 0.5738650560379028, + "learning_rate": 8.283892268039568e-06, + "loss": 1.2072, + "step": 19073 + }, + { + "epoch": 5.681118412479756, + "grad_norm": 0.3378513753414154, + "learning_rate": 8.282942019164308e-06, + "loss": 1.2192, + "step": 19074 + }, + { + "epoch": 5.681416258679424, + "grad_norm": 0.5449102520942688, + "learning_rate": 8.281991786264352e-06, + "loss": 1.2096, + "step": 19075 + }, + { + "epoch": 5.681714104879093, + "grad_norm": 0.33484184741973877, + "learning_rate": 8.281041569348547e-06, + "loss": 1.2169, + "step": 19076 + }, + { + "epoch": 5.682011951078762, + "grad_norm": 0.5389516353607178, + "learning_rate": 8.280091368425731e-06, + "loss": 1.2295, + "step": 19077 + }, + { + "epoch": 5.68230979727843, + "grad_norm": 0.3976714611053467, + "learning_rate": 8.279141183504737e-06, + "loss": 1.2135, + "step": 19078 + }, + { + "epoch": 5.682607643478099, + "grad_norm": 0.48223838210105896, + "learning_rate": 8.278191014594415e-06, + "loss": 1.2308, + "step": 19079 + }, + { + "epoch": 5.6829054896777675, + "grad_norm": 0.41766807436943054, + "learning_rate": 8.277240861703604e-06, + "loss": 1.2225, + "step": 19080 + }, + { + "epoch": 5.683203335877437, + "grad_norm": 0.3423973023891449, + "learning_rate": 8.276290724841138e-06, + "loss": 1.2155, + "step": 19081 + }, + { + "epoch": 5.683501182077105, + "grad_norm": 0.7863313555717468, + "learning_rate": 8.275340604015864e-06, + "loss": 1.22, + "step": 19082 + }, + { + "epoch": 5.683799028276773, + "grad_norm": 0.3401656746864319, + "learning_rate": 8.274390499236616e-06, + "loss": 1.2122, + "step": 19083 + }, + { + "epoch": 5.684096874476442, + "grad_norm": 1.1781949996948242, + "learning_rate": 8.273440410512238e-06, + "loss": 1.211, + "step": 19084 + }, + { + "epoch": 5.684394720676111, + "grad_norm": 0.3618912398815155, + "learning_rate": 8.272490337851568e-06, + "loss": 1.2213, + "step": 19085 + }, + { + "epoch": 5.684692566875779, + "grad_norm": 0.677024245262146, + "learning_rate": 8.271540281263446e-06, + "loss": 1.219, + "step": 19086 + }, + { + "epoch": 5.6849904130754485, + "grad_norm": 0.34416821599006653, + "learning_rate": 8.270590240756709e-06, + "loss": 1.2082, + "step": 19087 + }, + { + "epoch": 5.685288259275117, + "grad_norm": 0.4384899437427521, + "learning_rate": 8.2696402163402e-06, + "loss": 1.2183, + "step": 19088 + }, + { + "epoch": 5.685586105474785, + "grad_norm": 0.39428257942199707, + "learning_rate": 8.268690208022752e-06, + "loss": 1.2003, + "step": 19089 + }, + { + "epoch": 5.685883951674454, + "grad_norm": 0.29664862155914307, + "learning_rate": 8.267740215813212e-06, + "loss": 1.2021, + "step": 19090 + }, + { + "epoch": 5.686181797874123, + "grad_norm": 0.3758775293827057, + "learning_rate": 8.26679023972041e-06, + "loss": 1.2317, + "step": 19091 + }, + { + "epoch": 5.686479644073791, + "grad_norm": 0.3397117853164673, + "learning_rate": 8.265840279753187e-06, + "loss": 1.2138, + "step": 19092 + }, + { + "epoch": 5.68677749027346, + "grad_norm": 0.3522505760192871, + "learning_rate": 8.264890335920387e-06, + "loss": 1.2023, + "step": 19093 + }, + { + "epoch": 5.687075336473129, + "grad_norm": 0.37819424271583557, + "learning_rate": 8.263940408230844e-06, + "loss": 1.2054, + "step": 19094 + }, + { + "epoch": 5.687373182672797, + "grad_norm": 0.3502015769481659, + "learning_rate": 8.262990496693391e-06, + "loss": 1.2097, + "step": 19095 + }, + { + "epoch": 5.687671028872466, + "grad_norm": 0.3551258444786072, + "learning_rate": 8.262040601316876e-06, + "loss": 1.1903, + "step": 19096 + }, + { + "epoch": 5.6879688750721344, + "grad_norm": 0.28365689516067505, + "learning_rate": 8.261090722110128e-06, + "loss": 1.2216, + "step": 19097 + }, + { + "epoch": 5.688266721271804, + "grad_norm": 0.29984399676322937, + "learning_rate": 8.260140859081988e-06, + "loss": 1.2217, + "step": 19098 + }, + { + "epoch": 5.688564567471472, + "grad_norm": 0.41037264466285706, + "learning_rate": 8.259191012241296e-06, + "loss": 1.2084, + "step": 19099 + }, + { + "epoch": 5.68886241367114, + "grad_norm": 0.2847413122653961, + "learning_rate": 8.258241181596881e-06, + "loss": 1.2201, + "step": 19100 + }, + { + "epoch": 5.6891602598708095, + "grad_norm": 0.4303090572357178, + "learning_rate": 8.257291367157593e-06, + "loss": 1.2142, + "step": 19101 + }, + { + "epoch": 5.689458106070478, + "grad_norm": 0.27760329842567444, + "learning_rate": 8.25634156893226e-06, + "loss": 1.2067, + "step": 19102 + }, + { + "epoch": 5.689755952270146, + "grad_norm": 0.4031229019165039, + "learning_rate": 8.255391786929718e-06, + "loss": 1.2062, + "step": 19103 + }, + { + "epoch": 5.690053798469815, + "grad_norm": 0.2993726432323456, + "learning_rate": 8.254442021158807e-06, + "loss": 1.2055, + "step": 19104 + }, + { + "epoch": 5.690351644669484, + "grad_norm": 0.46615636348724365, + "learning_rate": 8.253492271628365e-06, + "loss": 1.2108, + "step": 19105 + }, + { + "epoch": 5.690649490869152, + "grad_norm": 0.3425360918045044, + "learning_rate": 8.252542538347222e-06, + "loss": 1.2313, + "step": 19106 + }, + { + "epoch": 5.690947337068821, + "grad_norm": 0.4575381278991699, + "learning_rate": 8.251592821324222e-06, + "loss": 1.2252, + "step": 19107 + }, + { + "epoch": 5.69124518326849, + "grad_norm": 0.2913895547389984, + "learning_rate": 8.250643120568197e-06, + "loss": 1.2329, + "step": 19108 + }, + { + "epoch": 5.691543029468159, + "grad_norm": 0.38088569045066833, + "learning_rate": 8.249693436087982e-06, + "loss": 1.2112, + "step": 19109 + }, + { + "epoch": 5.691840875667827, + "grad_norm": 0.3749718964099884, + "learning_rate": 8.248743767892413e-06, + "loss": 1.2124, + "step": 19110 + }, + { + "epoch": 5.6921387218674955, + "grad_norm": 0.37592044472694397, + "learning_rate": 8.247794115990331e-06, + "loss": 1.2101, + "step": 19111 + }, + { + "epoch": 5.692436568067165, + "grad_norm": 0.31984758377075195, + "learning_rate": 8.246844480390561e-06, + "loss": 1.2035, + "step": 19112 + }, + { + "epoch": 5.692734414266833, + "grad_norm": 0.2917548716068268, + "learning_rate": 8.24589486110195e-06, + "loss": 1.1921, + "step": 19113 + }, + { + "epoch": 5.693032260466501, + "grad_norm": 0.3189534842967987, + "learning_rate": 8.244945258133324e-06, + "loss": 1.194, + "step": 19114 + }, + { + "epoch": 5.693330106666171, + "grad_norm": 0.272670179605484, + "learning_rate": 8.24399567149352e-06, + "loss": 1.2146, + "step": 19115 + }, + { + "epoch": 5.693627952865839, + "grad_norm": 0.25924283266067505, + "learning_rate": 8.243046101191379e-06, + "loss": 1.2105, + "step": 19116 + }, + { + "epoch": 5.693925799065507, + "grad_norm": 0.2516448199748993, + "learning_rate": 8.242096547235726e-06, + "loss": 1.2196, + "step": 19117 + }, + { + "epoch": 5.694223645265176, + "grad_norm": 0.29821452498435974, + "learning_rate": 8.241147009635405e-06, + "loss": 1.205, + "step": 19118 + }, + { + "epoch": 5.694521491464845, + "grad_norm": 0.2622608542442322, + "learning_rate": 8.240197488399246e-06, + "loss": 1.2152, + "step": 19119 + }, + { + "epoch": 5.694819337664514, + "grad_norm": 0.4225327968597412, + "learning_rate": 8.23924798353608e-06, + "loss": 1.2009, + "step": 19120 + }, + { + "epoch": 5.695117183864182, + "grad_norm": 0.38297656178474426, + "learning_rate": 8.238298495054745e-06, + "loss": 1.2149, + "step": 19121 + }, + { + "epoch": 5.695415030063851, + "grad_norm": 0.300771027803421, + "learning_rate": 8.237349022964077e-06, + "loss": 1.2134, + "step": 19122 + }, + { + "epoch": 5.695712876263519, + "grad_norm": 0.3504418134689331, + "learning_rate": 8.236399567272903e-06, + "loss": 1.2099, + "step": 19123 + }, + { + "epoch": 5.696010722463188, + "grad_norm": 0.24541020393371582, + "learning_rate": 8.235450127990064e-06, + "loss": 1.1963, + "step": 19124 + }, + { + "epoch": 5.6963085686628565, + "grad_norm": 0.32109183073043823, + "learning_rate": 8.23450070512439e-06, + "loss": 1.2078, + "step": 19125 + }, + { + "epoch": 5.696606414862526, + "grad_norm": 0.3012921214103699, + "learning_rate": 8.233551298684711e-06, + "loss": 1.2171, + "step": 19126 + }, + { + "epoch": 5.696904261062194, + "grad_norm": 0.29415959119796753, + "learning_rate": 8.232601908679865e-06, + "loss": 1.231, + "step": 19127 + }, + { + "epoch": 5.697202107261862, + "grad_norm": 0.33890417218208313, + "learning_rate": 8.231652535118687e-06, + "loss": 1.2107, + "step": 19128 + }, + { + "epoch": 5.697499953461532, + "grad_norm": 0.30771562457084656, + "learning_rate": 8.230703178010002e-06, + "loss": 1.2321, + "step": 19129 + }, + { + "epoch": 5.6977977996612, + "grad_norm": 0.310462087392807, + "learning_rate": 8.22975383736265e-06, + "loss": 1.2197, + "step": 19130 + }, + { + "epoch": 5.698095645860868, + "grad_norm": 0.2630466818809509, + "learning_rate": 8.22880451318546e-06, + "loss": 1.1987, + "step": 19131 + }, + { + "epoch": 5.6983934920605375, + "grad_norm": 0.3235372006893158, + "learning_rate": 8.227855205487264e-06, + "loss": 1.2148, + "step": 19132 + }, + { + "epoch": 5.698691338260206, + "grad_norm": 0.26138269901275635, + "learning_rate": 8.226905914276895e-06, + "loss": 1.2171, + "step": 19133 + }, + { + "epoch": 5.698989184459874, + "grad_norm": 0.3011850118637085, + "learning_rate": 8.225956639563186e-06, + "loss": 1.2199, + "step": 19134 + }, + { + "epoch": 5.699287030659543, + "grad_norm": 0.3192721903324127, + "learning_rate": 8.22500738135497e-06, + "loss": 1.2137, + "step": 19135 + }, + { + "epoch": 5.699584876859212, + "grad_norm": 0.46231967210769653, + "learning_rate": 8.224058139661077e-06, + "loss": 1.2038, + "step": 19136 + }, + { + "epoch": 5.699882723058881, + "grad_norm": 0.28124892711639404, + "learning_rate": 8.223108914490337e-06, + "loss": 1.222, + "step": 19137 + }, + { + "epoch": 5.700180569258549, + "grad_norm": 0.5450407862663269, + "learning_rate": 8.222159705851583e-06, + "loss": 1.2043, + "step": 19138 + }, + { + "epoch": 5.7004784154582175, + "grad_norm": 0.26735734939575195, + "learning_rate": 8.22121051375365e-06, + "loss": 1.2262, + "step": 19139 + }, + { + "epoch": 5.700776261657887, + "grad_norm": 0.6248392462730408, + "learning_rate": 8.220261338205361e-06, + "loss": 1.2291, + "step": 19140 + }, + { + "epoch": 5.701074107857555, + "grad_norm": 0.3227476179599762, + "learning_rate": 8.219312179215557e-06, + "loss": 1.2025, + "step": 19141 + }, + { + "epoch": 5.701371954057223, + "grad_norm": 0.4438363313674927, + "learning_rate": 8.218363036793063e-06, + "loss": 1.2093, + "step": 19142 + }, + { + "epoch": 5.701669800256893, + "grad_norm": 0.5310785174369812, + "learning_rate": 8.217413910946705e-06, + "loss": 1.2251, + "step": 19143 + }, + { + "epoch": 5.701967646456561, + "grad_norm": 0.42326900362968445, + "learning_rate": 8.216464801685324e-06, + "loss": 1.2351, + "step": 19144 + }, + { + "epoch": 5.702265492656229, + "grad_norm": 0.6061856150627136, + "learning_rate": 8.215515709017745e-06, + "loss": 1.2225, + "step": 19145 + }, + { + "epoch": 5.7025633388558985, + "grad_norm": 0.4341288208961487, + "learning_rate": 8.214566632952795e-06, + "loss": 1.2215, + "step": 19146 + }, + { + "epoch": 5.702861185055567, + "grad_norm": 0.5088263750076294, + "learning_rate": 8.21361757349931e-06, + "loss": 1.1955, + "step": 19147 + }, + { + "epoch": 5.703159031255236, + "grad_norm": 0.2771662473678589, + "learning_rate": 8.212668530666122e-06, + "loss": 1.2127, + "step": 19148 + }, + { + "epoch": 5.703456877454904, + "grad_norm": 0.4809740483760834, + "learning_rate": 8.211719504462047e-06, + "loss": 1.2099, + "step": 19149 + }, + { + "epoch": 5.703754723654573, + "grad_norm": 0.47004154324531555, + "learning_rate": 8.21077049489593e-06, + "loss": 1.2326, + "step": 19150 + }, + { + "epoch": 5.704052569854241, + "grad_norm": 0.35316845774650574, + "learning_rate": 8.209821501976591e-06, + "loss": 1.236, + "step": 19151 + }, + { + "epoch": 5.70435041605391, + "grad_norm": 0.5005456209182739, + "learning_rate": 8.208872525712868e-06, + "loss": 1.21, + "step": 19152 + }, + { + "epoch": 5.704648262253579, + "grad_norm": 0.35879549384117126, + "learning_rate": 8.207923566113584e-06, + "loss": 1.2116, + "step": 19153 + }, + { + "epoch": 5.704946108453248, + "grad_norm": 0.3176892399787903, + "learning_rate": 8.206974623187564e-06, + "loss": 1.2238, + "step": 19154 + }, + { + "epoch": 5.705243954652916, + "grad_norm": 0.6765245199203491, + "learning_rate": 8.206025696943648e-06, + "loss": 1.2009, + "step": 19155 + }, + { + "epoch": 5.7055418008525844, + "grad_norm": 0.29464155435562134, + "learning_rate": 8.205076787390655e-06, + "loss": 1.2245, + "step": 19156 + }, + { + "epoch": 5.705839647052254, + "grad_norm": 0.49003055691719055, + "learning_rate": 8.204127894537417e-06, + "loss": 1.2129, + "step": 19157 + }, + { + "epoch": 5.706137493251922, + "grad_norm": 0.4680720567703247, + "learning_rate": 8.203179018392764e-06, + "loss": 1.1992, + "step": 19158 + }, + { + "epoch": 5.70643533945159, + "grad_norm": 0.3750953674316406, + "learning_rate": 8.202230158965524e-06, + "loss": 1.2268, + "step": 19159 + }, + { + "epoch": 5.7067331856512595, + "grad_norm": 0.38773584365844727, + "learning_rate": 8.201281316264518e-06, + "loss": 1.213, + "step": 19160 + }, + { + "epoch": 5.707031031850928, + "grad_norm": 0.3487052619457245, + "learning_rate": 8.200332490298587e-06, + "loss": 1.2156, + "step": 19161 + }, + { + "epoch": 5.707328878050596, + "grad_norm": 0.32750964164733887, + "learning_rate": 8.199383681076547e-06, + "loss": 1.228, + "step": 19162 + }, + { + "epoch": 5.707626724250265, + "grad_norm": 0.45693573355674744, + "learning_rate": 8.198434888607229e-06, + "loss": 1.2036, + "step": 19163 + }, + { + "epoch": 5.707924570449934, + "grad_norm": 0.26374465227127075, + "learning_rate": 8.197486112899465e-06, + "loss": 1.2312, + "step": 19164 + }, + { + "epoch": 5.708222416649603, + "grad_norm": 0.30079081654548645, + "learning_rate": 8.19653735396208e-06, + "loss": 1.2031, + "step": 19165 + }, + { + "epoch": 5.708520262849271, + "grad_norm": 0.34954196214675903, + "learning_rate": 8.195588611803894e-06, + "loss": 1.2035, + "step": 19166 + }, + { + "epoch": 5.70881810904894, + "grad_norm": 0.3157309889793396, + "learning_rate": 8.194639886433745e-06, + "loss": 1.2155, + "step": 19167 + }, + { + "epoch": 5.709115955248609, + "grad_norm": 0.2708793580532074, + "learning_rate": 8.19369117786045e-06, + "loss": 1.2207, + "step": 19168 + }, + { + "epoch": 5.709413801448277, + "grad_norm": 0.2903445065021515, + "learning_rate": 8.192742486092844e-06, + "loss": 1.2214, + "step": 19169 + }, + { + "epoch": 5.7097116476479455, + "grad_norm": 0.4261883795261383, + "learning_rate": 8.191793811139753e-06, + "loss": 1.2022, + "step": 19170 + }, + { + "epoch": 5.710009493847615, + "grad_norm": 0.2846320569515228, + "learning_rate": 8.190845153009994e-06, + "loss": 1.2159, + "step": 19171 + }, + { + "epoch": 5.710307340047283, + "grad_norm": 0.447803258895874, + "learning_rate": 8.189896511712404e-06, + "loss": 1.2066, + "step": 19172 + }, + { + "epoch": 5.710605186246951, + "grad_norm": 0.5086649656295776, + "learning_rate": 8.188947887255803e-06, + "loss": 1.2152, + "step": 19173 + }, + { + "epoch": 5.710903032446621, + "grad_norm": 0.27532675862312317, + "learning_rate": 8.187999279649016e-06, + "loss": 1.213, + "step": 19174 + }, + { + "epoch": 5.711200878646289, + "grad_norm": 0.40517324209213257, + "learning_rate": 8.187050688900874e-06, + "loss": 1.2157, + "step": 19175 + }, + { + "epoch": 5.711498724845958, + "grad_norm": 0.31303679943084717, + "learning_rate": 8.186102115020203e-06, + "loss": 1.2161, + "step": 19176 + }, + { + "epoch": 5.711796571045626, + "grad_norm": 0.2991512715816498, + "learning_rate": 8.18515355801582e-06, + "loss": 1.1982, + "step": 19177 + }, + { + "epoch": 5.712094417245295, + "grad_norm": 0.32588645815849304, + "learning_rate": 8.184205017896558e-06, + "loss": 1.2086, + "step": 19178 + }, + { + "epoch": 5.712392263444964, + "grad_norm": 0.27404963970184326, + "learning_rate": 8.183256494671239e-06, + "loss": 1.237, + "step": 19179 + }, + { + "epoch": 5.712690109644632, + "grad_norm": 0.2768762409687042, + "learning_rate": 8.182307988348686e-06, + "loss": 1.2134, + "step": 19180 + }, + { + "epoch": 5.712987955844301, + "grad_norm": 0.28320375084877014, + "learning_rate": 8.181359498937731e-06, + "loss": 1.2211, + "step": 19181 + }, + { + "epoch": 5.71328580204397, + "grad_norm": 0.2991725206375122, + "learning_rate": 8.180411026447188e-06, + "loss": 1.2194, + "step": 19182 + }, + { + "epoch": 5.713583648243638, + "grad_norm": 0.2722277343273163, + "learning_rate": 8.179462570885893e-06, + "loss": 1.2049, + "step": 19183 + }, + { + "epoch": 5.7138814944433065, + "grad_norm": 0.33952224254608154, + "learning_rate": 8.178514132262665e-06, + "loss": 1.188, + "step": 19184 + }, + { + "epoch": 5.714179340642976, + "grad_norm": 0.4549599885940552, + "learning_rate": 8.177565710586324e-06, + "loss": 1.1926, + "step": 19185 + }, + { + "epoch": 5.714477186842644, + "grad_norm": 0.38919007778167725, + "learning_rate": 8.1766173058657e-06, + "loss": 1.2194, + "step": 19186 + }, + { + "epoch": 5.714775033042313, + "grad_norm": 0.4535031318664551, + "learning_rate": 8.175668918109614e-06, + "loss": 1.1982, + "step": 19187 + }, + { + "epoch": 5.715072879241982, + "grad_norm": 0.6928510665893555, + "learning_rate": 8.17472054732689e-06, + "loss": 1.207, + "step": 19188 + }, + { + "epoch": 5.71537072544165, + "grad_norm": 0.3158282935619354, + "learning_rate": 8.173772193526355e-06, + "loss": 1.2142, + "step": 19189 + }, + { + "epoch": 5.715668571641318, + "grad_norm": 0.471537709236145, + "learning_rate": 8.172823856716826e-06, + "loss": 1.2081, + "step": 19190 + }, + { + "epoch": 5.7159664178409875, + "grad_norm": 0.43880006670951843, + "learning_rate": 8.171875536907131e-06, + "loss": 1.2195, + "step": 19191 + }, + { + "epoch": 5.716264264040656, + "grad_norm": 0.4804430902004242, + "learning_rate": 8.17092723410609e-06, + "loss": 1.2173, + "step": 19192 + }, + { + "epoch": 5.716562110240325, + "grad_norm": 0.45681121945381165, + "learning_rate": 8.169978948322532e-06, + "loss": 1.2119, + "step": 19193 + }, + { + "epoch": 5.716859956439993, + "grad_norm": 0.4069162607192993, + "learning_rate": 8.16903067956527e-06, + "loss": 1.222, + "step": 19194 + }, + { + "epoch": 5.717157802639662, + "grad_norm": 0.49170956015586853, + "learning_rate": 8.168082427843136e-06, + "loss": 1.2319, + "step": 19195 + }, + { + "epoch": 5.717455648839331, + "grad_norm": 0.3608390688896179, + "learning_rate": 8.167134193164945e-06, + "loss": 1.2169, + "step": 19196 + }, + { + "epoch": 5.717753495038999, + "grad_norm": 0.3721214830875397, + "learning_rate": 8.166185975539525e-06, + "loss": 1.2056, + "step": 19197 + }, + { + "epoch": 5.7180513412386675, + "grad_norm": 0.26780083775520325, + "learning_rate": 8.165237774975694e-06, + "loss": 1.2252, + "step": 19198 + }, + { + "epoch": 5.718349187438337, + "grad_norm": 0.3509320914745331, + "learning_rate": 8.164289591482273e-06, + "loss": 1.1987, + "step": 19199 + }, + { + "epoch": 5.718647033638005, + "grad_norm": 0.3072911500930786, + "learning_rate": 8.163341425068091e-06, + "loss": 1.2136, + "step": 19200 + }, + { + "epoch": 5.718944879837673, + "grad_norm": 0.34929409623146057, + "learning_rate": 8.162393275741965e-06, + "loss": 1.2269, + "step": 19201 + }, + { + "epoch": 5.719242726037343, + "grad_norm": 0.2987555265426636, + "learning_rate": 8.161445143512714e-06, + "loss": 1.2274, + "step": 19202 + }, + { + "epoch": 5.719540572237011, + "grad_norm": 0.26666340231895447, + "learning_rate": 8.160497028389162e-06, + "loss": 1.2093, + "step": 19203 + }, + { + "epoch": 5.71983841843668, + "grad_norm": 0.3248349130153656, + "learning_rate": 8.159548930380132e-06, + "loss": 1.1982, + "step": 19204 + }, + { + "epoch": 5.7201362646363485, + "grad_norm": 0.2517450451850891, + "learning_rate": 8.158600849494439e-06, + "loss": 1.1918, + "step": 19205 + }, + { + "epoch": 5.720434110836017, + "grad_norm": 0.3940218687057495, + "learning_rate": 8.157652785740913e-06, + "loss": 1.2134, + "step": 19206 + }, + { + "epoch": 5.720731957035686, + "grad_norm": 0.3380928337574005, + "learning_rate": 8.156704739128368e-06, + "loss": 1.2055, + "step": 19207 + }, + { + "epoch": 5.721029803235354, + "grad_norm": 0.7472136616706848, + "learning_rate": 8.155756709665623e-06, + "loss": 1.2026, + "step": 19208 + }, + { + "epoch": 5.721327649435023, + "grad_norm": 0.29066941142082214, + "learning_rate": 8.154808697361503e-06, + "loss": 1.2207, + "step": 19209 + }, + { + "epoch": 5.721625495634692, + "grad_norm": 0.472308486700058, + "learning_rate": 8.153860702224827e-06, + "loss": 1.2335, + "step": 19210 + }, + { + "epoch": 5.72192334183436, + "grad_norm": 0.2740098536014557, + "learning_rate": 8.152912724264413e-06, + "loss": 1.2163, + "step": 19211 + }, + { + "epoch": 5.722221188034029, + "grad_norm": 0.3556009531021118, + "learning_rate": 8.151964763489084e-06, + "loss": 1.198, + "step": 19212 + }, + { + "epoch": 5.722519034233698, + "grad_norm": 0.3266046643257141, + "learning_rate": 8.15101681990766e-06, + "loss": 1.2253, + "step": 19213 + }, + { + "epoch": 5.722816880433366, + "grad_norm": 0.3187634348869324, + "learning_rate": 8.150068893528953e-06, + "loss": 1.2037, + "step": 19214 + }, + { + "epoch": 5.723114726633035, + "grad_norm": 0.2581399977207184, + "learning_rate": 8.149120984361792e-06, + "loss": 1.2068, + "step": 19215 + }, + { + "epoch": 5.723412572832704, + "grad_norm": 0.2611939013004303, + "learning_rate": 8.14817309241499e-06, + "loss": 1.2066, + "step": 19216 + }, + { + "epoch": 5.723710419032372, + "grad_norm": 0.2910304367542267, + "learning_rate": 8.147225217697372e-06, + "loss": 1.2156, + "step": 19217 + }, + { + "epoch": 5.72400826523204, + "grad_norm": 0.3033101260662079, + "learning_rate": 8.146277360217753e-06, + "loss": 1.2008, + "step": 19218 + }, + { + "epoch": 5.7243061114317095, + "grad_norm": 0.2592804431915283, + "learning_rate": 8.145329519984948e-06, + "loss": 1.2114, + "step": 19219 + }, + { + "epoch": 5.724603957631378, + "grad_norm": 0.337984174489975, + "learning_rate": 8.144381697007784e-06, + "loss": 1.2291, + "step": 19220 + }, + { + "epoch": 5.724901803831047, + "grad_norm": 0.3439250588417053, + "learning_rate": 8.143433891295073e-06, + "loss": 1.2158, + "step": 19221 + }, + { + "epoch": 5.725199650030715, + "grad_norm": 0.316287636756897, + "learning_rate": 8.142486102855633e-06, + "loss": 1.2024, + "step": 19222 + }, + { + "epoch": 5.725497496230384, + "grad_norm": 0.32141873240470886, + "learning_rate": 8.141538331698289e-06, + "loss": 1.2128, + "step": 19223 + }, + { + "epoch": 5.725795342430053, + "grad_norm": 0.29094937443733215, + "learning_rate": 8.140590577831855e-06, + "loss": 1.2066, + "step": 19224 + }, + { + "epoch": 5.726093188629721, + "grad_norm": 0.2842795252799988, + "learning_rate": 8.139642841265143e-06, + "loss": 1.2027, + "step": 19225 + }, + { + "epoch": 5.72639103482939, + "grad_norm": 0.25643765926361084, + "learning_rate": 8.138695122006978e-06, + "loss": 1.2219, + "step": 19226 + }, + { + "epoch": 5.726688881029059, + "grad_norm": 0.3652380704879761, + "learning_rate": 8.137747420066178e-06, + "loss": 1.1971, + "step": 19227 + }, + { + "epoch": 5.726986727228727, + "grad_norm": 0.39302146434783936, + "learning_rate": 8.136799735451553e-06, + "loss": 1.2139, + "step": 19228 + }, + { + "epoch": 5.7272845734283955, + "grad_norm": 0.2970762252807617, + "learning_rate": 8.135852068171929e-06, + "loss": 1.1927, + "step": 19229 + }, + { + "epoch": 5.727582419628065, + "grad_norm": 0.26464593410491943, + "learning_rate": 8.13490441823612e-06, + "loss": 1.2139, + "step": 19230 + }, + { + "epoch": 5.727880265827733, + "grad_norm": 0.3146645724773407, + "learning_rate": 8.133956785652935e-06, + "loss": 1.2171, + "step": 19231 + }, + { + "epoch": 5.728178112027402, + "grad_norm": 0.2746916711330414, + "learning_rate": 8.1330091704312e-06, + "loss": 1.2115, + "step": 19232 + }, + { + "epoch": 5.728475958227071, + "grad_norm": 0.3675459027290344, + "learning_rate": 8.132061572579728e-06, + "loss": 1.2101, + "step": 19233 + }, + { + "epoch": 5.728773804426739, + "grad_norm": 0.4824284017086029, + "learning_rate": 8.13111399210734e-06, + "loss": 1.2117, + "step": 19234 + }, + { + "epoch": 5.729071650626408, + "grad_norm": 0.2716885209083557, + "learning_rate": 8.130166429022848e-06, + "loss": 1.2091, + "step": 19235 + }, + { + "epoch": 5.729369496826076, + "grad_norm": 0.4524732828140259, + "learning_rate": 8.129218883335063e-06, + "loss": 1.2208, + "step": 19236 + }, + { + "epoch": 5.729667343025745, + "grad_norm": 0.25957584381103516, + "learning_rate": 8.128271355052812e-06, + "loss": 1.2136, + "step": 19237 + }, + { + "epoch": 5.729965189225414, + "grad_norm": 0.4286913275718689, + "learning_rate": 8.127323844184901e-06, + "loss": 1.2279, + "step": 19238 + }, + { + "epoch": 5.730263035425082, + "grad_norm": 0.3456757664680481, + "learning_rate": 8.126376350740148e-06, + "loss": 1.2104, + "step": 19239 + }, + { + "epoch": 5.730560881624751, + "grad_norm": 0.34896788001060486, + "learning_rate": 8.125428874727374e-06, + "loss": 1.2083, + "step": 19240 + }, + { + "epoch": 5.73085872782442, + "grad_norm": 0.31779733300209045, + "learning_rate": 8.12448141615539e-06, + "loss": 1.207, + "step": 19241 + }, + { + "epoch": 5.731156574024088, + "grad_norm": 0.2977970242500305, + "learning_rate": 8.123533975033007e-06, + "loss": 1.2074, + "step": 19242 + }, + { + "epoch": 5.731454420223757, + "grad_norm": 0.27172422409057617, + "learning_rate": 8.122586551369049e-06, + "loss": 1.209, + "step": 19243 + }, + { + "epoch": 5.731752266423426, + "grad_norm": 0.24826082587242126, + "learning_rate": 8.121639145172322e-06, + "loss": 1.2047, + "step": 19244 + }, + { + "epoch": 5.732050112623094, + "grad_norm": 0.34129610657691956, + "learning_rate": 8.120691756451642e-06, + "loss": 1.2173, + "step": 19245 + }, + { + "epoch": 5.732347958822763, + "grad_norm": 0.2712298631668091, + "learning_rate": 8.119744385215829e-06, + "loss": 1.2168, + "step": 19246 + }, + { + "epoch": 5.732645805022432, + "grad_norm": 0.34561648964881897, + "learning_rate": 8.118797031473696e-06, + "loss": 1.2094, + "step": 19247 + }, + { + "epoch": 5.7329436512221, + "grad_norm": 0.2662794888019562, + "learning_rate": 8.11784969523405e-06, + "loss": 1.1895, + "step": 19248 + }, + { + "epoch": 5.733241497421769, + "grad_norm": 0.4298684000968933, + "learning_rate": 8.11690237650571e-06, + "loss": 1.2255, + "step": 19249 + }, + { + "epoch": 5.7335393436214375, + "grad_norm": 0.41783541440963745, + "learning_rate": 8.11595507529749e-06, + "loss": 1.2065, + "step": 19250 + }, + { + "epoch": 5.733837189821106, + "grad_norm": 0.26658281683921814, + "learning_rate": 8.115007791618204e-06, + "loss": 1.222, + "step": 19251 + }, + { + "epoch": 5.734135036020775, + "grad_norm": 0.33240213990211487, + "learning_rate": 8.114060525476667e-06, + "loss": 1.219, + "step": 19252 + }, + { + "epoch": 5.734432882220443, + "grad_norm": 0.3223637640476227, + "learning_rate": 8.113113276881685e-06, + "loss": 1.2047, + "step": 19253 + }, + { + "epoch": 5.7347307284201126, + "grad_norm": 0.2844304144382477, + "learning_rate": 8.112166045842079e-06, + "loss": 1.2111, + "step": 19254 + }, + { + "epoch": 5.735028574619781, + "grad_norm": 0.3888123631477356, + "learning_rate": 8.111218832366657e-06, + "loss": 1.211, + "step": 19255 + }, + { + "epoch": 5.735326420819449, + "grad_norm": 0.24945160746574402, + "learning_rate": 8.110271636464236e-06, + "loss": 1.2113, + "step": 19256 + }, + { + "epoch": 5.7356242670191175, + "grad_norm": 0.4705652594566345, + "learning_rate": 8.109324458143622e-06, + "loss": 1.2278, + "step": 19257 + }, + { + "epoch": 5.735922113218787, + "grad_norm": 0.25226935744285583, + "learning_rate": 8.108377297413636e-06, + "loss": 1.2071, + "step": 19258 + }, + { + "epoch": 5.736219959418455, + "grad_norm": 0.40976861119270325, + "learning_rate": 8.10743015428308e-06, + "loss": 1.2139, + "step": 19259 + }, + { + "epoch": 5.736517805618124, + "grad_norm": 0.3066800832748413, + "learning_rate": 8.106483028760778e-06, + "loss": 1.2137, + "step": 19260 + }, + { + "epoch": 5.736815651817793, + "grad_norm": 0.31210604310035706, + "learning_rate": 8.105535920855534e-06, + "loss": 1.218, + "step": 19261 + }, + { + "epoch": 5.737113498017461, + "grad_norm": 0.2538163661956787, + "learning_rate": 8.104588830576158e-06, + "loss": 1.217, + "step": 19262 + }, + { + "epoch": 5.73741134421713, + "grad_norm": 0.3894919455051422, + "learning_rate": 8.103641757931472e-06, + "loss": 1.2154, + "step": 19263 + }, + { + "epoch": 5.7377091904167985, + "grad_norm": 0.27999603748321533, + "learning_rate": 8.102694702930278e-06, + "loss": 1.2243, + "step": 19264 + }, + { + "epoch": 5.738007036616467, + "grad_norm": 0.28781571984291077, + "learning_rate": 8.101747665581386e-06, + "loss": 1.2122, + "step": 19265 + }, + { + "epoch": 5.738304882816136, + "grad_norm": 0.5580193400382996, + "learning_rate": 8.100800645893616e-06, + "loss": 1.2131, + "step": 19266 + }, + { + "epoch": 5.738602729015804, + "grad_norm": 0.29654157161712646, + "learning_rate": 8.099853643875771e-06, + "loss": 1.2186, + "step": 19267 + }, + { + "epoch": 5.738900575215473, + "grad_norm": 0.5443137884140015, + "learning_rate": 8.098906659536664e-06, + "loss": 1.2124, + "step": 19268 + }, + { + "epoch": 5.739198421415142, + "grad_norm": 0.2780085802078247, + "learning_rate": 8.097959692885111e-06, + "loss": 1.2158, + "step": 19269 + }, + { + "epoch": 5.73949626761481, + "grad_norm": 0.39644768834114075, + "learning_rate": 8.097012743929913e-06, + "loss": 1.2163, + "step": 19270 + }, + { + "epoch": 5.7397941138144795, + "grad_norm": 0.4487845301628113, + "learning_rate": 8.09606581267989e-06, + "loss": 1.1921, + "step": 19271 + }, + { + "epoch": 5.740091960014148, + "grad_norm": 0.5084857940673828, + "learning_rate": 8.095118899143848e-06, + "loss": 1.2046, + "step": 19272 + }, + { + "epoch": 5.740389806213816, + "grad_norm": 0.36046403646469116, + "learning_rate": 8.09417200333059e-06, + "loss": 1.224, + "step": 19273 + }, + { + "epoch": 5.740687652413485, + "grad_norm": 0.27846071124076843, + "learning_rate": 8.093225125248936e-06, + "loss": 1.2185, + "step": 19274 + }, + { + "epoch": 5.740985498613154, + "grad_norm": 0.4942079186439514, + "learning_rate": 8.092278264907694e-06, + "loss": 1.2153, + "step": 19275 + }, + { + "epoch": 5.741283344812822, + "grad_norm": 0.2698473632335663, + "learning_rate": 8.091331422315667e-06, + "loss": 1.2119, + "step": 19276 + }, + { + "epoch": 5.741581191012491, + "grad_norm": 0.4069843292236328, + "learning_rate": 8.090384597481672e-06, + "loss": 1.2127, + "step": 19277 + }, + { + "epoch": 5.7418790372121595, + "grad_norm": 0.26671722531318665, + "learning_rate": 8.089437790414513e-06, + "loss": 1.2021, + "step": 19278 + }, + { + "epoch": 5.742176883411828, + "grad_norm": 0.2694877088069916, + "learning_rate": 8.088491001123e-06, + "loss": 1.2091, + "step": 19279 + }, + { + "epoch": 5.742474729611497, + "grad_norm": 0.35932037234306335, + "learning_rate": 8.087544229615945e-06, + "loss": 1.2067, + "step": 19280 + }, + { + "epoch": 5.742772575811165, + "grad_norm": 0.3522062599658966, + "learning_rate": 8.086597475902155e-06, + "loss": 1.2123, + "step": 19281 + }, + { + "epoch": 5.743070422010835, + "grad_norm": 0.4137420356273651, + "learning_rate": 8.085650739990432e-06, + "loss": 1.203, + "step": 19282 + }, + { + "epoch": 5.743368268210503, + "grad_norm": 0.3549848198890686, + "learning_rate": 8.084704021889596e-06, + "loss": 1.2117, + "step": 19283 + }, + { + "epoch": 5.743666114410171, + "grad_norm": 0.4859996736049652, + "learning_rate": 8.083757321608443e-06, + "loss": 1.2066, + "step": 19284 + }, + { + "epoch": 5.74396396060984, + "grad_norm": 0.2760433554649353, + "learning_rate": 8.08281063915579e-06, + "loss": 1.2112, + "step": 19285 + }, + { + "epoch": 5.744261806809509, + "grad_norm": 0.6055020689964294, + "learning_rate": 8.081863974540447e-06, + "loss": 1.2284, + "step": 19286 + }, + { + "epoch": 5.744559653009177, + "grad_norm": 0.28894391655921936, + "learning_rate": 8.080917327771209e-06, + "loss": 1.2234, + "step": 19287 + }, + { + "epoch": 5.744857499208846, + "grad_norm": 0.45702478289604187, + "learning_rate": 8.079970698856897e-06, + "loss": 1.217, + "step": 19288 + }, + { + "epoch": 5.745155345408515, + "grad_norm": 0.25937530398368835, + "learning_rate": 8.079024087806313e-06, + "loss": 1.2195, + "step": 19289 + }, + { + "epoch": 5.745453191608183, + "grad_norm": 0.35895881056785583, + "learning_rate": 8.078077494628258e-06, + "loss": 1.2191, + "step": 19290 + }, + { + "epoch": 5.745751037807852, + "grad_norm": 0.28511467576026917, + "learning_rate": 8.077130919331547e-06, + "loss": 1.2254, + "step": 19291 + }, + { + "epoch": 5.746048884007521, + "grad_norm": 0.3195737302303314, + "learning_rate": 8.076184361924986e-06, + "loss": 1.2095, + "step": 19292 + }, + { + "epoch": 5.746346730207189, + "grad_norm": 0.30785128474235535, + "learning_rate": 8.075237822417378e-06, + "loss": 1.2191, + "step": 19293 + }, + { + "epoch": 5.746644576406858, + "grad_norm": 0.3183385729789734, + "learning_rate": 8.074291300817534e-06, + "loss": 1.2129, + "step": 19294 + }, + { + "epoch": 5.746942422606526, + "grad_norm": 0.5007601976394653, + "learning_rate": 8.073344797134258e-06, + "loss": 1.2093, + "step": 19295 + }, + { + "epoch": 5.747240268806195, + "grad_norm": 0.39805346727371216, + "learning_rate": 8.072398311376352e-06, + "loss": 1.2024, + "step": 19296 + }, + { + "epoch": 5.747538115005864, + "grad_norm": 0.43453508615493774, + "learning_rate": 8.07145184355263e-06, + "loss": 1.217, + "step": 19297 + }, + { + "epoch": 5.747835961205532, + "grad_norm": 0.3058512210845947, + "learning_rate": 8.07050539367189e-06, + "loss": 1.2059, + "step": 19298 + }, + { + "epoch": 5.7481338074052015, + "grad_norm": 0.5790659785270691, + "learning_rate": 8.069558961742945e-06, + "loss": 1.2206, + "step": 19299 + }, + { + "epoch": 5.74843165360487, + "grad_norm": 0.3180426359176636, + "learning_rate": 8.068612547774598e-06, + "loss": 1.2102, + "step": 19300 + }, + { + "epoch": 5.748729499804538, + "grad_norm": 0.40445476770401, + "learning_rate": 8.067666151775649e-06, + "loss": 1.2152, + "step": 19301 + }, + { + "epoch": 5.749027346004207, + "grad_norm": 0.26040560007095337, + "learning_rate": 8.066719773754911e-06, + "loss": 1.2103, + "step": 19302 + }, + { + "epoch": 5.749325192203876, + "grad_norm": 0.3587285280227661, + "learning_rate": 8.065773413721186e-06, + "loss": 1.2262, + "step": 19303 + }, + { + "epoch": 5.749623038403544, + "grad_norm": 0.27444958686828613, + "learning_rate": 8.064827071683275e-06, + "loss": 1.229, + "step": 19304 + }, + { + "epoch": 5.749920884603213, + "grad_norm": 0.28268489241600037, + "learning_rate": 8.06388074764999e-06, + "loss": 1.2281, + "step": 19305 + }, + { + "epoch": 5.750218730802882, + "grad_norm": 0.28504547476768494, + "learning_rate": 8.062934441630131e-06, + "loss": 1.2039, + "step": 19306 + }, + { + "epoch": 5.75051657700255, + "grad_norm": 0.42083004117012024, + "learning_rate": 8.061988153632499e-06, + "loss": 1.1994, + "step": 19307 + }, + { + "epoch": 5.750814423202219, + "grad_norm": 0.26682978868484497, + "learning_rate": 8.061041883665906e-06, + "loss": 1.2367, + "step": 19308 + }, + { + "epoch": 5.7511122694018875, + "grad_norm": 0.46401992440223694, + "learning_rate": 8.060095631739148e-06, + "loss": 1.2209, + "step": 19309 + }, + { + "epoch": 5.751410115601557, + "grad_norm": 0.2903793454170227, + "learning_rate": 8.059149397861034e-06, + "loss": 1.2298, + "step": 19310 + }, + { + "epoch": 5.751707961801225, + "grad_norm": 0.4303167164325714, + "learning_rate": 8.058203182040369e-06, + "loss": 1.2184, + "step": 19311 + }, + { + "epoch": 5.752005808000893, + "grad_norm": 0.2538061738014221, + "learning_rate": 8.057256984285954e-06, + "loss": 1.204, + "step": 19312 + }, + { + "epoch": 5.7523036542005626, + "grad_norm": 0.31230324506759644, + "learning_rate": 8.056310804606587e-06, + "loss": 1.2003, + "step": 19313 + }, + { + "epoch": 5.752601500400231, + "grad_norm": 0.2916463315486908, + "learning_rate": 8.05536464301108e-06, + "loss": 1.2084, + "step": 19314 + }, + { + "epoch": 5.752899346599899, + "grad_norm": 0.2824610769748688, + "learning_rate": 8.054418499508232e-06, + "loss": 1.2125, + "step": 19315 + }, + { + "epoch": 5.753197192799568, + "grad_norm": 0.3339831829071045, + "learning_rate": 8.053472374106845e-06, + "loss": 1.215, + "step": 19316 + }, + { + "epoch": 5.753495038999237, + "grad_norm": 0.2544691264629364, + "learning_rate": 8.052526266815726e-06, + "loss": 1.1975, + "step": 19317 + }, + { + "epoch": 5.753792885198905, + "grad_norm": 0.2540312707424164, + "learning_rate": 8.05158017764367e-06, + "loss": 1.2128, + "step": 19318 + }, + { + "epoch": 5.754090731398574, + "grad_norm": 0.2933233380317688, + "learning_rate": 8.050634106599488e-06, + "loss": 1.2117, + "step": 19319 + }, + { + "epoch": 5.754388577598243, + "grad_norm": 0.40847817063331604, + "learning_rate": 8.049688053691976e-06, + "loss": 1.1959, + "step": 19320 + }, + { + "epoch": 5.754686423797912, + "grad_norm": 0.2613731026649475, + "learning_rate": 8.048742018929934e-06, + "loss": 1.2155, + "step": 19321 + }, + { + "epoch": 5.75498426999758, + "grad_norm": 0.3122713267803192, + "learning_rate": 8.047796002322171e-06, + "loss": 1.2108, + "step": 19322 + }, + { + "epoch": 5.7552821161972485, + "grad_norm": 0.2656697630882263, + "learning_rate": 8.046850003877487e-06, + "loss": 1.2146, + "step": 19323 + }, + { + "epoch": 5.755579962396917, + "grad_norm": 0.29155322909355164, + "learning_rate": 8.045904023604677e-06, + "loss": 1.2178, + "step": 19324 + }, + { + "epoch": 5.755877808596586, + "grad_norm": 0.3093149662017822, + "learning_rate": 8.04495806151255e-06, + "loss": 1.2181, + "step": 19325 + }, + { + "epoch": 5.756175654796254, + "grad_norm": 0.28253135085105896, + "learning_rate": 8.044012117609902e-06, + "loss": 1.2167, + "step": 19326 + }, + { + "epoch": 5.756473500995924, + "grad_norm": 0.29040855169296265, + "learning_rate": 8.043066191905536e-06, + "loss": 1.2022, + "step": 19327 + }, + { + "epoch": 5.756771347195592, + "grad_norm": 0.27360713481903076, + "learning_rate": 8.042120284408255e-06, + "loss": 1.2145, + "step": 19328 + }, + { + "epoch": 5.75706919339526, + "grad_norm": 0.3463855981826782, + "learning_rate": 8.041174395126857e-06, + "loss": 1.2137, + "step": 19329 + }, + { + "epoch": 5.7573670395949295, + "grad_norm": 0.2982986569404602, + "learning_rate": 8.04022852407014e-06, + "loss": 1.2015, + "step": 19330 + }, + { + "epoch": 5.757664885794598, + "grad_norm": 0.3009822964668274, + "learning_rate": 8.039282671246909e-06, + "loss": 1.2093, + "step": 19331 + }, + { + "epoch": 5.757962731994266, + "grad_norm": 0.2775547504425049, + "learning_rate": 8.03833683666596e-06, + "loss": 1.2097, + "step": 19332 + }, + { + "epoch": 5.758260578193935, + "grad_norm": 0.2594717741012573, + "learning_rate": 8.037391020336095e-06, + "loss": 1.2067, + "step": 19333 + }, + { + "epoch": 5.758558424393604, + "grad_norm": 0.4200427532196045, + "learning_rate": 8.036445222266119e-06, + "loss": 1.2209, + "step": 19334 + }, + { + "epoch": 5.758856270593272, + "grad_norm": 0.32719770073890686, + "learning_rate": 8.03549944246482e-06, + "loss": 1.2004, + "step": 19335 + }, + { + "epoch": 5.759154116792941, + "grad_norm": 0.36505675315856934, + "learning_rate": 8.034553680941007e-06, + "loss": 1.2191, + "step": 19336 + }, + { + "epoch": 5.7594519629926095, + "grad_norm": 0.42838454246520996, + "learning_rate": 8.033607937703475e-06, + "loss": 1.2267, + "step": 19337 + }, + { + "epoch": 5.759749809192279, + "grad_norm": 0.2745087146759033, + "learning_rate": 8.032662212761025e-06, + "loss": 1.2143, + "step": 19338 + }, + { + "epoch": 5.760047655391947, + "grad_norm": 0.4445969760417938, + "learning_rate": 8.031716506122454e-06, + "loss": 1.2037, + "step": 19339 + }, + { + "epoch": 5.760345501591615, + "grad_norm": 0.2586653530597687, + "learning_rate": 8.030770817796566e-06, + "loss": 1.2191, + "step": 19340 + }, + { + "epoch": 5.760643347791285, + "grad_norm": 0.40743568539619446, + "learning_rate": 8.029825147792149e-06, + "loss": 1.2105, + "step": 19341 + }, + { + "epoch": 5.760941193990953, + "grad_norm": 0.25817403197288513, + "learning_rate": 8.028879496118013e-06, + "loss": 1.2091, + "step": 19342 + }, + { + "epoch": 5.761239040190621, + "grad_norm": 0.3616870045661926, + "learning_rate": 8.027933862782949e-06, + "loss": 1.2226, + "step": 19343 + }, + { + "epoch": 5.7615368863902905, + "grad_norm": 0.35949820280075073, + "learning_rate": 8.026988247795753e-06, + "loss": 1.2155, + "step": 19344 + }, + { + "epoch": 5.761834732589959, + "grad_norm": 0.5290505290031433, + "learning_rate": 8.026042651165234e-06, + "loss": 1.2185, + "step": 19345 + }, + { + "epoch": 5.762132578789627, + "grad_norm": 0.3248412013053894, + "learning_rate": 8.025097072900181e-06, + "loss": 1.223, + "step": 19346 + }, + { + "epoch": 5.762430424989296, + "grad_norm": 0.4513121247291565, + "learning_rate": 8.02415151300939e-06, + "loss": 1.2118, + "step": 19347 + }, + { + "epoch": 5.762728271188965, + "grad_norm": 0.34418195486068726, + "learning_rate": 8.023205971501664e-06, + "loss": 1.2233, + "step": 19348 + }, + { + "epoch": 5.763026117388634, + "grad_norm": 0.3057760000228882, + "learning_rate": 8.022260448385795e-06, + "loss": 1.2173, + "step": 19349 + }, + { + "epoch": 5.763323963588302, + "grad_norm": 0.3657165765762329, + "learning_rate": 8.021314943670584e-06, + "loss": 1.2229, + "step": 19350 + }, + { + "epoch": 5.763621809787971, + "grad_norm": 0.3583250045776367, + "learning_rate": 8.020369457364832e-06, + "loss": 1.2071, + "step": 19351 + }, + { + "epoch": 5.763919655987639, + "grad_norm": 0.4996359944343567, + "learning_rate": 8.019423989477324e-06, + "loss": 1.2232, + "step": 19352 + }, + { + "epoch": 5.764217502187308, + "grad_norm": 0.30307087302207947, + "learning_rate": 8.018478540016866e-06, + "loss": 1.2144, + "step": 19353 + }, + { + "epoch": 5.764515348386976, + "grad_norm": 0.2732860743999481, + "learning_rate": 8.017533108992252e-06, + "loss": 1.22, + "step": 19354 + }, + { + "epoch": 5.764813194586646, + "grad_norm": 0.3784959614276886, + "learning_rate": 8.016587696412275e-06, + "loss": 1.22, + "step": 19355 + }, + { + "epoch": 5.765111040786314, + "grad_norm": 0.2700169086456299, + "learning_rate": 8.015642302285733e-06, + "loss": 1.208, + "step": 19356 + }, + { + "epoch": 5.765408886985982, + "grad_norm": 0.3230387568473816, + "learning_rate": 8.014696926621425e-06, + "loss": 1.2161, + "step": 19357 + }, + { + "epoch": 5.7657067331856515, + "grad_norm": 0.2916318476200104, + "learning_rate": 8.013751569428141e-06, + "loss": 1.2202, + "step": 19358 + }, + { + "epoch": 5.76600457938532, + "grad_norm": 0.2668834626674652, + "learning_rate": 8.012806230714684e-06, + "loss": 1.2045, + "step": 19359 + }, + { + "epoch": 5.766302425584988, + "grad_norm": 0.2626957595348358, + "learning_rate": 8.011860910489841e-06, + "loss": 1.2112, + "step": 19360 + }, + { + "epoch": 5.766600271784657, + "grad_norm": 0.25540390610694885, + "learning_rate": 8.010915608762412e-06, + "loss": 1.2169, + "step": 19361 + }, + { + "epoch": 5.766898117984326, + "grad_norm": 0.27726197242736816, + "learning_rate": 8.009970325541192e-06, + "loss": 1.2048, + "step": 19362 + }, + { + "epoch": 5.767195964183994, + "grad_norm": 0.3548414707183838, + "learning_rate": 8.009025060834974e-06, + "loss": 1.2143, + "step": 19363 + }, + { + "epoch": 5.767493810383663, + "grad_norm": 0.3036036789417267, + "learning_rate": 8.008079814652551e-06, + "loss": 1.2103, + "step": 19364 + }, + { + "epoch": 5.767791656583332, + "grad_norm": 0.3073996305465698, + "learning_rate": 8.007134587002724e-06, + "loss": 1.2019, + "step": 19365 + }, + { + "epoch": 5.768089502783001, + "grad_norm": 0.3572111427783966, + "learning_rate": 8.00618937789428e-06, + "loss": 1.2206, + "step": 19366 + }, + { + "epoch": 5.768387348982669, + "grad_norm": 0.2723371386528015, + "learning_rate": 8.005244187336018e-06, + "loss": 1.2139, + "step": 19367 + }, + { + "epoch": 5.7686851951823375, + "grad_norm": 0.3631337881088257, + "learning_rate": 8.004299015336729e-06, + "loss": 1.2168, + "step": 19368 + }, + { + "epoch": 5.768983041382007, + "grad_norm": 0.3015212416648865, + "learning_rate": 8.003353861905206e-06, + "loss": 1.1829, + "step": 19369 + }, + { + "epoch": 5.769280887581675, + "grad_norm": 0.3681679666042328, + "learning_rate": 8.002408727050248e-06, + "loss": 1.2057, + "step": 19370 + }, + { + "epoch": 5.769578733781343, + "grad_norm": 0.40260106325149536, + "learning_rate": 8.001463610780647e-06, + "loss": 1.2184, + "step": 19371 + }, + { + "epoch": 5.7698765799810126, + "grad_norm": 0.4095645844936371, + "learning_rate": 8.000518513105188e-06, + "loss": 1.2184, + "step": 19372 + }, + { + "epoch": 5.770174426180681, + "grad_norm": 0.35802316665649414, + "learning_rate": 7.999573434032672e-06, + "loss": 1.2027, + "step": 19373 + }, + { + "epoch": 5.770472272380349, + "grad_norm": 0.3471449613571167, + "learning_rate": 7.998628373571894e-06, + "loss": 1.2115, + "step": 19374 + }, + { + "epoch": 5.770770118580018, + "grad_norm": 0.39103150367736816, + "learning_rate": 7.997683331731638e-06, + "loss": 1.2126, + "step": 19375 + }, + { + "epoch": 5.771067964779687, + "grad_norm": 0.2856650948524475, + "learning_rate": 7.996738308520706e-06, + "loss": 1.202, + "step": 19376 + }, + { + "epoch": 5.771365810979356, + "grad_norm": 0.38072243332862854, + "learning_rate": 7.995793303947885e-06, + "loss": 1.2012, + "step": 19377 + }, + { + "epoch": 5.771663657179024, + "grad_norm": 0.28907880187034607, + "learning_rate": 7.994848318021966e-06, + "loss": 1.2067, + "step": 19378 + }, + { + "epoch": 5.771961503378693, + "grad_norm": 0.493081659078598, + "learning_rate": 7.993903350751742e-06, + "loss": 1.2209, + "step": 19379 + }, + { + "epoch": 5.772259349578362, + "grad_norm": 0.26373782753944397, + "learning_rate": 7.99295840214601e-06, + "loss": 1.2122, + "step": 19380 + }, + { + "epoch": 5.77255719577803, + "grad_norm": 0.4298916459083557, + "learning_rate": 7.992013472213553e-06, + "loss": 1.2122, + "step": 19381 + }, + { + "epoch": 5.7728550419776985, + "grad_norm": 0.3300621211528778, + "learning_rate": 7.991068560963173e-06, + "loss": 1.2254, + "step": 19382 + }, + { + "epoch": 5.773152888177368, + "grad_norm": 0.2928406894207001, + "learning_rate": 7.990123668403649e-06, + "loss": 1.2318, + "step": 19383 + }, + { + "epoch": 5.773450734377036, + "grad_norm": 0.3084755539894104, + "learning_rate": 7.989178794543784e-06, + "loss": 1.2125, + "step": 19384 + }, + { + "epoch": 5.773748580576704, + "grad_norm": 0.31379589438438416, + "learning_rate": 7.988233939392362e-06, + "loss": 1.2073, + "step": 19385 + }, + { + "epoch": 5.774046426776374, + "grad_norm": 0.2632942199707031, + "learning_rate": 7.987289102958172e-06, + "loss": 1.224, + "step": 19386 + }, + { + "epoch": 5.774344272976042, + "grad_norm": 0.44776707887649536, + "learning_rate": 7.986344285250014e-06, + "loss": 1.2063, + "step": 19387 + }, + { + "epoch": 5.774642119175711, + "grad_norm": 0.25434747338294983, + "learning_rate": 7.985399486276672e-06, + "loss": 1.2062, + "step": 19388 + }, + { + "epoch": 5.7749399653753795, + "grad_norm": 0.43667274713516235, + "learning_rate": 7.984454706046931e-06, + "loss": 1.2085, + "step": 19389 + }, + { + "epoch": 5.775237811575048, + "grad_norm": 0.3168714642524719, + "learning_rate": 7.983509944569594e-06, + "loss": 1.201, + "step": 19390 + }, + { + "epoch": 5.775535657774716, + "grad_norm": 0.4105963706970215, + "learning_rate": 7.98256520185344e-06, + "loss": 1.2145, + "step": 19391 + }, + { + "epoch": 5.775833503974385, + "grad_norm": 0.38376665115356445, + "learning_rate": 7.98162047790726e-06, + "loss": 1.2121, + "step": 19392 + }, + { + "epoch": 5.776131350174054, + "grad_norm": 0.28114792704582214, + "learning_rate": 7.980675772739855e-06, + "loss": 1.2052, + "step": 19393 + }, + { + "epoch": 5.776429196373723, + "grad_norm": 0.26987892389297485, + "learning_rate": 7.979731086360002e-06, + "loss": 1.2218, + "step": 19394 + }, + { + "epoch": 5.776727042573391, + "grad_norm": 0.30014467239379883, + "learning_rate": 7.978786418776491e-06, + "loss": 1.2092, + "step": 19395 + }, + { + "epoch": 5.7770248887730595, + "grad_norm": 0.3099175691604614, + "learning_rate": 7.977841769998115e-06, + "loss": 1.2346, + "step": 19396 + }, + { + "epoch": 5.777322734972729, + "grad_norm": 0.32902398705482483, + "learning_rate": 7.976897140033665e-06, + "loss": 1.2061, + "step": 19397 + }, + { + "epoch": 5.777620581172397, + "grad_norm": 0.3419867753982544, + "learning_rate": 7.975952528891924e-06, + "loss": 1.233, + "step": 19398 + }, + { + "epoch": 5.777918427372065, + "grad_norm": 0.34635451436042786, + "learning_rate": 7.975007936581685e-06, + "loss": 1.2125, + "step": 19399 + }, + { + "epoch": 5.778216273571735, + "grad_norm": 0.31191107630729675, + "learning_rate": 7.974063363111733e-06, + "loss": 1.2266, + "step": 19400 + }, + { + "epoch": 5.778514119771403, + "grad_norm": 0.3498908281326294, + "learning_rate": 7.97311880849086e-06, + "loss": 1.2317, + "step": 19401 + }, + { + "epoch": 5.778811965971071, + "grad_norm": 0.3947415351867676, + "learning_rate": 7.97217427272785e-06, + "loss": 1.2084, + "step": 19402 + }, + { + "epoch": 5.7791098121707405, + "grad_norm": 0.28250670433044434, + "learning_rate": 7.971229755831494e-06, + "loss": 1.2108, + "step": 19403 + }, + { + "epoch": 5.779407658370409, + "grad_norm": 0.2685490846633911, + "learning_rate": 7.970285257810577e-06, + "loss": 1.2109, + "step": 19404 + }, + { + "epoch": 5.779705504570078, + "grad_norm": 0.31267213821411133, + "learning_rate": 7.969340778673891e-06, + "loss": 1.226, + "step": 19405 + }, + { + "epoch": 5.780003350769746, + "grad_norm": 0.3549420237541199, + "learning_rate": 7.968396318430216e-06, + "loss": 1.2155, + "step": 19406 + }, + { + "epoch": 5.780301196969415, + "grad_norm": 0.2676822245121002, + "learning_rate": 7.967451877088348e-06, + "loss": 1.2141, + "step": 19407 + }, + { + "epoch": 5.780599043169084, + "grad_norm": 0.4018521010875702, + "learning_rate": 7.966507454657067e-06, + "loss": 1.2148, + "step": 19408 + }, + { + "epoch": 5.780896889368752, + "grad_norm": 0.349960058927536, + "learning_rate": 7.96556305114516e-06, + "loss": 1.225, + "step": 19409 + }, + { + "epoch": 5.781194735568421, + "grad_norm": 0.4603356122970581, + "learning_rate": 7.964618666561418e-06, + "loss": 1.2232, + "step": 19410 + }, + { + "epoch": 5.78149258176809, + "grad_norm": 0.2915216088294983, + "learning_rate": 7.96367430091463e-06, + "loss": 1.2269, + "step": 19411 + }, + { + "epoch": 5.781790427967758, + "grad_norm": 0.3176279366016388, + "learning_rate": 7.96272995421357e-06, + "loss": 1.221, + "step": 19412 + }, + { + "epoch": 5.782088274167426, + "grad_norm": 0.2789140045642853, + "learning_rate": 7.961785626467036e-06, + "loss": 1.2267, + "step": 19413 + }, + { + "epoch": 5.782386120367096, + "grad_norm": 0.2658219039440155, + "learning_rate": 7.960841317683805e-06, + "loss": 1.1986, + "step": 19414 + }, + { + "epoch": 5.782683966566764, + "grad_norm": 0.3000280559062958, + "learning_rate": 7.95989702787267e-06, + "loss": 1.222, + "step": 19415 + }, + { + "epoch": 5.782981812766433, + "grad_norm": 0.27580147981643677, + "learning_rate": 7.958952757042415e-06, + "loss": 1.2208, + "step": 19416 + }, + { + "epoch": 5.7832796589661015, + "grad_norm": 0.2718835175037384, + "learning_rate": 7.958008505201821e-06, + "loss": 1.2155, + "step": 19417 + }, + { + "epoch": 5.78357750516577, + "grad_norm": 0.25872963666915894, + "learning_rate": 7.957064272359682e-06, + "loss": 1.2101, + "step": 19418 + }, + { + "epoch": 5.783875351365438, + "grad_norm": 0.3069838583469391, + "learning_rate": 7.956120058524774e-06, + "loss": 1.2184, + "step": 19419 + }, + { + "epoch": 5.784173197565107, + "grad_norm": 0.4329480528831482, + "learning_rate": 7.955175863705884e-06, + "loss": 1.2139, + "step": 19420 + }, + { + "epoch": 5.784471043764776, + "grad_norm": 0.3012837767601013, + "learning_rate": 7.9542316879118e-06, + "loss": 1.1893, + "step": 19421 + }, + { + "epoch": 5.784768889964445, + "grad_norm": 0.4695422351360321, + "learning_rate": 7.953287531151305e-06, + "loss": 1.2073, + "step": 19422 + }, + { + "epoch": 5.785066736164113, + "grad_norm": 0.3115541934967041, + "learning_rate": 7.95234339343318e-06, + "loss": 1.2165, + "step": 19423 + }, + { + "epoch": 5.785364582363782, + "grad_norm": 0.49520808458328247, + "learning_rate": 7.951399274766214e-06, + "loss": 1.2114, + "step": 19424 + }, + { + "epoch": 5.785662428563451, + "grad_norm": 0.2969418466091156, + "learning_rate": 7.950455175159188e-06, + "loss": 1.2283, + "step": 19425 + }, + { + "epoch": 5.785960274763119, + "grad_norm": 0.3932274580001831, + "learning_rate": 7.949511094620888e-06, + "loss": 1.2163, + "step": 19426 + }, + { + "epoch": 5.7862581209627875, + "grad_norm": 0.27351394295692444, + "learning_rate": 7.948567033160095e-06, + "loss": 1.2033, + "step": 19427 + }, + { + "epoch": 5.786555967162457, + "grad_norm": 0.33573392033576965, + "learning_rate": 7.947622990785596e-06, + "loss": 1.2052, + "step": 19428 + }, + { + "epoch": 5.786853813362125, + "grad_norm": 0.32856041193008423, + "learning_rate": 7.946678967506167e-06, + "loss": 1.2176, + "step": 19429 + }, + { + "epoch": 5.787151659561793, + "grad_norm": 0.4364908039569855, + "learning_rate": 7.945734963330602e-06, + "loss": 1.2045, + "step": 19430 + }, + { + "epoch": 5.7874495057614626, + "grad_norm": 0.2859799265861511, + "learning_rate": 7.944790978267673e-06, + "loss": 1.2285, + "step": 19431 + }, + { + "epoch": 5.787747351961131, + "grad_norm": 0.268753319978714, + "learning_rate": 7.94384701232617e-06, + "loss": 1.2036, + "step": 19432 + }, + { + "epoch": 5.7880451981608, + "grad_norm": 0.2933937609195709, + "learning_rate": 7.942903065514876e-06, + "loss": 1.215, + "step": 19433 + }, + { + "epoch": 5.788343044360468, + "grad_norm": 0.3423647880554199, + "learning_rate": 7.941959137842565e-06, + "loss": 1.2122, + "step": 19434 + }, + { + "epoch": 5.788640890560137, + "grad_norm": 0.3489307761192322, + "learning_rate": 7.94101522931803e-06, + "loss": 1.2048, + "step": 19435 + }, + { + "epoch": 5.788938736759806, + "grad_norm": 0.3236292600631714, + "learning_rate": 7.940071339950047e-06, + "loss": 1.1957, + "step": 19436 + }, + { + "epoch": 5.789236582959474, + "grad_norm": 0.3881048262119293, + "learning_rate": 7.939127469747397e-06, + "loss": 1.2432, + "step": 19437 + }, + { + "epoch": 5.789534429159143, + "grad_norm": 0.29771867394447327, + "learning_rate": 7.938183618718862e-06, + "loss": 1.2061, + "step": 19438 + }, + { + "epoch": 5.789832275358812, + "grad_norm": 0.34279343485832214, + "learning_rate": 7.93723978687323e-06, + "loss": 1.2221, + "step": 19439 + }, + { + "epoch": 5.79013012155848, + "grad_norm": 0.27765029668807983, + "learning_rate": 7.93629597421927e-06, + "loss": 1.2094, + "step": 19440 + }, + { + "epoch": 5.7904279677581485, + "grad_norm": 0.2648593485355377, + "learning_rate": 7.935352180765776e-06, + "loss": 1.2163, + "step": 19441 + }, + { + "epoch": 5.790725813957818, + "grad_norm": 0.28902289271354675, + "learning_rate": 7.934408406521523e-06, + "loss": 1.2161, + "step": 19442 + }, + { + "epoch": 5.791023660157486, + "grad_norm": 0.3197891414165497, + "learning_rate": 7.933464651495289e-06, + "loss": 1.1946, + "step": 19443 + }, + { + "epoch": 5.791321506357155, + "grad_norm": 0.26033326983451843, + "learning_rate": 7.932520915695857e-06, + "loss": 1.2228, + "step": 19444 + }, + { + "epoch": 5.791619352556824, + "grad_norm": 0.2686390280723572, + "learning_rate": 7.931577199132012e-06, + "loss": 1.2038, + "step": 19445 + }, + { + "epoch": 5.791917198756492, + "grad_norm": 0.26463714241981506, + "learning_rate": 7.930633501812525e-06, + "loss": 1.2083, + "step": 19446 + }, + { + "epoch": 5.792215044956161, + "grad_norm": 0.28965914249420166, + "learning_rate": 7.929689823746187e-06, + "loss": 1.2215, + "step": 19447 + }, + { + "epoch": 5.7925128911558295, + "grad_norm": 0.34461528062820435, + "learning_rate": 7.928746164941767e-06, + "loss": 1.2092, + "step": 19448 + }, + { + "epoch": 5.792810737355498, + "grad_norm": 0.3160446286201477, + "learning_rate": 7.927802525408053e-06, + "loss": 1.2237, + "step": 19449 + }, + { + "epoch": 5.793108583555167, + "grad_norm": 0.4149305820465088, + "learning_rate": 7.926858905153821e-06, + "loss": 1.2123, + "step": 19450 + }, + { + "epoch": 5.793406429754835, + "grad_norm": 0.311864972114563, + "learning_rate": 7.925915304187847e-06, + "loss": 1.2251, + "step": 19451 + }, + { + "epoch": 5.793704275954504, + "grad_norm": 0.3452128469944, + "learning_rate": 7.924971722518918e-06, + "loss": 1.2047, + "step": 19452 + }, + { + "epoch": 5.794002122154173, + "grad_norm": 0.33326712250709534, + "learning_rate": 7.92402816015581e-06, + "loss": 1.2124, + "step": 19453 + }, + { + "epoch": 5.794299968353841, + "grad_norm": 0.32764118909835815, + "learning_rate": 7.923084617107295e-06, + "loss": 1.1997, + "step": 19454 + }, + { + "epoch": 5.79459781455351, + "grad_norm": 0.3002004027366638, + "learning_rate": 7.92214109338216e-06, + "loss": 1.2065, + "step": 19455 + }, + { + "epoch": 5.794895660753179, + "grad_norm": 0.3065284192562103, + "learning_rate": 7.921197588989183e-06, + "loss": 1.2075, + "step": 19456 + }, + { + "epoch": 5.795193506952847, + "grad_norm": 0.2906924784183502, + "learning_rate": 7.920254103937135e-06, + "loss": 1.2079, + "step": 19457 + }, + { + "epoch": 5.795491353152515, + "grad_norm": 0.27946940064430237, + "learning_rate": 7.919310638234804e-06, + "loss": 1.2122, + "step": 19458 + }, + { + "epoch": 5.795789199352185, + "grad_norm": 0.307682067155838, + "learning_rate": 7.918367191890962e-06, + "loss": 1.2315, + "step": 19459 + }, + { + "epoch": 5.796087045551853, + "grad_norm": 0.29961058497428894, + "learning_rate": 7.917423764914385e-06, + "loss": 1.2075, + "step": 19460 + }, + { + "epoch": 5.796384891751522, + "grad_norm": 0.31368833780288696, + "learning_rate": 7.916480357313852e-06, + "loss": 1.2183, + "step": 19461 + }, + { + "epoch": 5.7966827379511905, + "grad_norm": 0.27859219908714294, + "learning_rate": 7.915536969098146e-06, + "loss": 1.2106, + "step": 19462 + }, + { + "epoch": 5.796980584150859, + "grad_norm": 0.3880733251571655, + "learning_rate": 7.914593600276035e-06, + "loss": 1.2315, + "step": 19463 + }, + { + "epoch": 5.797278430350528, + "grad_norm": 0.26957470178604126, + "learning_rate": 7.913650250856304e-06, + "loss": 1.2171, + "step": 19464 + }, + { + "epoch": 5.797576276550196, + "grad_norm": 0.5684119462966919, + "learning_rate": 7.912706920847724e-06, + "loss": 1.2159, + "step": 19465 + }, + { + "epoch": 5.797874122749865, + "grad_norm": 0.2869011461734772, + "learning_rate": 7.911763610259076e-06, + "loss": 1.2193, + "step": 19466 + }, + { + "epoch": 5.798171968949534, + "grad_norm": 0.40862399339675903, + "learning_rate": 7.910820319099134e-06, + "loss": 1.2169, + "step": 19467 + }, + { + "epoch": 5.798469815149202, + "grad_norm": 0.26902005076408386, + "learning_rate": 7.909877047376671e-06, + "loss": 1.2057, + "step": 19468 + }, + { + "epoch": 5.798767661348871, + "grad_norm": 0.5130660533905029, + "learning_rate": 7.908933795100473e-06, + "loss": 1.2088, + "step": 19469 + }, + { + "epoch": 5.79906550754854, + "grad_norm": 0.2949145436286926, + "learning_rate": 7.907990562279309e-06, + "loss": 1.2215, + "step": 19470 + }, + { + "epoch": 5.799363353748208, + "grad_norm": 0.4076009690761566, + "learning_rate": 7.90704734892195e-06, + "loss": 1.2172, + "step": 19471 + }, + { + "epoch": 5.799661199947877, + "grad_norm": 0.37305235862731934, + "learning_rate": 7.906104155037183e-06, + "loss": 1.2247, + "step": 19472 + }, + { + "epoch": 5.799959046147546, + "grad_norm": 0.5357394218444824, + "learning_rate": 7.905160980633774e-06, + "loss": 1.2077, + "step": 19473 + }, + { + "epoch": 5.800256892347214, + "grad_norm": 0.3946038484573364, + "learning_rate": 7.904217825720502e-06, + "loss": 1.2238, + "step": 19474 + }, + { + "epoch": 5.800554738546883, + "grad_norm": 0.4440886378288269, + "learning_rate": 7.903274690306143e-06, + "loss": 1.2033, + "step": 19475 + }, + { + "epoch": 5.8008525847465515, + "grad_norm": 0.4744853079319, + "learning_rate": 7.902331574399471e-06, + "loss": 1.2139, + "step": 19476 + }, + { + "epoch": 5.80115043094622, + "grad_norm": 0.30114537477493286, + "learning_rate": 7.901388478009258e-06, + "loss": 1.2301, + "step": 19477 + }, + { + "epoch": 5.801448277145889, + "grad_norm": 0.5893943905830383, + "learning_rate": 7.900445401144282e-06, + "loss": 1.1888, + "step": 19478 + }, + { + "epoch": 5.801746123345557, + "grad_norm": 0.2603563964366913, + "learning_rate": 7.899502343813314e-06, + "loss": 1.2091, + "step": 19479 + }, + { + "epoch": 5.802043969545226, + "grad_norm": 0.5266931056976318, + "learning_rate": 7.898559306025129e-06, + "loss": 1.2035, + "step": 19480 + }, + { + "epoch": 5.802341815744895, + "grad_norm": 0.3926750421524048, + "learning_rate": 7.897616287788506e-06, + "loss": 1.1993, + "step": 19481 + }, + { + "epoch": 5.802639661944563, + "grad_norm": 0.5756117105484009, + "learning_rate": 7.896673289112209e-06, + "loss": 1.1926, + "step": 19482 + }, + { + "epoch": 5.8029375081442325, + "grad_norm": 0.4146365821361542, + "learning_rate": 7.895730310005023e-06, + "loss": 1.215, + "step": 19483 + }, + { + "epoch": 5.803235354343901, + "grad_norm": 0.37578409910202026, + "learning_rate": 7.894787350475712e-06, + "loss": 1.2171, + "step": 19484 + }, + { + "epoch": 5.803533200543569, + "grad_norm": 0.5479852557182312, + "learning_rate": 7.893844410533052e-06, + "loss": 1.2107, + "step": 19485 + }, + { + "epoch": 5.8038310467432375, + "grad_norm": 0.2593201994895935, + "learning_rate": 7.892901490185818e-06, + "loss": 1.2214, + "step": 19486 + }, + { + "epoch": 5.804128892942907, + "grad_norm": 0.6040177345275879, + "learning_rate": 7.891958589442783e-06, + "loss": 1.218, + "step": 19487 + }, + { + "epoch": 5.804426739142575, + "grad_norm": 0.26518529653549194, + "learning_rate": 7.891015708312714e-06, + "loss": 1.2083, + "step": 19488 + }, + { + "epoch": 5.804724585342244, + "grad_norm": 0.42248404026031494, + "learning_rate": 7.89007284680439e-06, + "loss": 1.2098, + "step": 19489 + }, + { + "epoch": 5.8050224315419126, + "grad_norm": 0.3890303075313568, + "learning_rate": 7.889130004926582e-06, + "loss": 1.2243, + "step": 19490 + }, + { + "epoch": 5.805320277741581, + "grad_norm": 0.3540903329849243, + "learning_rate": 7.888187182688057e-06, + "loss": 1.2221, + "step": 19491 + }, + { + "epoch": 5.80561812394125, + "grad_norm": 0.5370272397994995, + "learning_rate": 7.887244380097595e-06, + "loss": 1.2219, + "step": 19492 + }, + { + "epoch": 5.805915970140918, + "grad_norm": 0.3935164213180542, + "learning_rate": 7.886301597163964e-06, + "loss": 1.2015, + "step": 19493 + }, + { + "epoch": 5.806213816340587, + "grad_norm": 0.4174191355705261, + "learning_rate": 7.885358833895931e-06, + "loss": 1.2106, + "step": 19494 + }, + { + "epoch": 5.806511662540256, + "grad_norm": 0.322135865688324, + "learning_rate": 7.884416090302275e-06, + "loss": 1.2237, + "step": 19495 + }, + { + "epoch": 5.806809508739924, + "grad_norm": 0.38452598452568054, + "learning_rate": 7.883473366391761e-06, + "loss": 1.1945, + "step": 19496 + }, + { + "epoch": 5.807107354939593, + "grad_norm": 0.41428133845329285, + "learning_rate": 7.882530662173163e-06, + "loss": 1.2004, + "step": 19497 + }, + { + "epoch": 5.807405201139262, + "grad_norm": 0.2633223235607147, + "learning_rate": 7.881587977655254e-06, + "loss": 1.2087, + "step": 19498 + }, + { + "epoch": 5.80770304733893, + "grad_norm": 0.4220786392688751, + "learning_rate": 7.880645312846796e-06, + "loss": 1.2245, + "step": 19499 + }, + { + "epoch": 5.808000893538599, + "grad_norm": 0.3942488729953766, + "learning_rate": 7.879702667756573e-06, + "loss": 1.2119, + "step": 19500 + }, + { + "epoch": 5.808000893538599, + "eval_loss": 1.3142019510269165, + "eval_runtime": 24.4626, + "eval_samples_per_second": 70.884, + "eval_steps_per_second": 4.456, + "step": 19500 + }, + { + "epoch": 5.808298739738268, + "grad_norm": 0.32796359062194824, + "learning_rate": 7.878760042393346e-06, + "loss": 1.2214, + "step": 19501 + }, + { + "epoch": 5.808596585937936, + "grad_norm": 0.4206756353378296, + "learning_rate": 7.877817436765882e-06, + "loss": 1.2121, + "step": 19502 + }, + { + "epoch": 5.808894432137605, + "grad_norm": 0.30258476734161377, + "learning_rate": 7.876874850882959e-06, + "loss": 1.2147, + "step": 19503 + }, + { + "epoch": 5.809192278337274, + "grad_norm": 0.3711523413658142, + "learning_rate": 7.875932284753345e-06, + "loss": 1.2151, + "step": 19504 + }, + { + "epoch": 5.809490124536942, + "grad_norm": 0.3838064670562744, + "learning_rate": 7.874989738385803e-06, + "loss": 1.2135, + "step": 19505 + }, + { + "epoch": 5.809787970736611, + "grad_norm": 0.34310483932495117, + "learning_rate": 7.874047211789113e-06, + "loss": 1.1999, + "step": 19506 + }, + { + "epoch": 5.8100858169362795, + "grad_norm": 0.2482302486896515, + "learning_rate": 7.873104704972036e-06, + "loss": 1.2241, + "step": 19507 + }, + { + "epoch": 5.810383663135948, + "grad_norm": 0.3790917992591858, + "learning_rate": 7.872162217943343e-06, + "loss": 1.224, + "step": 19508 + }, + { + "epoch": 5.810681509335617, + "grad_norm": 0.34777647256851196, + "learning_rate": 7.871219750711803e-06, + "loss": 1.2231, + "step": 19509 + }, + { + "epoch": 5.810979355535285, + "grad_norm": 0.3179490864276886, + "learning_rate": 7.870277303286186e-06, + "loss": 1.2188, + "step": 19510 + }, + { + "epoch": 5.8112772017349545, + "grad_norm": 0.2821601331233978, + "learning_rate": 7.869334875675257e-06, + "loss": 1.209, + "step": 19511 + }, + { + "epoch": 5.811575047934623, + "grad_norm": 0.2729527950286865, + "learning_rate": 7.86839246788779e-06, + "loss": 1.2022, + "step": 19512 + }, + { + "epoch": 5.811872894134291, + "grad_norm": 0.29866573214530945, + "learning_rate": 7.867450079932548e-06, + "loss": 1.225, + "step": 19513 + }, + { + "epoch": 5.81217074033396, + "grad_norm": 0.29793581366539, + "learning_rate": 7.866507711818299e-06, + "loss": 1.2237, + "step": 19514 + }, + { + "epoch": 5.812468586533629, + "grad_norm": 0.26419177651405334, + "learning_rate": 7.865565363553813e-06, + "loss": 1.2181, + "step": 19515 + }, + { + "epoch": 5.812766432733297, + "grad_norm": 0.5411391854286194, + "learning_rate": 7.864623035147853e-06, + "loss": 1.2193, + "step": 19516 + }, + { + "epoch": 5.813064278932966, + "grad_norm": 0.32239797711372375, + "learning_rate": 7.863680726609197e-06, + "loss": 1.2162, + "step": 19517 + }, + { + "epoch": 5.813362125132635, + "grad_norm": 0.37849095463752747, + "learning_rate": 7.862738437946602e-06, + "loss": 1.2094, + "step": 19518 + }, + { + "epoch": 5.813659971332303, + "grad_norm": 0.27907320857048035, + "learning_rate": 7.861796169168836e-06, + "loss": 1.225, + "step": 19519 + }, + { + "epoch": 5.813957817531972, + "grad_norm": 0.38774049282073975, + "learning_rate": 7.860853920284668e-06, + "loss": 1.2262, + "step": 19520 + }, + { + "epoch": 5.8142556637316405, + "grad_norm": 0.27382105588912964, + "learning_rate": 7.859911691302868e-06, + "loss": 1.2235, + "step": 19521 + }, + { + "epoch": 5.81455350993131, + "grad_norm": 0.4559738337993622, + "learning_rate": 7.858969482232193e-06, + "loss": 1.2217, + "step": 19522 + }, + { + "epoch": 5.814851356130978, + "grad_norm": 0.363189160823822, + "learning_rate": 7.85802729308142e-06, + "loss": 1.1912, + "step": 19523 + }, + { + "epoch": 5.815149202330646, + "grad_norm": 0.6832521557807922, + "learning_rate": 7.857085123859309e-06, + "loss": 1.2023, + "step": 19524 + }, + { + "epoch": 5.815447048530315, + "grad_norm": 0.42572712898254395, + "learning_rate": 7.856142974574624e-06, + "loss": 1.2209, + "step": 19525 + }, + { + "epoch": 5.815744894729984, + "grad_norm": 0.6698419451713562, + "learning_rate": 7.855200845236135e-06, + "loss": 1.2232, + "step": 19526 + }, + { + "epoch": 5.816042740929652, + "grad_norm": 0.48179465532302856, + "learning_rate": 7.854258735852608e-06, + "loss": 1.2204, + "step": 19527 + }, + { + "epoch": 5.8163405871293214, + "grad_norm": 0.5433464646339417, + "learning_rate": 7.853316646432801e-06, + "loss": 1.2085, + "step": 19528 + }, + { + "epoch": 5.81663843332899, + "grad_norm": 0.41237905621528625, + "learning_rate": 7.85237457698549e-06, + "loss": 1.215, + "step": 19529 + }, + { + "epoch": 5.816936279528658, + "grad_norm": 0.30683696269989014, + "learning_rate": 7.851432527519428e-06, + "loss": 1.2046, + "step": 19530 + }, + { + "epoch": 5.817234125728327, + "grad_norm": 0.5190821886062622, + "learning_rate": 7.850490498043389e-06, + "loss": 1.1982, + "step": 19531 + }, + { + "epoch": 5.817531971927996, + "grad_norm": 0.29013538360595703, + "learning_rate": 7.849548488566134e-06, + "loss": 1.2062, + "step": 19532 + }, + { + "epoch": 5.817829818127664, + "grad_norm": 0.47048938274383545, + "learning_rate": 7.848606499096427e-06, + "loss": 1.2164, + "step": 19533 + }, + { + "epoch": 5.818127664327333, + "grad_norm": 0.39608705043792725, + "learning_rate": 7.847664529643035e-06, + "loss": 1.2188, + "step": 19534 + }, + { + "epoch": 5.8184255105270015, + "grad_norm": 0.48446598649024963, + "learning_rate": 7.84672258021472e-06, + "loss": 1.2065, + "step": 19535 + }, + { + "epoch": 5.81872335672667, + "grad_norm": 0.4329598546028137, + "learning_rate": 7.845780650820242e-06, + "loss": 1.1901, + "step": 19536 + }, + { + "epoch": 5.819021202926339, + "grad_norm": 0.3019412159919739, + "learning_rate": 7.844838741468372e-06, + "loss": 1.2078, + "step": 19537 + }, + { + "epoch": 5.819319049126007, + "grad_norm": 0.40126192569732666, + "learning_rate": 7.843896852167868e-06, + "loss": 1.217, + "step": 19538 + }, + { + "epoch": 5.819616895325677, + "grad_norm": 0.4770059287548065, + "learning_rate": 7.842954982927493e-06, + "loss": 1.2181, + "step": 19539 + }, + { + "epoch": 5.819914741525345, + "grad_norm": 0.29250651597976685, + "learning_rate": 7.842013133756014e-06, + "loss": 1.1887, + "step": 19540 + }, + { + "epoch": 5.820212587725013, + "grad_norm": 0.7883771657943726, + "learning_rate": 7.841071304662194e-06, + "loss": 1.21, + "step": 19541 + }, + { + "epoch": 5.8205104339246825, + "grad_norm": 0.258577436208725, + "learning_rate": 7.840129495654789e-06, + "loss": 1.2065, + "step": 19542 + }, + { + "epoch": 5.820808280124351, + "grad_norm": 0.4956992566585541, + "learning_rate": 7.839187706742569e-06, + "loss": 1.2114, + "step": 19543 + }, + { + "epoch": 5.821106126324019, + "grad_norm": 0.4127381443977356, + "learning_rate": 7.838245937934292e-06, + "loss": 1.2121, + "step": 19544 + }, + { + "epoch": 5.821403972523688, + "grad_norm": 0.33163201808929443, + "learning_rate": 7.837304189238719e-06, + "loss": 1.2069, + "step": 19545 + }, + { + "epoch": 5.821701818723357, + "grad_norm": 0.3702181875705719, + "learning_rate": 7.836362460664619e-06, + "loss": 1.2206, + "step": 19546 + }, + { + "epoch": 5.821999664923025, + "grad_norm": 0.4005001485347748, + "learning_rate": 7.835420752220744e-06, + "loss": 1.2168, + "step": 19547 + }, + { + "epoch": 5.822297511122694, + "grad_norm": 0.31537196040153503, + "learning_rate": 7.834479063915865e-06, + "loss": 1.2047, + "step": 19548 + }, + { + "epoch": 5.8225953573223626, + "grad_norm": 0.3628023862838745, + "learning_rate": 7.833537395758736e-06, + "loss": 1.199, + "step": 19549 + }, + { + "epoch": 5.822893203522032, + "grad_norm": 0.33250901103019714, + "learning_rate": 7.832595747758121e-06, + "loss": 1.2091, + "step": 19550 + }, + { + "epoch": 5.8231910497217, + "grad_norm": 0.361563116312027, + "learning_rate": 7.831654119922783e-06, + "loss": 1.2159, + "step": 19551 + }, + { + "epoch": 5.823488895921368, + "grad_norm": 0.3284603953361511, + "learning_rate": 7.83071251226148e-06, + "loss": 1.2137, + "step": 19552 + }, + { + "epoch": 5.823786742121037, + "grad_norm": 0.28391581773757935, + "learning_rate": 7.829770924782971e-06, + "loss": 1.208, + "step": 19553 + }, + { + "epoch": 5.824084588320706, + "grad_norm": 0.4764794707298279, + "learning_rate": 7.828829357496023e-06, + "loss": 1.2304, + "step": 19554 + }, + { + "epoch": 5.824382434520374, + "grad_norm": 0.25552383065223694, + "learning_rate": 7.827887810409388e-06, + "loss": 1.2166, + "step": 19555 + }, + { + "epoch": 5.8246802807200435, + "grad_norm": 0.4202573597431183, + "learning_rate": 7.826946283531831e-06, + "loss": 1.1973, + "step": 19556 + }, + { + "epoch": 5.824978126919712, + "grad_norm": 0.31306782364845276, + "learning_rate": 7.826004776872114e-06, + "loss": 1.2175, + "step": 19557 + }, + { + "epoch": 5.82527597311938, + "grad_norm": 0.3671114146709442, + "learning_rate": 7.825063290438993e-06, + "loss": 1.2223, + "step": 19558 + }, + { + "epoch": 5.825573819319049, + "grad_norm": 0.4436177611351013, + "learning_rate": 7.824121824241225e-06, + "loss": 1.2199, + "step": 19559 + }, + { + "epoch": 5.825871665518718, + "grad_norm": 0.6168495416641235, + "learning_rate": 7.823180378287576e-06, + "loss": 1.2144, + "step": 19560 + }, + { + "epoch": 5.826169511718386, + "grad_norm": 0.2953750193119049, + "learning_rate": 7.822238952586798e-06, + "loss": 1.2196, + "step": 19561 + }, + { + "epoch": 5.826467357918055, + "grad_norm": 0.4011472463607788, + "learning_rate": 7.821297547147652e-06, + "loss": 1.2033, + "step": 19562 + }, + { + "epoch": 5.826765204117724, + "grad_norm": 0.3375413417816162, + "learning_rate": 7.820356161978904e-06, + "loss": 1.2148, + "step": 19563 + }, + { + "epoch": 5.827063050317392, + "grad_norm": 0.30218881368637085, + "learning_rate": 7.819414797089301e-06, + "loss": 1.2278, + "step": 19564 + }, + { + "epoch": 5.827360896517061, + "grad_norm": 0.6413726806640625, + "learning_rate": 7.818473452487612e-06, + "loss": 1.2009, + "step": 19565 + }, + { + "epoch": 5.8276587427167295, + "grad_norm": 0.31588879227638245, + "learning_rate": 7.817532128182587e-06, + "loss": 1.2118, + "step": 19566 + }, + { + "epoch": 5.827956588916399, + "grad_norm": 0.4257681369781494, + "learning_rate": 7.816590824182988e-06, + "loss": 1.2133, + "step": 19567 + }, + { + "epoch": 5.828254435116067, + "grad_norm": 0.35621654987335205, + "learning_rate": 7.815649540497572e-06, + "loss": 1.1961, + "step": 19568 + }, + { + "epoch": 5.828552281315735, + "grad_norm": 0.408843070268631, + "learning_rate": 7.8147082771351e-06, + "loss": 1.2071, + "step": 19569 + }, + { + "epoch": 5.8288501275154045, + "grad_norm": 0.4065593481063843, + "learning_rate": 7.813767034104319e-06, + "loss": 1.2207, + "step": 19570 + }, + { + "epoch": 5.829147973715073, + "grad_norm": 0.27984321117401123, + "learning_rate": 7.812825811413998e-06, + "loss": 1.1934, + "step": 19571 + }, + { + "epoch": 5.829445819914741, + "grad_norm": 0.37439045310020447, + "learning_rate": 7.811884609072888e-06, + "loss": 1.2224, + "step": 19572 + }, + { + "epoch": 5.82974366611441, + "grad_norm": 0.2947523593902588, + "learning_rate": 7.810943427089746e-06, + "loss": 1.1902, + "step": 19573 + }, + { + "epoch": 5.830041512314079, + "grad_norm": 0.3579213619232178, + "learning_rate": 7.810002265473331e-06, + "loss": 1.2041, + "step": 19574 + }, + { + "epoch": 5.830339358513747, + "grad_norm": 0.3616520166397095, + "learning_rate": 7.809061124232399e-06, + "loss": 1.2165, + "step": 19575 + }, + { + "epoch": 5.830637204713416, + "grad_norm": 0.28629428148269653, + "learning_rate": 7.808120003375703e-06, + "loss": 1.1932, + "step": 19576 + }, + { + "epoch": 5.830935050913085, + "grad_norm": 0.42465123534202576, + "learning_rate": 7.807178902912002e-06, + "loss": 1.211, + "step": 19577 + }, + { + "epoch": 5.831232897112754, + "grad_norm": 0.3263692557811737, + "learning_rate": 7.806237822850053e-06, + "loss": 1.2122, + "step": 19578 + }, + { + "epoch": 5.831530743312422, + "grad_norm": 0.6494307518005371, + "learning_rate": 7.805296763198605e-06, + "loss": 1.2217, + "step": 19579 + }, + { + "epoch": 5.8318285895120905, + "grad_norm": 0.28989386558532715, + "learning_rate": 7.804355723966425e-06, + "loss": 1.2212, + "step": 19580 + }, + { + "epoch": 5.83212643571176, + "grad_norm": 0.4110584259033203, + "learning_rate": 7.803414705162256e-06, + "loss": 1.2119, + "step": 19581 + }, + { + "epoch": 5.832424281911428, + "grad_norm": 0.38376542925834656, + "learning_rate": 7.802473706794865e-06, + "loss": 1.2061, + "step": 19582 + }, + { + "epoch": 5.832722128111096, + "grad_norm": 0.3380451798439026, + "learning_rate": 7.801532728873e-06, + "loss": 1.2063, + "step": 19583 + }, + { + "epoch": 5.833019974310766, + "grad_norm": 0.3984583914279938, + "learning_rate": 7.800591771405412e-06, + "loss": 1.1853, + "step": 19584 + }, + { + "epoch": 5.833317820510434, + "grad_norm": 0.2838040888309479, + "learning_rate": 7.799650834400863e-06, + "loss": 1.2087, + "step": 19585 + }, + { + "epoch": 5.833615666710102, + "grad_norm": 0.38319075107574463, + "learning_rate": 7.798709917868107e-06, + "loss": 1.2206, + "step": 19586 + }, + { + "epoch": 5.8339135129097714, + "grad_norm": 0.2788871228694916, + "learning_rate": 7.797769021815891e-06, + "loss": 1.2148, + "step": 19587 + }, + { + "epoch": 5.83421135910944, + "grad_norm": 0.3699237108230591, + "learning_rate": 7.796828146252979e-06, + "loss": 1.221, + "step": 19588 + }, + { + "epoch": 5.834509205309109, + "grad_norm": 0.29569047689437866, + "learning_rate": 7.795887291188118e-06, + "loss": 1.1963, + "step": 19589 + }, + { + "epoch": 5.834807051508777, + "grad_norm": 0.3124069273471832, + "learning_rate": 7.79494645663006e-06, + "loss": 1.2016, + "step": 19590 + }, + { + "epoch": 5.835104897708446, + "grad_norm": 0.2546287775039673, + "learning_rate": 7.794005642587563e-06, + "loss": 1.1898, + "step": 19591 + }, + { + "epoch": 5.835402743908114, + "grad_norm": 0.40595853328704834, + "learning_rate": 7.79306484906938e-06, + "loss": 1.2096, + "step": 19592 + }, + { + "epoch": 5.835700590107783, + "grad_norm": 0.2685578167438507, + "learning_rate": 7.79212407608426e-06, + "loss": 1.2236, + "step": 19593 + }, + { + "epoch": 5.8359984363074515, + "grad_norm": 0.37070587277412415, + "learning_rate": 7.791183323640963e-06, + "loss": 1.2133, + "step": 19594 + }, + { + "epoch": 5.836296282507121, + "grad_norm": 0.2616555690765381, + "learning_rate": 7.790242591748235e-06, + "loss": 1.2164, + "step": 19595 + }, + { + "epoch": 5.836594128706789, + "grad_norm": 0.38597285747528076, + "learning_rate": 7.789301880414832e-06, + "loss": 1.2073, + "step": 19596 + }, + { + "epoch": 5.836891974906457, + "grad_norm": 0.2547573745250702, + "learning_rate": 7.788361189649503e-06, + "loss": 1.2062, + "step": 19597 + }, + { + "epoch": 5.837189821106127, + "grad_norm": 0.3700876533985138, + "learning_rate": 7.787420519461e-06, + "loss": 1.2249, + "step": 19598 + }, + { + "epoch": 5.837487667305795, + "grad_norm": 0.2842254340648651, + "learning_rate": 7.786479869858083e-06, + "loss": 1.2332, + "step": 19599 + }, + { + "epoch": 5.837785513505463, + "grad_norm": 0.2934574782848358, + "learning_rate": 7.785539240849497e-06, + "loss": 1.2046, + "step": 19600 + }, + { + "epoch": 5.8380833597051325, + "grad_norm": 0.30779892206192017, + "learning_rate": 7.784598632443989e-06, + "loss": 1.221, + "step": 19601 + }, + { + "epoch": 5.838381205904801, + "grad_norm": 0.27394330501556396, + "learning_rate": 7.78365804465032e-06, + "loss": 1.2167, + "step": 19602 + }, + { + "epoch": 5.838679052104469, + "grad_norm": 0.3246789574623108, + "learning_rate": 7.782717477477237e-06, + "loss": 1.2125, + "step": 19603 + }, + { + "epoch": 5.838976898304138, + "grad_norm": 0.33262526988983154, + "learning_rate": 7.781776930933487e-06, + "loss": 1.2117, + "step": 19604 + }, + { + "epoch": 5.839274744503807, + "grad_norm": 0.2643575370311737, + "learning_rate": 7.780836405027827e-06, + "loss": 1.2057, + "step": 19605 + }, + { + "epoch": 5.839572590703476, + "grad_norm": 0.26078900694847107, + "learning_rate": 7.779895899769006e-06, + "loss": 1.2248, + "step": 19606 + }, + { + "epoch": 5.839870436903144, + "grad_norm": 0.26408693194389343, + "learning_rate": 7.77895541516577e-06, + "loss": 1.2108, + "step": 19607 + }, + { + "epoch": 5.8401682831028126, + "grad_norm": 0.2535662353038788, + "learning_rate": 7.778014951226874e-06, + "loss": 1.2346, + "step": 19608 + }, + { + "epoch": 5.840466129302482, + "grad_norm": 0.2951176166534424, + "learning_rate": 7.777074507961067e-06, + "loss": 1.2101, + "step": 19609 + }, + { + "epoch": 5.84076397550215, + "grad_norm": 0.26551011204719543, + "learning_rate": 7.776134085377096e-06, + "loss": 1.2071, + "step": 19610 + }, + { + "epoch": 5.841061821701818, + "grad_norm": 0.348847359418869, + "learning_rate": 7.775193683483715e-06, + "loss": 1.2096, + "step": 19611 + }, + { + "epoch": 5.841359667901488, + "grad_norm": 0.27657821774482727, + "learning_rate": 7.774253302289671e-06, + "loss": 1.2165, + "step": 19612 + }, + { + "epoch": 5.841657514101156, + "grad_norm": 0.2629542648792267, + "learning_rate": 7.77331294180371e-06, + "loss": 1.2102, + "step": 19613 + }, + { + "epoch": 5.841955360300824, + "grad_norm": 0.331338107585907, + "learning_rate": 7.772372602034585e-06, + "loss": 1.2122, + "step": 19614 + }, + { + "epoch": 5.8422532065004935, + "grad_norm": 0.3670068681240082, + "learning_rate": 7.771432282991045e-06, + "loss": 1.2259, + "step": 19615 + }, + { + "epoch": 5.842551052700162, + "grad_norm": 0.2694351077079773, + "learning_rate": 7.770491984681839e-06, + "loss": 1.215, + "step": 19616 + }, + { + "epoch": 5.842848898899831, + "grad_norm": 0.2863113582134247, + "learning_rate": 7.769551707115715e-06, + "loss": 1.204, + "step": 19617 + }, + { + "epoch": 5.843146745099499, + "grad_norm": 0.3022710382938385, + "learning_rate": 7.768611450301418e-06, + "loss": 1.2147, + "step": 19618 + }, + { + "epoch": 5.843444591299168, + "grad_norm": 0.3116869032382965, + "learning_rate": 7.767671214247701e-06, + "loss": 1.2067, + "step": 19619 + }, + { + "epoch": 5.843742437498836, + "grad_norm": 0.262445867061615, + "learning_rate": 7.766730998963307e-06, + "loss": 1.205, + "step": 19620 + }, + { + "epoch": 5.844040283698505, + "grad_norm": 0.2657754421234131, + "learning_rate": 7.765790804456982e-06, + "loss": 1.2149, + "step": 19621 + }, + { + "epoch": 5.844338129898174, + "grad_norm": 0.38192668557167053, + "learning_rate": 7.764850630737484e-06, + "loss": 1.2054, + "step": 19622 + }, + { + "epoch": 5.844635976097843, + "grad_norm": 0.3063715994358063, + "learning_rate": 7.763910477813555e-06, + "loss": 1.2116, + "step": 19623 + }, + { + "epoch": 5.844933822297511, + "grad_norm": 0.38201138377189636, + "learning_rate": 7.762970345693934e-06, + "loss": 1.2045, + "step": 19624 + }, + { + "epoch": 5.8452316684971795, + "grad_norm": 0.272349089384079, + "learning_rate": 7.76203023438738e-06, + "loss": 1.2091, + "step": 19625 + }, + { + "epoch": 5.845529514696849, + "grad_norm": 0.6438816785812378, + "learning_rate": 7.761090143902631e-06, + "loss": 1.2123, + "step": 19626 + }, + { + "epoch": 5.845827360896517, + "grad_norm": 0.3567163944244385, + "learning_rate": 7.760150074248435e-06, + "loss": 1.2221, + "step": 19627 + }, + { + "epoch": 5.846125207096185, + "grad_norm": 0.5482204556465149, + "learning_rate": 7.759210025433544e-06, + "loss": 1.2137, + "step": 19628 + }, + { + "epoch": 5.8464230532958545, + "grad_norm": 0.3015652000904083, + "learning_rate": 7.7582699974667e-06, + "loss": 1.2034, + "step": 19629 + }, + { + "epoch": 5.846720899495523, + "grad_norm": 0.3942241668701172, + "learning_rate": 7.757329990356647e-06, + "loss": 1.2083, + "step": 19630 + }, + { + "epoch": 5.847018745695191, + "grad_norm": 0.34201639890670776, + "learning_rate": 7.756390004112133e-06, + "loss": 1.2009, + "step": 19631 + }, + { + "epoch": 5.84731659189486, + "grad_norm": 0.42357155680656433, + "learning_rate": 7.755450038741903e-06, + "loss": 1.2032, + "step": 19632 + }, + { + "epoch": 5.847614438094529, + "grad_norm": 0.4093886911869049, + "learning_rate": 7.754510094254703e-06, + "loss": 1.206, + "step": 19633 + }, + { + "epoch": 5.847912284294198, + "grad_norm": 0.39763838052749634, + "learning_rate": 7.753570170659281e-06, + "loss": 1.237, + "step": 19634 + }, + { + "epoch": 5.848210130493866, + "grad_norm": 0.3132558763027191, + "learning_rate": 7.752630267964373e-06, + "loss": 1.212, + "step": 19635 + }, + { + "epoch": 5.848507976693535, + "grad_norm": 0.2729364335536957, + "learning_rate": 7.751690386178734e-06, + "loss": 1.2014, + "step": 19636 + }, + { + "epoch": 5.848805822893204, + "grad_norm": 0.32816725969314575, + "learning_rate": 7.750750525311103e-06, + "loss": 1.2119, + "step": 19637 + }, + { + "epoch": 5.849103669092872, + "grad_norm": 0.25775423645973206, + "learning_rate": 7.749810685370223e-06, + "loss": 1.2138, + "step": 19638 + }, + { + "epoch": 5.8494015152925405, + "grad_norm": 0.3045441508293152, + "learning_rate": 7.748870866364844e-06, + "loss": 1.2212, + "step": 19639 + }, + { + "epoch": 5.84969936149221, + "grad_norm": 0.2741446793079376, + "learning_rate": 7.747931068303707e-06, + "loss": 1.2104, + "step": 19640 + }, + { + "epoch": 5.849997207691878, + "grad_norm": 0.2722054123878479, + "learning_rate": 7.74699129119555e-06, + "loss": 1.2135, + "step": 19641 + }, + { + "epoch": 5.850295053891546, + "grad_norm": 0.26878291368484497, + "learning_rate": 7.746051535049127e-06, + "loss": 1.2002, + "step": 19642 + }, + { + "epoch": 5.850592900091216, + "grad_norm": 0.3480394780635834, + "learning_rate": 7.745111799873173e-06, + "loss": 1.2237, + "step": 19643 + }, + { + "epoch": 5.850890746290884, + "grad_norm": 0.4168858528137207, + "learning_rate": 7.744172085676433e-06, + "loss": 1.1948, + "step": 19644 + }, + { + "epoch": 5.851188592490553, + "grad_norm": 0.26998400688171387, + "learning_rate": 7.743232392467657e-06, + "loss": 1.2154, + "step": 19645 + }, + { + "epoch": 5.8514864386902214, + "grad_norm": 0.2897645831108093, + "learning_rate": 7.742292720255576e-06, + "loss": 1.2206, + "step": 19646 + }, + { + "epoch": 5.85178428488989, + "grad_norm": 0.29250776767730713, + "learning_rate": 7.741353069048945e-06, + "loss": 1.2305, + "step": 19647 + }, + { + "epoch": 5.852082131089559, + "grad_norm": 0.2947573661804199, + "learning_rate": 7.740413438856498e-06, + "loss": 1.2079, + "step": 19648 + }, + { + "epoch": 5.852379977289227, + "grad_norm": 0.2997695207595825, + "learning_rate": 7.739473829686977e-06, + "loss": 1.2069, + "step": 19649 + }, + { + "epoch": 5.852677823488896, + "grad_norm": 0.2877860367298126, + "learning_rate": 7.738534241549128e-06, + "loss": 1.2138, + "step": 19650 + }, + { + "epoch": 5.852975669688565, + "grad_norm": 0.3143649995326996, + "learning_rate": 7.737594674451692e-06, + "loss": 1.2203, + "step": 19651 + }, + { + "epoch": 5.853273515888233, + "grad_norm": 0.2758687138557434, + "learning_rate": 7.736655128403407e-06, + "loss": 1.2242, + "step": 19652 + }, + { + "epoch": 5.8535713620879015, + "grad_norm": 0.2680954039096832, + "learning_rate": 7.735715603413021e-06, + "loss": 1.1969, + "step": 19653 + }, + { + "epoch": 5.853869208287571, + "grad_norm": 0.28473687171936035, + "learning_rate": 7.734776099489268e-06, + "loss": 1.2199, + "step": 19654 + }, + { + "epoch": 5.854167054487239, + "grad_norm": 0.30702653527259827, + "learning_rate": 7.733836616640896e-06, + "loss": 1.2171, + "step": 19655 + }, + { + "epoch": 5.854464900686908, + "grad_norm": 0.3919333219528198, + "learning_rate": 7.732897154876639e-06, + "loss": 1.2042, + "step": 19656 + }, + { + "epoch": 5.854762746886577, + "grad_norm": 0.29652485251426697, + "learning_rate": 7.731957714205244e-06, + "loss": 1.2252, + "step": 19657 + }, + { + "epoch": 5.855060593086245, + "grad_norm": 0.36527273058891296, + "learning_rate": 7.731018294635446e-06, + "loss": 1.2118, + "step": 19658 + }, + { + "epoch": 5.855358439285913, + "grad_norm": 0.27338355779647827, + "learning_rate": 7.73007889617599e-06, + "loss": 1.1971, + "step": 19659 + }, + { + "epoch": 5.8556562854855825, + "grad_norm": 0.26514914631843567, + "learning_rate": 7.729139518835612e-06, + "loss": 1.2009, + "step": 19660 + }, + { + "epoch": 5.855954131685251, + "grad_norm": 0.2890794575214386, + "learning_rate": 7.728200162623052e-06, + "loss": 1.2105, + "step": 19661 + }, + { + "epoch": 5.85625197788492, + "grad_norm": 0.2984280586242676, + "learning_rate": 7.727260827547055e-06, + "loss": 1.2434, + "step": 19662 + }, + { + "epoch": 5.856549824084588, + "grad_norm": 0.2639959454536438, + "learning_rate": 7.726321513616352e-06, + "loss": 1.2058, + "step": 19663 + }, + { + "epoch": 5.856847670284257, + "grad_norm": 0.2727898061275482, + "learning_rate": 7.725382220839693e-06, + "loss": 1.2095, + "step": 19664 + }, + { + "epoch": 5.857145516483926, + "grad_norm": 0.31113842129707336, + "learning_rate": 7.72444294922581e-06, + "loss": 1.2144, + "step": 19665 + }, + { + "epoch": 5.857443362683594, + "grad_norm": 0.3023146986961365, + "learning_rate": 7.723503698783438e-06, + "loss": 1.2103, + "step": 19666 + }, + { + "epoch": 5.8577412088832626, + "grad_norm": 0.2729348838329315, + "learning_rate": 7.722564469521324e-06, + "loss": 1.2076, + "step": 19667 + }, + { + "epoch": 5.858039055082932, + "grad_norm": 0.29502829909324646, + "learning_rate": 7.721625261448206e-06, + "loss": 1.1948, + "step": 19668 + }, + { + "epoch": 5.8583369012826, + "grad_norm": 0.5774202346801758, + "learning_rate": 7.720686074572813e-06, + "loss": 1.2138, + "step": 19669 + }, + { + "epoch": 5.858634747482268, + "grad_norm": 0.3133317530155182, + "learning_rate": 7.719746908903895e-06, + "loss": 1.2005, + "step": 19670 + }, + { + "epoch": 5.858932593681938, + "grad_norm": 0.4840753376483917, + "learning_rate": 7.718807764450185e-06, + "loss": 1.2186, + "step": 19671 + }, + { + "epoch": 5.859230439881606, + "grad_norm": 0.30928316712379456, + "learning_rate": 7.717868641220414e-06, + "loss": 1.2075, + "step": 19672 + }, + { + "epoch": 5.859528286081275, + "grad_norm": 0.5464794039726257, + "learning_rate": 7.716929539223329e-06, + "loss": 1.2111, + "step": 19673 + }, + { + "epoch": 5.8598261322809435, + "grad_norm": 0.4610711336135864, + "learning_rate": 7.715990458467667e-06, + "loss": 1.2153, + "step": 19674 + }, + { + "epoch": 5.860123978480612, + "grad_norm": 0.4352073073387146, + "learning_rate": 7.715051398962155e-06, + "loss": 1.2133, + "step": 19675 + }, + { + "epoch": 5.860421824680281, + "grad_norm": 0.37265628576278687, + "learning_rate": 7.714112360715542e-06, + "loss": 1.2327, + "step": 19676 + }, + { + "epoch": 5.860719670879949, + "grad_norm": 0.3591066598892212, + "learning_rate": 7.713173343736557e-06, + "loss": 1.2159, + "step": 19677 + }, + { + "epoch": 5.861017517079618, + "grad_norm": 0.29050585627555847, + "learning_rate": 7.71223434803394e-06, + "loss": 1.1929, + "step": 19678 + }, + { + "epoch": 5.861315363279287, + "grad_norm": 0.37478896975517273, + "learning_rate": 7.711295373616426e-06, + "loss": 1.2006, + "step": 19679 + }, + { + "epoch": 5.861613209478955, + "grad_norm": 0.338222473859787, + "learning_rate": 7.71035642049275e-06, + "loss": 1.2158, + "step": 19680 + }, + { + "epoch": 5.861911055678624, + "grad_norm": 0.27898043394088745, + "learning_rate": 7.709417488671653e-06, + "loss": 1.2014, + "step": 19681 + }, + { + "epoch": 5.862208901878293, + "grad_norm": 0.2537229359149933, + "learning_rate": 7.708478578161866e-06, + "loss": 1.2084, + "step": 19682 + }, + { + "epoch": 5.862506748077961, + "grad_norm": 0.28727245330810547, + "learning_rate": 7.707539688972124e-06, + "loss": 1.2165, + "step": 19683 + }, + { + "epoch": 5.86280459427763, + "grad_norm": 0.26053178310394287, + "learning_rate": 7.706600821111166e-06, + "loss": 1.2284, + "step": 19684 + }, + { + "epoch": 5.863102440477299, + "grad_norm": 0.28430110216140747, + "learning_rate": 7.705661974587723e-06, + "loss": 1.2194, + "step": 19685 + }, + { + "epoch": 5.863400286676967, + "grad_norm": 0.26683878898620605, + "learning_rate": 7.70472314941053e-06, + "loss": 1.2252, + "step": 19686 + }, + { + "epoch": 5.863698132876636, + "grad_norm": 0.33253082633018494, + "learning_rate": 7.70378434558833e-06, + "loss": 1.2143, + "step": 19687 + }, + { + "epoch": 5.8639959790763045, + "grad_norm": 0.2606544494628906, + "learning_rate": 7.702845563129848e-06, + "loss": 1.2307, + "step": 19688 + }, + { + "epoch": 5.864293825275973, + "grad_norm": 0.2861187160015106, + "learning_rate": 7.701906802043819e-06, + "loss": 1.2088, + "step": 19689 + }, + { + "epoch": 5.864591671475642, + "grad_norm": 0.2976442277431488, + "learning_rate": 7.700968062338981e-06, + "loss": 1.2102, + "step": 19690 + }, + { + "epoch": 5.86488951767531, + "grad_norm": 0.2559099793434143, + "learning_rate": 7.70002934402407e-06, + "loss": 1.1972, + "step": 19691 + }, + { + "epoch": 5.865187363874979, + "grad_norm": 0.4708477258682251, + "learning_rate": 7.699090647107811e-06, + "loss": 1.2245, + "step": 19692 + }, + { + "epoch": 5.865485210074648, + "grad_norm": 0.3552057445049286, + "learning_rate": 7.698151971598947e-06, + "loss": 1.2135, + "step": 19693 + }, + { + "epoch": 5.865783056274316, + "grad_norm": 0.4166930615901947, + "learning_rate": 7.697213317506208e-06, + "loss": 1.1927, + "step": 19694 + }, + { + "epoch": 5.866080902473985, + "grad_norm": 0.2781657874584198, + "learning_rate": 7.69627468483832e-06, + "loss": 1.1973, + "step": 19695 + }, + { + "epoch": 5.866378748673654, + "grad_norm": 0.6836292743682861, + "learning_rate": 7.695336073604027e-06, + "loss": 1.2058, + "step": 19696 + }, + { + "epoch": 5.866676594873322, + "grad_norm": 0.4408610761165619, + "learning_rate": 7.694397483812052e-06, + "loss": 1.1983, + "step": 19697 + }, + { + "epoch": 5.8669744410729905, + "grad_norm": 0.5116508603096008, + "learning_rate": 7.69345891547114e-06, + "loss": 1.2179, + "step": 19698 + }, + { + "epoch": 5.86727228727266, + "grad_norm": 0.2547590732574463, + "learning_rate": 7.692520368590013e-06, + "loss": 1.2052, + "step": 19699 + }, + { + "epoch": 5.867570133472328, + "grad_norm": 0.7557486295700073, + "learning_rate": 7.691581843177403e-06, + "loss": 1.2109, + "step": 19700 + }, + { + "epoch": 5.867867979671997, + "grad_norm": 0.29969343543052673, + "learning_rate": 7.690643339242048e-06, + "loss": 1.2232, + "step": 19701 + }, + { + "epoch": 5.868165825871666, + "grad_norm": 0.40185675024986267, + "learning_rate": 7.689704856792674e-06, + "loss": 1.2019, + "step": 19702 + }, + { + "epoch": 5.868463672071334, + "grad_norm": 0.31418320536613464, + "learning_rate": 7.688766395838016e-06, + "loss": 1.2, + "step": 19703 + }, + { + "epoch": 5.868761518271003, + "grad_norm": 0.3252184987068176, + "learning_rate": 7.687827956386807e-06, + "loss": 1.2026, + "step": 19704 + }, + { + "epoch": 5.8690593644706714, + "grad_norm": 0.4650043249130249, + "learning_rate": 7.686889538447775e-06, + "loss": 1.2149, + "step": 19705 + }, + { + "epoch": 5.86935721067034, + "grad_norm": 0.26629945635795593, + "learning_rate": 7.685951142029646e-06, + "loss": 1.2091, + "step": 19706 + }, + { + "epoch": 5.869655056870009, + "grad_norm": 0.5208491086959839, + "learning_rate": 7.685012767141162e-06, + "loss": 1.2003, + "step": 19707 + }, + { + "epoch": 5.869952903069677, + "grad_norm": 0.33644038438796997, + "learning_rate": 7.684074413791047e-06, + "loss": 1.2144, + "step": 19708 + }, + { + "epoch": 5.870250749269346, + "grad_norm": 0.4062998592853546, + "learning_rate": 7.683136081988027e-06, + "loss": 1.2199, + "step": 19709 + }, + { + "epoch": 5.870548595469015, + "grad_norm": 0.4381130039691925, + "learning_rate": 7.682197771740843e-06, + "loss": 1.2065, + "step": 19710 + }, + { + "epoch": 5.870846441668683, + "grad_norm": 0.25779780745506287, + "learning_rate": 7.68125948305822e-06, + "loss": 1.2159, + "step": 19711 + }, + { + "epoch": 5.871144287868352, + "grad_norm": 0.4768553376197815, + "learning_rate": 7.680321215948882e-06, + "loss": 1.2089, + "step": 19712 + }, + { + "epoch": 5.871442134068021, + "grad_norm": 0.302226185798645, + "learning_rate": 7.679382970421565e-06, + "loss": 1.2161, + "step": 19713 + }, + { + "epoch": 5.871739980267689, + "grad_norm": 0.3917502164840698, + "learning_rate": 7.678444746484996e-06, + "loss": 1.2202, + "step": 19714 + }, + { + "epoch": 5.872037826467358, + "grad_norm": 0.37382709980010986, + "learning_rate": 7.677506544147906e-06, + "loss": 1.2126, + "step": 19715 + }, + { + "epoch": 5.872335672667027, + "grad_norm": 0.28242412209510803, + "learning_rate": 7.676568363419026e-06, + "loss": 1.2133, + "step": 19716 + }, + { + "epoch": 5.872633518866695, + "grad_norm": 0.334397554397583, + "learning_rate": 7.675630204307075e-06, + "loss": 1.2013, + "step": 19717 + }, + { + "epoch": 5.872931365066364, + "grad_norm": 0.27919960021972656, + "learning_rate": 7.674692066820794e-06, + "loss": 1.2248, + "step": 19718 + }, + { + "epoch": 5.8732292112660325, + "grad_norm": 0.310494601726532, + "learning_rate": 7.673753950968902e-06, + "loss": 1.2175, + "step": 19719 + }, + { + "epoch": 5.873527057465701, + "grad_norm": 0.30901026725769043, + "learning_rate": 7.672815856760131e-06, + "loss": 1.2174, + "step": 19720 + }, + { + "epoch": 5.87382490366537, + "grad_norm": 0.2734905779361725, + "learning_rate": 7.671877784203208e-06, + "loss": 1.2095, + "step": 19721 + }, + { + "epoch": 5.874122749865038, + "grad_norm": 0.2951793074607849, + "learning_rate": 7.670939733306863e-06, + "loss": 1.2031, + "step": 19722 + }, + { + "epoch": 5.874420596064708, + "grad_norm": 0.2693183720111847, + "learning_rate": 7.670001704079817e-06, + "loss": 1.2172, + "step": 19723 + }, + { + "epoch": 5.874718442264376, + "grad_norm": 0.28688564896583557, + "learning_rate": 7.669063696530808e-06, + "loss": 1.2089, + "step": 19724 + }, + { + "epoch": 5.875016288464044, + "grad_norm": 0.2883380055427551, + "learning_rate": 7.668125710668554e-06, + "loss": 1.2284, + "step": 19725 + }, + { + "epoch": 5.8753141346637126, + "grad_norm": 0.35950028896331787, + "learning_rate": 7.667187746501783e-06, + "loss": 1.1866, + "step": 19726 + }, + { + "epoch": 5.875611980863382, + "grad_norm": 0.2990448474884033, + "learning_rate": 7.666249804039227e-06, + "loss": 1.2333, + "step": 19727 + }, + { + "epoch": 5.87590982706305, + "grad_norm": 0.40367069840431213, + "learning_rate": 7.665311883289611e-06, + "loss": 1.2208, + "step": 19728 + }, + { + "epoch": 5.876207673262719, + "grad_norm": 0.2846141755580902, + "learning_rate": 7.664373984261653e-06, + "loss": 1.2303, + "step": 19729 + }, + { + "epoch": 5.876505519462388, + "grad_norm": 0.4116760790348053, + "learning_rate": 7.66343610696409e-06, + "loss": 1.2218, + "step": 19730 + }, + { + "epoch": 5.876803365662056, + "grad_norm": 0.27822595834732056, + "learning_rate": 7.66249825140564e-06, + "loss": 1.2121, + "step": 19731 + }, + { + "epoch": 5.877101211861725, + "grad_norm": 0.3134772777557373, + "learning_rate": 7.661560417595034e-06, + "loss": 1.2094, + "step": 19732 + }, + { + "epoch": 5.8773990580613935, + "grad_norm": 0.2531176805496216, + "learning_rate": 7.660622605540999e-06, + "loss": 1.2039, + "step": 19733 + }, + { + "epoch": 5.877696904261062, + "grad_norm": 0.30348435044288635, + "learning_rate": 7.65968481525225e-06, + "loss": 1.2014, + "step": 19734 + }, + { + "epoch": 5.877994750460731, + "grad_norm": 0.26167163252830505, + "learning_rate": 7.658747046737525e-06, + "loss": 1.2089, + "step": 19735 + }, + { + "epoch": 5.878292596660399, + "grad_norm": 0.2847365438938141, + "learning_rate": 7.657809300005544e-06, + "loss": 1.2203, + "step": 19736 + }, + { + "epoch": 5.878590442860068, + "grad_norm": 0.3568847179412842, + "learning_rate": 7.656871575065025e-06, + "loss": 1.2316, + "step": 19737 + }, + { + "epoch": 5.878888289059737, + "grad_norm": 0.34484249353408813, + "learning_rate": 7.6559338719247e-06, + "loss": 1.2234, + "step": 19738 + }, + { + "epoch": 5.879186135259405, + "grad_norm": 0.40856823325157166, + "learning_rate": 7.654996190593295e-06, + "loss": 1.1945, + "step": 19739 + }, + { + "epoch": 5.8794839814590745, + "grad_norm": 0.3517288863658905, + "learning_rate": 7.654058531079524e-06, + "loss": 1.2309, + "step": 19740 + }, + { + "epoch": 5.879781827658743, + "grad_norm": 0.40801236033439636, + "learning_rate": 7.653120893392124e-06, + "loss": 1.2108, + "step": 19741 + }, + { + "epoch": 5.880079673858411, + "grad_norm": 0.2814236581325531, + "learning_rate": 7.65218327753981e-06, + "loss": 1.2134, + "step": 19742 + }, + { + "epoch": 5.88037752005808, + "grad_norm": 0.41062241792678833, + "learning_rate": 7.651245683531304e-06, + "loss": 1.207, + "step": 19743 + }, + { + "epoch": 5.880675366257749, + "grad_norm": 0.27050167322158813, + "learning_rate": 7.650308111375334e-06, + "loss": 1.2226, + "step": 19744 + }, + { + "epoch": 5.880973212457417, + "grad_norm": 0.31314024329185486, + "learning_rate": 7.649370561080628e-06, + "loss": 1.2091, + "step": 19745 + }, + { + "epoch": 5.881271058657086, + "grad_norm": 0.2666119337081909, + "learning_rate": 7.648433032655893e-06, + "loss": 1.1988, + "step": 19746 + }, + { + "epoch": 5.8815689048567545, + "grad_norm": 0.3014333248138428, + "learning_rate": 7.647495526109869e-06, + "loss": 1.2095, + "step": 19747 + }, + { + "epoch": 5.881866751056423, + "grad_norm": 0.2822328805923462, + "learning_rate": 7.646558041451264e-06, + "loss": 1.1923, + "step": 19748 + }, + { + "epoch": 5.882164597256092, + "grad_norm": 0.32284829020500183, + "learning_rate": 7.64562057868881e-06, + "loss": 1.2149, + "step": 19749 + }, + { + "epoch": 5.88246244345576, + "grad_norm": 0.27972474694252014, + "learning_rate": 7.644683137831229e-06, + "loss": 1.2031, + "step": 19750 + }, + { + "epoch": 5.88276028965543, + "grad_norm": 0.28591328859329224, + "learning_rate": 7.643745718887236e-06, + "loss": 1.2073, + "step": 19751 + }, + { + "epoch": 5.883058135855098, + "grad_norm": 0.2891368567943573, + "learning_rate": 7.64280832186556e-06, + "loss": 1.195, + "step": 19752 + }, + { + "epoch": 5.883355982054766, + "grad_norm": 0.2828793227672577, + "learning_rate": 7.641870946774919e-06, + "loss": 1.2132, + "step": 19753 + }, + { + "epoch": 5.8836538282544355, + "grad_norm": 0.3005000948905945, + "learning_rate": 7.640933593624029e-06, + "loss": 1.2077, + "step": 19754 + }, + { + "epoch": 5.883951674454104, + "grad_norm": 0.26643726229667664, + "learning_rate": 7.639996262421619e-06, + "loss": 1.2157, + "step": 19755 + }, + { + "epoch": 5.884249520653772, + "grad_norm": 0.3420386016368866, + "learning_rate": 7.63905895317641e-06, + "loss": 1.2136, + "step": 19756 + }, + { + "epoch": 5.884547366853441, + "grad_norm": 0.29570236802101135, + "learning_rate": 7.638121665897115e-06, + "loss": 1.2052, + "step": 19757 + }, + { + "epoch": 5.88484521305311, + "grad_norm": 0.33349350094795227, + "learning_rate": 7.637184400592462e-06, + "loss": 1.206, + "step": 19758 + }, + { + "epoch": 5.885143059252778, + "grad_norm": 0.36793774366378784, + "learning_rate": 7.63624715727117e-06, + "loss": 1.2179, + "step": 19759 + }, + { + "epoch": 5.885440905452447, + "grad_norm": 0.2907385528087616, + "learning_rate": 7.635309935941953e-06, + "loss": 1.2136, + "step": 19760 + }, + { + "epoch": 5.885738751652116, + "grad_norm": 0.28459247946739197, + "learning_rate": 7.634372736613535e-06, + "loss": 1.1965, + "step": 19761 + }, + { + "epoch": 5.886036597851784, + "grad_norm": 0.31736648082733154, + "learning_rate": 7.633435559294635e-06, + "loss": 1.2093, + "step": 19762 + }, + { + "epoch": 5.886334444051453, + "grad_norm": 0.280977725982666, + "learning_rate": 7.632498403993978e-06, + "loss": 1.2093, + "step": 19763 + }, + { + "epoch": 5.8866322902511214, + "grad_norm": 0.27929916977882385, + "learning_rate": 7.631561270720278e-06, + "loss": 1.2057, + "step": 19764 + }, + { + "epoch": 5.88693013645079, + "grad_norm": 0.2878957986831665, + "learning_rate": 7.63062415948225e-06, + "loss": 1.1996, + "step": 19765 + }, + { + "epoch": 5.887227982650459, + "grad_norm": 0.2803519666194916, + "learning_rate": 7.629687070288619e-06, + "loss": 1.2138, + "step": 19766 + }, + { + "epoch": 5.887525828850127, + "grad_norm": 0.3282645046710968, + "learning_rate": 7.628750003148102e-06, + "loss": 1.2081, + "step": 19767 + }, + { + "epoch": 5.8878236750497965, + "grad_norm": 0.2690000534057617, + "learning_rate": 7.627812958069414e-06, + "loss": 1.2241, + "step": 19768 + }, + { + "epoch": 5.888121521249465, + "grad_norm": 0.30320388078689575, + "learning_rate": 7.62687593506128e-06, + "loss": 1.2101, + "step": 19769 + }, + { + "epoch": 5.888419367449133, + "grad_norm": 0.33145949244499207, + "learning_rate": 7.625938934132414e-06, + "loss": 1.2162, + "step": 19770 + }, + { + "epoch": 5.888717213648802, + "grad_norm": 0.3910709619522095, + "learning_rate": 7.62500195529153e-06, + "loss": 1.2086, + "step": 19771 + }, + { + "epoch": 5.889015059848471, + "grad_norm": 0.28167808055877686, + "learning_rate": 7.624064998547352e-06, + "loss": 1.2151, + "step": 19772 + }, + { + "epoch": 5.889312906048139, + "grad_norm": 0.2926369309425354, + "learning_rate": 7.623128063908595e-06, + "loss": 1.2043, + "step": 19773 + }, + { + "epoch": 5.889610752247808, + "grad_norm": 0.2957414388656616, + "learning_rate": 7.622191151383972e-06, + "loss": 1.2069, + "step": 19774 + }, + { + "epoch": 5.889908598447477, + "grad_norm": 0.27410802245140076, + "learning_rate": 7.621254260982208e-06, + "loss": 1.1913, + "step": 19775 + }, + { + "epoch": 5.890206444647145, + "grad_norm": 0.28888213634490967, + "learning_rate": 7.620317392712017e-06, + "loss": 1.2156, + "step": 19776 + }, + { + "epoch": 5.890504290846814, + "grad_norm": 0.28191816806793213, + "learning_rate": 7.619380546582108e-06, + "loss": 1.2265, + "step": 19777 + }, + { + "epoch": 5.8908021370464825, + "grad_norm": 0.2632920444011688, + "learning_rate": 7.618443722601205e-06, + "loss": 1.2269, + "step": 19778 + }, + { + "epoch": 5.891099983246152, + "grad_norm": 0.29163745045661926, + "learning_rate": 7.6175069207780235e-06, + "loss": 1.2136, + "step": 19779 + }, + { + "epoch": 5.89139782944582, + "grad_norm": 0.29716843366622925, + "learning_rate": 7.616570141121277e-06, + "loss": 1.2261, + "step": 19780 + }, + { + "epoch": 5.891695675645488, + "grad_norm": 0.3333028256893158, + "learning_rate": 7.615633383639687e-06, + "loss": 1.2004, + "step": 19781 + }, + { + "epoch": 5.891993521845158, + "grad_norm": 0.25077876448631287, + "learning_rate": 7.614696648341958e-06, + "loss": 1.2159, + "step": 19782 + }, + { + "epoch": 5.892291368044826, + "grad_norm": 0.45058029890060425, + "learning_rate": 7.613759935236817e-06, + "loss": 1.2052, + "step": 19783 + }, + { + "epoch": 5.892589214244494, + "grad_norm": 0.2832079529762268, + "learning_rate": 7.612823244332971e-06, + "loss": 1.2084, + "step": 19784 + }, + { + "epoch": 5.892887060444163, + "grad_norm": 0.4495527744293213, + "learning_rate": 7.611886575639135e-06, + "loss": 1.2083, + "step": 19785 + }, + { + "epoch": 5.893184906643832, + "grad_norm": 0.2885785400867462, + "learning_rate": 7.610949929164031e-06, + "loss": 1.2109, + "step": 19786 + }, + { + "epoch": 5.8934827528435, + "grad_norm": 0.3337485194206238, + "learning_rate": 7.610013304916369e-06, + "loss": 1.2041, + "step": 19787 + }, + { + "epoch": 5.893780599043169, + "grad_norm": 0.29731377959251404, + "learning_rate": 7.609076702904858e-06, + "loss": 1.2176, + "step": 19788 + }, + { + "epoch": 5.894078445242838, + "grad_norm": 0.345464289188385, + "learning_rate": 7.608140123138222e-06, + "loss": 1.2265, + "step": 19789 + }, + { + "epoch": 5.894376291442507, + "grad_norm": 0.2939906418323517, + "learning_rate": 7.607203565625168e-06, + "loss": 1.2016, + "step": 19790 + }, + { + "epoch": 5.894674137642175, + "grad_norm": 0.489046186208725, + "learning_rate": 7.606267030374408e-06, + "loss": 1.2291, + "step": 19791 + }, + { + "epoch": 5.8949719838418435, + "grad_norm": 0.362615704536438, + "learning_rate": 7.605330517394664e-06, + "loss": 1.211, + "step": 19792 + }, + { + "epoch": 5.895269830041512, + "grad_norm": 0.2884233891963959, + "learning_rate": 7.604394026694644e-06, + "loss": 1.199, + "step": 19793 + }, + { + "epoch": 5.895567676241181, + "grad_norm": 0.4300801157951355, + "learning_rate": 7.603457558283056e-06, + "loss": 1.211, + "step": 19794 + }, + { + "epoch": 5.895865522440849, + "grad_norm": 0.2703288793563843, + "learning_rate": 7.602521112168624e-06, + "loss": 1.2148, + "step": 19795 + }, + { + "epoch": 5.896163368640519, + "grad_norm": 0.42262008786201477, + "learning_rate": 7.601584688360049e-06, + "loss": 1.2094, + "step": 19796 + }, + { + "epoch": 5.896461214840187, + "grad_norm": 0.26051944494247437, + "learning_rate": 7.60064828686605e-06, + "loss": 1.1958, + "step": 19797 + }, + { + "epoch": 5.896759061039855, + "grad_norm": 0.38392624258995056, + "learning_rate": 7.599711907695341e-06, + "loss": 1.2148, + "step": 19798 + }, + { + "epoch": 5.8970569072395245, + "grad_norm": 0.2545011341571808, + "learning_rate": 7.598775550856626e-06, + "loss": 1.2116, + "step": 19799 + }, + { + "epoch": 5.897354753439193, + "grad_norm": 0.2836942970752716, + "learning_rate": 7.597839216358626e-06, + "loss": 1.2115, + "step": 19800 + }, + { + "epoch": 5.897652599638861, + "grad_norm": 0.345489501953125, + "learning_rate": 7.5969029042100485e-06, + "loss": 1.2126, + "step": 19801 + }, + { + "epoch": 5.89795044583853, + "grad_norm": 0.29392674565315247, + "learning_rate": 7.595966614419602e-06, + "loss": 1.2323, + "step": 19802 + }, + { + "epoch": 5.898248292038199, + "grad_norm": 0.5595733523368835, + "learning_rate": 7.5950303469960005e-06, + "loss": 1.2012, + "step": 19803 + }, + { + "epoch": 5.898546138237867, + "grad_norm": 0.4478977918624878, + "learning_rate": 7.594094101947957e-06, + "loss": 1.1948, + "step": 19804 + }, + { + "epoch": 5.898843984437536, + "grad_norm": 0.4497094452381134, + "learning_rate": 7.593157879284177e-06, + "loss": 1.2032, + "step": 19805 + }, + { + "epoch": 5.8991418306372045, + "grad_norm": 0.2579902112483978, + "learning_rate": 7.592221679013377e-06, + "loss": 1.2131, + "step": 19806 + }, + { + "epoch": 5.899439676836874, + "grad_norm": 0.7427989840507507, + "learning_rate": 7.591285501144261e-06, + "loss": 1.2173, + "step": 19807 + }, + { + "epoch": 5.899737523036542, + "grad_norm": 0.25360074639320374, + "learning_rate": 7.590349345685542e-06, + "loss": 1.2181, + "step": 19808 + }, + { + "epoch": 5.90003536923621, + "grad_norm": 0.41214168071746826, + "learning_rate": 7.589413212645933e-06, + "loss": 1.2305, + "step": 19809 + }, + { + "epoch": 5.90033321543588, + "grad_norm": 0.4207543432712555, + "learning_rate": 7.588477102034142e-06, + "loss": 1.2187, + "step": 19810 + }, + { + "epoch": 5.900631061635548, + "grad_norm": 0.2610822916030884, + "learning_rate": 7.587541013858871e-06, + "loss": 1.2179, + "step": 19811 + }, + { + "epoch": 5.900928907835216, + "grad_norm": 0.5251509547233582, + "learning_rate": 7.586604948128841e-06, + "loss": 1.2047, + "step": 19812 + }, + { + "epoch": 5.9012267540348855, + "grad_norm": 0.2954830229282379, + "learning_rate": 7.585668904852752e-06, + "loss": 1.2049, + "step": 19813 + }, + { + "epoch": 5.901524600234554, + "grad_norm": 0.3921951651573181, + "learning_rate": 7.584732884039317e-06, + "loss": 1.202, + "step": 19814 + }, + { + "epoch": 5.901822446434222, + "grad_norm": 0.3233911693096161, + "learning_rate": 7.583796885697248e-06, + "loss": 1.199, + "step": 19815 + }, + { + "epoch": 5.902120292633891, + "grad_norm": 0.3623232841491699, + "learning_rate": 7.582860909835243e-06, + "loss": 1.2218, + "step": 19816 + }, + { + "epoch": 5.90241813883356, + "grad_norm": 0.345813512802124, + "learning_rate": 7.581924956462022e-06, + "loss": 1.1904, + "step": 19817 + }, + { + "epoch": 5.902715985033229, + "grad_norm": 0.3087981939315796, + "learning_rate": 7.5809890255862885e-06, + "loss": 1.2139, + "step": 19818 + }, + { + "epoch": 5.903013831232897, + "grad_norm": 0.34175044298171997, + "learning_rate": 7.580053117216744e-06, + "loss": 1.2182, + "step": 19819 + }, + { + "epoch": 5.903311677432566, + "grad_norm": 0.38446667790412903, + "learning_rate": 7.579117231362104e-06, + "loss": 1.23, + "step": 19820 + }, + { + "epoch": 5.903609523632235, + "grad_norm": 0.3683859705924988, + "learning_rate": 7.578181368031076e-06, + "loss": 1.2236, + "step": 19821 + }, + { + "epoch": 5.903907369831903, + "grad_norm": 0.2987349331378937, + "learning_rate": 7.5772455272323595e-06, + "loss": 1.1938, + "step": 19822 + }, + { + "epoch": 5.9042052160315714, + "grad_norm": 0.28819775581359863, + "learning_rate": 7.576309708974672e-06, + "loss": 1.2069, + "step": 19823 + }, + { + "epoch": 5.904503062231241, + "grad_norm": 0.3232839107513428, + "learning_rate": 7.57537391326671e-06, + "loss": 1.2079, + "step": 19824 + }, + { + "epoch": 5.904800908430909, + "grad_norm": 0.29022216796875, + "learning_rate": 7.5744381401171876e-06, + "loss": 1.211, + "step": 19825 + }, + { + "epoch": 5.905098754630577, + "grad_norm": 0.3549312949180603, + "learning_rate": 7.573502389534807e-06, + "loss": 1.1946, + "step": 19826 + }, + { + "epoch": 5.9053966008302465, + "grad_norm": 0.24365216493606567, + "learning_rate": 7.572566661528278e-06, + "loss": 1.2172, + "step": 19827 + }, + { + "epoch": 5.905694447029915, + "grad_norm": 0.423112154006958, + "learning_rate": 7.5716309561063e-06, + "loss": 1.2049, + "step": 19828 + }, + { + "epoch": 5.905992293229583, + "grad_norm": 0.28737735748291016, + "learning_rate": 7.5706952732775866e-06, + "loss": 1.2189, + "step": 19829 + }, + { + "epoch": 5.906290139429252, + "grad_norm": 0.39810413122177124, + "learning_rate": 7.569759613050835e-06, + "loss": 1.2068, + "step": 19830 + }, + { + "epoch": 5.906587985628921, + "grad_norm": 0.39153480529785156, + "learning_rate": 7.5688239754347605e-06, + "loss": 1.2071, + "step": 19831 + }, + { + "epoch": 5.906885831828589, + "grad_norm": 0.3402028977870941, + "learning_rate": 7.56788836043806e-06, + "loss": 1.2197, + "step": 19832 + }, + { + "epoch": 5.907183678028258, + "grad_norm": 0.45036885142326355, + "learning_rate": 7.566952768069438e-06, + "loss": 1.2309, + "step": 19833 + }, + { + "epoch": 5.907481524227927, + "grad_norm": 0.2677386999130249, + "learning_rate": 7.566017198337607e-06, + "loss": 1.2181, + "step": 19834 + }, + { + "epoch": 5.907779370427596, + "grad_norm": 0.7536735534667969, + "learning_rate": 7.565081651251267e-06, + "loss": 1.2099, + "step": 19835 + }, + { + "epoch": 5.908077216627264, + "grad_norm": 0.3469991981983185, + "learning_rate": 7.564146126819117e-06, + "loss": 1.2354, + "step": 19836 + }, + { + "epoch": 5.9083750628269325, + "grad_norm": 0.5384515523910522, + "learning_rate": 7.563210625049868e-06, + "loss": 1.1995, + "step": 19837 + }, + { + "epoch": 5.908672909026602, + "grad_norm": 0.257614403963089, + "learning_rate": 7.562275145952225e-06, + "loss": 1.2103, + "step": 19838 + }, + { + "epoch": 5.90897075522627, + "grad_norm": 0.5288355350494385, + "learning_rate": 7.561339689534881e-06, + "loss": 1.2221, + "step": 19839 + }, + { + "epoch": 5.909268601425938, + "grad_norm": 0.31223511695861816, + "learning_rate": 7.560404255806554e-06, + "loss": 1.211, + "step": 19840 + }, + { + "epoch": 5.909566447625608, + "grad_norm": 0.42010438442230225, + "learning_rate": 7.5594688447759375e-06, + "loss": 1.2126, + "step": 19841 + }, + { + "epoch": 5.909864293825276, + "grad_norm": 0.4113349914550781, + "learning_rate": 7.558533456451733e-06, + "loss": 1.2088, + "step": 19842 + }, + { + "epoch": 5.910162140024944, + "grad_norm": 0.29698100686073303, + "learning_rate": 7.5575980908426495e-06, + "loss": 1.2351, + "step": 19843 + }, + { + "epoch": 5.910459986224613, + "grad_norm": 0.5019407868385315, + "learning_rate": 7.556662747957388e-06, + "loss": 1.2101, + "step": 19844 + }, + { + "epoch": 5.910757832424282, + "grad_norm": 0.2548619508743286, + "learning_rate": 7.5557274278046466e-06, + "loss": 1.2037, + "step": 19845 + }, + { + "epoch": 5.911055678623951, + "grad_norm": 0.3295741379261017, + "learning_rate": 7.554792130393135e-06, + "loss": 1.2158, + "step": 19846 + }, + { + "epoch": 5.911353524823619, + "grad_norm": 0.33598634600639343, + "learning_rate": 7.5538568557315464e-06, + "loss": 1.2051, + "step": 19847 + }, + { + "epoch": 5.911651371023288, + "grad_norm": 0.3634411692619324, + "learning_rate": 7.55292160382859e-06, + "loss": 1.2389, + "step": 19848 + }, + { + "epoch": 5.911949217222957, + "grad_norm": 0.3242103159427643, + "learning_rate": 7.551986374692963e-06, + "loss": 1.1995, + "step": 19849 + }, + { + "epoch": 5.912247063422625, + "grad_norm": 0.3475632071495056, + "learning_rate": 7.551051168333365e-06, + "loss": 1.2008, + "step": 19850 + }, + { + "epoch": 5.9125449096222935, + "grad_norm": 0.28974246978759766, + "learning_rate": 7.550115984758504e-06, + "loss": 1.2093, + "step": 19851 + }, + { + "epoch": 5.912842755821963, + "grad_norm": 0.27499011158943176, + "learning_rate": 7.549180823977077e-06, + "loss": 1.2199, + "step": 19852 + }, + { + "epoch": 5.913140602021631, + "grad_norm": 0.3993379473686218, + "learning_rate": 7.548245685997779e-06, + "loss": 1.2215, + "step": 19853 + }, + { + "epoch": 5.913438448221299, + "grad_norm": 0.36304888129234314, + "learning_rate": 7.54731057082932e-06, + "loss": 1.2005, + "step": 19854 + }, + { + "epoch": 5.913736294420969, + "grad_norm": 0.5126381516456604, + "learning_rate": 7.546375478480394e-06, + "loss": 1.2229, + "step": 19855 + }, + { + "epoch": 5.914034140620637, + "grad_norm": 0.34435397386550903, + "learning_rate": 7.5454404089597026e-06, + "loss": 1.2157, + "step": 19856 + }, + { + "epoch": 5.914331986820306, + "grad_norm": 0.45478591322898865, + "learning_rate": 7.544505362275948e-06, + "loss": 1.1986, + "step": 19857 + }, + { + "epoch": 5.9146298330199745, + "grad_norm": 0.3183295428752899, + "learning_rate": 7.543570338437828e-06, + "loss": 1.2115, + "step": 19858 + }, + { + "epoch": 5.914927679219643, + "grad_norm": 0.25597986578941345, + "learning_rate": 7.542635337454037e-06, + "loss": 1.2072, + "step": 19859 + }, + { + "epoch": 5.915225525419311, + "grad_norm": 0.4365426003932953, + "learning_rate": 7.5417003593332815e-06, + "loss": 1.1927, + "step": 19860 + }, + { + "epoch": 5.91552337161898, + "grad_norm": 0.3271276652812958, + "learning_rate": 7.540765404084258e-06, + "loss": 1.2189, + "step": 19861 + }, + { + "epoch": 5.915821217818649, + "grad_norm": 0.44658908247947693, + "learning_rate": 7.539830471715663e-06, + "loss": 1.2112, + "step": 19862 + }, + { + "epoch": 5.916119064018318, + "grad_norm": 0.3291972577571869, + "learning_rate": 7.538895562236199e-06, + "loss": 1.2147, + "step": 19863 + }, + { + "epoch": 5.916416910217986, + "grad_norm": 0.3678570091724396, + "learning_rate": 7.5379606756545585e-06, + "loss": 1.2228, + "step": 19864 + }, + { + "epoch": 5.9167147564176545, + "grad_norm": 0.2758341431617737, + "learning_rate": 7.5370258119794485e-06, + "loss": 1.2055, + "step": 19865 + }, + { + "epoch": 5.917012602617324, + "grad_norm": 0.2678714394569397, + "learning_rate": 7.536090971219557e-06, + "loss": 1.2098, + "step": 19866 + }, + { + "epoch": 5.917310448816992, + "grad_norm": 0.27749374508857727, + "learning_rate": 7.535156153383588e-06, + "loss": 1.1947, + "step": 19867 + }, + { + "epoch": 5.91760829501666, + "grad_norm": 0.25734415650367737, + "learning_rate": 7.534221358480237e-06, + "loss": 1.2171, + "step": 19868 + }, + { + "epoch": 5.91790614121633, + "grad_norm": 0.29759058356285095, + "learning_rate": 7.5332865865182035e-06, + "loss": 1.2056, + "step": 19869 + }, + { + "epoch": 5.918203987415998, + "grad_norm": 0.30260103940963745, + "learning_rate": 7.532351837506177e-06, + "loss": 1.216, + "step": 19870 + }, + { + "epoch": 5.918501833615666, + "grad_norm": 0.2465955913066864, + "learning_rate": 7.531417111452866e-06, + "loss": 1.2082, + "step": 19871 + }, + { + "epoch": 5.9187996798153355, + "grad_norm": 0.28616493940353394, + "learning_rate": 7.530482408366957e-06, + "loss": 1.2153, + "step": 19872 + }, + { + "epoch": 5.919097526015004, + "grad_norm": 0.29654261469841003, + "learning_rate": 7.529547728257147e-06, + "loss": 1.2132, + "step": 19873 + }, + { + "epoch": 5.919395372214673, + "grad_norm": 0.28355711698532104, + "learning_rate": 7.528613071132141e-06, + "loss": 1.2141, + "step": 19874 + }, + { + "epoch": 5.919693218414341, + "grad_norm": 0.4401582181453705, + "learning_rate": 7.52767843700063e-06, + "loss": 1.2302, + "step": 19875 + }, + { + "epoch": 5.91999106461401, + "grad_norm": 0.404633104801178, + "learning_rate": 7.526743825871303e-06, + "loss": 1.2256, + "step": 19876 + }, + { + "epoch": 5.920288910813679, + "grad_norm": 0.2894860804080963, + "learning_rate": 7.525809237752866e-06, + "loss": 1.2043, + "step": 19877 + }, + { + "epoch": 5.920586757013347, + "grad_norm": 0.2649744749069214, + "learning_rate": 7.524874672654005e-06, + "loss": 1.2171, + "step": 19878 + }, + { + "epoch": 5.920884603213016, + "grad_norm": 0.3483593761920929, + "learning_rate": 7.5239401305834225e-06, + "loss": 1.2182, + "step": 19879 + }, + { + "epoch": 5.921182449412685, + "grad_norm": 0.2558915913105011, + "learning_rate": 7.523005611549813e-06, + "loss": 1.2175, + "step": 19880 + }, + { + "epoch": 5.921480295612353, + "grad_norm": 0.3790781795978546, + "learning_rate": 7.522071115561864e-06, + "loss": 1.217, + "step": 19881 + }, + { + "epoch": 5.9217781418120214, + "grad_norm": 0.26363861560821533, + "learning_rate": 7.5211366426282795e-06, + "loss": 1.2173, + "step": 19882 + }, + { + "epoch": 5.922075988011691, + "grad_norm": 0.2687417268753052, + "learning_rate": 7.520202192757747e-06, + "loss": 1.2006, + "step": 19883 + }, + { + "epoch": 5.922373834211359, + "grad_norm": 0.3512256443500519, + "learning_rate": 7.519267765958963e-06, + "loss": 1.2271, + "step": 19884 + }, + { + "epoch": 5.922671680411028, + "grad_norm": 0.28896406292915344, + "learning_rate": 7.518333362240619e-06, + "loss": 1.2374, + "step": 19885 + }, + { + "epoch": 5.9229695266106965, + "grad_norm": 0.29075995087623596, + "learning_rate": 7.5173989816114145e-06, + "loss": 1.2098, + "step": 19886 + }, + { + "epoch": 5.923267372810365, + "grad_norm": 0.2722116708755493, + "learning_rate": 7.516464624080033e-06, + "loss": 1.2059, + "step": 19887 + }, + { + "epoch": 5.923565219010034, + "grad_norm": 0.33740612864494324, + "learning_rate": 7.515530289655179e-06, + "loss": 1.2112, + "step": 19888 + }, + { + "epoch": 5.923863065209702, + "grad_norm": 0.29584792256355286, + "learning_rate": 7.5145959783455365e-06, + "loss": 1.2078, + "step": 19889 + }, + { + "epoch": 5.924160911409371, + "grad_norm": 0.3502522110939026, + "learning_rate": 7.513661690159802e-06, + "loss": 1.1741, + "step": 19890 + }, + { + "epoch": 5.92445875760904, + "grad_norm": 0.2546685039997101, + "learning_rate": 7.512727425106668e-06, + "loss": 1.216, + "step": 19891 + }, + { + "epoch": 5.924756603808708, + "grad_norm": 0.45806533098220825, + "learning_rate": 7.511793183194829e-06, + "loss": 1.2215, + "step": 19892 + }, + { + "epoch": 5.925054450008377, + "grad_norm": 0.28287121653556824, + "learning_rate": 7.510858964432969e-06, + "loss": 1.1988, + "step": 19893 + }, + { + "epoch": 5.925352296208046, + "grad_norm": 0.5181441307067871, + "learning_rate": 7.509924768829789e-06, + "loss": 1.2286, + "step": 19894 + }, + { + "epoch": 5.925650142407714, + "grad_norm": 0.2704135775566101, + "learning_rate": 7.5089905963939734e-06, + "loss": 1.2006, + "step": 19895 + }, + { + "epoch": 5.9259479886073825, + "grad_norm": 0.36298900842666626, + "learning_rate": 7.50805644713422e-06, + "loss": 1.2174, + "step": 19896 + }, + { + "epoch": 5.926245834807052, + "grad_norm": 0.27507761120796204, + "learning_rate": 7.507122321059219e-06, + "loss": 1.2071, + "step": 19897 + }, + { + "epoch": 5.92654368100672, + "grad_norm": 0.2856441140174866, + "learning_rate": 7.506188218177655e-06, + "loss": 1.1948, + "step": 19898 + }, + { + "epoch": 5.926841527206388, + "grad_norm": 0.2776634693145752, + "learning_rate": 7.505254138498228e-06, + "loss": 1.2182, + "step": 19899 + }, + { + "epoch": 5.927139373406058, + "grad_norm": 0.2945333421230316, + "learning_rate": 7.504320082029623e-06, + "loss": 1.2086, + "step": 19900 + }, + { + "epoch": 5.927437219605726, + "grad_norm": 0.26102709770202637, + "learning_rate": 7.5033860487805275e-06, + "loss": 1.2187, + "step": 19901 + }, + { + "epoch": 5.927735065805395, + "grad_norm": 0.3198640048503876, + "learning_rate": 7.502452038759637e-06, + "loss": 1.2311, + "step": 19902 + }, + { + "epoch": 5.928032912005063, + "grad_norm": 0.2610388994216919, + "learning_rate": 7.501518051975644e-06, + "loss": 1.2136, + "step": 19903 + }, + { + "epoch": 5.928330758204732, + "grad_norm": 0.3690877854824066, + "learning_rate": 7.500584088437228e-06, + "loss": 1.2181, + "step": 19904 + }, + { + "epoch": 5.928628604404401, + "grad_norm": 0.2494429498910904, + "learning_rate": 7.499650148153091e-06, + "loss": 1.1962, + "step": 19905 + }, + { + "epoch": 5.928926450604069, + "grad_norm": 0.3058006167411804, + "learning_rate": 7.498716231131913e-06, + "loss": 1.207, + "step": 19906 + }, + { + "epoch": 5.929224296803738, + "grad_norm": 0.2546834945678711, + "learning_rate": 7.497782337382383e-06, + "loss": 1.2112, + "step": 19907 + }, + { + "epoch": 5.929522143003407, + "grad_norm": 0.3980565369129181, + "learning_rate": 7.496848466913195e-06, + "loss": 1.2146, + "step": 19908 + }, + { + "epoch": 5.929819989203075, + "grad_norm": 0.2976199686527252, + "learning_rate": 7.495914619733037e-06, + "loss": 1.2112, + "step": 19909 + }, + { + "epoch": 5.9301178354027435, + "grad_norm": 0.4909135401248932, + "learning_rate": 7.4949807958505915e-06, + "loss": 1.2305, + "step": 19910 + }, + { + "epoch": 5.930415681602413, + "grad_norm": 0.4183238744735718, + "learning_rate": 7.494046995274556e-06, + "loss": 1.2058, + "step": 19911 + }, + { + "epoch": 5.930713527802081, + "grad_norm": 0.3827158510684967, + "learning_rate": 7.493113218013608e-06, + "loss": 1.2073, + "step": 19912 + }, + { + "epoch": 5.93101137400175, + "grad_norm": 0.2779160439968109, + "learning_rate": 7.492179464076446e-06, + "loss": 1.1992, + "step": 19913 + }, + { + "epoch": 5.931309220201419, + "grad_norm": 0.4791818857192993, + "learning_rate": 7.491245733471748e-06, + "loss": 1.2239, + "step": 19914 + }, + { + "epoch": 5.931607066401087, + "grad_norm": 0.2737295627593994, + "learning_rate": 7.490312026208205e-06, + "loss": 1.2189, + "step": 19915 + }, + { + "epoch": 5.931904912600756, + "grad_norm": 0.41740521788597107, + "learning_rate": 7.4893783422945085e-06, + "loss": 1.2203, + "step": 19916 + }, + { + "epoch": 5.9322027588004245, + "grad_norm": 0.2548010051250458, + "learning_rate": 7.488444681739342e-06, + "loss": 1.2161, + "step": 19917 + }, + { + "epoch": 5.932500605000093, + "grad_norm": 0.3685665428638458, + "learning_rate": 7.487511044551388e-06, + "loss": 1.2135, + "step": 19918 + }, + { + "epoch": 5.932798451199762, + "grad_norm": 0.33215752243995667, + "learning_rate": 7.486577430739337e-06, + "loss": 1.2023, + "step": 19919 + }, + { + "epoch": 5.93309629739943, + "grad_norm": 0.319754034280777, + "learning_rate": 7.485643840311878e-06, + "loss": 1.2233, + "step": 19920 + }, + { + "epoch": 5.933394143599099, + "grad_norm": 0.3260960876941681, + "learning_rate": 7.484710273277689e-06, + "loss": 1.1984, + "step": 19921 + }, + { + "epoch": 5.933691989798768, + "grad_norm": 0.2729150354862213, + "learning_rate": 7.4837767296454645e-06, + "loss": 1.2185, + "step": 19922 + }, + { + "epoch": 5.933989835998436, + "grad_norm": 0.29566365480422974, + "learning_rate": 7.482843209423887e-06, + "loss": 1.2045, + "step": 19923 + }, + { + "epoch": 5.934287682198105, + "grad_norm": 0.30384641885757446, + "learning_rate": 7.4819097126216374e-06, + "loss": 1.2194, + "step": 19924 + }, + { + "epoch": 5.934585528397774, + "grad_norm": 0.28885459899902344, + "learning_rate": 7.480976239247406e-06, + "loss": 1.2152, + "step": 19925 + }, + { + "epoch": 5.934883374597442, + "grad_norm": 0.31526216864585876, + "learning_rate": 7.480042789309878e-06, + "loss": 1.2103, + "step": 19926 + }, + { + "epoch": 5.93518122079711, + "grad_norm": 0.3096555173397064, + "learning_rate": 7.479109362817733e-06, + "loss": 1.2158, + "step": 19927 + }, + { + "epoch": 5.93547906699678, + "grad_norm": 0.278586745262146, + "learning_rate": 7.478175959779663e-06, + "loss": 1.1954, + "step": 19928 + }, + { + "epoch": 5.935776913196448, + "grad_norm": 0.29671400785446167, + "learning_rate": 7.477242580204343e-06, + "loss": 1.2012, + "step": 19929 + }, + { + "epoch": 5.936074759396117, + "grad_norm": 0.29190900921821594, + "learning_rate": 7.476309224100468e-06, + "loss": 1.2245, + "step": 19930 + }, + { + "epoch": 5.9363726055957855, + "grad_norm": 0.3125079870223999, + "learning_rate": 7.475375891476713e-06, + "loss": 1.2131, + "step": 19931 + }, + { + "epoch": 5.936670451795454, + "grad_norm": 0.2816937267780304, + "learning_rate": 7.474442582341762e-06, + "loss": 1.2136, + "step": 19932 + }, + { + "epoch": 5.936968297995123, + "grad_norm": 0.2571212351322174, + "learning_rate": 7.4735092967043066e-06, + "loss": 1.2086, + "step": 19933 + }, + { + "epoch": 5.937266144194791, + "grad_norm": 0.47276771068573, + "learning_rate": 7.472576034573023e-06, + "loss": 1.2021, + "step": 19934 + }, + { + "epoch": 5.93756399039446, + "grad_norm": 0.36985161900520325, + "learning_rate": 7.4716427959565925e-06, + "loss": 1.2058, + "step": 19935 + }, + { + "epoch": 5.937861836594129, + "grad_norm": 0.31467387080192566, + "learning_rate": 7.470709580863704e-06, + "loss": 1.2008, + "step": 19936 + }, + { + "epoch": 5.938159682793797, + "grad_norm": 0.28341153264045715, + "learning_rate": 7.469776389303036e-06, + "loss": 1.2057, + "step": 19937 + }, + { + "epoch": 5.938457528993466, + "grad_norm": 0.43463417887687683, + "learning_rate": 7.468843221283269e-06, + "loss": 1.2076, + "step": 19938 + }, + { + "epoch": 5.938755375193135, + "grad_norm": 0.27061885595321655, + "learning_rate": 7.467910076813092e-06, + "loss": 1.2038, + "step": 19939 + }, + { + "epoch": 5.939053221392803, + "grad_norm": 0.33500590920448303, + "learning_rate": 7.466976955901184e-06, + "loss": 1.206, + "step": 19940 + }, + { + "epoch": 5.939351067592472, + "grad_norm": 0.31054073572158813, + "learning_rate": 7.46604385855622e-06, + "loss": 1.203, + "step": 19941 + }, + { + "epoch": 5.939648913792141, + "grad_norm": 0.2667149007320404, + "learning_rate": 7.465110784786891e-06, + "loss": 1.2102, + "step": 19942 + }, + { + "epoch": 5.939946759991809, + "grad_norm": 0.2642519772052765, + "learning_rate": 7.4641777346018715e-06, + "loss": 1.2058, + "step": 19943 + }, + { + "epoch": 5.940244606191478, + "grad_norm": 0.36663520336151123, + "learning_rate": 7.463244708009845e-06, + "loss": 1.2281, + "step": 19944 + }, + { + "epoch": 5.9405424523911465, + "grad_norm": 0.270404577255249, + "learning_rate": 7.462311705019494e-06, + "loss": 1.209, + "step": 19945 + }, + { + "epoch": 5.940840298590815, + "grad_norm": 0.27171462774276733, + "learning_rate": 7.461378725639495e-06, + "loss": 1.2081, + "step": 19946 + }, + { + "epoch": 5.941138144790484, + "grad_norm": 0.34753552079200745, + "learning_rate": 7.4604457698785325e-06, + "loss": 1.2037, + "step": 19947 + }, + { + "epoch": 5.941435990990152, + "grad_norm": 0.3226587176322937, + "learning_rate": 7.459512837745284e-06, + "loss": 1.2135, + "step": 19948 + }, + { + "epoch": 5.941733837189821, + "grad_norm": 0.3982117772102356, + "learning_rate": 7.458579929248431e-06, + "loss": 1.2064, + "step": 19949 + }, + { + "epoch": 5.94203168338949, + "grad_norm": 0.5194597840309143, + "learning_rate": 7.457647044396651e-06, + "loss": 1.2028, + "step": 19950 + }, + { + "epoch": 5.942329529589158, + "grad_norm": 0.27021604776382446, + "learning_rate": 7.456714183198628e-06, + "loss": 1.2288, + "step": 19951 + }, + { + "epoch": 5.9426273757888275, + "grad_norm": 0.400470107793808, + "learning_rate": 7.455781345663032e-06, + "loss": 1.2179, + "step": 19952 + }, + { + "epoch": 5.942925221988496, + "grad_norm": 0.32334771752357483, + "learning_rate": 7.454848531798553e-06, + "loss": 1.1957, + "step": 19953 + }, + { + "epoch": 5.943223068188164, + "grad_norm": 0.48630544543266296, + "learning_rate": 7.453915741613864e-06, + "loss": 1.2048, + "step": 19954 + }, + { + "epoch": 5.943520914387833, + "grad_norm": 0.3211387097835541, + "learning_rate": 7.452982975117642e-06, + "loss": 1.2081, + "step": 19955 + }, + { + "epoch": 5.943818760587502, + "grad_norm": 0.42263099551200867, + "learning_rate": 7.452050232318571e-06, + "loss": 1.1995, + "step": 19956 + }, + { + "epoch": 5.94411660678717, + "grad_norm": 0.4230688214302063, + "learning_rate": 7.451117513225325e-06, + "loss": 1.2047, + "step": 19957 + }, + { + "epoch": 5.944414452986839, + "grad_norm": 0.304991751909256, + "learning_rate": 7.4501848178465795e-06, + "loss": 1.1918, + "step": 19958 + }, + { + "epoch": 5.944712299186508, + "grad_norm": 0.49376922845840454, + "learning_rate": 7.44925214619102e-06, + "loss": 1.2028, + "step": 19959 + }, + { + "epoch": 5.945010145386176, + "grad_norm": 0.2916540205478668, + "learning_rate": 7.448319498267318e-06, + "loss": 1.1948, + "step": 19960 + }, + { + "epoch": 5.945307991585845, + "grad_norm": 0.2906514108181, + "learning_rate": 7.447386874084148e-06, + "loss": 1.2075, + "step": 19961 + }, + { + "epoch": 5.945605837785513, + "grad_norm": 0.28675130009651184, + "learning_rate": 7.446454273650198e-06, + "loss": 1.2037, + "step": 19962 + }, + { + "epoch": 5.945903683985182, + "grad_norm": 0.3520452678203583, + "learning_rate": 7.445521696974132e-06, + "loss": 1.216, + "step": 19963 + }, + { + "epoch": 5.946201530184851, + "grad_norm": 0.2449936717748642, + "learning_rate": 7.4445891440646365e-06, + "loss": 1.2141, + "step": 19964 + }, + { + "epoch": 5.946499376384519, + "grad_norm": 0.26454365253448486, + "learning_rate": 7.443656614930386e-06, + "loss": 1.2042, + "step": 19965 + }, + { + "epoch": 5.946797222584188, + "grad_norm": 0.3150053322315216, + "learning_rate": 7.442724109580048e-06, + "loss": 1.2032, + "step": 19966 + }, + { + "epoch": 5.947095068783857, + "grad_norm": 0.2718449532985687, + "learning_rate": 7.441791628022308e-06, + "loss": 1.1929, + "step": 19967 + }, + { + "epoch": 5.947392914983525, + "grad_norm": 0.2793262302875519, + "learning_rate": 7.440859170265841e-06, + "loss": 1.2268, + "step": 19968 + }, + { + "epoch": 5.947690761183194, + "grad_norm": 0.2563835680484772, + "learning_rate": 7.439926736319316e-06, + "loss": 1.2244, + "step": 19969 + }, + { + "epoch": 5.947988607382863, + "grad_norm": 0.3162914216518402, + "learning_rate": 7.438994326191417e-06, + "loss": 1.2076, + "step": 19970 + }, + { + "epoch": 5.948286453582531, + "grad_norm": 0.4131770730018616, + "learning_rate": 7.43806193989081e-06, + "loss": 1.2097, + "step": 19971 + }, + { + "epoch": 5.9485842997822, + "grad_norm": 0.2641558349132538, + "learning_rate": 7.437129577426177e-06, + "loss": 1.2121, + "step": 19972 + }, + { + "epoch": 5.948882145981869, + "grad_norm": 0.40105366706848145, + "learning_rate": 7.436197238806189e-06, + "loss": 1.2045, + "step": 19973 + }, + { + "epoch": 5.949179992181537, + "grad_norm": 0.312832236289978, + "learning_rate": 7.435264924039523e-06, + "loss": 1.2291, + "step": 19974 + }, + { + "epoch": 5.949477838381206, + "grad_norm": 0.27298468351364136, + "learning_rate": 7.434332633134849e-06, + "loss": 1.211, + "step": 19975 + }, + { + "epoch": 5.9497756845808745, + "grad_norm": 0.28135013580322266, + "learning_rate": 7.433400366100845e-06, + "loss": 1.2132, + "step": 19976 + }, + { + "epoch": 5.950073530780543, + "grad_norm": 0.2630771994590759, + "learning_rate": 7.432468122946183e-06, + "loss": 1.2169, + "step": 19977 + }, + { + "epoch": 5.950371376980212, + "grad_norm": 0.4849890470504761, + "learning_rate": 7.431535903679534e-06, + "loss": 1.2166, + "step": 19978 + }, + { + "epoch": 5.95066922317988, + "grad_norm": 0.2566927373409271, + "learning_rate": 7.430603708309576e-06, + "loss": 1.2213, + "step": 19979 + }, + { + "epoch": 5.95096706937955, + "grad_norm": 0.3909648060798645, + "learning_rate": 7.4296715368449775e-06, + "loss": 1.2254, + "step": 19980 + }, + { + "epoch": 5.951264915579218, + "grad_norm": 0.26811352372169495, + "learning_rate": 7.428739389294417e-06, + "loss": 1.1927, + "step": 19981 + }, + { + "epoch": 5.951562761778886, + "grad_norm": 0.317093163728714, + "learning_rate": 7.427807265666565e-06, + "loss": 1.2083, + "step": 19982 + }, + { + "epoch": 5.951860607978555, + "grad_norm": 0.2598755657672882, + "learning_rate": 7.426875165970087e-06, + "loss": 1.2277, + "step": 19983 + }, + { + "epoch": 5.952158454178224, + "grad_norm": 0.24861247837543488, + "learning_rate": 7.425943090213663e-06, + "loss": 1.2053, + "step": 19984 + }, + { + "epoch": 5.952456300377892, + "grad_norm": 0.2631685733795166, + "learning_rate": 7.425011038405965e-06, + "loss": 1.2027, + "step": 19985 + }, + { + "epoch": 5.952754146577561, + "grad_norm": 0.27041855454444885, + "learning_rate": 7.424079010555657e-06, + "loss": 1.2099, + "step": 19986 + }, + { + "epoch": 5.95305199277723, + "grad_norm": 0.25152724981307983, + "learning_rate": 7.423147006671421e-06, + "loss": 1.1967, + "step": 19987 + }, + { + "epoch": 5.953349838976898, + "grad_norm": 0.29649677872657776, + "learning_rate": 7.422215026761923e-06, + "loss": 1.206, + "step": 19988 + }, + { + "epoch": 5.953647685176567, + "grad_norm": 0.2471916675567627, + "learning_rate": 7.421283070835831e-06, + "loss": 1.2142, + "step": 19989 + }, + { + "epoch": 5.9539455313762355, + "grad_norm": 0.25260427594184875, + "learning_rate": 7.4203511389018204e-06, + "loss": 1.1993, + "step": 19990 + }, + { + "epoch": 5.954243377575905, + "grad_norm": 0.42111966013908386, + "learning_rate": 7.419419230968561e-06, + "loss": 1.2141, + "step": 19991 + }, + { + "epoch": 5.954541223775573, + "grad_norm": 0.44608354568481445, + "learning_rate": 7.418487347044719e-06, + "loss": 1.2041, + "step": 19992 + }, + { + "epoch": 5.954839069975241, + "grad_norm": 0.35318055748939514, + "learning_rate": 7.4175554871389735e-06, + "loss": 1.2191, + "step": 19993 + }, + { + "epoch": 5.95513691617491, + "grad_norm": 0.3805428445339203, + "learning_rate": 7.416623651259983e-06, + "loss": 1.1964, + "step": 19994 + }, + { + "epoch": 5.955434762374579, + "grad_norm": 0.41821813583374023, + "learning_rate": 7.415691839416428e-06, + "loss": 1.2229, + "step": 19995 + }, + { + "epoch": 5.955732608574247, + "grad_norm": 0.3168116509914398, + "learning_rate": 7.414760051616972e-06, + "loss": 1.2023, + "step": 19996 + }, + { + "epoch": 5.9560304547739165, + "grad_norm": 0.3952611982822418, + "learning_rate": 7.413828287870281e-06, + "loss": 1.2217, + "step": 19997 + }, + { + "epoch": 5.956328300973585, + "grad_norm": 0.3050205409526825, + "learning_rate": 7.412896548185033e-06, + "loss": 1.1859, + "step": 19998 + }, + { + "epoch": 5.956626147173253, + "grad_norm": 0.3769543170928955, + "learning_rate": 7.411964832569894e-06, + "loss": 1.2154, + "step": 19999 + }, + { + "epoch": 5.956923993372922, + "grad_norm": 0.45296552777290344, + "learning_rate": 7.411033141033525e-06, + "loss": 1.2171, + "step": 20000 + }, + { + "epoch": 5.956923993372922, + "eval_loss": 1.3239740133285522, + "eval_runtime": 23.6264, + "eval_samples_per_second": 73.392, + "eval_steps_per_second": 4.613, + "step": 20000 + }, + { + "epoch": 5.957221839572591, + "grad_norm": 0.354196697473526, + "learning_rate": 7.410101473584605e-06, + "loss": 1.2021, + "step": 20001 + }, + { + "epoch": 5.957519685772259, + "grad_norm": 0.4095867872238159, + "learning_rate": 7.409169830231795e-06, + "loss": 1.205, + "step": 20002 + }, + { + "epoch": 5.957817531971928, + "grad_norm": 0.2558134198188782, + "learning_rate": 7.408238210983763e-06, + "loss": 1.2169, + "step": 20003 + }, + { + "epoch": 5.9581153781715965, + "grad_norm": 0.31956395506858826, + "learning_rate": 7.407306615849182e-06, + "loss": 1.2084, + "step": 20004 + }, + { + "epoch": 5.958413224371265, + "grad_norm": 0.41249266266822815, + "learning_rate": 7.406375044836717e-06, + "loss": 1.2099, + "step": 20005 + }, + { + "epoch": 5.958711070570934, + "grad_norm": 0.3225463926792145, + "learning_rate": 7.4054434979550305e-06, + "loss": 1.2103, + "step": 20006 + }, + { + "epoch": 5.959008916770602, + "grad_norm": 0.3751731812953949, + "learning_rate": 7.404511975212796e-06, + "loss": 1.2268, + "step": 20007 + }, + { + "epoch": 5.959306762970272, + "grad_norm": 0.28736716508865356, + "learning_rate": 7.403580476618678e-06, + "loss": 1.1916, + "step": 20008 + }, + { + "epoch": 5.95960460916994, + "grad_norm": 0.44467705488204956, + "learning_rate": 7.402649002181339e-06, + "loss": 1.2004, + "step": 20009 + }, + { + "epoch": 5.959902455369608, + "grad_norm": 0.31485164165496826, + "learning_rate": 7.4017175519094555e-06, + "loss": 1.2189, + "step": 20010 + }, + { + "epoch": 5.9602003015692775, + "grad_norm": 0.37998661398887634, + "learning_rate": 7.400786125811681e-06, + "loss": 1.2049, + "step": 20011 + }, + { + "epoch": 5.960498147768946, + "grad_norm": 0.363171249628067, + "learning_rate": 7.399854723896691e-06, + "loss": 1.1948, + "step": 20012 + }, + { + "epoch": 5.960795993968614, + "grad_norm": 0.268888384103775, + "learning_rate": 7.398923346173148e-06, + "loss": 1.2134, + "step": 20013 + }, + { + "epoch": 5.961093840168283, + "grad_norm": 0.3031296133995056, + "learning_rate": 7.397991992649713e-06, + "loss": 1.2232, + "step": 20014 + }, + { + "epoch": 5.961391686367952, + "grad_norm": 0.2481214851140976, + "learning_rate": 7.397060663335061e-06, + "loss": 1.2176, + "step": 20015 + }, + { + "epoch": 5.96168953256762, + "grad_norm": 0.30856403708457947, + "learning_rate": 7.396129358237851e-06, + "loss": 1.219, + "step": 20016 + }, + { + "epoch": 5.961987378767289, + "grad_norm": 0.3058025538921356, + "learning_rate": 7.395198077366743e-06, + "loss": 1.2061, + "step": 20017 + }, + { + "epoch": 5.962285224966958, + "grad_norm": 0.3234337568283081, + "learning_rate": 7.394266820730412e-06, + "loss": 1.2101, + "step": 20018 + }, + { + "epoch": 5.962583071166627, + "grad_norm": 0.25915324687957764, + "learning_rate": 7.393335588337514e-06, + "loss": 1.2086, + "step": 20019 + }, + { + "epoch": 5.962880917366295, + "grad_norm": 0.3813258707523346, + "learning_rate": 7.392404380196714e-06, + "loss": 1.1911, + "step": 20020 + }, + { + "epoch": 5.963178763565963, + "grad_norm": 0.326995313167572, + "learning_rate": 7.391473196316683e-06, + "loss": 1.2214, + "step": 20021 + }, + { + "epoch": 5.963476609765633, + "grad_norm": 0.370278924703598, + "learning_rate": 7.3905420367060784e-06, + "loss": 1.2223, + "step": 20022 + }, + { + "epoch": 5.963774455965301, + "grad_norm": 0.4342767894268036, + "learning_rate": 7.389610901373561e-06, + "loss": 1.2211, + "step": 20023 + }, + { + "epoch": 5.964072302164969, + "grad_norm": 0.32407841086387634, + "learning_rate": 7.388679790327802e-06, + "loss": 1.2157, + "step": 20024 + }, + { + "epoch": 5.9643701483646385, + "grad_norm": 0.4559040367603302, + "learning_rate": 7.387748703577458e-06, + "loss": 1.218, + "step": 20025 + }, + { + "epoch": 5.964667994564307, + "grad_norm": 0.2781195640563965, + "learning_rate": 7.386817641131191e-06, + "loss": 1.2271, + "step": 20026 + }, + { + "epoch": 5.964965840763975, + "grad_norm": 0.38680049777030945, + "learning_rate": 7.385886602997672e-06, + "loss": 1.1975, + "step": 20027 + }, + { + "epoch": 5.965263686963644, + "grad_norm": 0.3065037131309509, + "learning_rate": 7.384955589185552e-06, + "loss": 1.2029, + "step": 20028 + }, + { + "epoch": 5.965561533163313, + "grad_norm": 0.482476145029068, + "learning_rate": 7.384024599703502e-06, + "loss": 1.2281, + "step": 20029 + }, + { + "epoch": 5.965859379362981, + "grad_norm": 0.3051374554634094, + "learning_rate": 7.3830936345601785e-06, + "loss": 1.2111, + "step": 20030 + }, + { + "epoch": 5.96615722556265, + "grad_norm": 0.4405565857887268, + "learning_rate": 7.382162693764245e-06, + "loss": 1.2121, + "step": 20031 + }, + { + "epoch": 5.966455071762319, + "grad_norm": 0.3059985339641571, + "learning_rate": 7.381231777324365e-06, + "loss": 1.2155, + "step": 20032 + }, + { + "epoch": 5.966752917961987, + "grad_norm": 0.33760663866996765, + "learning_rate": 7.380300885249197e-06, + "loss": 1.208, + "step": 20033 + }, + { + "epoch": 5.967050764161656, + "grad_norm": 0.28422605991363525, + "learning_rate": 7.379370017547399e-06, + "loss": 1.2118, + "step": 20034 + }, + { + "epoch": 5.9673486103613245, + "grad_norm": 0.262935608625412, + "learning_rate": 7.37843917422764e-06, + "loss": 1.2182, + "step": 20035 + }, + { + "epoch": 5.967646456560994, + "grad_norm": 0.30111443996429443, + "learning_rate": 7.377508355298572e-06, + "loss": 1.2026, + "step": 20036 + }, + { + "epoch": 5.967944302760662, + "grad_norm": 0.3197158873081207, + "learning_rate": 7.376577560768861e-06, + "loss": 1.2114, + "step": 20037 + }, + { + "epoch": 5.96824214896033, + "grad_norm": 0.2820962965488434, + "learning_rate": 7.375646790647162e-06, + "loss": 1.2015, + "step": 20038 + }, + { + "epoch": 5.96853999516, + "grad_norm": 0.3418351709842682, + "learning_rate": 7.37471604494214e-06, + "loss": 1.2303, + "step": 20039 + }, + { + "epoch": 5.968837841359668, + "grad_norm": 0.2564753592014313, + "learning_rate": 7.37378532366245e-06, + "loss": 1.2147, + "step": 20040 + }, + { + "epoch": 5.969135687559336, + "grad_norm": 0.2663310170173645, + "learning_rate": 7.3728546268167554e-06, + "loss": 1.2172, + "step": 20041 + }, + { + "epoch": 5.969433533759005, + "grad_norm": 0.44179773330688477, + "learning_rate": 7.371923954413712e-06, + "loss": 1.2004, + "step": 20042 + }, + { + "epoch": 5.969731379958674, + "grad_norm": 0.3131563663482666, + "learning_rate": 7.370993306461978e-06, + "loss": 1.2097, + "step": 20043 + }, + { + "epoch": 5.970029226158342, + "grad_norm": 0.409889817237854, + "learning_rate": 7.3700626829702185e-06, + "loss": 1.2083, + "step": 20044 + }, + { + "epoch": 5.970327072358011, + "grad_norm": 0.2620203495025635, + "learning_rate": 7.369132083947083e-06, + "loss": 1.2062, + "step": 20045 + }, + { + "epoch": 5.97062491855768, + "grad_norm": 0.34795284271240234, + "learning_rate": 7.368201509401238e-06, + "loss": 1.2023, + "step": 20046 + }, + { + "epoch": 5.970922764757349, + "grad_norm": 0.45827654004096985, + "learning_rate": 7.367270959341337e-06, + "loss": 1.2206, + "step": 20047 + }, + { + "epoch": 5.971220610957017, + "grad_norm": 0.6986866593360901, + "learning_rate": 7.366340433776034e-06, + "loss": 1.2059, + "step": 20048 + }, + { + "epoch": 5.9715184571566855, + "grad_norm": 0.28741317987442017, + "learning_rate": 7.3654099327139936e-06, + "loss": 1.2188, + "step": 20049 + }, + { + "epoch": 5.971816303356355, + "grad_norm": 0.40813660621643066, + "learning_rate": 7.364479456163873e-06, + "loss": 1.2007, + "step": 20050 + }, + { + "epoch": 5.972114149556023, + "grad_norm": 0.4476781189441681, + "learning_rate": 7.363549004134322e-06, + "loss": 1.2138, + "step": 20051 + }, + { + "epoch": 5.972411995755691, + "grad_norm": 0.27198249101638794, + "learning_rate": 7.362618576634005e-06, + "loss": 1.2125, + "step": 20052 + }, + { + "epoch": 5.972709841955361, + "grad_norm": 0.5791362524032593, + "learning_rate": 7.361688173671576e-06, + "loss": 1.2073, + "step": 20053 + }, + { + "epoch": 5.973007688155029, + "grad_norm": 0.30177468061447144, + "learning_rate": 7.3607577952556864e-06, + "loss": 1.2159, + "step": 20054 + }, + { + "epoch": 5.973305534354697, + "grad_norm": 0.35324525833129883, + "learning_rate": 7.3598274413949996e-06, + "loss": 1.2069, + "step": 20055 + }, + { + "epoch": 5.9736033805543665, + "grad_norm": 0.4216848313808441, + "learning_rate": 7.358897112098171e-06, + "loss": 1.2163, + "step": 20056 + }, + { + "epoch": 5.973901226754035, + "grad_norm": 0.2792426347732544, + "learning_rate": 7.357966807373848e-06, + "loss": 1.2211, + "step": 20057 + }, + { + "epoch": 5.974199072953704, + "grad_norm": 0.367888867855072, + "learning_rate": 7.357036527230699e-06, + "loss": 1.2098, + "step": 20058 + }, + { + "epoch": 5.974496919153372, + "grad_norm": 0.2614375054836273, + "learning_rate": 7.356106271677368e-06, + "loss": 1.1958, + "step": 20059 + }, + { + "epoch": 5.974794765353041, + "grad_norm": 0.2798803150653839, + "learning_rate": 7.355176040722514e-06, + "loss": 1.2257, + "step": 20060 + }, + { + "epoch": 5.975092611552709, + "grad_norm": 0.2662627696990967, + "learning_rate": 7.354245834374793e-06, + "loss": 1.2105, + "step": 20061 + }, + { + "epoch": 5.975390457752378, + "grad_norm": 0.2591420114040375, + "learning_rate": 7.3533156526428565e-06, + "loss": 1.222, + "step": 20062 + }, + { + "epoch": 5.9756883039520465, + "grad_norm": 0.3019726574420929, + "learning_rate": 7.3523854955353635e-06, + "loss": 1.2012, + "step": 20063 + }, + { + "epoch": 5.975986150151716, + "grad_norm": 0.2563430368900299, + "learning_rate": 7.351455363060967e-06, + "loss": 1.217, + "step": 20064 + }, + { + "epoch": 5.976283996351384, + "grad_norm": 0.2991671562194824, + "learning_rate": 7.350525255228314e-06, + "loss": 1.21, + "step": 20065 + }, + { + "epoch": 5.976581842551052, + "grad_norm": 0.28286856412887573, + "learning_rate": 7.3495951720460665e-06, + "loss": 1.1913, + "step": 20066 + }, + { + "epoch": 5.976879688750722, + "grad_norm": 0.25297608971595764, + "learning_rate": 7.348665113522877e-06, + "loss": 1.2024, + "step": 20067 + }, + { + "epoch": 5.97717753495039, + "grad_norm": 0.42464059591293335, + "learning_rate": 7.347735079667391e-06, + "loss": 1.2023, + "step": 20068 + }, + { + "epoch": 5.977475381150058, + "grad_norm": 0.32270076870918274, + "learning_rate": 7.346805070488271e-06, + "loss": 1.2263, + "step": 20069 + }, + { + "epoch": 5.9777732273497275, + "grad_norm": 0.30218422412872314, + "learning_rate": 7.3458750859941655e-06, + "loss": 1.222, + "step": 20070 + }, + { + "epoch": 5.978071073549396, + "grad_norm": 0.2741331160068512, + "learning_rate": 7.344945126193723e-06, + "loss": 1.2006, + "step": 20071 + }, + { + "epoch": 5.978368919749064, + "grad_norm": 0.4426378309726715, + "learning_rate": 7.344015191095603e-06, + "loss": 1.1973, + "step": 20072 + }, + { + "epoch": 5.978666765948733, + "grad_norm": 0.260190486907959, + "learning_rate": 7.343085280708455e-06, + "loss": 1.2037, + "step": 20073 + }, + { + "epoch": 5.978964612148402, + "grad_norm": 0.37847578525543213, + "learning_rate": 7.342155395040926e-06, + "loss": 1.2049, + "step": 20074 + }, + { + "epoch": 5.979262458348071, + "grad_norm": 0.26997873187065125, + "learning_rate": 7.341225534101676e-06, + "loss": 1.2282, + "step": 20075 + }, + { + "epoch": 5.979560304547739, + "grad_norm": 0.4619077444076538, + "learning_rate": 7.3402956978993514e-06, + "loss": 1.2095, + "step": 20076 + }, + { + "epoch": 5.979858150747408, + "grad_norm": 0.28734198212623596, + "learning_rate": 7.3393658864426e-06, + "loss": 1.2076, + "step": 20077 + }, + { + "epoch": 5.980155996947077, + "grad_norm": 0.28254473209381104, + "learning_rate": 7.338436099740079e-06, + "loss": 1.2234, + "step": 20078 + }, + { + "epoch": 5.980453843146745, + "grad_norm": 0.27819713950157166, + "learning_rate": 7.337506337800433e-06, + "loss": 1.2155, + "step": 20079 + }, + { + "epoch": 5.980751689346413, + "grad_norm": 0.2774719297885895, + "learning_rate": 7.33657660063232e-06, + "loss": 1.212, + "step": 20080 + }, + { + "epoch": 5.981049535546083, + "grad_norm": 0.29275059700012207, + "learning_rate": 7.335646888244386e-06, + "loss": 1.1926, + "step": 20081 + }, + { + "epoch": 5.981347381745751, + "grad_norm": 0.26836857199668884, + "learning_rate": 7.334717200645278e-06, + "loss": 1.2126, + "step": 20082 + }, + { + "epoch": 5.981645227945419, + "grad_norm": 0.2950221300125122, + "learning_rate": 7.333787537843652e-06, + "loss": 1.2217, + "step": 20083 + }, + { + "epoch": 5.9819430741450885, + "grad_norm": 0.2860493063926697, + "learning_rate": 7.3328578998481514e-06, + "loss": 1.2034, + "step": 20084 + }, + { + "epoch": 5.982240920344757, + "grad_norm": 0.2634318470954895, + "learning_rate": 7.331928286667427e-06, + "loss": 1.2034, + "step": 20085 + }, + { + "epoch": 5.982538766544426, + "grad_norm": 0.2731645405292511, + "learning_rate": 7.330998698310133e-06, + "loss": 1.1952, + "step": 20086 + }, + { + "epoch": 5.982836612744094, + "grad_norm": 0.29142895340919495, + "learning_rate": 7.330069134784914e-06, + "loss": 1.218, + "step": 20087 + }, + { + "epoch": 5.983134458943763, + "grad_norm": 0.275729775428772, + "learning_rate": 7.329139596100414e-06, + "loss": 1.2227, + "step": 20088 + }, + { + "epoch": 5.983432305143432, + "grad_norm": 0.38682979345321655, + "learning_rate": 7.328210082265289e-06, + "loss": 1.2105, + "step": 20089 + }, + { + "epoch": 5.9837301513431, + "grad_norm": 0.3280767798423767, + "learning_rate": 7.327280593288186e-06, + "loss": 1.2086, + "step": 20090 + }, + { + "epoch": 5.984027997542769, + "grad_norm": 0.3915572762489319, + "learning_rate": 7.326351129177746e-06, + "loss": 1.2036, + "step": 20091 + }, + { + "epoch": 5.984325843742438, + "grad_norm": 0.25731709599494934, + "learning_rate": 7.325421689942627e-06, + "loss": 1.2025, + "step": 20092 + }, + { + "epoch": 5.984623689942106, + "grad_norm": 0.7086104154586792, + "learning_rate": 7.32449227559147e-06, + "loss": 1.1935, + "step": 20093 + }, + { + "epoch": 5.9849215361417745, + "grad_norm": 0.3194904923439026, + "learning_rate": 7.323562886132919e-06, + "loss": 1.2096, + "step": 20094 + }, + { + "epoch": 5.985219382341444, + "grad_norm": 0.4881739318370819, + "learning_rate": 7.3226335215756274e-06, + "loss": 1.2136, + "step": 20095 + }, + { + "epoch": 5.985517228541112, + "grad_norm": 0.2634449601173401, + "learning_rate": 7.32170418192824e-06, + "loss": 1.2184, + "step": 20096 + }, + { + "epoch": 5.985815074740781, + "grad_norm": 0.4980546832084656, + "learning_rate": 7.3207748671994016e-06, + "loss": 1.2074, + "step": 20097 + }, + { + "epoch": 5.9861129209404496, + "grad_norm": 0.2874886691570282, + "learning_rate": 7.319845577397763e-06, + "loss": 1.2205, + "step": 20098 + }, + { + "epoch": 5.986410767140118, + "grad_norm": 0.342190682888031, + "learning_rate": 7.3189163125319615e-06, + "loss": 1.2136, + "step": 20099 + }, + { + "epoch": 5.986708613339786, + "grad_norm": 0.4686349630355835, + "learning_rate": 7.317987072610653e-06, + "loss": 1.2031, + "step": 20100 + }, + { + "epoch": 5.987006459539455, + "grad_norm": 0.29796168208122253, + "learning_rate": 7.317057857642476e-06, + "loss": 1.2036, + "step": 20101 + }, + { + "epoch": 5.987304305739124, + "grad_norm": 0.5794475674629211, + "learning_rate": 7.316128667636077e-06, + "loss": 1.2193, + "step": 20102 + }, + { + "epoch": 5.987602151938793, + "grad_norm": 0.36775219440460205, + "learning_rate": 7.315199502600106e-06, + "loss": 1.2163, + "step": 20103 + }, + { + "epoch": 5.987899998138461, + "grad_norm": 0.380149245262146, + "learning_rate": 7.314270362543203e-06, + "loss": 1.2146, + "step": 20104 + }, + { + "epoch": 5.98819784433813, + "grad_norm": 0.34766989946365356, + "learning_rate": 7.313341247474009e-06, + "loss": 1.2176, + "step": 20105 + }, + { + "epoch": 5.988495690537799, + "grad_norm": 0.7177759408950806, + "learning_rate": 7.312412157401179e-06, + "loss": 1.2095, + "step": 20106 + }, + { + "epoch": 5.988793536737467, + "grad_norm": 0.274689257144928, + "learning_rate": 7.311483092333347e-06, + "loss": 1.2226, + "step": 20107 + }, + { + "epoch": 5.9890913829371355, + "grad_norm": 0.4082862138748169, + "learning_rate": 7.310554052279161e-06, + "loss": 1.2127, + "step": 20108 + }, + { + "epoch": 5.989389229136805, + "grad_norm": 0.3884943425655365, + "learning_rate": 7.309625037247268e-06, + "loss": 1.2079, + "step": 20109 + }, + { + "epoch": 5.989687075336473, + "grad_norm": 0.5350891351699829, + "learning_rate": 7.308696047246304e-06, + "loss": 1.2075, + "step": 20110 + }, + { + "epoch": 5.989984921536141, + "grad_norm": 0.4405856430530548, + "learning_rate": 7.30776708228492e-06, + "loss": 1.2106, + "step": 20111 + }, + { + "epoch": 5.990282767735811, + "grad_norm": 0.4748976528644562, + "learning_rate": 7.306838142371756e-06, + "loss": 1.2141, + "step": 20112 + }, + { + "epoch": 5.990580613935479, + "grad_norm": 0.4101448655128479, + "learning_rate": 7.3059092275154495e-06, + "loss": 1.1996, + "step": 20113 + }, + { + "epoch": 5.990878460135148, + "grad_norm": 0.31333962082862854, + "learning_rate": 7.30498033772465e-06, + "loss": 1.2052, + "step": 20114 + }, + { + "epoch": 5.9911763063348165, + "grad_norm": 0.3447631895542145, + "learning_rate": 7.304051473008e-06, + "loss": 1.2148, + "step": 20115 + }, + { + "epoch": 5.991474152534485, + "grad_norm": 0.37121906876564026, + "learning_rate": 7.3031226333741334e-06, + "loss": 1.2133, + "step": 20116 + }, + { + "epoch": 5.991771998734154, + "grad_norm": 0.3757255971431732, + "learning_rate": 7.302193818831703e-06, + "loss": 1.2047, + "step": 20117 + }, + { + "epoch": 5.992069844933822, + "grad_norm": 0.3267693817615509, + "learning_rate": 7.301265029389342e-06, + "loss": 1.1951, + "step": 20118 + }, + { + "epoch": 5.992367691133491, + "grad_norm": 0.287718802690506, + "learning_rate": 7.300336265055697e-06, + "loss": 1.2135, + "step": 20119 + }, + { + "epoch": 5.99266553733316, + "grad_norm": 0.4105055034160614, + "learning_rate": 7.299407525839404e-06, + "loss": 1.2068, + "step": 20120 + }, + { + "epoch": 5.992963383532828, + "grad_norm": 0.2679644227027893, + "learning_rate": 7.29847881174911e-06, + "loss": 1.2249, + "step": 20121 + }, + { + "epoch": 5.9932612297324965, + "grad_norm": 0.2722683846950531, + "learning_rate": 7.297550122793447e-06, + "loss": 1.1984, + "step": 20122 + }, + { + "epoch": 5.993559075932166, + "grad_norm": 0.37727558612823486, + "learning_rate": 7.296621458981066e-06, + "loss": 1.2037, + "step": 20123 + }, + { + "epoch": 5.993856922131834, + "grad_norm": 0.2851454019546509, + "learning_rate": 7.295692820320599e-06, + "loss": 1.222, + "step": 20124 + }, + { + "epoch": 5.994154768331503, + "grad_norm": 0.346204936504364, + "learning_rate": 7.294764206820688e-06, + "loss": 1.2093, + "step": 20125 + }, + { + "epoch": 5.994452614531172, + "grad_norm": 0.29175764322280884, + "learning_rate": 7.293835618489977e-06, + "loss": 1.2222, + "step": 20126 + }, + { + "epoch": 5.99475046073084, + "grad_norm": 0.2784067690372467, + "learning_rate": 7.292907055337099e-06, + "loss": 1.1932, + "step": 20127 + }, + { + "epoch": 5.995048306930508, + "grad_norm": 0.2747276723384857, + "learning_rate": 7.2919785173706994e-06, + "loss": 1.209, + "step": 20128 + }, + { + "epoch": 5.9953461531301775, + "grad_norm": 0.47183743119239807, + "learning_rate": 7.291050004599414e-06, + "loss": 1.2141, + "step": 20129 + }, + { + "epoch": 5.995643999329846, + "grad_norm": 0.38707008957862854, + "learning_rate": 7.290121517031879e-06, + "loss": 1.2008, + "step": 20130 + }, + { + "epoch": 5.995941845529515, + "grad_norm": 0.26040273904800415, + "learning_rate": 7.289193054676735e-06, + "loss": 1.2055, + "step": 20131 + }, + { + "epoch": 5.996239691729183, + "grad_norm": 0.2804533839225769, + "learning_rate": 7.2882646175426254e-06, + "loss": 1.2178, + "step": 20132 + }, + { + "epoch": 5.996537537928852, + "grad_norm": 0.2642832100391388, + "learning_rate": 7.287336205638178e-06, + "loss": 1.2115, + "step": 20133 + }, + { + "epoch": 5.996835384128521, + "grad_norm": 0.26809191703796387, + "learning_rate": 7.286407818972042e-06, + "loss": 1.2166, + "step": 20134 + }, + { + "epoch": 5.997133230328189, + "grad_norm": 0.31540343165397644, + "learning_rate": 7.2854794575528485e-06, + "loss": 1.2273, + "step": 20135 + }, + { + "epoch": 5.997431076527858, + "grad_norm": 0.31573593616485596, + "learning_rate": 7.284551121389232e-06, + "loss": 1.2178, + "step": 20136 + }, + { + "epoch": 5.997728922727527, + "grad_norm": 0.2828293442726135, + "learning_rate": 7.283622810489836e-06, + "loss": 1.2003, + "step": 20137 + }, + { + "epoch": 5.998026768927195, + "grad_norm": 0.3221212923526764, + "learning_rate": 7.282694524863297e-06, + "loss": 1.2245, + "step": 20138 + }, + { + "epoch": 5.998324615126863, + "grad_norm": 0.32015347480773926, + "learning_rate": 7.281766264518244e-06, + "loss": 1.2138, + "step": 20139 + }, + { + "epoch": 5.998622461326533, + "grad_norm": 0.28515100479125977, + "learning_rate": 7.280838029463324e-06, + "loss": 1.2293, + "step": 20140 + }, + { + "epoch": 5.998920307526201, + "grad_norm": 0.2944874167442322, + "learning_rate": 7.279909819707166e-06, + "loss": 1.1981, + "step": 20141 + }, + { + "epoch": 5.99921815372587, + "grad_norm": 0.26799100637435913, + "learning_rate": 7.278981635258408e-06, + "loss": 1.2172, + "step": 20142 + }, + { + "epoch": 5.9995159999255385, + "grad_norm": 0.41549232602119446, + "learning_rate": 7.278053476125686e-06, + "loss": 1.1884, + "step": 20143 + }, + { + "epoch": 5.999813846125207, + "grad_norm": 0.26328161358833313, + "learning_rate": 7.277125342317632e-06, + "loss": 1.2016, + "step": 20144 + }, + { + "epoch": 6.000111692324876, + "grad_norm": 0.43518418073654175, + "learning_rate": 7.27619723384289e-06, + "loss": 1.2256, + "step": 20145 + }, + { + "epoch": 6.000409538524544, + "grad_norm": 0.32351791858673096, + "learning_rate": 7.2752691507100895e-06, + "loss": 1.2045, + "step": 20146 + }, + { + "epoch": 6.000707384724213, + "grad_norm": 0.4505084156990051, + "learning_rate": 7.274341092927861e-06, + "loss": 1.2168, + "step": 20147 + }, + { + "epoch": 6.001005230923882, + "grad_norm": 0.302665114402771, + "learning_rate": 7.273413060504846e-06, + "loss": 1.2008, + "step": 20148 + }, + { + "epoch": 6.00130307712355, + "grad_norm": 0.43544793128967285, + "learning_rate": 7.272485053449676e-06, + "loss": 1.2138, + "step": 20149 + }, + { + "epoch": 6.001600923323219, + "grad_norm": 0.3466516435146332, + "learning_rate": 7.271557071770983e-06, + "loss": 1.213, + "step": 20150 + }, + { + "epoch": 6.001898769522888, + "grad_norm": 0.40594616532325745, + "learning_rate": 7.270629115477406e-06, + "loss": 1.2088, + "step": 20151 + }, + { + "epoch": 6.002196615722556, + "grad_norm": 0.3146449029445648, + "learning_rate": 7.269701184577577e-06, + "loss": 1.2139, + "step": 20152 + }, + { + "epoch": 6.002494461922225, + "grad_norm": 0.29757311940193176, + "learning_rate": 7.2687732790801225e-06, + "loss": 1.192, + "step": 20153 + }, + { + "epoch": 6.002792308121894, + "grad_norm": 0.2778853476047516, + "learning_rate": 7.267845398993685e-06, + "loss": 1.2113, + "step": 20154 + }, + { + "epoch": 6.003090154321562, + "grad_norm": 0.33108317852020264, + "learning_rate": 7.266917544326894e-06, + "loss": 1.2155, + "step": 20155 + }, + { + "epoch": 6.003388000521231, + "grad_norm": 0.26440855860710144, + "learning_rate": 7.265989715088377e-06, + "loss": 1.2172, + "step": 20156 + }, + { + "epoch": 6.0036858467208996, + "grad_norm": 0.3025246560573578, + "learning_rate": 7.265061911286777e-06, + "loss": 1.2307, + "step": 20157 + }, + { + "epoch": 6.003983692920568, + "grad_norm": 0.2890378534793854, + "learning_rate": 7.264134132930719e-06, + "loss": 1.1996, + "step": 20158 + }, + { + "epoch": 6.004281539120237, + "grad_norm": 0.32154908776283264, + "learning_rate": 7.263206380028833e-06, + "loss": 1.2276, + "step": 20159 + }, + { + "epoch": 6.004579385319905, + "grad_norm": 0.2817894220352173, + "learning_rate": 7.262278652589756e-06, + "loss": 1.1953, + "step": 20160 + }, + { + "epoch": 6.004877231519574, + "grad_norm": 0.3327573835849762, + "learning_rate": 7.261350950622115e-06, + "loss": 1.2209, + "step": 20161 + }, + { + "epoch": 6.005175077719243, + "grad_norm": 0.2718318700790405, + "learning_rate": 7.260423274134547e-06, + "loss": 1.1992, + "step": 20162 + }, + { + "epoch": 6.005472923918911, + "grad_norm": 0.41871002316474915, + "learning_rate": 7.25949562313568e-06, + "loss": 1.2104, + "step": 20163 + }, + { + "epoch": 6.00577077011858, + "grad_norm": 0.5034345984458923, + "learning_rate": 7.258567997634141e-06, + "loss": 1.2115, + "step": 20164 + }, + { + "epoch": 6.006068616318249, + "grad_norm": 0.25733083486557007, + "learning_rate": 7.257640397638567e-06, + "loss": 1.2109, + "step": 20165 + }, + { + "epoch": 6.006366462517917, + "grad_norm": 0.33275723457336426, + "learning_rate": 7.256712823157584e-06, + "loss": 1.2166, + "step": 20166 + }, + { + "epoch": 6.006664308717586, + "grad_norm": 0.4351447820663452, + "learning_rate": 7.25578527419982e-06, + "loss": 1.2281, + "step": 20167 + }, + { + "epoch": 6.006962154917255, + "grad_norm": 0.31585893034935, + "learning_rate": 7.254857750773913e-06, + "loss": 1.1977, + "step": 20168 + }, + { + "epoch": 6.007260001116923, + "grad_norm": 0.3916688561439514, + "learning_rate": 7.253930252888487e-06, + "loss": 1.203, + "step": 20169 + }, + { + "epoch": 6.007557847316592, + "grad_norm": 0.3016495108604431, + "learning_rate": 7.253002780552167e-06, + "loss": 1.1975, + "step": 20170 + }, + { + "epoch": 6.007855693516261, + "grad_norm": 0.3804759383201599, + "learning_rate": 7.252075333773594e-06, + "loss": 1.205, + "step": 20171 + }, + { + "epoch": 6.008153539715929, + "grad_norm": 0.2956993281841278, + "learning_rate": 7.251147912561385e-06, + "loss": 1.1952, + "step": 20172 + }, + { + "epoch": 6.008451385915598, + "grad_norm": 0.40233540534973145, + "learning_rate": 7.250220516924174e-06, + "loss": 1.2127, + "step": 20173 + }, + { + "epoch": 6.0087492321152665, + "grad_norm": 0.30694687366485596, + "learning_rate": 7.24929314687059e-06, + "loss": 1.2116, + "step": 20174 + }, + { + "epoch": 6.009047078314935, + "grad_norm": 0.3779052495956421, + "learning_rate": 7.248365802409262e-06, + "loss": 1.1989, + "step": 20175 + }, + { + "epoch": 6.009344924514604, + "grad_norm": 0.4814597964286804, + "learning_rate": 7.247438483548811e-06, + "loss": 1.2011, + "step": 20176 + }, + { + "epoch": 6.009642770714272, + "grad_norm": 0.43889567255973816, + "learning_rate": 7.246511190297871e-06, + "loss": 1.2116, + "step": 20177 + }, + { + "epoch": 6.009940616913941, + "grad_norm": 0.5229358673095703, + "learning_rate": 7.2455839226650696e-06, + "loss": 1.2141, + "step": 20178 + }, + { + "epoch": 6.01023846311361, + "grad_norm": 0.32213860750198364, + "learning_rate": 7.244656680659032e-06, + "loss": 1.195, + "step": 20179 + }, + { + "epoch": 6.010536309313278, + "grad_norm": 0.9367316961288452, + "learning_rate": 7.243729464288387e-06, + "loss": 1.2101, + "step": 20180 + }, + { + "epoch": 6.010834155512947, + "grad_norm": 0.5796099901199341, + "learning_rate": 7.242802273561755e-06, + "loss": 1.204, + "step": 20181 + }, + { + "epoch": 6.011132001712616, + "grad_norm": 0.596804141998291, + "learning_rate": 7.241875108487773e-06, + "loss": 1.2033, + "step": 20182 + }, + { + "epoch": 6.011429847912284, + "grad_norm": 0.43341487646102905, + "learning_rate": 7.240947969075059e-06, + "loss": 1.2018, + "step": 20183 + }, + { + "epoch": 6.011727694111953, + "grad_norm": 0.49243173003196716, + "learning_rate": 7.240020855332241e-06, + "loss": 1.2303, + "step": 20184 + }, + { + "epoch": 6.012025540311622, + "grad_norm": 0.30709803104400635, + "learning_rate": 7.239093767267946e-06, + "loss": 1.2088, + "step": 20185 + }, + { + "epoch": 6.01232338651129, + "grad_norm": 0.3742426037788391, + "learning_rate": 7.238166704890801e-06, + "loss": 1.1972, + "step": 20186 + }, + { + "epoch": 6.012621232710959, + "grad_norm": 0.3649977743625641, + "learning_rate": 7.237239668209425e-06, + "loss": 1.2157, + "step": 20187 + }, + { + "epoch": 6.0129190789106275, + "grad_norm": 0.26593589782714844, + "learning_rate": 7.2363126572324505e-06, + "loss": 1.2058, + "step": 20188 + }, + { + "epoch": 6.013216925110296, + "grad_norm": 0.5346266031265259, + "learning_rate": 7.235385671968498e-06, + "loss": 1.2205, + "step": 20189 + }, + { + "epoch": 6.013514771309965, + "grad_norm": 0.3156408965587616, + "learning_rate": 7.234458712426189e-06, + "loss": 1.2021, + "step": 20190 + }, + { + "epoch": 6.013812617509633, + "grad_norm": 0.45501449704170227, + "learning_rate": 7.2335317786141575e-06, + "loss": 1.1988, + "step": 20191 + }, + { + "epoch": 6.014110463709302, + "grad_norm": 0.35815557837486267, + "learning_rate": 7.232604870541022e-06, + "loss": 1.1927, + "step": 20192 + }, + { + "epoch": 6.014408309908971, + "grad_norm": 0.30453506112098694, + "learning_rate": 7.231677988215402e-06, + "loss": 1.2149, + "step": 20193 + }, + { + "epoch": 6.014706156108639, + "grad_norm": 0.4766293466091156, + "learning_rate": 7.230751131645929e-06, + "loss": 1.2094, + "step": 20194 + }, + { + "epoch": 6.0150040023083085, + "grad_norm": 0.29360222816467285, + "learning_rate": 7.229824300841219e-06, + "loss": 1.2275, + "step": 20195 + }, + { + "epoch": 6.015301848507977, + "grad_norm": 0.3880390226840973, + "learning_rate": 7.228897495809901e-06, + "loss": 1.2104, + "step": 20196 + }, + { + "epoch": 6.015599694707645, + "grad_norm": 0.45414289832115173, + "learning_rate": 7.227970716560597e-06, + "loss": 1.2102, + "step": 20197 + }, + { + "epoch": 6.015897540907314, + "grad_norm": 0.35279580950737, + "learning_rate": 7.227043963101925e-06, + "loss": 1.2236, + "step": 20198 + }, + { + "epoch": 6.016195387106983, + "grad_norm": 0.48844024538993835, + "learning_rate": 7.226117235442515e-06, + "loss": 1.2091, + "step": 20199 + }, + { + "epoch": 6.016493233306651, + "grad_norm": 0.2899201512336731, + "learning_rate": 7.225190533590982e-06, + "loss": 1.2038, + "step": 20200 + }, + { + "epoch": 6.01679107950632, + "grad_norm": 0.3832801282405853, + "learning_rate": 7.224263857555952e-06, + "loss": 1.2166, + "step": 20201 + }, + { + "epoch": 6.0170889257059885, + "grad_norm": 0.33199718594551086, + "learning_rate": 7.223337207346045e-06, + "loss": 1.2086, + "step": 20202 + }, + { + "epoch": 6.017386771905657, + "grad_norm": 0.4868021607398987, + "learning_rate": 7.222410582969885e-06, + "loss": 1.2091, + "step": 20203 + }, + { + "epoch": 6.017684618105326, + "grad_norm": 0.31164202094078064, + "learning_rate": 7.221483984436089e-06, + "loss": 1.2012, + "step": 20204 + }, + { + "epoch": 6.017982464304994, + "grad_norm": 0.33542874455451965, + "learning_rate": 7.220557411753281e-06, + "loss": 1.1989, + "step": 20205 + }, + { + "epoch": 6.018280310504663, + "grad_norm": 0.3539286255836487, + "learning_rate": 7.219630864930081e-06, + "loss": 1.2085, + "step": 20206 + }, + { + "epoch": 6.018578156704332, + "grad_norm": 0.2705722749233246, + "learning_rate": 7.21870434397511e-06, + "loss": 1.2092, + "step": 20207 + }, + { + "epoch": 6.018876002904, + "grad_norm": 0.3076128363609314, + "learning_rate": 7.217777848896985e-06, + "loss": 1.1942, + "step": 20208 + }, + { + "epoch": 6.0191738491036695, + "grad_norm": 0.2987601161003113, + "learning_rate": 7.216851379704333e-06, + "loss": 1.2177, + "step": 20209 + }, + { + "epoch": 6.019471695303338, + "grad_norm": 0.2667655050754547, + "learning_rate": 7.215924936405764e-06, + "loss": 1.2322, + "step": 20210 + }, + { + "epoch": 6.019769541503006, + "grad_norm": 0.27794232964515686, + "learning_rate": 7.214998519009908e-06, + "loss": 1.2056, + "step": 20211 + }, + { + "epoch": 6.020067387702675, + "grad_norm": 0.30565345287323, + "learning_rate": 7.214072127525373e-06, + "loss": 1.2197, + "step": 20212 + }, + { + "epoch": 6.020365233902344, + "grad_norm": 0.32783243060112, + "learning_rate": 7.213145761960789e-06, + "loss": 1.209, + "step": 20213 + }, + { + "epoch": 6.020663080102012, + "grad_norm": 0.2744307816028595, + "learning_rate": 7.212219422324771e-06, + "loss": 1.2017, + "step": 20214 + }, + { + "epoch": 6.020960926301681, + "grad_norm": 0.2722924053668976, + "learning_rate": 7.211293108625932e-06, + "loss": 1.2091, + "step": 20215 + }, + { + "epoch": 6.0212587725013496, + "grad_norm": 0.31360188126564026, + "learning_rate": 7.2103668208729e-06, + "loss": 1.2202, + "step": 20216 + }, + { + "epoch": 6.021556618701018, + "grad_norm": 0.37984272837638855, + "learning_rate": 7.209440559074287e-06, + "loss": 1.1931, + "step": 20217 + }, + { + "epoch": 6.021854464900687, + "grad_norm": 0.29541170597076416, + "learning_rate": 7.2085143232387075e-06, + "loss": 1.2097, + "step": 20218 + }, + { + "epoch": 6.022152311100355, + "grad_norm": 0.45290622115135193, + "learning_rate": 7.207588113374786e-06, + "loss": 1.2064, + "step": 20219 + }, + { + "epoch": 6.022450157300025, + "grad_norm": 0.32291334867477417, + "learning_rate": 7.20666192949114e-06, + "loss": 1.1922, + "step": 20220 + }, + { + "epoch": 6.022748003499693, + "grad_norm": 0.46814969182014465, + "learning_rate": 7.205735771596379e-06, + "loss": 1.2011, + "step": 20221 + }, + { + "epoch": 6.023045849699361, + "grad_norm": 0.2647963762283325, + "learning_rate": 7.204809639699128e-06, + "loss": 1.2162, + "step": 20222 + }, + { + "epoch": 6.0233436958990305, + "grad_norm": 0.2693672776222229, + "learning_rate": 7.2038835338080026e-06, + "loss": 1.2214, + "step": 20223 + }, + { + "epoch": 6.023641542098699, + "grad_norm": 0.3999970555305481, + "learning_rate": 7.202957453931611e-06, + "loss": 1.2086, + "step": 20224 + }, + { + "epoch": 6.023939388298367, + "grad_norm": 0.41179734468460083, + "learning_rate": 7.202031400078578e-06, + "loss": 1.1991, + "step": 20225 + }, + { + "epoch": 6.024237234498036, + "grad_norm": 0.26541635394096375, + "learning_rate": 7.201105372257513e-06, + "loss": 1.2184, + "step": 20226 + }, + { + "epoch": 6.024535080697705, + "grad_norm": 0.2938902676105499, + "learning_rate": 7.200179370477042e-06, + "loss": 1.2234, + "step": 20227 + }, + { + "epoch": 6.024832926897373, + "grad_norm": 0.27487459778785706, + "learning_rate": 7.199253394745772e-06, + "loss": 1.2094, + "step": 20228 + }, + { + "epoch": 6.025130773097042, + "grad_norm": 0.3359871506690979, + "learning_rate": 7.198327445072318e-06, + "loss": 1.2182, + "step": 20229 + }, + { + "epoch": 6.025428619296711, + "grad_norm": 0.3092283606529236, + "learning_rate": 7.1974015214652996e-06, + "loss": 1.2182, + "step": 20230 + }, + { + "epoch": 6.025726465496379, + "grad_norm": 0.28621119260787964, + "learning_rate": 7.196475623933326e-06, + "loss": 1.2198, + "step": 20231 + }, + { + "epoch": 6.026024311696048, + "grad_norm": 0.7144689559936523, + "learning_rate": 7.195549752485013e-06, + "loss": 1.2126, + "step": 20232 + }, + { + "epoch": 6.0263221578957165, + "grad_norm": 0.41839155554771423, + "learning_rate": 7.19462390712898e-06, + "loss": 1.2127, + "step": 20233 + }, + { + "epoch": 6.026620004095386, + "grad_norm": 0.5097765922546387, + "learning_rate": 7.193698087873838e-06, + "loss": 1.2038, + "step": 20234 + }, + { + "epoch": 6.026917850295054, + "grad_norm": 0.3441077470779419, + "learning_rate": 7.1927722947281956e-06, + "loss": 1.2117, + "step": 20235 + }, + { + "epoch": 6.027215696494722, + "grad_norm": 0.4326217770576477, + "learning_rate": 7.191846527700672e-06, + "loss": 1.2149, + "step": 20236 + }, + { + "epoch": 6.0275135426943915, + "grad_norm": 0.260591596364975, + "learning_rate": 7.190920786799882e-06, + "loss": 1.2043, + "step": 20237 + }, + { + "epoch": 6.02781138889406, + "grad_norm": 0.3409421741962433, + "learning_rate": 7.18999507203443e-06, + "loss": 1.2193, + "step": 20238 + }, + { + "epoch": 6.028109235093728, + "grad_norm": 0.35025498270988464, + "learning_rate": 7.189069383412941e-06, + "loss": 1.2049, + "step": 20239 + }, + { + "epoch": 6.028407081293397, + "grad_norm": 0.2884001135826111, + "learning_rate": 7.188143720944018e-06, + "loss": 1.2022, + "step": 20240 + }, + { + "epoch": 6.028704927493066, + "grad_norm": 0.2892345190048218, + "learning_rate": 7.187218084636274e-06, + "loss": 1.2, + "step": 20241 + }, + { + "epoch": 6.029002773692734, + "grad_norm": 0.3615533411502838, + "learning_rate": 7.1862924744983244e-06, + "loss": 1.2287, + "step": 20242 + }, + { + "epoch": 6.029300619892403, + "grad_norm": 0.27343472838401794, + "learning_rate": 7.18536689053878e-06, + "loss": 1.2045, + "step": 20243 + }, + { + "epoch": 6.029598466092072, + "grad_norm": 0.2636178135871887, + "learning_rate": 7.184441332766252e-06, + "loss": 1.1812, + "step": 20244 + }, + { + "epoch": 6.02989631229174, + "grad_norm": 0.3013664782047272, + "learning_rate": 7.183515801189353e-06, + "loss": 1.222, + "step": 20245 + }, + { + "epoch": 6.030194158491409, + "grad_norm": 0.2612849175930023, + "learning_rate": 7.182590295816689e-06, + "loss": 1.2062, + "step": 20246 + }, + { + "epoch": 6.0304920046910775, + "grad_norm": 0.3013571798801422, + "learning_rate": 7.18166481665688e-06, + "loss": 1.2186, + "step": 20247 + }, + { + "epoch": 6.030789850890747, + "grad_norm": 0.27074506878852844, + "learning_rate": 7.1807393637185265e-06, + "loss": 1.1998, + "step": 20248 + }, + { + "epoch": 6.031087697090415, + "grad_norm": 0.4318067133426666, + "learning_rate": 7.179813937010243e-06, + "loss": 1.2042, + "step": 20249 + }, + { + "epoch": 6.031385543290083, + "grad_norm": 0.31180885434150696, + "learning_rate": 7.178888536540642e-06, + "loss": 1.1954, + "step": 20250 + }, + { + "epoch": 6.031683389489753, + "grad_norm": 0.3415888845920563, + "learning_rate": 7.177963162318333e-06, + "loss": 1.1973, + "step": 20251 + }, + { + "epoch": 6.031981235689421, + "grad_norm": 0.3148844540119171, + "learning_rate": 7.1770378143519195e-06, + "loss": 1.2013, + "step": 20252 + }, + { + "epoch": 6.032279081889089, + "grad_norm": 0.45775938034057617, + "learning_rate": 7.1761124926500205e-06, + "loss": 1.1995, + "step": 20253 + }, + { + "epoch": 6.0325769280887585, + "grad_norm": 0.3277266323566437, + "learning_rate": 7.175187197221236e-06, + "loss": 1.2031, + "step": 20254 + }, + { + "epoch": 6.032874774288427, + "grad_norm": 0.4142463505268097, + "learning_rate": 7.1742619280741756e-06, + "loss": 1.2072, + "step": 20255 + }, + { + "epoch": 6.033172620488095, + "grad_norm": 0.36327052116394043, + "learning_rate": 7.173336685217457e-06, + "loss": 1.2076, + "step": 20256 + }, + { + "epoch": 6.033470466687764, + "grad_norm": 0.2709522247314453, + "learning_rate": 7.172411468659681e-06, + "loss": 1.1982, + "step": 20257 + }, + { + "epoch": 6.033768312887433, + "grad_norm": 0.45761629939079285, + "learning_rate": 7.171486278409454e-06, + "loss": 1.1959, + "step": 20258 + }, + { + "epoch": 6.034066159087101, + "grad_norm": 0.2904505133628845, + "learning_rate": 7.17056111447539e-06, + "loss": 1.2153, + "step": 20259 + }, + { + "epoch": 6.03436400528677, + "grad_norm": 0.5569671988487244, + "learning_rate": 7.16963597686609e-06, + "loss": 1.2045, + "step": 20260 + }, + { + "epoch": 6.0346618514864385, + "grad_norm": 0.3583231270313263, + "learning_rate": 7.168710865590168e-06, + "loss": 1.2145, + "step": 20261 + }, + { + "epoch": 6.034959697686108, + "grad_norm": 0.5513421297073364, + "learning_rate": 7.167785780656229e-06, + "loss": 1.2144, + "step": 20262 + }, + { + "epoch": 6.035257543885776, + "grad_norm": 0.4723437428474426, + "learning_rate": 7.166860722072876e-06, + "loss": 1.2052, + "step": 20263 + }, + { + "epoch": 6.035555390085444, + "grad_norm": 0.3246673047542572, + "learning_rate": 7.165935689848722e-06, + "loss": 1.2164, + "step": 20264 + }, + { + "epoch": 6.035853236285114, + "grad_norm": 0.49947765469551086, + "learning_rate": 7.1650106839923685e-06, + "loss": 1.1882, + "step": 20265 + }, + { + "epoch": 6.036151082484782, + "grad_norm": 0.2703419029712677, + "learning_rate": 7.1640857045124214e-06, + "loss": 1.2139, + "step": 20266 + }, + { + "epoch": 6.03644892868445, + "grad_norm": 0.49116799235343933, + "learning_rate": 7.16316075141749e-06, + "loss": 1.2194, + "step": 20267 + }, + { + "epoch": 6.0367467748841195, + "grad_norm": 0.31984731554985046, + "learning_rate": 7.162235824716179e-06, + "loss": 1.2122, + "step": 20268 + }, + { + "epoch": 6.037044621083788, + "grad_norm": 0.42808830738067627, + "learning_rate": 7.161310924417091e-06, + "loss": 1.2005, + "step": 20269 + }, + { + "epoch": 6.037342467283456, + "grad_norm": 0.35784581303596497, + "learning_rate": 7.160386050528836e-06, + "loss": 1.2119, + "step": 20270 + }, + { + "epoch": 6.037640313483125, + "grad_norm": 0.2613573372364044, + "learning_rate": 7.159461203060014e-06, + "loss": 1.2133, + "step": 20271 + }, + { + "epoch": 6.037938159682794, + "grad_norm": 0.5055890679359436, + "learning_rate": 7.1585363820192295e-06, + "loss": 1.2098, + "step": 20272 + }, + { + "epoch": 6.038236005882462, + "grad_norm": 0.28298699855804443, + "learning_rate": 7.157611587415094e-06, + "loss": 1.1979, + "step": 20273 + }, + { + "epoch": 6.038533852082131, + "grad_norm": 0.3474966287612915, + "learning_rate": 7.156686819256208e-06, + "loss": 1.1902, + "step": 20274 + }, + { + "epoch": 6.0388316982817996, + "grad_norm": 0.36944159865379333, + "learning_rate": 7.155762077551169e-06, + "loss": 1.2224, + "step": 20275 + }, + { + "epoch": 6.039129544481469, + "grad_norm": 0.2859373390674591, + "learning_rate": 7.154837362308591e-06, + "loss": 1.2096, + "step": 20276 + }, + { + "epoch": 6.039427390681137, + "grad_norm": 0.4036981463432312, + "learning_rate": 7.153912673537068e-06, + "loss": 1.1936, + "step": 20277 + }, + { + "epoch": 6.039725236880805, + "grad_norm": 0.26265665888786316, + "learning_rate": 7.15298801124521e-06, + "loss": 1.2139, + "step": 20278 + }, + { + "epoch": 6.040023083080475, + "grad_norm": 0.2937547266483307, + "learning_rate": 7.152063375441619e-06, + "loss": 1.2144, + "step": 20279 + }, + { + "epoch": 6.040320929280143, + "grad_norm": 0.3020278513431549, + "learning_rate": 7.151138766134893e-06, + "loss": 1.2127, + "step": 20280 + }, + { + "epoch": 6.040618775479811, + "grad_norm": 0.29109111428260803, + "learning_rate": 7.150214183333641e-06, + "loss": 1.2049, + "step": 20281 + }, + { + "epoch": 6.0409166216794805, + "grad_norm": 0.3542512059211731, + "learning_rate": 7.149289627046463e-06, + "loss": 1.2086, + "step": 20282 + }, + { + "epoch": 6.041214467879149, + "grad_norm": 0.2816535532474518, + "learning_rate": 7.148365097281956e-06, + "loss": 1.2058, + "step": 20283 + }, + { + "epoch": 6.041512314078817, + "grad_norm": 0.35247406363487244, + "learning_rate": 7.147440594048728e-06, + "loss": 1.2184, + "step": 20284 + }, + { + "epoch": 6.041810160278486, + "grad_norm": 0.31707391142845154, + "learning_rate": 7.1465161173553795e-06, + "loss": 1.2087, + "step": 20285 + }, + { + "epoch": 6.042108006478155, + "grad_norm": 0.36135584115982056, + "learning_rate": 7.145591667210507e-06, + "loss": 1.2004, + "step": 20286 + }, + { + "epoch": 6.042405852677824, + "grad_norm": 0.2838849723339081, + "learning_rate": 7.1446672436227185e-06, + "loss": 1.2278, + "step": 20287 + }, + { + "epoch": 6.042703698877492, + "grad_norm": 0.31617432832717896, + "learning_rate": 7.1437428466006096e-06, + "loss": 1.2062, + "step": 20288 + }, + { + "epoch": 6.043001545077161, + "grad_norm": 0.24839657545089722, + "learning_rate": 7.142818476152782e-06, + "loss": 1.1983, + "step": 20289 + }, + { + "epoch": 6.04329939127683, + "grad_norm": 0.2619687616825104, + "learning_rate": 7.141894132287836e-06, + "loss": 1.2022, + "step": 20290 + }, + { + "epoch": 6.043597237476498, + "grad_norm": 0.27380067110061646, + "learning_rate": 7.1409698150143735e-06, + "loss": 1.1957, + "step": 20291 + }, + { + "epoch": 6.0438950836761665, + "grad_norm": 0.24459058046340942, + "learning_rate": 7.140045524340989e-06, + "loss": 1.1973, + "step": 20292 + }, + { + "epoch": 6.044192929875836, + "grad_norm": 0.3787658214569092, + "learning_rate": 7.139121260276289e-06, + "loss": 1.1999, + "step": 20293 + }, + { + "epoch": 6.044490776075504, + "grad_norm": 0.2831189036369324, + "learning_rate": 7.138197022828867e-06, + "loss": 1.2056, + "step": 20294 + }, + { + "epoch": 6.044788622275172, + "grad_norm": 0.37250658869743347, + "learning_rate": 7.137272812007328e-06, + "loss": 1.2121, + "step": 20295 + }, + { + "epoch": 6.0450864684748415, + "grad_norm": 0.2655556797981262, + "learning_rate": 7.136348627820264e-06, + "loss": 1.1904, + "step": 20296 + }, + { + "epoch": 6.04538431467451, + "grad_norm": 0.3377954959869385, + "learning_rate": 7.1354244702762746e-06, + "loss": 1.2136, + "step": 20297 + }, + { + "epoch": 6.045682160874178, + "grad_norm": 0.25669988989830017, + "learning_rate": 7.134500339383964e-06, + "loss": 1.224, + "step": 20298 + }, + { + "epoch": 6.045980007073847, + "grad_norm": 0.289655864238739, + "learning_rate": 7.1335762351519264e-06, + "loss": 1.2171, + "step": 20299 + }, + { + "epoch": 6.046277853273516, + "grad_norm": 0.37964531779289246, + "learning_rate": 7.132652157588755e-06, + "loss": 1.2089, + "step": 20300 + }, + { + "epoch": 6.046575699473185, + "grad_norm": 0.3691795766353607, + "learning_rate": 7.131728106703055e-06, + "loss": 1.1989, + "step": 20301 + }, + { + "epoch": 6.046873545672853, + "grad_norm": 0.2852831184864044, + "learning_rate": 7.130804082503422e-06, + "loss": 1.2121, + "step": 20302 + }, + { + "epoch": 6.047171391872522, + "grad_norm": 0.25786951184272766, + "learning_rate": 7.129880084998446e-06, + "loss": 1.2259, + "step": 20303 + }, + { + "epoch": 6.047469238072191, + "grad_norm": 0.2877507209777832, + "learning_rate": 7.128956114196734e-06, + "loss": 1.2264, + "step": 20304 + }, + { + "epoch": 6.047767084271859, + "grad_norm": 0.26803675293922424, + "learning_rate": 7.1280321701068776e-06, + "loss": 1.2099, + "step": 20305 + }, + { + "epoch": 6.0480649304715275, + "grad_norm": 0.294336199760437, + "learning_rate": 7.127108252737469e-06, + "loss": 1.2089, + "step": 20306 + }, + { + "epoch": 6.048362776671197, + "grad_norm": 0.41252401471138, + "learning_rate": 7.126184362097111e-06, + "loss": 1.2175, + "step": 20307 + }, + { + "epoch": 6.048660622870865, + "grad_norm": 0.2663376033306122, + "learning_rate": 7.125260498194397e-06, + "loss": 1.2103, + "step": 20308 + }, + { + "epoch": 6.048958469070533, + "grad_norm": 0.444261372089386, + "learning_rate": 7.12433666103792e-06, + "loss": 1.2047, + "step": 20309 + }, + { + "epoch": 6.049256315270203, + "grad_norm": 0.3058907091617584, + "learning_rate": 7.123412850636282e-06, + "loss": 1.2129, + "step": 20310 + }, + { + "epoch": 6.049554161469871, + "grad_norm": 0.48705607652664185, + "learning_rate": 7.122489066998067e-06, + "loss": 1.2101, + "step": 20311 + }, + { + "epoch": 6.049852007669539, + "grad_norm": 0.38099536299705505, + "learning_rate": 7.121565310131882e-06, + "loss": 1.2085, + "step": 20312 + }, + { + "epoch": 6.0501498538692084, + "grad_norm": 0.3769145905971527, + "learning_rate": 7.120641580046314e-06, + "loss": 1.2155, + "step": 20313 + }, + { + "epoch": 6.050447700068877, + "grad_norm": 0.46924084424972534, + "learning_rate": 7.119717876749956e-06, + "loss": 1.2256, + "step": 20314 + }, + { + "epoch": 6.050745546268546, + "grad_norm": 0.28301212191581726, + "learning_rate": 7.11879420025141e-06, + "loss": 1.1889, + "step": 20315 + }, + { + "epoch": 6.051043392468214, + "grad_norm": 0.3728954792022705, + "learning_rate": 7.117870550559265e-06, + "loss": 1.2152, + "step": 20316 + }, + { + "epoch": 6.051341238667883, + "grad_norm": 0.2818300127983093, + "learning_rate": 7.116946927682109e-06, + "loss": 1.2075, + "step": 20317 + }, + { + "epoch": 6.051639084867552, + "grad_norm": 0.29196274280548096, + "learning_rate": 7.116023331628547e-06, + "loss": 1.2033, + "step": 20318 + }, + { + "epoch": 6.05193693106722, + "grad_norm": 0.26948052644729614, + "learning_rate": 7.115099762407162e-06, + "loss": 1.2144, + "step": 20319 + }, + { + "epoch": 6.0522347772668885, + "grad_norm": 0.3444169759750366, + "learning_rate": 7.114176220026549e-06, + "loss": 1.2132, + "step": 20320 + }, + { + "epoch": 6.052532623466558, + "grad_norm": 0.275411456823349, + "learning_rate": 7.113252704495306e-06, + "loss": 1.2135, + "step": 20321 + }, + { + "epoch": 6.052830469666226, + "grad_norm": 0.29189392924308777, + "learning_rate": 7.11232921582202e-06, + "loss": 1.2255, + "step": 20322 + }, + { + "epoch": 6.053128315865894, + "grad_norm": 0.37070831656455994, + "learning_rate": 7.111405754015283e-06, + "loss": 1.2175, + "step": 20323 + }, + { + "epoch": 6.053426162065564, + "grad_norm": 0.4555090367794037, + "learning_rate": 7.110482319083689e-06, + "loss": 1.2174, + "step": 20324 + }, + { + "epoch": 6.053724008265232, + "grad_norm": 0.3041447401046753, + "learning_rate": 7.109558911035828e-06, + "loss": 1.23, + "step": 20325 + }, + { + "epoch": 6.0540218544649, + "grad_norm": 0.446586936712265, + "learning_rate": 7.108635529880292e-06, + "loss": 1.2094, + "step": 20326 + }, + { + "epoch": 6.0543197006645695, + "grad_norm": 0.4064498841762543, + "learning_rate": 7.107712175625674e-06, + "loss": 1.1954, + "step": 20327 + }, + { + "epoch": 6.054617546864238, + "grad_norm": 0.44321534037590027, + "learning_rate": 7.106788848280559e-06, + "loss": 1.2123, + "step": 20328 + }, + { + "epoch": 6.054915393063907, + "grad_norm": 0.35596713423728943, + "learning_rate": 7.1058655478535454e-06, + "loss": 1.1995, + "step": 20329 + }, + { + "epoch": 6.055213239263575, + "grad_norm": 0.31254100799560547, + "learning_rate": 7.104942274353218e-06, + "loss": 1.2104, + "step": 20330 + }, + { + "epoch": 6.055511085463244, + "grad_norm": 0.39108315110206604, + "learning_rate": 7.104019027788165e-06, + "loss": 1.1914, + "step": 20331 + }, + { + "epoch": 6.055808931662913, + "grad_norm": 0.30670154094696045, + "learning_rate": 7.103095808166984e-06, + "loss": 1.2292, + "step": 20332 + }, + { + "epoch": 6.056106777862581, + "grad_norm": 0.26854562759399414, + "learning_rate": 7.102172615498261e-06, + "loss": 1.2124, + "step": 20333 + }, + { + "epoch": 6.0564046240622496, + "grad_norm": 0.3315681219100952, + "learning_rate": 7.101249449790578e-06, + "loss": 1.1954, + "step": 20334 + }, + { + "epoch": 6.056702470261919, + "grad_norm": 0.36479222774505615, + "learning_rate": 7.100326311052535e-06, + "loss": 1.2224, + "step": 20335 + }, + { + "epoch": 6.057000316461587, + "grad_norm": 0.32198062539100647, + "learning_rate": 7.099403199292715e-06, + "loss": 1.2185, + "step": 20336 + }, + { + "epoch": 6.057298162661255, + "grad_norm": 0.2780730724334717, + "learning_rate": 7.098480114519703e-06, + "loss": 1.2169, + "step": 20337 + }, + { + "epoch": 6.057596008860925, + "grad_norm": 0.36179402470588684, + "learning_rate": 7.0975570567421e-06, + "loss": 1.2094, + "step": 20338 + }, + { + "epoch": 6.057893855060593, + "grad_norm": 0.2977309226989746, + "learning_rate": 7.096634025968482e-06, + "loss": 1.2253, + "step": 20339 + }, + { + "epoch": 6.058191701260261, + "grad_norm": 0.2938660681247711, + "learning_rate": 7.095711022207438e-06, + "loss": 1.2012, + "step": 20340 + }, + { + "epoch": 6.0584895474599305, + "grad_norm": 0.48642587661743164, + "learning_rate": 7.094788045467562e-06, + "loss": 1.2172, + "step": 20341 + }, + { + "epoch": 6.058787393659599, + "grad_norm": 0.3143618106842041, + "learning_rate": 7.093865095757433e-06, + "loss": 1.2014, + "step": 20342 + }, + { + "epoch": 6.059085239859268, + "grad_norm": 0.31239813566207886, + "learning_rate": 7.092942173085644e-06, + "loss": 1.1942, + "step": 20343 + }, + { + "epoch": 6.059383086058936, + "grad_norm": 0.25440940260887146, + "learning_rate": 7.092019277460782e-06, + "loss": 1.2137, + "step": 20344 + }, + { + "epoch": 6.059680932258605, + "grad_norm": 0.3276274502277374, + "learning_rate": 7.091096408891429e-06, + "loss": 1.2169, + "step": 20345 + }, + { + "epoch": 6.059978778458274, + "grad_norm": 0.268621027469635, + "learning_rate": 7.090173567386177e-06, + "loss": 1.2079, + "step": 20346 + }, + { + "epoch": 6.060276624657942, + "grad_norm": 0.27536237239837646, + "learning_rate": 7.089250752953607e-06, + "loss": 1.232, + "step": 20347 + }, + { + "epoch": 6.060574470857611, + "grad_norm": 0.37485766410827637, + "learning_rate": 7.0883279656023066e-06, + "loss": 1.2133, + "step": 20348 + }, + { + "epoch": 6.06087231705728, + "grad_norm": 0.25602367520332336, + "learning_rate": 7.0874052053408616e-06, + "loss": 1.1843, + "step": 20349 + }, + { + "epoch": 6.061170163256948, + "grad_norm": 0.40816983580589294, + "learning_rate": 7.086482472177859e-06, + "loss": 1.2167, + "step": 20350 + }, + { + "epoch": 6.0614680094566165, + "grad_norm": 0.28543126583099365, + "learning_rate": 7.085559766121877e-06, + "loss": 1.2146, + "step": 20351 + }, + { + "epoch": 6.061765855656286, + "grad_norm": 0.38882529735565186, + "learning_rate": 7.084637087181511e-06, + "loss": 1.2167, + "step": 20352 + }, + { + "epoch": 6.062063701855954, + "grad_norm": 0.3593398928642273, + "learning_rate": 7.083714435365337e-06, + "loss": 1.211, + "step": 20353 + }, + { + "epoch": 6.062361548055623, + "grad_norm": 0.3053756356239319, + "learning_rate": 7.082791810681942e-06, + "loss": 1.2014, + "step": 20354 + }, + { + "epoch": 6.0626593942552915, + "grad_norm": 0.5559249520301819, + "learning_rate": 7.081869213139908e-06, + "loss": 1.2064, + "step": 20355 + }, + { + "epoch": 6.06295724045496, + "grad_norm": 0.36054715514183044, + "learning_rate": 7.080946642747825e-06, + "loss": 1.2183, + "step": 20356 + }, + { + "epoch": 6.063255086654629, + "grad_norm": 0.4525575637817383, + "learning_rate": 7.080024099514267e-06, + "loss": 1.2028, + "step": 20357 + }, + { + "epoch": 6.063552932854297, + "grad_norm": 0.2647133469581604, + "learning_rate": 7.079101583447826e-06, + "loss": 1.2196, + "step": 20358 + }, + { + "epoch": 6.063850779053966, + "grad_norm": 0.5323600172996521, + "learning_rate": 7.078179094557078e-06, + "loss": 1.2211, + "step": 20359 + }, + { + "epoch": 6.064148625253635, + "grad_norm": 0.34941011667251587, + "learning_rate": 7.077256632850611e-06, + "loss": 1.2192, + "step": 20360 + }, + { + "epoch": 6.064446471453303, + "grad_norm": 0.5039055943489075, + "learning_rate": 7.076334198337007e-06, + "loss": 1.2113, + "step": 20361 + }, + { + "epoch": 6.064744317652972, + "grad_norm": 0.3495168089866638, + "learning_rate": 7.0754117910248445e-06, + "loss": 1.2015, + "step": 20362 + }, + { + "epoch": 6.065042163852641, + "grad_norm": 0.2851726710796356, + "learning_rate": 7.0744894109227095e-06, + "loss": 1.2005, + "step": 20363 + }, + { + "epoch": 6.065340010052309, + "grad_norm": 0.47293710708618164, + "learning_rate": 7.073567058039184e-06, + "loss": 1.1926, + "step": 20364 + }, + { + "epoch": 6.0656378562519775, + "grad_norm": 0.2833225429058075, + "learning_rate": 7.072644732382842e-06, + "loss": 1.2157, + "step": 20365 + }, + { + "epoch": 6.065935702451647, + "grad_norm": 0.36753979325294495, + "learning_rate": 7.071722433962272e-06, + "loss": 1.2129, + "step": 20366 + }, + { + "epoch": 6.066233548651315, + "grad_norm": 0.45584869384765625, + "learning_rate": 7.0708001627860556e-06, + "loss": 1.215, + "step": 20367 + }, + { + "epoch": 6.066531394850984, + "grad_norm": 0.2690370976924896, + "learning_rate": 7.069877918862766e-06, + "loss": 1.1947, + "step": 20368 + }, + { + "epoch": 6.066829241050653, + "grad_norm": 0.5671363472938538, + "learning_rate": 7.068955702200995e-06, + "loss": 1.2114, + "step": 20369 + }, + { + "epoch": 6.067127087250321, + "grad_norm": 0.2720038890838623, + "learning_rate": 7.068033512809316e-06, + "loss": 1.2191, + "step": 20370 + }, + { + "epoch": 6.06742493344999, + "grad_norm": 0.5115370154380798, + "learning_rate": 7.067111350696303e-06, + "loss": 1.2105, + "step": 20371 + }, + { + "epoch": 6.0677227796496584, + "grad_norm": 0.31888440251350403, + "learning_rate": 7.0661892158705455e-06, + "loss": 1.2116, + "step": 20372 + }, + { + "epoch": 6.068020625849327, + "grad_norm": 0.36731308698654175, + "learning_rate": 7.065267108340622e-06, + "loss": 1.217, + "step": 20373 + }, + { + "epoch": 6.068318472048996, + "grad_norm": 0.6055471301078796, + "learning_rate": 7.064345028115105e-06, + "loss": 1.2088, + "step": 20374 + }, + { + "epoch": 6.068616318248664, + "grad_norm": 0.28859513998031616, + "learning_rate": 7.063422975202581e-06, + "loss": 1.2283, + "step": 20375 + }, + { + "epoch": 6.068914164448333, + "grad_norm": 0.6174281239509583, + "learning_rate": 7.062500949611622e-06, + "loss": 1.2135, + "step": 20376 + }, + { + "epoch": 6.069212010648002, + "grad_norm": 0.33535486459732056, + "learning_rate": 7.061578951350814e-06, + "loss": 1.2116, + "step": 20377 + }, + { + "epoch": 6.06950985684767, + "grad_norm": 0.5999237895011902, + "learning_rate": 7.060656980428728e-06, + "loss": 1.1964, + "step": 20378 + }, + { + "epoch": 6.0698077030473385, + "grad_norm": 0.3785956799983978, + "learning_rate": 7.059735036853942e-06, + "loss": 1.2025, + "step": 20379 + }, + { + "epoch": 6.070105549247008, + "grad_norm": 0.37391769886016846, + "learning_rate": 7.058813120635042e-06, + "loss": 1.2212, + "step": 20380 + }, + { + "epoch": 6.070403395446676, + "grad_norm": 0.4204009473323822, + "learning_rate": 7.057891231780598e-06, + "loss": 1.2008, + "step": 20381 + }, + { + "epoch": 6.070701241646345, + "grad_norm": 0.2564176619052887, + "learning_rate": 7.056969370299187e-06, + "loss": 1.224, + "step": 20382 + }, + { + "epoch": 6.070999087846014, + "grad_norm": 0.35885563492774963, + "learning_rate": 7.0560475361993885e-06, + "loss": 1.2079, + "step": 20383 + }, + { + "epoch": 6.071296934045682, + "grad_norm": 0.35639089345932007, + "learning_rate": 7.055125729489782e-06, + "loss": 1.1945, + "step": 20384 + }, + { + "epoch": 6.071594780245351, + "grad_norm": 0.25001823902130127, + "learning_rate": 7.054203950178935e-06, + "loss": 1.2165, + "step": 20385 + }, + { + "epoch": 6.0718926264450195, + "grad_norm": 0.41291582584381104, + "learning_rate": 7.053282198275433e-06, + "loss": 1.2268, + "step": 20386 + }, + { + "epoch": 6.072190472644688, + "grad_norm": 0.26988789439201355, + "learning_rate": 7.05236047378785e-06, + "loss": 1.218, + "step": 20387 + }, + { + "epoch": 6.072488318844357, + "grad_norm": 0.28253674507141113, + "learning_rate": 7.051438776724753e-06, + "loss": 1.2005, + "step": 20388 + }, + { + "epoch": 6.072786165044025, + "grad_norm": 0.33627164363861084, + "learning_rate": 7.050517107094725e-06, + "loss": 1.2046, + "step": 20389 + }, + { + "epoch": 6.073084011243694, + "grad_norm": 0.3070752024650574, + "learning_rate": 7.049595464906345e-06, + "loss": 1.1961, + "step": 20390 + }, + { + "epoch": 6.073381857443363, + "grad_norm": 0.3663565516471863, + "learning_rate": 7.048673850168177e-06, + "loss": 1.2106, + "step": 20391 + }, + { + "epoch": 6.073679703643031, + "grad_norm": 0.31832611560821533, + "learning_rate": 7.047752262888805e-06, + "loss": 1.2044, + "step": 20392 + }, + { + "epoch": 6.0739775498426996, + "grad_norm": 0.27179139852523804, + "learning_rate": 7.046830703076797e-06, + "loss": 1.1984, + "step": 20393 + }, + { + "epoch": 6.074275396042369, + "grad_norm": 0.2872277796268463, + "learning_rate": 7.0459091707407335e-06, + "loss": 1.2197, + "step": 20394 + }, + { + "epoch": 6.074573242242037, + "grad_norm": 0.3771313726902008, + "learning_rate": 7.044987665889182e-06, + "loss": 1.1935, + "step": 20395 + }, + { + "epoch": 6.074871088441706, + "grad_norm": 0.275558739900589, + "learning_rate": 7.0440661885307184e-06, + "loss": 1.2022, + "step": 20396 + }, + { + "epoch": 6.075168934641375, + "grad_norm": 0.43085530400276184, + "learning_rate": 7.0431447386739196e-06, + "loss": 1.2004, + "step": 20397 + }, + { + "epoch": 6.075466780841043, + "grad_norm": 0.3149186074733734, + "learning_rate": 7.042223316327355e-06, + "loss": 1.2117, + "step": 20398 + }, + { + "epoch": 6.075764627040712, + "grad_norm": 0.3658163547515869, + "learning_rate": 7.041301921499592e-06, + "loss": 1.2057, + "step": 20399 + }, + { + "epoch": 6.0760624732403805, + "grad_norm": 0.38250505924224854, + "learning_rate": 7.040380554199215e-06, + "loss": 1.1872, + "step": 20400 + }, + { + "epoch": 6.076360319440049, + "grad_norm": 0.26535093784332275, + "learning_rate": 7.039459214434789e-06, + "loss": 1.2137, + "step": 20401 + }, + { + "epoch": 6.076658165639718, + "grad_norm": 0.43134433031082153, + "learning_rate": 7.038537902214884e-06, + "loss": 1.2043, + "step": 20402 + }, + { + "epoch": 6.076956011839386, + "grad_norm": 0.25832346081733704, + "learning_rate": 7.03761661754808e-06, + "loss": 1.194, + "step": 20403 + }, + { + "epoch": 6.077253858039055, + "grad_norm": 0.37801945209503174, + "learning_rate": 7.036695360442942e-06, + "loss": 1.1986, + "step": 20404 + }, + { + "epoch": 6.077551704238724, + "grad_norm": 0.2915472388267517, + "learning_rate": 7.035774130908041e-06, + "loss": 1.2123, + "step": 20405 + }, + { + "epoch": 6.077849550438392, + "grad_norm": 0.30703651905059814, + "learning_rate": 7.034852928951953e-06, + "loss": 1.2081, + "step": 20406 + }, + { + "epoch": 6.078147396638061, + "grad_norm": 0.47424939274787903, + "learning_rate": 7.0339317545832435e-06, + "loss": 1.2096, + "step": 20407 + }, + { + "epoch": 6.07844524283773, + "grad_norm": 0.41901594400405884, + "learning_rate": 7.033010607810482e-06, + "loss": 1.2081, + "step": 20408 + }, + { + "epoch": 6.078743089037398, + "grad_norm": 0.27409788966178894, + "learning_rate": 7.0320894886422474e-06, + "loss": 1.1911, + "step": 20409 + }, + { + "epoch": 6.079040935237067, + "grad_norm": 0.29361626505851746, + "learning_rate": 7.0311683970871005e-06, + "loss": 1.1985, + "step": 20410 + }, + { + "epoch": 6.079338781436736, + "grad_norm": 0.2925852835178375, + "learning_rate": 7.030247333153617e-06, + "loss": 1.2284, + "step": 20411 + }, + { + "epoch": 6.079636627636404, + "grad_norm": 0.26330602169036865, + "learning_rate": 7.029326296850363e-06, + "loss": 1.2141, + "step": 20412 + }, + { + "epoch": 6.079934473836073, + "grad_norm": 0.27293527126312256, + "learning_rate": 7.0284052881859085e-06, + "loss": 1.2333, + "step": 20413 + }, + { + "epoch": 6.0802323200357415, + "grad_norm": 0.2640949487686157, + "learning_rate": 7.027484307168824e-06, + "loss": 1.2126, + "step": 20414 + }, + { + "epoch": 6.08053016623541, + "grad_norm": 0.26671817898750305, + "learning_rate": 7.026563353807678e-06, + "loss": 1.1931, + "step": 20415 + }, + { + "epoch": 6.080828012435079, + "grad_norm": 0.2652629613876343, + "learning_rate": 7.025642428111034e-06, + "loss": 1.1916, + "step": 20416 + }, + { + "epoch": 6.081125858634747, + "grad_norm": 0.3108772337436676, + "learning_rate": 7.024721530087469e-06, + "loss": 1.2156, + "step": 20417 + }, + { + "epoch": 6.081423704834416, + "grad_norm": 0.2715778648853302, + "learning_rate": 7.023800659745541e-06, + "loss": 1.216, + "step": 20418 + }, + { + "epoch": 6.081721551034085, + "grad_norm": 0.26113471388816833, + "learning_rate": 7.0228798170938225e-06, + "loss": 1.1955, + "step": 20419 + }, + { + "epoch": 6.082019397233753, + "grad_norm": 0.32121750712394714, + "learning_rate": 7.021959002140885e-06, + "loss": 1.2216, + "step": 20420 + }, + { + "epoch": 6.0823172434334225, + "grad_norm": 0.30098626017570496, + "learning_rate": 7.021038214895293e-06, + "loss": 1.2005, + "step": 20421 + }, + { + "epoch": 6.082615089633091, + "grad_norm": 0.27612149715423584, + "learning_rate": 7.020117455365606e-06, + "loss": 1.2129, + "step": 20422 + }, + { + "epoch": 6.082912935832759, + "grad_norm": 0.2973646819591522, + "learning_rate": 7.019196723560402e-06, + "loss": 1.2121, + "step": 20423 + }, + { + "epoch": 6.083210782032428, + "grad_norm": 0.26949751377105713, + "learning_rate": 7.01827601948824e-06, + "loss": 1.2028, + "step": 20424 + }, + { + "epoch": 6.083508628232097, + "grad_norm": 0.4442526400089264, + "learning_rate": 7.017355343157686e-06, + "loss": 1.2217, + "step": 20425 + }, + { + "epoch": 6.083806474431765, + "grad_norm": 0.2635462284088135, + "learning_rate": 7.016434694577312e-06, + "loss": 1.2113, + "step": 20426 + }, + { + "epoch": 6.084104320631434, + "grad_norm": 0.34377890825271606, + "learning_rate": 7.015514073755676e-06, + "loss": 1.2154, + "step": 20427 + }, + { + "epoch": 6.084402166831103, + "grad_norm": 0.30079883337020874, + "learning_rate": 7.014593480701351e-06, + "loss": 1.2039, + "step": 20428 + }, + { + "epoch": 6.084700013030771, + "grad_norm": 0.3327597975730896, + "learning_rate": 7.013672915422898e-06, + "loss": 1.2197, + "step": 20429 + }, + { + "epoch": 6.08499785923044, + "grad_norm": 0.2588975727558136, + "learning_rate": 7.012752377928878e-06, + "loss": 1.204, + "step": 20430 + }, + { + "epoch": 6.0852957054301084, + "grad_norm": 0.35409826040267944, + "learning_rate": 7.0118318682278615e-06, + "loss": 1.2015, + "step": 20431 + }, + { + "epoch": 6.085593551629777, + "grad_norm": 0.4417869448661804, + "learning_rate": 7.010911386328414e-06, + "loss": 1.2115, + "step": 20432 + }, + { + "epoch": 6.085891397829446, + "grad_norm": 0.2775195837020874, + "learning_rate": 7.0099909322390915e-06, + "loss": 1.2191, + "step": 20433 + }, + { + "epoch": 6.086189244029114, + "grad_norm": 0.3130930960178375, + "learning_rate": 7.009070505968467e-06, + "loss": 1.2218, + "step": 20434 + }, + { + "epoch": 6.0864870902287835, + "grad_norm": 0.25491204857826233, + "learning_rate": 7.008150107525098e-06, + "loss": 1.2002, + "step": 20435 + }, + { + "epoch": 6.086784936428452, + "grad_norm": 0.25688236951828003, + "learning_rate": 7.007229736917549e-06, + "loss": 1.2051, + "step": 20436 + }, + { + "epoch": 6.08708278262812, + "grad_norm": 0.28710290789604187, + "learning_rate": 7.006309394154383e-06, + "loss": 1.2025, + "step": 20437 + }, + { + "epoch": 6.087380628827789, + "grad_norm": 0.2777743637561798, + "learning_rate": 7.005389079244167e-06, + "loss": 1.2218, + "step": 20438 + }, + { + "epoch": 6.087678475027458, + "grad_norm": 0.4726444482803345, + "learning_rate": 7.004468792195454e-06, + "loss": 1.2149, + "step": 20439 + }, + { + "epoch": 6.087976321227126, + "grad_norm": 0.29624661803245544, + "learning_rate": 7.0035485330168175e-06, + "loss": 1.2336, + "step": 20440 + }, + { + "epoch": 6.088274167426795, + "grad_norm": 0.5459936857223511, + "learning_rate": 7.00262830171681e-06, + "loss": 1.209, + "step": 20441 + }, + { + "epoch": 6.088572013626464, + "grad_norm": 0.2904707193374634, + "learning_rate": 7.001708098303999e-06, + "loss": 1.2282, + "step": 20442 + }, + { + "epoch": 6.088869859826132, + "grad_norm": 0.3669629991054535, + "learning_rate": 7.000787922786944e-06, + "loss": 1.2053, + "step": 20443 + }, + { + "epoch": 6.089167706025801, + "grad_norm": 0.4363369941711426, + "learning_rate": 6.999867775174205e-06, + "loss": 1.2076, + "step": 20444 + }, + { + "epoch": 6.0894655522254695, + "grad_norm": 0.2777957320213318, + "learning_rate": 6.998947655474348e-06, + "loss": 1.2063, + "step": 20445 + }, + { + "epoch": 6.089763398425138, + "grad_norm": 0.3113929033279419, + "learning_rate": 6.998027563695931e-06, + "loss": 1.2066, + "step": 20446 + }, + { + "epoch": 6.090061244624807, + "grad_norm": 0.3149489164352417, + "learning_rate": 6.997107499847509e-06, + "loss": 1.1957, + "step": 20447 + }, + { + "epoch": 6.090359090824475, + "grad_norm": 0.27551475167274475, + "learning_rate": 6.996187463937647e-06, + "loss": 1.2202, + "step": 20448 + }, + { + "epoch": 6.090656937024145, + "grad_norm": 0.330971896648407, + "learning_rate": 6.995267455974909e-06, + "loss": 1.2157, + "step": 20449 + }, + { + "epoch": 6.090954783223813, + "grad_norm": 0.28980836272239685, + "learning_rate": 6.994347475967844e-06, + "loss": 1.2022, + "step": 20450 + }, + { + "epoch": 6.091252629423481, + "grad_norm": 0.401186466217041, + "learning_rate": 6.993427523925024e-06, + "loss": 1.1996, + "step": 20451 + }, + { + "epoch": 6.09155047562315, + "grad_norm": 0.2572411596775055, + "learning_rate": 6.992507599855001e-06, + "loss": 1.2126, + "step": 20452 + }, + { + "epoch": 6.091848321822819, + "grad_norm": 0.4092616140842438, + "learning_rate": 6.991587703766329e-06, + "loss": 1.2134, + "step": 20453 + }, + { + "epoch": 6.092146168022487, + "grad_norm": 0.3409324288368225, + "learning_rate": 6.990667835667575e-06, + "loss": 1.1989, + "step": 20454 + }, + { + "epoch": 6.092444014222156, + "grad_norm": 0.3348819613456726, + "learning_rate": 6.989747995567298e-06, + "loss": 1.2099, + "step": 20455 + }, + { + "epoch": 6.092741860421825, + "grad_norm": 0.3193581700325012, + "learning_rate": 6.988828183474046e-06, + "loss": 1.2028, + "step": 20456 + }, + { + "epoch": 6.093039706621493, + "grad_norm": 0.38945794105529785, + "learning_rate": 6.98790839939639e-06, + "loss": 1.2102, + "step": 20457 + }, + { + "epoch": 6.093337552821162, + "grad_norm": 0.2539597451686859, + "learning_rate": 6.986988643342876e-06, + "loss": 1.1944, + "step": 20458 + }, + { + "epoch": 6.0936353990208305, + "grad_norm": 0.4171285927295685, + "learning_rate": 6.986068915322071e-06, + "loss": 1.2103, + "step": 20459 + }, + { + "epoch": 6.093933245220499, + "grad_norm": 0.28877657651901245, + "learning_rate": 6.985149215342524e-06, + "loss": 1.204, + "step": 20460 + }, + { + "epoch": 6.094231091420168, + "grad_norm": 0.3398493230342865, + "learning_rate": 6.984229543412794e-06, + "loss": 1.1847, + "step": 20461 + }, + { + "epoch": 6.094528937619836, + "grad_norm": 0.2786409854888916, + "learning_rate": 6.983309899541443e-06, + "loss": 1.218, + "step": 20462 + }, + { + "epoch": 6.094826783819506, + "grad_norm": 0.6020640134811401, + "learning_rate": 6.982390283737022e-06, + "loss": 1.2103, + "step": 20463 + }, + { + "epoch": 6.095124630019174, + "grad_norm": 0.3121355473995209, + "learning_rate": 6.9814706960080845e-06, + "loss": 1.2303, + "step": 20464 + }, + { + "epoch": 6.095422476218842, + "grad_norm": 0.5204094648361206, + "learning_rate": 6.980551136363194e-06, + "loss": 1.2075, + "step": 20465 + }, + { + "epoch": 6.0957203224185115, + "grad_norm": 0.27732616662979126, + "learning_rate": 6.979631604810899e-06, + "loss": 1.2063, + "step": 20466 + }, + { + "epoch": 6.09601816861818, + "grad_norm": 0.4900452494621277, + "learning_rate": 6.978712101359756e-06, + "loss": 1.2041, + "step": 20467 + }, + { + "epoch": 6.096316014817848, + "grad_norm": 0.28665292263031006, + "learning_rate": 6.977792626018325e-06, + "loss": 1.2165, + "step": 20468 + }, + { + "epoch": 6.096613861017517, + "grad_norm": 0.45302802324295044, + "learning_rate": 6.976873178795157e-06, + "loss": 1.211, + "step": 20469 + }, + { + "epoch": 6.096911707217186, + "grad_norm": 0.26088371872901917, + "learning_rate": 6.9759537596988025e-06, + "loss": 1.2011, + "step": 20470 + }, + { + "epoch": 6.097209553416854, + "grad_norm": 0.36315658688545227, + "learning_rate": 6.975034368737821e-06, + "loss": 1.2057, + "step": 20471 + }, + { + "epoch": 6.097507399616523, + "grad_norm": 0.24717873334884644, + "learning_rate": 6.974115005920767e-06, + "loss": 1.2098, + "step": 20472 + }, + { + "epoch": 6.0978052458161915, + "grad_norm": 0.26071131229400635, + "learning_rate": 6.97319567125619e-06, + "loss": 1.2086, + "step": 20473 + }, + { + "epoch": 6.09810309201586, + "grad_norm": 0.27413856983184814, + "learning_rate": 6.9722763647526484e-06, + "loss": 1.2052, + "step": 20474 + }, + { + "epoch": 6.098400938215529, + "grad_norm": 0.33768430352211, + "learning_rate": 6.971357086418688e-06, + "loss": 1.1973, + "step": 20475 + }, + { + "epoch": 6.098698784415197, + "grad_norm": 0.3707341253757477, + "learning_rate": 6.97043783626287e-06, + "loss": 1.2011, + "step": 20476 + }, + { + "epoch": 6.098996630614867, + "grad_norm": 0.2891037166118622, + "learning_rate": 6.9695186142937425e-06, + "loss": 1.2058, + "step": 20477 + }, + { + "epoch": 6.099294476814535, + "grad_norm": 0.277849018573761, + "learning_rate": 6.968599420519855e-06, + "loss": 1.2151, + "step": 20478 + }, + { + "epoch": 6.099592323014203, + "grad_norm": 0.37085142731666565, + "learning_rate": 6.967680254949767e-06, + "loss": 1.1967, + "step": 20479 + }, + { + "epoch": 6.0998901692138725, + "grad_norm": 0.2706451416015625, + "learning_rate": 6.966761117592027e-06, + "loss": 1.2112, + "step": 20480 + }, + { + "epoch": 6.100188015413541, + "grad_norm": 0.40303710103034973, + "learning_rate": 6.965842008455181e-06, + "loss": 1.2084, + "step": 20481 + }, + { + "epoch": 6.100485861613209, + "grad_norm": 0.3119714558124542, + "learning_rate": 6.9649229275477905e-06, + "loss": 1.204, + "step": 20482 + }, + { + "epoch": 6.100783707812878, + "grad_norm": 0.36651402711868286, + "learning_rate": 6.964003874878398e-06, + "loss": 1.2106, + "step": 20483 + }, + { + "epoch": 6.101081554012547, + "grad_norm": 0.4344756603240967, + "learning_rate": 6.963084850455556e-06, + "loss": 1.2067, + "step": 20484 + }, + { + "epoch": 6.101379400212215, + "grad_norm": 0.27097341418266296, + "learning_rate": 6.96216585428782e-06, + "loss": 1.2034, + "step": 20485 + }, + { + "epoch": 6.101677246411884, + "grad_norm": 0.6429668664932251, + "learning_rate": 6.961246886383737e-06, + "loss": 1.2117, + "step": 20486 + }, + { + "epoch": 6.101975092611553, + "grad_norm": 0.29374146461486816, + "learning_rate": 6.960327946751852e-06, + "loss": 1.1867, + "step": 20487 + }, + { + "epoch": 6.102272938811222, + "grad_norm": 0.47270968556404114, + "learning_rate": 6.959409035400725e-06, + "loss": 1.2342, + "step": 20488 + }, + { + "epoch": 6.10257078501089, + "grad_norm": 0.2763921022415161, + "learning_rate": 6.958490152338897e-06, + "loss": 1.2199, + "step": 20489 + }, + { + "epoch": 6.1028686312105584, + "grad_norm": 0.3743135333061218, + "learning_rate": 6.9575712975749165e-06, + "loss": 1.2095, + "step": 20490 + }, + { + "epoch": 6.103166477410228, + "grad_norm": 0.28013962507247925, + "learning_rate": 6.956652471117342e-06, + "loss": 1.2001, + "step": 20491 + }, + { + "epoch": 6.103464323609896, + "grad_norm": 0.26570209860801697, + "learning_rate": 6.955733672974712e-06, + "loss": 1.204, + "step": 20492 + }, + { + "epoch": 6.103762169809564, + "grad_norm": 0.39894533157348633, + "learning_rate": 6.95481490315558e-06, + "loss": 1.2119, + "step": 20493 + }, + { + "epoch": 6.1040600160092335, + "grad_norm": 0.29802659153938293, + "learning_rate": 6.953896161668494e-06, + "loss": 1.2141, + "step": 20494 + }, + { + "epoch": 6.104357862208902, + "grad_norm": 0.27953803539276123, + "learning_rate": 6.952977448522001e-06, + "loss": 1.2163, + "step": 20495 + }, + { + "epoch": 6.10465570840857, + "grad_norm": 0.35821402072906494, + "learning_rate": 6.952058763724646e-06, + "loss": 1.1876, + "step": 20496 + }, + { + "epoch": 6.104953554608239, + "grad_norm": 0.2715325653553009, + "learning_rate": 6.951140107284983e-06, + "loss": 1.2025, + "step": 20497 + }, + { + "epoch": 6.105251400807908, + "grad_norm": 0.4369836151599884, + "learning_rate": 6.9502214792115495e-06, + "loss": 1.2112, + "step": 20498 + }, + { + "epoch": 6.105549247007576, + "grad_norm": 0.3524416983127594, + "learning_rate": 6.949302879512903e-06, + "loss": 1.2078, + "step": 20499 + }, + { + "epoch": 6.105847093207245, + "grad_norm": 0.850420355796814, + "learning_rate": 6.948384308197582e-06, + "loss": 1.21, + "step": 20500 + }, + { + "epoch": 6.105847093207245, + "eval_loss": 1.3234530687332153, + "eval_runtime": 23.9568, + "eval_samples_per_second": 72.38, + "eval_steps_per_second": 4.55, + "step": 20500 + }, + { + "epoch": 6.106144939406914, + "grad_norm": 0.3084619641304016, + "learning_rate": 6.947465765274135e-06, + "loss": 1.2032, + "step": 20501 + }, + { + "epoch": 6.106442785606583, + "grad_norm": 0.4244225323200226, + "learning_rate": 6.94654725075111e-06, + "loss": 1.1966, + "step": 20502 + }, + { + "epoch": 6.106740631806251, + "grad_norm": 0.3955024182796478, + "learning_rate": 6.945628764637053e-06, + "loss": 1.2156, + "step": 20503 + }, + { + "epoch": 6.1070384780059195, + "grad_norm": 0.26083919405937195, + "learning_rate": 6.944710306940503e-06, + "loss": 1.2078, + "step": 20504 + }, + { + "epoch": 6.107336324205589, + "grad_norm": 0.6535545587539673, + "learning_rate": 6.9437918776700145e-06, + "loss": 1.2169, + "step": 20505 + }, + { + "epoch": 6.107634170405257, + "grad_norm": 0.2741965353488922, + "learning_rate": 6.942873476834127e-06, + "loss": 1.2111, + "step": 20506 + }, + { + "epoch": 6.107932016604925, + "grad_norm": 0.37694886326789856, + "learning_rate": 6.9419551044413825e-06, + "loss": 1.2263, + "step": 20507 + }, + { + "epoch": 6.108229862804595, + "grad_norm": 0.36882150173187256, + "learning_rate": 6.941036760500333e-06, + "loss": 1.1988, + "step": 20508 + }, + { + "epoch": 6.108527709004263, + "grad_norm": 0.3224135637283325, + "learning_rate": 6.940118445019516e-06, + "loss": 1.1923, + "step": 20509 + }, + { + "epoch": 6.108825555203931, + "grad_norm": 0.46624529361724854, + "learning_rate": 6.939200158007482e-06, + "loss": 1.2059, + "step": 20510 + }, + { + "epoch": 6.1091234014036, + "grad_norm": 0.30174317955970764, + "learning_rate": 6.9382818994727695e-06, + "loss": 1.1994, + "step": 20511 + }, + { + "epoch": 6.109421247603269, + "grad_norm": 0.28202491998672485, + "learning_rate": 6.93736366942392e-06, + "loss": 1.2058, + "step": 20512 + }, + { + "epoch": 6.109719093802937, + "grad_norm": 0.3995566666126251, + "learning_rate": 6.936445467869481e-06, + "loss": 1.2134, + "step": 20513 + }, + { + "epoch": 6.110016940002606, + "grad_norm": 0.3717820346355438, + "learning_rate": 6.935527294817998e-06, + "loss": 1.2148, + "step": 20514 + }, + { + "epoch": 6.110314786202275, + "grad_norm": 0.44415083527565, + "learning_rate": 6.934609150278005e-06, + "loss": 1.1948, + "step": 20515 + }, + { + "epoch": 6.110612632401944, + "grad_norm": 0.453581839799881, + "learning_rate": 6.9336910342580525e-06, + "loss": 1.1998, + "step": 20516 + }, + { + "epoch": 6.110910478601612, + "grad_norm": 0.2977471947669983, + "learning_rate": 6.932772946766679e-06, + "loss": 1.2202, + "step": 20517 + }, + { + "epoch": 6.1112083248012805, + "grad_norm": 0.479152649641037, + "learning_rate": 6.931854887812424e-06, + "loss": 1.2124, + "step": 20518 + }, + { + "epoch": 6.11150617100095, + "grad_norm": 0.3845822513103485, + "learning_rate": 6.930936857403832e-06, + "loss": 1.2205, + "step": 20519 + }, + { + "epoch": 6.111804017200618, + "grad_norm": 0.483164519071579, + "learning_rate": 6.930018855549445e-06, + "loss": 1.2135, + "step": 20520 + }, + { + "epoch": 6.112101863400286, + "grad_norm": 0.3645203411579132, + "learning_rate": 6.929100882257798e-06, + "loss": 1.2102, + "step": 20521 + }, + { + "epoch": 6.112399709599956, + "grad_norm": 0.3148750364780426, + "learning_rate": 6.928182937537442e-06, + "loss": 1.2293, + "step": 20522 + }, + { + "epoch": 6.112697555799624, + "grad_norm": 0.3886222541332245, + "learning_rate": 6.927265021396909e-06, + "loss": 1.213, + "step": 20523 + }, + { + "epoch": 6.112995401999292, + "grad_norm": 0.287148118019104, + "learning_rate": 6.9263471338447416e-06, + "loss": 1.2011, + "step": 20524 + }, + { + "epoch": 6.1132932481989615, + "grad_norm": 0.26246532797813416, + "learning_rate": 6.925429274889481e-06, + "loss": 1.1939, + "step": 20525 + }, + { + "epoch": 6.11359109439863, + "grad_norm": 0.47627633810043335, + "learning_rate": 6.924511444539663e-06, + "loss": 1.1948, + "step": 20526 + }, + { + "epoch": 6.113888940598298, + "grad_norm": 0.35837310552597046, + "learning_rate": 6.923593642803834e-06, + "loss": 1.2005, + "step": 20527 + }, + { + "epoch": 6.114186786797967, + "grad_norm": 0.42851340770721436, + "learning_rate": 6.9226758696905295e-06, + "loss": 1.2183, + "step": 20528 + }, + { + "epoch": 6.114484632997636, + "grad_norm": 0.5166046023368835, + "learning_rate": 6.921758125208284e-06, + "loss": 1.2096, + "step": 20529 + }, + { + "epoch": 6.114782479197305, + "grad_norm": 0.3526485562324524, + "learning_rate": 6.9208404093656415e-06, + "loss": 1.2049, + "step": 20530 + }, + { + "epoch": 6.115080325396973, + "grad_norm": 0.6175981760025024, + "learning_rate": 6.919922722171139e-06, + "loss": 1.2096, + "step": 20531 + }, + { + "epoch": 6.1153781715966415, + "grad_norm": 0.2990829050540924, + "learning_rate": 6.9190050636333126e-06, + "loss": 1.2116, + "step": 20532 + }, + { + "epoch": 6.115676017796311, + "grad_norm": 0.8120248913764954, + "learning_rate": 6.918087433760705e-06, + "loss": 1.2086, + "step": 20533 + }, + { + "epoch": 6.115973863995979, + "grad_norm": 0.2923959195613861, + "learning_rate": 6.917169832561852e-06, + "loss": 1.2189, + "step": 20534 + }, + { + "epoch": 6.116271710195647, + "grad_norm": 0.5978054404258728, + "learning_rate": 6.9162522600452834e-06, + "loss": 1.196, + "step": 20535 + }, + { + "epoch": 6.116569556395317, + "grad_norm": 0.3772621154785156, + "learning_rate": 6.915334716219544e-06, + "loss": 1.2195, + "step": 20536 + }, + { + "epoch": 6.116867402594985, + "grad_norm": 0.3038269579410553, + "learning_rate": 6.914417201093172e-06, + "loss": 1.222, + "step": 20537 + }, + { + "epoch": 6.117165248794653, + "grad_norm": 0.555637001991272, + "learning_rate": 6.913499714674696e-06, + "loss": 1.2295, + "step": 20538 + }, + { + "epoch": 6.1174630949943225, + "grad_norm": 0.31136155128479004, + "learning_rate": 6.912582256972661e-06, + "loss": 1.2051, + "step": 20539 + }, + { + "epoch": 6.117760941193991, + "grad_norm": 0.5317781567573547, + "learning_rate": 6.911664827995599e-06, + "loss": 1.2104, + "step": 20540 + }, + { + "epoch": 6.118058787393659, + "grad_norm": 0.48772385716438293, + "learning_rate": 6.910747427752041e-06, + "loss": 1.2161, + "step": 20541 + }, + { + "epoch": 6.118356633593328, + "grad_norm": 0.375875324010849, + "learning_rate": 6.909830056250527e-06, + "loss": 1.2091, + "step": 20542 + }, + { + "epoch": 6.118654479792997, + "grad_norm": 0.5589781403541565, + "learning_rate": 6.908912713499592e-06, + "loss": 1.2111, + "step": 20543 + }, + { + "epoch": 6.118952325992666, + "grad_norm": 0.28380241990089417, + "learning_rate": 6.907995399507773e-06, + "loss": 1.2244, + "step": 20544 + }, + { + "epoch": 6.119250172192334, + "grad_norm": 0.6299400925636292, + "learning_rate": 6.907078114283603e-06, + "loss": 1.1996, + "step": 20545 + }, + { + "epoch": 6.119548018392003, + "grad_norm": 0.28528720140457153, + "learning_rate": 6.906160857835611e-06, + "loss": 1.2237, + "step": 20546 + }, + { + "epoch": 6.119845864591672, + "grad_norm": 0.3071301579475403, + "learning_rate": 6.90524363017234e-06, + "loss": 1.2006, + "step": 20547 + }, + { + "epoch": 6.12014371079134, + "grad_norm": 0.40062451362609863, + "learning_rate": 6.904326431302317e-06, + "loss": 1.2103, + "step": 20548 + }, + { + "epoch": 6.1204415569910084, + "grad_norm": 0.324465274810791, + "learning_rate": 6.903409261234078e-06, + "loss": 1.2091, + "step": 20549 + }, + { + "epoch": 6.120739403190678, + "grad_norm": 0.3610185384750366, + "learning_rate": 6.9024921199761575e-06, + "loss": 1.2172, + "step": 20550 + }, + { + "epoch": 6.121037249390346, + "grad_norm": 0.40855681896209717, + "learning_rate": 6.901575007537088e-06, + "loss": 1.2067, + "step": 20551 + }, + { + "epoch": 6.121335095590014, + "grad_norm": 0.2959303557872772, + "learning_rate": 6.900657923925398e-06, + "loss": 1.2246, + "step": 20552 + }, + { + "epoch": 6.1216329417896835, + "grad_norm": 0.28657275438308716, + "learning_rate": 6.899740869149623e-06, + "loss": 1.225, + "step": 20553 + }, + { + "epoch": 6.121930787989352, + "grad_norm": 0.4831201434135437, + "learning_rate": 6.8988238432183e-06, + "loss": 1.2139, + "step": 20554 + }, + { + "epoch": 6.122228634189021, + "grad_norm": 0.33637285232543945, + "learning_rate": 6.89790684613995e-06, + "loss": 1.2026, + "step": 20555 + }, + { + "epoch": 6.122526480388689, + "grad_norm": 0.4028896689414978, + "learning_rate": 6.896989877923117e-06, + "loss": 1.2057, + "step": 20556 + }, + { + "epoch": 6.122824326588358, + "grad_norm": 0.3351418375968933, + "learning_rate": 6.896072938576321e-06, + "loss": 1.2118, + "step": 20557 + }, + { + "epoch": 6.123122172788027, + "grad_norm": 0.4150068163871765, + "learning_rate": 6.895156028108103e-06, + "loss": 1.2179, + "step": 20558 + }, + { + "epoch": 6.123420018987695, + "grad_norm": 0.32761695981025696, + "learning_rate": 6.894239146526986e-06, + "loss": 1.2099, + "step": 20559 + }, + { + "epoch": 6.123717865187364, + "grad_norm": 0.3515329957008362, + "learning_rate": 6.893322293841505e-06, + "loss": 1.2013, + "step": 20560 + }, + { + "epoch": 6.124015711387033, + "grad_norm": 0.3351740837097168, + "learning_rate": 6.89240547006019e-06, + "loss": 1.2028, + "step": 20561 + }, + { + "epoch": 6.124313557586701, + "grad_norm": 0.28326913714408875, + "learning_rate": 6.8914886751915714e-06, + "loss": 1.2216, + "step": 20562 + }, + { + "epoch": 6.1246114037863695, + "grad_norm": 0.29817336797714233, + "learning_rate": 6.890571909244173e-06, + "loss": 1.2059, + "step": 20563 + }, + { + "epoch": 6.124909249986039, + "grad_norm": 0.30810728669166565, + "learning_rate": 6.889655172226533e-06, + "loss": 1.2098, + "step": 20564 + }, + { + "epoch": 6.125207096185707, + "grad_norm": 0.3400174677371979, + "learning_rate": 6.888738464147174e-06, + "loss": 1.2043, + "step": 20565 + }, + { + "epoch": 6.125504942385375, + "grad_norm": 0.41957220435142517, + "learning_rate": 6.887821785014627e-06, + "loss": 1.2104, + "step": 20566 + }, + { + "epoch": 6.125802788585045, + "grad_norm": 0.3656337559223175, + "learning_rate": 6.886905134837424e-06, + "loss": 1.2127, + "step": 20567 + }, + { + "epoch": 6.126100634784713, + "grad_norm": 0.3066710829734802, + "learning_rate": 6.885988513624091e-06, + "loss": 1.203, + "step": 20568 + }, + { + "epoch": 6.126398480984382, + "grad_norm": 0.2674976587295532, + "learning_rate": 6.885071921383151e-06, + "loss": 1.2173, + "step": 20569 + }, + { + "epoch": 6.12669632718405, + "grad_norm": 0.2625259459018707, + "learning_rate": 6.884155358123141e-06, + "loss": 1.219, + "step": 20570 + }, + { + "epoch": 6.126994173383719, + "grad_norm": 0.46240806579589844, + "learning_rate": 6.883238823852583e-06, + "loss": 1.2165, + "step": 20571 + }, + { + "epoch": 6.127292019583388, + "grad_norm": 0.26764822006225586, + "learning_rate": 6.8823223185800024e-06, + "loss": 1.2049, + "step": 20572 + }, + { + "epoch": 6.127589865783056, + "grad_norm": 0.5365241765975952, + "learning_rate": 6.881405842313934e-06, + "loss": 1.2085, + "step": 20573 + }, + { + "epoch": 6.127887711982725, + "grad_norm": 0.3085898756980896, + "learning_rate": 6.880489395062896e-06, + "loss": 1.2161, + "step": 20574 + }, + { + "epoch": 6.128185558182394, + "grad_norm": 0.6672071218490601, + "learning_rate": 6.879572976835422e-06, + "loss": 1.2084, + "step": 20575 + }, + { + "epoch": 6.128483404382062, + "grad_norm": 0.2635343372821808, + "learning_rate": 6.878656587640036e-06, + "loss": 1.1941, + "step": 20576 + }, + { + "epoch": 6.1287812505817305, + "grad_norm": 0.3361892104148865, + "learning_rate": 6.877740227485259e-06, + "loss": 1.2068, + "step": 20577 + }, + { + "epoch": 6.1290790967814, + "grad_norm": 0.42709881067276, + "learning_rate": 6.876823896379623e-06, + "loss": 1.2112, + "step": 20578 + }, + { + "epoch": 6.129376942981068, + "grad_norm": 0.2991598844528198, + "learning_rate": 6.875907594331652e-06, + "loss": 1.2122, + "step": 20579 + }, + { + "epoch": 6.129674789180736, + "grad_norm": 0.47507810592651367, + "learning_rate": 6.874991321349867e-06, + "loss": 1.2162, + "step": 20580 + }, + { + "epoch": 6.129972635380406, + "grad_norm": 0.2950620651245117, + "learning_rate": 6.874075077442801e-06, + "loss": 1.1809, + "step": 20581 + }, + { + "epoch": 6.130270481580074, + "grad_norm": 0.3419857323169708, + "learning_rate": 6.873158862618972e-06, + "loss": 1.2024, + "step": 20582 + }, + { + "epoch": 6.130568327779743, + "grad_norm": 0.2745928466320038, + "learning_rate": 6.872242676886905e-06, + "loss": 1.2026, + "step": 20583 + }, + { + "epoch": 6.1308661739794115, + "grad_norm": 0.2627103328704834, + "learning_rate": 6.871326520255126e-06, + "loss": 1.1981, + "step": 20584 + }, + { + "epoch": 6.13116402017908, + "grad_norm": 0.27292487025260925, + "learning_rate": 6.870410392732161e-06, + "loss": 1.204, + "step": 20585 + }, + { + "epoch": 6.131461866378749, + "grad_norm": 0.3463563919067383, + "learning_rate": 6.869494294326524e-06, + "loss": 1.2092, + "step": 20586 + }, + { + "epoch": 6.131759712578417, + "grad_norm": 0.265866756439209, + "learning_rate": 6.868578225046751e-06, + "loss": 1.2171, + "step": 20587 + }, + { + "epoch": 6.132057558778086, + "grad_norm": 0.34672361612319946, + "learning_rate": 6.867662184901358e-06, + "loss": 1.1981, + "step": 20588 + }, + { + "epoch": 6.132355404977755, + "grad_norm": 0.358896404504776, + "learning_rate": 6.866746173898865e-06, + "loss": 1.2177, + "step": 20589 + }, + { + "epoch": 6.132653251177423, + "grad_norm": 0.3443050682544708, + "learning_rate": 6.865830192047803e-06, + "loss": 1.2018, + "step": 20590 + }, + { + "epoch": 6.1329510973770915, + "grad_norm": 0.3751683533191681, + "learning_rate": 6.8649142393566845e-06, + "loss": 1.2089, + "step": 20591 + }, + { + "epoch": 6.133248943576761, + "grad_norm": 0.3259476125240326, + "learning_rate": 6.863998315834042e-06, + "loss": 1.2082, + "step": 20592 + }, + { + "epoch": 6.133546789776429, + "grad_norm": 0.35596439242362976, + "learning_rate": 6.86308242148839e-06, + "loss": 1.2041, + "step": 20593 + }, + { + "epoch": 6.133844635976097, + "grad_norm": 0.2538476288318634, + "learning_rate": 6.862166556328247e-06, + "loss": 1.2044, + "step": 20594 + }, + { + "epoch": 6.134142482175767, + "grad_norm": 0.5332903861999512, + "learning_rate": 6.861250720362141e-06, + "loss": 1.2126, + "step": 20595 + }, + { + "epoch": 6.134440328375435, + "grad_norm": 0.41133761405944824, + "learning_rate": 6.8603349135985915e-06, + "loss": 1.2158, + "step": 20596 + }, + { + "epoch": 6.134738174575104, + "grad_norm": 0.37557145953178406, + "learning_rate": 6.859419136046115e-06, + "loss": 1.2136, + "step": 20597 + }, + { + "epoch": 6.1350360207747725, + "grad_norm": 0.29362809658050537, + "learning_rate": 6.858503387713239e-06, + "loss": 1.2349, + "step": 20598 + }, + { + "epoch": 6.135333866974441, + "grad_norm": 0.5136938095092773, + "learning_rate": 6.857587668608479e-06, + "loss": 1.2078, + "step": 20599 + }, + { + "epoch": 6.13563171317411, + "grad_norm": 0.3157677948474884, + "learning_rate": 6.856671978740349e-06, + "loss": 1.2042, + "step": 20600 + }, + { + "epoch": 6.135929559373778, + "grad_norm": 0.4371340870857239, + "learning_rate": 6.8557563181173765e-06, + "loss": 1.2106, + "step": 20601 + }, + { + "epoch": 6.136227405573447, + "grad_norm": 0.2592601776123047, + "learning_rate": 6.854840686748082e-06, + "loss": 1.1945, + "step": 20602 + }, + { + "epoch": 6.136525251773116, + "grad_norm": 0.5364843606948853, + "learning_rate": 6.853925084640977e-06, + "loss": 1.1995, + "step": 20603 + }, + { + "epoch": 6.136823097972784, + "grad_norm": 0.2640324831008911, + "learning_rate": 6.853009511804587e-06, + "loss": 1.2213, + "step": 20604 + }, + { + "epoch": 6.137120944172453, + "grad_norm": 0.39366430044174194, + "learning_rate": 6.852093968247426e-06, + "loss": 1.212, + "step": 20605 + }, + { + "epoch": 6.137418790372122, + "grad_norm": 0.36515429615974426, + "learning_rate": 6.851178453978013e-06, + "loss": 1.1881, + "step": 20606 + }, + { + "epoch": 6.13771663657179, + "grad_norm": 0.3126497268676758, + "learning_rate": 6.850262969004866e-06, + "loss": 1.2115, + "step": 20607 + }, + { + "epoch": 6.1380144827714584, + "grad_norm": 0.3741849660873413, + "learning_rate": 6.849347513336501e-06, + "loss": 1.2258, + "step": 20608 + }, + { + "epoch": 6.138312328971128, + "grad_norm": 0.2869343161582947, + "learning_rate": 6.848432086981444e-06, + "loss": 1.204, + "step": 20609 + }, + { + "epoch": 6.138610175170796, + "grad_norm": 0.3380416929721832, + "learning_rate": 6.847516689948203e-06, + "loss": 1.2106, + "step": 20610 + }, + { + "epoch": 6.138908021370465, + "grad_norm": 0.32794860005378723, + "learning_rate": 6.846601322245293e-06, + "loss": 1.214, + "step": 20611 + }, + { + "epoch": 6.1392058675701335, + "grad_norm": 0.26346683502197266, + "learning_rate": 6.8456859838812396e-06, + "loss": 1.1959, + "step": 20612 + }, + { + "epoch": 6.139503713769802, + "grad_norm": 0.31447237730026245, + "learning_rate": 6.8447706748645515e-06, + "loss": 1.2156, + "step": 20613 + }, + { + "epoch": 6.139801559969471, + "grad_norm": 0.26520559191703796, + "learning_rate": 6.843855395203744e-06, + "loss": 1.2159, + "step": 20614 + }, + { + "epoch": 6.140099406169139, + "grad_norm": 0.25677022337913513, + "learning_rate": 6.842940144907341e-06, + "loss": 1.1995, + "step": 20615 + }, + { + "epoch": 6.140397252368808, + "grad_norm": 0.2796150743961334, + "learning_rate": 6.842024923983852e-06, + "loss": 1.2141, + "step": 20616 + }, + { + "epoch": 6.140695098568477, + "grad_norm": 0.27832838892936707, + "learning_rate": 6.841109732441788e-06, + "loss": 1.1975, + "step": 20617 + }, + { + "epoch": 6.140992944768145, + "grad_norm": 0.31540924310684204, + "learning_rate": 6.840194570289673e-06, + "loss": 1.2007, + "step": 20618 + }, + { + "epoch": 6.141290790967814, + "grad_norm": 0.3285495638847351, + "learning_rate": 6.8392794375360174e-06, + "loss": 1.2299, + "step": 20619 + }, + { + "epoch": 6.141588637167483, + "grad_norm": 0.3270295262336731, + "learning_rate": 6.838364334189331e-06, + "loss": 1.2279, + "step": 20620 + }, + { + "epoch": 6.141886483367151, + "grad_norm": 0.3397824764251709, + "learning_rate": 6.837449260258138e-06, + "loss": 1.2167, + "step": 20621 + }, + { + "epoch": 6.14218432956682, + "grad_norm": 0.3192901015281677, + "learning_rate": 6.836534215750945e-06, + "loss": 1.1943, + "step": 20622 + }, + { + "epoch": 6.142482175766489, + "grad_norm": 0.2750777006149292, + "learning_rate": 6.835619200676262e-06, + "loss": 1.2089, + "step": 20623 + }, + { + "epoch": 6.142780021966157, + "grad_norm": 0.3177313506603241, + "learning_rate": 6.834704215042609e-06, + "loss": 1.206, + "step": 20624 + }, + { + "epoch": 6.143077868165826, + "grad_norm": 0.2615659534931183, + "learning_rate": 6.833789258858496e-06, + "loss": 1.2098, + "step": 20625 + }, + { + "epoch": 6.143375714365495, + "grad_norm": 0.33818912506103516, + "learning_rate": 6.8328743321324394e-06, + "loss": 1.2055, + "step": 20626 + }, + { + "epoch": 6.143673560565163, + "grad_norm": 0.256344735622406, + "learning_rate": 6.8319594348729505e-06, + "loss": 1.2114, + "step": 20627 + }, + { + "epoch": 6.143971406764832, + "grad_norm": 0.2722576856613159, + "learning_rate": 6.831044567088533e-06, + "loss": 1.1923, + "step": 20628 + }, + { + "epoch": 6.1442692529645, + "grad_norm": 0.2654699683189392, + "learning_rate": 6.830129728787711e-06, + "loss": 1.2197, + "step": 20629 + }, + { + "epoch": 6.144567099164169, + "grad_norm": 0.2782585620880127, + "learning_rate": 6.829214919978987e-06, + "loss": 1.1987, + "step": 20630 + }, + { + "epoch": 6.144864945363838, + "grad_norm": 0.3233979344367981, + "learning_rate": 6.828300140670873e-06, + "loss": 1.204, + "step": 20631 + }, + { + "epoch": 6.145162791563506, + "grad_norm": 0.2728368937969208, + "learning_rate": 6.827385390871888e-06, + "loss": 1.1921, + "step": 20632 + }, + { + "epoch": 6.145460637763175, + "grad_norm": 0.2728160619735718, + "learning_rate": 6.826470670590537e-06, + "loss": 1.2112, + "step": 20633 + }, + { + "epoch": 6.145758483962844, + "grad_norm": 0.3183210790157318, + "learning_rate": 6.825555979835328e-06, + "loss": 1.21, + "step": 20634 + }, + { + "epoch": 6.146056330162512, + "grad_norm": 0.28341755270957947, + "learning_rate": 6.824641318614776e-06, + "loss": 1.2078, + "step": 20635 + }, + { + "epoch": 6.146354176362181, + "grad_norm": 0.2619353234767914, + "learning_rate": 6.823726686937388e-06, + "loss": 1.222, + "step": 20636 + }, + { + "epoch": 6.14665202256185, + "grad_norm": 0.28459495306015015, + "learning_rate": 6.822812084811672e-06, + "loss": 1.2158, + "step": 20637 + }, + { + "epoch": 6.146949868761518, + "grad_norm": 0.35960063338279724, + "learning_rate": 6.821897512246143e-06, + "loss": 1.2057, + "step": 20638 + }, + { + "epoch": 6.147247714961187, + "grad_norm": 0.2918333411216736, + "learning_rate": 6.820982969249308e-06, + "loss": 1.211, + "step": 20639 + }, + { + "epoch": 6.147545561160856, + "grad_norm": 0.27319374680519104, + "learning_rate": 6.820068455829669e-06, + "loss": 1.2031, + "step": 20640 + }, + { + "epoch": 6.147843407360524, + "grad_norm": 0.2744782269001007, + "learning_rate": 6.819153971995745e-06, + "loss": 1.2069, + "step": 20641 + }, + { + "epoch": 6.148141253560193, + "grad_norm": 0.3396849036216736, + "learning_rate": 6.8182395177560365e-06, + "loss": 1.2267, + "step": 20642 + }, + { + "epoch": 6.1484390997598615, + "grad_norm": 0.24592465162277222, + "learning_rate": 6.817325093119056e-06, + "loss": 1.2042, + "step": 20643 + }, + { + "epoch": 6.14873694595953, + "grad_norm": 0.36841118335723877, + "learning_rate": 6.816410698093312e-06, + "loss": 1.2175, + "step": 20644 + }, + { + "epoch": 6.149034792159199, + "grad_norm": 0.2556819021701813, + "learning_rate": 6.815496332687304e-06, + "loss": 1.213, + "step": 20645 + }, + { + "epoch": 6.149332638358867, + "grad_norm": 0.3732451796531677, + "learning_rate": 6.81458199690955e-06, + "loss": 1.2075, + "step": 20646 + }, + { + "epoch": 6.149630484558536, + "grad_norm": 0.30606967210769653, + "learning_rate": 6.8136676907685486e-06, + "loss": 1.2126, + "step": 20647 + }, + { + "epoch": 6.149928330758205, + "grad_norm": 0.3908883035182953, + "learning_rate": 6.812753414272808e-06, + "loss": 1.2158, + "step": 20648 + }, + { + "epoch": 6.150226176957873, + "grad_norm": 0.38985610008239746, + "learning_rate": 6.811839167430841e-06, + "loss": 1.2004, + "step": 20649 + }, + { + "epoch": 6.150524023157542, + "grad_norm": 0.42454275488853455, + "learning_rate": 6.8109249502511465e-06, + "loss": 1.219, + "step": 20650 + }, + { + "epoch": 6.150821869357211, + "grad_norm": 0.30039963126182556, + "learning_rate": 6.810010762742229e-06, + "loss": 1.2115, + "step": 20651 + }, + { + "epoch": 6.151119715556879, + "grad_norm": 0.30101659893989563, + "learning_rate": 6.8090966049126015e-06, + "loss": 1.1975, + "step": 20652 + }, + { + "epoch": 6.151417561756548, + "grad_norm": 0.3211324214935303, + "learning_rate": 6.8081824767707626e-06, + "loss": 1.2071, + "step": 20653 + }, + { + "epoch": 6.151715407956217, + "grad_norm": 0.30729320645332336, + "learning_rate": 6.807268378325218e-06, + "loss": 1.2117, + "step": 20654 + }, + { + "epoch": 6.152013254155885, + "grad_norm": 0.3042641282081604, + "learning_rate": 6.806354309584477e-06, + "loss": 1.216, + "step": 20655 + }, + { + "epoch": 6.152311100355554, + "grad_norm": 0.24978935718536377, + "learning_rate": 6.805440270557042e-06, + "loss": 1.2189, + "step": 20656 + }, + { + "epoch": 6.1526089465552225, + "grad_norm": 0.31557783484458923, + "learning_rate": 6.804526261251413e-06, + "loss": 1.2128, + "step": 20657 + }, + { + "epoch": 6.152906792754891, + "grad_norm": 0.2657829523086548, + "learning_rate": 6.8036122816760995e-06, + "loss": 1.2245, + "step": 20658 + }, + { + "epoch": 6.15320463895456, + "grad_norm": 0.2974882125854492, + "learning_rate": 6.802698331839599e-06, + "loss": 1.1951, + "step": 20659 + }, + { + "epoch": 6.153502485154228, + "grad_norm": 0.28717219829559326, + "learning_rate": 6.8017844117504205e-06, + "loss": 1.2357, + "step": 20660 + }, + { + "epoch": 6.153800331353897, + "grad_norm": 0.33680206537246704, + "learning_rate": 6.800870521417067e-06, + "loss": 1.2107, + "step": 20661 + }, + { + "epoch": 6.154098177553566, + "grad_norm": 0.2575487494468689, + "learning_rate": 6.799956660848034e-06, + "loss": 1.213, + "step": 20662 + }, + { + "epoch": 6.154396023753234, + "grad_norm": 0.43166589736938477, + "learning_rate": 6.799042830051834e-06, + "loss": 1.1901, + "step": 20663 + }, + { + "epoch": 6.1546938699529035, + "grad_norm": 0.2783490717411041, + "learning_rate": 6.7981290290369615e-06, + "loss": 1.2104, + "step": 20664 + }, + { + "epoch": 6.154991716152572, + "grad_norm": 0.5312780737876892, + "learning_rate": 6.797215257811921e-06, + "loss": 1.2145, + "step": 20665 + }, + { + "epoch": 6.15528956235224, + "grad_norm": 0.25795263051986694, + "learning_rate": 6.796301516385214e-06, + "loss": 1.1946, + "step": 20666 + }, + { + "epoch": 6.155587408551909, + "grad_norm": 0.28236448764801025, + "learning_rate": 6.7953878047653455e-06, + "loss": 1.2198, + "step": 20667 + }, + { + "epoch": 6.155885254751578, + "grad_norm": 0.36620813608169556, + "learning_rate": 6.794474122960808e-06, + "loss": 1.1893, + "step": 20668 + }, + { + "epoch": 6.156183100951246, + "grad_norm": 0.2770450711250305, + "learning_rate": 6.793560470980111e-06, + "loss": 1.2108, + "step": 20669 + }, + { + "epoch": 6.156480947150915, + "grad_norm": 0.3194955587387085, + "learning_rate": 6.79264684883175e-06, + "loss": 1.2147, + "step": 20670 + }, + { + "epoch": 6.1567787933505835, + "grad_norm": 0.24974024295806885, + "learning_rate": 6.791733256524225e-06, + "loss": 1.2192, + "step": 20671 + }, + { + "epoch": 6.157076639550252, + "grad_norm": 0.28228843212127686, + "learning_rate": 6.79081969406604e-06, + "loss": 1.2072, + "step": 20672 + }, + { + "epoch": 6.157374485749921, + "grad_norm": 0.26873913407325745, + "learning_rate": 6.789906161465689e-06, + "loss": 1.2217, + "step": 20673 + }, + { + "epoch": 6.157672331949589, + "grad_norm": 0.26520776748657227, + "learning_rate": 6.788992658731679e-06, + "loss": 1.2107, + "step": 20674 + }, + { + "epoch": 6.157970178149258, + "grad_norm": 0.32572123408317566, + "learning_rate": 6.7880791858725046e-06, + "loss": 1.1901, + "step": 20675 + }, + { + "epoch": 6.158268024348927, + "grad_norm": 0.26272010803222656, + "learning_rate": 6.787165742896659e-06, + "loss": 1.2036, + "step": 20676 + }, + { + "epoch": 6.158565870548595, + "grad_norm": 0.34784695506095886, + "learning_rate": 6.786252329812652e-06, + "loss": 1.2168, + "step": 20677 + }, + { + "epoch": 6.1588637167482645, + "grad_norm": 0.26512646675109863, + "learning_rate": 6.785338946628977e-06, + "loss": 1.2073, + "step": 20678 + }, + { + "epoch": 6.159161562947933, + "grad_norm": 0.33080342411994934, + "learning_rate": 6.784425593354127e-06, + "loss": 1.206, + "step": 20679 + }, + { + "epoch": 6.159459409147601, + "grad_norm": 0.2957414388656616, + "learning_rate": 6.783512269996609e-06, + "loss": 1.2076, + "step": 20680 + }, + { + "epoch": 6.15975725534727, + "grad_norm": 0.2581091523170471, + "learning_rate": 6.782598976564914e-06, + "loss": 1.2217, + "step": 20681 + }, + { + "epoch": 6.160055101546939, + "grad_norm": 0.27238890528678894, + "learning_rate": 6.781685713067538e-06, + "loss": 1.2195, + "step": 20682 + }, + { + "epoch": 6.160352947746607, + "grad_norm": 0.41715824604034424, + "learning_rate": 6.780772479512983e-06, + "loss": 1.2128, + "step": 20683 + }, + { + "epoch": 6.160650793946276, + "grad_norm": 0.27436164021492004, + "learning_rate": 6.7798592759097446e-06, + "loss": 1.2188, + "step": 20684 + }, + { + "epoch": 6.160948640145945, + "grad_norm": 0.5577759146690369, + "learning_rate": 6.778946102266314e-06, + "loss": 1.2183, + "step": 20685 + }, + { + "epoch": 6.161246486345613, + "grad_norm": 0.2873111069202423, + "learning_rate": 6.7780329585911965e-06, + "loss": 1.2105, + "step": 20686 + }, + { + "epoch": 6.161544332545282, + "grad_norm": 0.36976149678230286, + "learning_rate": 6.777119844892881e-06, + "loss": 1.205, + "step": 20687 + }, + { + "epoch": 6.16184217874495, + "grad_norm": 0.2958317697048187, + "learning_rate": 6.77620676117986e-06, + "loss": 1.1901, + "step": 20688 + }, + { + "epoch": 6.16214002494462, + "grad_norm": 0.3038024604320526, + "learning_rate": 6.775293707460637e-06, + "loss": 1.2088, + "step": 20689 + }, + { + "epoch": 6.162437871144288, + "grad_norm": 0.3150775730609894, + "learning_rate": 6.7743806837437e-06, + "loss": 1.2174, + "step": 20690 + }, + { + "epoch": 6.162735717343956, + "grad_norm": 0.4162593483924866, + "learning_rate": 6.7734676900375514e-06, + "loss": 1.2216, + "step": 20691 + }, + { + "epoch": 6.1630335635436255, + "grad_norm": 0.27144870162010193, + "learning_rate": 6.772554726350682e-06, + "loss": 1.2097, + "step": 20692 + }, + { + "epoch": 6.163331409743294, + "grad_norm": 0.32726457715034485, + "learning_rate": 6.771641792691579e-06, + "loss": 1.2005, + "step": 20693 + }, + { + "epoch": 6.163629255942962, + "grad_norm": 0.2827392518520355, + "learning_rate": 6.770728889068747e-06, + "loss": 1.2072, + "step": 20694 + }, + { + "epoch": 6.163927102142631, + "grad_norm": 0.33994659781455994, + "learning_rate": 6.769816015490674e-06, + "loss": 1.2264, + "step": 20695 + }, + { + "epoch": 6.1642249483423, + "grad_norm": 0.2557581663131714, + "learning_rate": 6.768903171965849e-06, + "loss": 1.2027, + "step": 20696 + }, + { + "epoch": 6.164522794541968, + "grad_norm": 0.2889869213104248, + "learning_rate": 6.767990358502776e-06, + "loss": 1.1965, + "step": 20697 + }, + { + "epoch": 6.164820640741637, + "grad_norm": 0.44061148166656494, + "learning_rate": 6.767077575109942e-06, + "loss": 1.2048, + "step": 20698 + }, + { + "epoch": 6.165118486941306, + "grad_norm": 0.33885693550109863, + "learning_rate": 6.766164821795836e-06, + "loss": 1.2063, + "step": 20699 + }, + { + "epoch": 6.165416333140974, + "grad_norm": 0.2890945076942444, + "learning_rate": 6.765252098568953e-06, + "loss": 1.207, + "step": 20700 + }, + { + "epoch": 6.165714179340643, + "grad_norm": 0.296283096075058, + "learning_rate": 6.764339405437788e-06, + "loss": 1.2044, + "step": 20701 + }, + { + "epoch": 6.1660120255403115, + "grad_norm": 0.3410748243331909, + "learning_rate": 6.7634267424108255e-06, + "loss": 1.1972, + "step": 20702 + }, + { + "epoch": 6.166309871739981, + "grad_norm": 0.3115783929824829, + "learning_rate": 6.762514109496565e-06, + "loss": 1.2059, + "step": 20703 + }, + { + "epoch": 6.166607717939649, + "grad_norm": 0.2976948618888855, + "learning_rate": 6.761601506703495e-06, + "loss": 1.2224, + "step": 20704 + }, + { + "epoch": 6.166905564139317, + "grad_norm": 0.35263922810554504, + "learning_rate": 6.760688934040098e-06, + "loss": 1.2024, + "step": 20705 + }, + { + "epoch": 6.167203410338987, + "grad_norm": 0.3121910095214844, + "learning_rate": 6.759776391514876e-06, + "loss": 1.1983, + "step": 20706 + }, + { + "epoch": 6.167501256538655, + "grad_norm": 0.3365488052368164, + "learning_rate": 6.758863879136314e-06, + "loss": 1.2094, + "step": 20707 + }, + { + "epoch": 6.167799102738323, + "grad_norm": 0.2783413827419281, + "learning_rate": 6.7579513969129006e-06, + "loss": 1.209, + "step": 20708 + }, + { + "epoch": 6.168096948937992, + "grad_norm": 0.3695756196975708, + "learning_rate": 6.757038944853131e-06, + "loss": 1.1872, + "step": 20709 + }, + { + "epoch": 6.168394795137661, + "grad_norm": 0.33527496457099915, + "learning_rate": 6.756126522965487e-06, + "loss": 1.2287, + "step": 20710 + }, + { + "epoch": 6.168692641337329, + "grad_norm": 0.36470115184783936, + "learning_rate": 6.755214131258465e-06, + "loss": 1.1932, + "step": 20711 + }, + { + "epoch": 6.168990487536998, + "grad_norm": 0.4481731057167053, + "learning_rate": 6.754301769740548e-06, + "loss": 1.2142, + "step": 20712 + }, + { + "epoch": 6.169288333736667, + "grad_norm": 0.33711037039756775, + "learning_rate": 6.753389438420224e-06, + "loss": 1.2162, + "step": 20713 + }, + { + "epoch": 6.169586179936335, + "grad_norm": 0.44622817635536194, + "learning_rate": 6.752477137305989e-06, + "loss": 1.2085, + "step": 20714 + }, + { + "epoch": 6.169884026136004, + "grad_norm": 0.274313360452652, + "learning_rate": 6.751564866406326e-06, + "loss": 1.2047, + "step": 20715 + }, + { + "epoch": 6.1701818723356725, + "grad_norm": 0.4805906116962433, + "learning_rate": 6.750652625729718e-06, + "loss": 1.2282, + "step": 20716 + }, + { + "epoch": 6.170479718535342, + "grad_norm": 0.2658785283565521, + "learning_rate": 6.74974041528466e-06, + "loss": 1.2098, + "step": 20717 + }, + { + "epoch": 6.17077756473501, + "grad_norm": 0.39714986085891724, + "learning_rate": 6.748828235079636e-06, + "loss": 1.2139, + "step": 20718 + }, + { + "epoch": 6.171075410934678, + "grad_norm": 0.30077528953552246, + "learning_rate": 6.74791608512313e-06, + "loss": 1.2116, + "step": 20719 + }, + { + "epoch": 6.171373257134348, + "grad_norm": 0.26328808069229126, + "learning_rate": 6.747003965423635e-06, + "loss": 1.2043, + "step": 20720 + }, + { + "epoch": 6.171671103334016, + "grad_norm": 0.3597334325313568, + "learning_rate": 6.7460918759896334e-06, + "loss": 1.183, + "step": 20721 + }, + { + "epoch": 6.171968949533684, + "grad_norm": 0.25994551181793213, + "learning_rate": 6.745179816829608e-06, + "loss": 1.2058, + "step": 20722 + }, + { + "epoch": 6.1722667957333535, + "grad_norm": 0.2922576665878296, + "learning_rate": 6.7442677879520524e-06, + "loss": 1.2153, + "step": 20723 + }, + { + "epoch": 6.172564641933022, + "grad_norm": 0.27364155650138855, + "learning_rate": 6.743355789365442e-06, + "loss": 1.2084, + "step": 20724 + }, + { + "epoch": 6.17286248813269, + "grad_norm": 0.26237180829048157, + "learning_rate": 6.7424438210782706e-06, + "loss": 1.2111, + "step": 20725 + }, + { + "epoch": 6.173160334332359, + "grad_norm": 0.2967134714126587, + "learning_rate": 6.741531883099022e-06, + "loss": 1.1999, + "step": 20726 + }, + { + "epoch": 6.173458180532028, + "grad_norm": 0.29727038741111755, + "learning_rate": 6.740619975436174e-06, + "loss": 1.2067, + "step": 20727 + }, + { + "epoch": 6.173756026731697, + "grad_norm": 0.39990097284317017, + "learning_rate": 6.739708098098221e-06, + "loss": 1.1846, + "step": 20728 + }, + { + "epoch": 6.174053872931365, + "grad_norm": 0.44286075234413147, + "learning_rate": 6.738796251093638e-06, + "loss": 1.2186, + "step": 20729 + }, + { + "epoch": 6.1743517191310335, + "grad_norm": 0.48298612236976624, + "learning_rate": 6.737884434430912e-06, + "loss": 1.1968, + "step": 20730 + }, + { + "epoch": 6.174649565330703, + "grad_norm": 0.36918771266937256, + "learning_rate": 6.736972648118529e-06, + "loss": 1.1972, + "step": 20731 + }, + { + "epoch": 6.174947411530371, + "grad_norm": 0.2885284125804901, + "learning_rate": 6.73606089216497e-06, + "loss": 1.2213, + "step": 20732 + }, + { + "epoch": 6.175245257730039, + "grad_norm": 0.43381965160369873, + "learning_rate": 6.735149166578714e-06, + "loss": 1.2092, + "step": 20733 + }, + { + "epoch": 6.175543103929709, + "grad_norm": 0.2930415868759155, + "learning_rate": 6.73423747136825e-06, + "loss": 1.2052, + "step": 20734 + }, + { + "epoch": 6.175840950129377, + "grad_norm": 0.3443886339664459, + "learning_rate": 6.733325806542057e-06, + "loss": 1.2113, + "step": 20735 + }, + { + "epoch": 6.176138796329045, + "grad_norm": 0.2660033404827118, + "learning_rate": 6.732414172108615e-06, + "loss": 1.2069, + "step": 20736 + }, + { + "epoch": 6.1764366425287145, + "grad_norm": 0.35309046506881714, + "learning_rate": 6.731502568076413e-06, + "loss": 1.2118, + "step": 20737 + }, + { + "epoch": 6.176734488728383, + "grad_norm": 0.25800788402557373, + "learning_rate": 6.730590994453928e-06, + "loss": 1.1957, + "step": 20738 + }, + { + "epoch": 6.177032334928051, + "grad_norm": 0.3657929301261902, + "learning_rate": 6.729679451249638e-06, + "loss": 1.2179, + "step": 20739 + }, + { + "epoch": 6.17733018112772, + "grad_norm": 0.28903043270111084, + "learning_rate": 6.728767938472028e-06, + "loss": 1.1813, + "step": 20740 + }, + { + "epoch": 6.177628027327389, + "grad_norm": 0.29322391748428345, + "learning_rate": 6.727856456129576e-06, + "loss": 1.215, + "step": 20741 + }, + { + "epoch": 6.177925873527057, + "grad_norm": 0.2867197096347809, + "learning_rate": 6.726945004230765e-06, + "loss": 1.2243, + "step": 20742 + }, + { + "epoch": 6.178223719726726, + "grad_norm": 0.3531189262866974, + "learning_rate": 6.726033582784075e-06, + "loss": 1.2009, + "step": 20743 + }, + { + "epoch": 6.178521565926395, + "grad_norm": 0.2790989279747009, + "learning_rate": 6.725122191797981e-06, + "loss": 1.2087, + "step": 20744 + }, + { + "epoch": 6.178819412126064, + "grad_norm": 0.27653998136520386, + "learning_rate": 6.724210831280971e-06, + "loss": 1.2423, + "step": 20745 + }, + { + "epoch": 6.179117258325732, + "grad_norm": 0.37640488147735596, + "learning_rate": 6.723299501241518e-06, + "loss": 1.2186, + "step": 20746 + }, + { + "epoch": 6.1794151045254, + "grad_norm": 0.2798153758049011, + "learning_rate": 6.722388201688099e-06, + "loss": 1.2068, + "step": 20747 + }, + { + "epoch": 6.17971295072507, + "grad_norm": 0.2653747498989105, + "learning_rate": 6.721476932629199e-06, + "loss": 1.2119, + "step": 20748 + }, + { + "epoch": 6.180010796924738, + "grad_norm": 0.33432847261428833, + "learning_rate": 6.720565694073294e-06, + "loss": 1.2048, + "step": 20749 + }, + { + "epoch": 6.180308643124406, + "grad_norm": 0.26739978790283203, + "learning_rate": 6.7196544860288555e-06, + "loss": 1.2073, + "step": 20750 + }, + { + "epoch": 6.1806064893240755, + "grad_norm": 0.3965112566947937, + "learning_rate": 6.718743308504374e-06, + "loss": 1.2218, + "step": 20751 + }, + { + "epoch": 6.180904335523744, + "grad_norm": 0.25039783120155334, + "learning_rate": 6.717832161508315e-06, + "loss": 1.2079, + "step": 20752 + }, + { + "epoch": 6.181202181723412, + "grad_norm": 0.4873012602329254, + "learning_rate": 6.716921045049163e-06, + "loss": 1.2127, + "step": 20753 + }, + { + "epoch": 6.181500027923081, + "grad_norm": 0.28977301716804504, + "learning_rate": 6.716009959135392e-06, + "loss": 1.2049, + "step": 20754 + }, + { + "epoch": 6.18179787412275, + "grad_norm": 0.2853916585445404, + "learning_rate": 6.715098903775481e-06, + "loss": 1.2306, + "step": 20755 + }, + { + "epoch": 6.182095720322419, + "grad_norm": 0.27583566308021545, + "learning_rate": 6.7141878789779e-06, + "loss": 1.2248, + "step": 20756 + }, + { + "epoch": 6.182393566522087, + "grad_norm": 0.3771075904369354, + "learning_rate": 6.7132768847511354e-06, + "loss": 1.1998, + "step": 20757 + }, + { + "epoch": 6.182691412721756, + "grad_norm": 0.2786421477794647, + "learning_rate": 6.712365921103652e-06, + "loss": 1.1987, + "step": 20758 + }, + { + "epoch": 6.182989258921425, + "grad_norm": 0.36081111431121826, + "learning_rate": 6.711454988043933e-06, + "loss": 1.2124, + "step": 20759 + }, + { + "epoch": 6.183287105121093, + "grad_norm": 0.2855590283870697, + "learning_rate": 6.710544085580452e-06, + "loss": 1.2283, + "step": 20760 + }, + { + "epoch": 6.1835849513207615, + "grad_norm": 0.26320427656173706, + "learning_rate": 6.70963321372168e-06, + "loss": 1.2275, + "step": 20761 + }, + { + "epoch": 6.183882797520431, + "grad_norm": 0.29471349716186523, + "learning_rate": 6.7087223724760995e-06, + "loss": 1.212, + "step": 20762 + }, + { + "epoch": 6.184180643720099, + "grad_norm": 0.33346307277679443, + "learning_rate": 6.70781156185218e-06, + "loss": 1.2134, + "step": 20763 + }, + { + "epoch": 6.184478489919767, + "grad_norm": 0.31599733233451843, + "learning_rate": 6.706900781858389e-06, + "loss": 1.2096, + "step": 20764 + }, + { + "epoch": 6.184776336119437, + "grad_norm": 0.2737686038017273, + "learning_rate": 6.705990032503211e-06, + "loss": 1.1987, + "step": 20765 + }, + { + "epoch": 6.185074182319105, + "grad_norm": 0.3117977976799011, + "learning_rate": 6.7050793137951185e-06, + "loss": 1.2282, + "step": 20766 + }, + { + "epoch": 6.185372028518773, + "grad_norm": 0.29927635192871094, + "learning_rate": 6.704168625742576e-06, + "loss": 1.2034, + "step": 20767 + }, + { + "epoch": 6.185669874718442, + "grad_norm": 0.28418171405792236, + "learning_rate": 6.703257968354066e-06, + "loss": 1.2164, + "step": 20768 + }, + { + "epoch": 6.185967720918111, + "grad_norm": 0.28196296095848083, + "learning_rate": 6.702347341638059e-06, + "loss": 1.2081, + "step": 20769 + }, + { + "epoch": 6.18626556711778, + "grad_norm": 0.3697756230831146, + "learning_rate": 6.70143674560302e-06, + "loss": 1.207, + "step": 20770 + }, + { + "epoch": 6.186563413317448, + "grad_norm": 0.2549345791339874, + "learning_rate": 6.7005261802574296e-06, + "loss": 1.2085, + "step": 20771 + }, + { + "epoch": 6.186861259517117, + "grad_norm": 0.327949196100235, + "learning_rate": 6.699615645609758e-06, + "loss": 1.2056, + "step": 20772 + }, + { + "epoch": 6.187159105716786, + "grad_norm": 0.2568923532962799, + "learning_rate": 6.698705141668473e-06, + "loss": 1.2151, + "step": 20773 + }, + { + "epoch": 6.187456951916454, + "grad_norm": 0.3360218405723572, + "learning_rate": 6.697794668442051e-06, + "loss": 1.1999, + "step": 20774 + }, + { + "epoch": 6.1877547981161225, + "grad_norm": 0.2906973958015442, + "learning_rate": 6.6968842259389556e-06, + "loss": 1.2088, + "step": 20775 + }, + { + "epoch": 6.188052644315792, + "grad_norm": 0.45060616731643677, + "learning_rate": 6.695973814167667e-06, + "loss": 1.233, + "step": 20776 + }, + { + "epoch": 6.18835049051546, + "grad_norm": 0.266433447599411, + "learning_rate": 6.695063433136648e-06, + "loss": 1.2036, + "step": 20777 + }, + { + "epoch": 6.188648336715128, + "grad_norm": 0.2839943468570709, + "learning_rate": 6.694153082854369e-06, + "loss": 1.2134, + "step": 20778 + }, + { + "epoch": 6.188946182914798, + "grad_norm": 0.4276580214500427, + "learning_rate": 6.693242763329308e-06, + "loss": 1.2076, + "step": 20779 + }, + { + "epoch": 6.189244029114466, + "grad_norm": 0.36513426899909973, + "learning_rate": 6.692332474569927e-06, + "loss": 1.1947, + "step": 20780 + }, + { + "epoch": 6.189541875314134, + "grad_norm": 0.5294678807258606, + "learning_rate": 6.691422216584692e-06, + "loss": 1.2127, + "step": 20781 + }, + { + "epoch": 6.1898397215138035, + "grad_norm": 0.5186043977737427, + "learning_rate": 6.690511989382082e-06, + "loss": 1.2116, + "step": 20782 + }, + { + "epoch": 6.190137567713472, + "grad_norm": 0.4168766140937805, + "learning_rate": 6.689601792970558e-06, + "loss": 1.2145, + "step": 20783 + }, + { + "epoch": 6.190435413913141, + "grad_norm": 0.34949246048927307, + "learning_rate": 6.688691627358587e-06, + "loss": 1.2037, + "step": 20784 + }, + { + "epoch": 6.190733260112809, + "grad_norm": 0.4944717586040497, + "learning_rate": 6.687781492554648e-06, + "loss": 1.2222, + "step": 20785 + }, + { + "epoch": 6.191031106312478, + "grad_norm": 0.29145970940589905, + "learning_rate": 6.6868713885672e-06, + "loss": 1.2034, + "step": 20786 + }, + { + "epoch": 6.191328952512147, + "grad_norm": 0.6446349620819092, + "learning_rate": 6.685961315404708e-06, + "loss": 1.2172, + "step": 20787 + }, + { + "epoch": 6.191626798711815, + "grad_norm": 0.2638005018234253, + "learning_rate": 6.6850512730756455e-06, + "loss": 1.2138, + "step": 20788 + }, + { + "epoch": 6.1919246449114835, + "grad_norm": 0.34286072850227356, + "learning_rate": 6.684141261588477e-06, + "loss": 1.2174, + "step": 20789 + }, + { + "epoch": 6.192222491111153, + "grad_norm": 0.5390883684158325, + "learning_rate": 6.68323128095167e-06, + "loss": 1.1962, + "step": 20790 + }, + { + "epoch": 6.192520337310821, + "grad_norm": 0.33346831798553467, + "learning_rate": 6.682321331173691e-06, + "loss": 1.2121, + "step": 20791 + }, + { + "epoch": 6.192818183510489, + "grad_norm": 0.6277686357498169, + "learning_rate": 6.6814114122630025e-06, + "loss": 1.2138, + "step": 20792 + }, + { + "epoch": 6.193116029710159, + "grad_norm": 0.31872302293777466, + "learning_rate": 6.680501524228077e-06, + "loss": 1.2071, + "step": 20793 + }, + { + "epoch": 6.193413875909827, + "grad_norm": 0.4405876398086548, + "learning_rate": 6.679591667077374e-06, + "loss": 1.2147, + "step": 20794 + }, + { + "epoch": 6.193711722109496, + "grad_norm": 0.29229310154914856, + "learning_rate": 6.678681840819357e-06, + "loss": 1.2191, + "step": 20795 + }, + { + "epoch": 6.1940095683091645, + "grad_norm": 0.3992125391960144, + "learning_rate": 6.6777720454625e-06, + "loss": 1.219, + "step": 20796 + }, + { + "epoch": 6.194307414508833, + "grad_norm": 0.4436438977718353, + "learning_rate": 6.676862281015264e-06, + "loss": 1.2215, + "step": 20797 + }, + { + "epoch": 6.194605260708502, + "grad_norm": 0.28378355503082275, + "learning_rate": 6.6759525474861055e-06, + "loss": 1.2069, + "step": 20798 + }, + { + "epoch": 6.19490310690817, + "grad_norm": 0.44195660948753357, + "learning_rate": 6.675042844883499e-06, + "loss": 1.2166, + "step": 20799 + }, + { + "epoch": 6.195200953107839, + "grad_norm": 0.335060179233551, + "learning_rate": 6.674133173215902e-06, + "loss": 1.2144, + "step": 20800 + }, + { + "epoch": 6.195498799307508, + "grad_norm": 0.4112803041934967, + "learning_rate": 6.673223532491778e-06, + "loss": 1.2142, + "step": 20801 + }, + { + "epoch": 6.195796645507176, + "grad_norm": 0.37522804737091064, + "learning_rate": 6.672313922719597e-06, + "loss": 1.1878, + "step": 20802 + }, + { + "epoch": 6.196094491706845, + "grad_norm": 0.2578239142894745, + "learning_rate": 6.671404343907817e-06, + "loss": 1.2158, + "step": 20803 + }, + { + "epoch": 6.196392337906514, + "grad_norm": 0.5630843043327332, + "learning_rate": 6.670494796064895e-06, + "loss": 1.2154, + "step": 20804 + }, + { + "epoch": 6.196690184106182, + "grad_norm": 0.2554631233215332, + "learning_rate": 6.6695852791993044e-06, + "loss": 1.2053, + "step": 20805 + }, + { + "epoch": 6.19698803030585, + "grad_norm": 0.3429364264011383, + "learning_rate": 6.6686757933194965e-06, + "loss": 1.2028, + "step": 20806 + }, + { + "epoch": 6.19728587650552, + "grad_norm": 0.47954127192497253, + "learning_rate": 6.6677663384339405e-06, + "loss": 1.2335, + "step": 20807 + }, + { + "epoch": 6.197583722705188, + "grad_norm": 0.26083940267562866, + "learning_rate": 6.6668569145511e-06, + "loss": 1.2032, + "step": 20808 + }, + { + "epoch": 6.197881568904856, + "grad_norm": 0.4446563720703125, + "learning_rate": 6.665947521679425e-06, + "loss": 1.2124, + "step": 20809 + }, + { + "epoch": 6.1981794151045255, + "grad_norm": 0.41570764780044556, + "learning_rate": 6.665038159827391e-06, + "loss": 1.2264, + "step": 20810 + }, + { + "epoch": 6.198477261304194, + "grad_norm": 0.3224899172782898, + "learning_rate": 6.664128829003445e-06, + "loss": 1.2037, + "step": 20811 + }, + { + "epoch": 6.198775107503863, + "grad_norm": 0.4793171286582947, + "learning_rate": 6.663219529216055e-06, + "loss": 1.1908, + "step": 20812 + }, + { + "epoch": 6.199072953703531, + "grad_norm": 0.32333850860595703, + "learning_rate": 6.662310260473679e-06, + "loss": 1.2095, + "step": 20813 + }, + { + "epoch": 6.1993707999032, + "grad_norm": 0.5674697160720825, + "learning_rate": 6.661401022784779e-06, + "loss": 1.1854, + "step": 20814 + }, + { + "epoch": 6.199668646102869, + "grad_norm": 0.3417503237724304, + "learning_rate": 6.660491816157808e-06, + "loss": 1.1886, + "step": 20815 + }, + { + "epoch": 6.199966492302537, + "grad_norm": 0.36498722434043884, + "learning_rate": 6.6595826406012344e-06, + "loss": 1.2062, + "step": 20816 + }, + { + "epoch": 6.200264338502206, + "grad_norm": 0.3774951100349426, + "learning_rate": 6.658673496123509e-06, + "loss": 1.2164, + "step": 20817 + }, + { + "epoch": 6.200562184701875, + "grad_norm": 0.26584306359291077, + "learning_rate": 6.657764382733094e-06, + "loss": 1.2104, + "step": 20818 + }, + { + "epoch": 6.200860030901543, + "grad_norm": 0.31986191868782043, + "learning_rate": 6.656855300438447e-06, + "loss": 1.212, + "step": 20819 + }, + { + "epoch": 6.2011578771012115, + "grad_norm": 0.39728841185569763, + "learning_rate": 6.6559462492480295e-06, + "loss": 1.2232, + "step": 20820 + }, + { + "epoch": 6.201455723300881, + "grad_norm": 0.2903613746166229, + "learning_rate": 6.655037229170291e-06, + "loss": 1.2213, + "step": 20821 + }, + { + "epoch": 6.201753569500549, + "grad_norm": 0.36729517579078674, + "learning_rate": 6.654128240213697e-06, + "loss": 1.2195, + "step": 20822 + }, + { + "epoch": 6.202051415700218, + "grad_norm": 0.29853418469429016, + "learning_rate": 6.6532192823867006e-06, + "loss": 1.1987, + "step": 20823 + }, + { + "epoch": 6.202349261899887, + "grad_norm": 0.3352545201778412, + "learning_rate": 6.652310355697759e-06, + "loss": 1.206, + "step": 20824 + }, + { + "epoch": 6.202647108099555, + "grad_norm": 0.39215201139450073, + "learning_rate": 6.651401460155331e-06, + "loss": 1.2086, + "step": 20825 + }, + { + "epoch": 6.202944954299224, + "grad_norm": 0.24427476525306702, + "learning_rate": 6.650492595767868e-06, + "loss": 1.2051, + "step": 20826 + }, + { + "epoch": 6.203242800498892, + "grad_norm": 0.3072798550128937, + "learning_rate": 6.649583762543833e-06, + "loss": 1.2118, + "step": 20827 + }, + { + "epoch": 6.203540646698561, + "grad_norm": 0.37357524037361145, + "learning_rate": 6.648674960491677e-06, + "loss": 1.211, + "step": 20828 + }, + { + "epoch": 6.20383849289823, + "grad_norm": 0.40315720438957214, + "learning_rate": 6.647766189619853e-06, + "loss": 1.2026, + "step": 20829 + }, + { + "epoch": 6.204136339097898, + "grad_norm": 0.2604370415210724, + "learning_rate": 6.646857449936821e-06, + "loss": 1.2015, + "step": 20830 + }, + { + "epoch": 6.204434185297567, + "grad_norm": 0.2628321945667267, + "learning_rate": 6.645948741451037e-06, + "loss": 1.2076, + "step": 20831 + }, + { + "epoch": 6.204732031497236, + "grad_norm": 0.3860684633255005, + "learning_rate": 6.645040064170948e-06, + "loss": 1.1966, + "step": 20832 + }, + { + "epoch": 6.205029877696904, + "grad_norm": 0.31014302372932434, + "learning_rate": 6.644131418105017e-06, + "loss": 1.2074, + "step": 20833 + }, + { + "epoch": 6.2053277238965725, + "grad_norm": 0.314895898103714, + "learning_rate": 6.643222803261693e-06, + "loss": 1.198, + "step": 20834 + }, + { + "epoch": 6.205625570096242, + "grad_norm": 0.34027209877967834, + "learning_rate": 6.642314219649426e-06, + "loss": 1.222, + "step": 20835 + }, + { + "epoch": 6.20592341629591, + "grad_norm": 0.29028600454330444, + "learning_rate": 6.6414056672766765e-06, + "loss": 1.1941, + "step": 20836 + }, + { + "epoch": 6.206221262495579, + "grad_norm": 0.302869588136673, + "learning_rate": 6.640497146151898e-06, + "loss": 1.2186, + "step": 20837 + }, + { + "epoch": 6.206519108695248, + "grad_norm": 0.34623581171035767, + "learning_rate": 6.639588656283535e-06, + "loss": 1.2018, + "step": 20838 + }, + { + "epoch": 6.206816954894916, + "grad_norm": 0.27471330761909485, + "learning_rate": 6.638680197680049e-06, + "loss": 1.1964, + "step": 20839 + }, + { + "epoch": 6.207114801094585, + "grad_norm": 0.36948472261428833, + "learning_rate": 6.637771770349883e-06, + "loss": 1.1875, + "step": 20840 + }, + { + "epoch": 6.2074126472942535, + "grad_norm": 0.3009609282016754, + "learning_rate": 6.636863374301501e-06, + "loss": 1.2173, + "step": 20841 + }, + { + "epoch": 6.207710493493922, + "grad_norm": 0.3019651174545288, + "learning_rate": 6.635955009543345e-06, + "loss": 1.1965, + "step": 20842 + }, + { + "epoch": 6.208008339693591, + "grad_norm": 0.3131799101829529, + "learning_rate": 6.635046676083865e-06, + "loss": 1.2021, + "step": 20843 + }, + { + "epoch": 6.208306185893259, + "grad_norm": 0.42689722776412964, + "learning_rate": 6.634138373931523e-06, + "loss": 1.2077, + "step": 20844 + }, + { + "epoch": 6.208604032092928, + "grad_norm": 0.37339693307876587, + "learning_rate": 6.633230103094761e-06, + "loss": 1.2147, + "step": 20845 + }, + { + "epoch": 6.208901878292597, + "grad_norm": 0.4136165678501129, + "learning_rate": 6.632321863582027e-06, + "loss": 1.2157, + "step": 20846 + }, + { + "epoch": 6.209199724492265, + "grad_norm": 0.32157769799232483, + "learning_rate": 6.631413655401778e-06, + "loss": 1.2094, + "step": 20847 + }, + { + "epoch": 6.2094975706919335, + "grad_norm": 0.3838278353214264, + "learning_rate": 6.630505478562464e-06, + "loss": 1.2036, + "step": 20848 + }, + { + "epoch": 6.209795416891603, + "grad_norm": 0.28281065821647644, + "learning_rate": 6.629597333072527e-06, + "loss": 1.2064, + "step": 20849 + }, + { + "epoch": 6.210093263091271, + "grad_norm": 0.3274846076965332, + "learning_rate": 6.628689218940426e-06, + "loss": 1.2194, + "step": 20850 + }, + { + "epoch": 6.21039110929094, + "grad_norm": 0.3020499646663666, + "learning_rate": 6.6277811361746045e-06, + "loss": 1.1887, + "step": 20851 + }, + { + "epoch": 6.210688955490609, + "grad_norm": 0.3482782244682312, + "learning_rate": 6.626873084783508e-06, + "loss": 1.1968, + "step": 20852 + }, + { + "epoch": 6.210986801690277, + "grad_norm": 0.3389839231967926, + "learning_rate": 6.625965064775592e-06, + "loss": 1.2093, + "step": 20853 + }, + { + "epoch": 6.211284647889946, + "grad_norm": 0.26609256863594055, + "learning_rate": 6.625057076159302e-06, + "loss": 1.2055, + "step": 20854 + }, + { + "epoch": 6.2115824940896145, + "grad_norm": 0.3010624647140503, + "learning_rate": 6.624149118943082e-06, + "loss": 1.2023, + "step": 20855 + }, + { + "epoch": 6.211880340289283, + "grad_norm": 0.33901360630989075, + "learning_rate": 6.623241193135386e-06, + "loss": 1.2017, + "step": 20856 + }, + { + "epoch": 6.212178186488952, + "grad_norm": 0.2817125618457794, + "learning_rate": 6.622333298744654e-06, + "loss": 1.1984, + "step": 20857 + }, + { + "epoch": 6.21247603268862, + "grad_norm": 0.5507752895355225, + "learning_rate": 6.6214254357793405e-06, + "loss": 1.2059, + "step": 20858 + }, + { + "epoch": 6.212773878888289, + "grad_norm": 0.31725063920021057, + "learning_rate": 6.620517604247887e-06, + "loss": 1.2163, + "step": 20859 + }, + { + "epoch": 6.213071725087958, + "grad_norm": 0.438908189535141, + "learning_rate": 6.61960980415874e-06, + "loss": 1.204, + "step": 20860 + }, + { + "epoch": 6.213369571287626, + "grad_norm": 0.28866636753082275, + "learning_rate": 6.61870203552035e-06, + "loss": 1.1937, + "step": 20861 + }, + { + "epoch": 6.2136674174872955, + "grad_norm": 0.35296639800071716, + "learning_rate": 6.61779429834116e-06, + "loss": 1.2051, + "step": 20862 + }, + { + "epoch": 6.213965263686964, + "grad_norm": 0.35828477144241333, + "learning_rate": 6.616886592629612e-06, + "loss": 1.2092, + "step": 20863 + }, + { + "epoch": 6.214263109886632, + "grad_norm": 0.2809810936450958, + "learning_rate": 6.615978918394158e-06, + "loss": 1.2016, + "step": 20864 + }, + { + "epoch": 6.214560956086301, + "grad_norm": 0.5067834258079529, + "learning_rate": 6.6150712756432365e-06, + "loss": 1.2066, + "step": 20865 + }, + { + "epoch": 6.21485880228597, + "grad_norm": 0.3311060667037964, + "learning_rate": 6.6141636643852946e-06, + "loss": 1.2053, + "step": 20866 + }, + { + "epoch": 6.215156648485638, + "grad_norm": 0.4814979135990143, + "learning_rate": 6.61325608462878e-06, + "loss": 1.2064, + "step": 20867 + }, + { + "epoch": 6.215454494685307, + "grad_norm": 0.3755452334880829, + "learning_rate": 6.612348536382134e-06, + "loss": 1.2187, + "step": 20868 + }, + { + "epoch": 6.2157523408849755, + "grad_norm": 0.35468029975891113, + "learning_rate": 6.611441019653795e-06, + "loss": 1.2054, + "step": 20869 + }, + { + "epoch": 6.216050187084644, + "grad_norm": 0.32278144359588623, + "learning_rate": 6.610533534452215e-06, + "loss": 1.1867, + "step": 20870 + }, + { + "epoch": 6.216348033284313, + "grad_norm": 0.2916470170021057, + "learning_rate": 6.609626080785834e-06, + "loss": 1.2176, + "step": 20871 + }, + { + "epoch": 6.216645879483981, + "grad_norm": 0.28612467646598816, + "learning_rate": 6.60871865866309e-06, + "loss": 1.2096, + "step": 20872 + }, + { + "epoch": 6.21694372568365, + "grad_norm": 0.362280011177063, + "learning_rate": 6.6078112680924345e-06, + "loss": 1.2175, + "step": 20873 + }, + { + "epoch": 6.217241571883319, + "grad_norm": 0.2870917022228241, + "learning_rate": 6.606903909082303e-06, + "loss": 1.2104, + "step": 20874 + }, + { + "epoch": 6.217539418082987, + "grad_norm": 0.3332632780075073, + "learning_rate": 6.6059965816411406e-06, + "loss": 1.1964, + "step": 20875 + }, + { + "epoch": 6.217837264282656, + "grad_norm": 0.286295622587204, + "learning_rate": 6.605089285777389e-06, + "loss": 1.2181, + "step": 20876 + }, + { + "epoch": 6.218135110482325, + "grad_norm": 0.27101877331733704, + "learning_rate": 6.604182021499485e-06, + "loss": 1.2033, + "step": 20877 + }, + { + "epoch": 6.218432956681993, + "grad_norm": 0.31318768858909607, + "learning_rate": 6.603274788815877e-06, + "loss": 1.2013, + "step": 20878 + }, + { + "epoch": 6.218730802881662, + "grad_norm": 0.3703415095806122, + "learning_rate": 6.602367587735001e-06, + "loss": 1.205, + "step": 20879 + }, + { + "epoch": 6.219028649081331, + "grad_norm": 0.332447350025177, + "learning_rate": 6.601460418265297e-06, + "loss": 1.2155, + "step": 20880 + }, + { + "epoch": 6.219326495280999, + "grad_norm": 0.31926429271698, + "learning_rate": 6.6005532804152095e-06, + "loss": 1.2102, + "step": 20881 + }, + { + "epoch": 6.219624341480668, + "grad_norm": 0.3711044490337372, + "learning_rate": 6.599646174193174e-06, + "loss": 1.2148, + "step": 20882 + }, + { + "epoch": 6.219922187680337, + "grad_norm": 0.3418666422367096, + "learning_rate": 6.59873909960763e-06, + "loss": 1.2011, + "step": 20883 + }, + { + "epoch": 6.220220033880005, + "grad_norm": 0.5140612721443176, + "learning_rate": 6.5978320566670215e-06, + "loss": 1.2048, + "step": 20884 + }, + { + "epoch": 6.220517880079674, + "grad_norm": 0.2515263855457306, + "learning_rate": 6.5969250453797854e-06, + "loss": 1.1976, + "step": 20885 + }, + { + "epoch": 6.220815726279342, + "grad_norm": 0.4952830970287323, + "learning_rate": 6.596018065754355e-06, + "loss": 1.2112, + "step": 20886 + }, + { + "epoch": 6.221113572479011, + "grad_norm": 0.3173068165779114, + "learning_rate": 6.5951111177991774e-06, + "loss": 1.2106, + "step": 20887 + }, + { + "epoch": 6.22141141867868, + "grad_norm": 0.28146082162857056, + "learning_rate": 6.594204201522685e-06, + "loss": 1.2105, + "step": 20888 + }, + { + "epoch": 6.221709264878348, + "grad_norm": 0.44297167658805847, + "learning_rate": 6.593297316933316e-06, + "loss": 1.2002, + "step": 20889 + }, + { + "epoch": 6.2220071110780175, + "grad_norm": 0.2924647927284241, + "learning_rate": 6.592390464039513e-06, + "loss": 1.2033, + "step": 20890 + }, + { + "epoch": 6.222304957277686, + "grad_norm": 0.31262731552124023, + "learning_rate": 6.591483642849705e-06, + "loss": 1.2185, + "step": 20891 + }, + { + "epoch": 6.222602803477354, + "grad_norm": 0.27535152435302734, + "learning_rate": 6.590576853372337e-06, + "loss": 1.2154, + "step": 20892 + }, + { + "epoch": 6.222900649677023, + "grad_norm": 0.2639579772949219, + "learning_rate": 6.589670095615843e-06, + "loss": 1.1925, + "step": 20893 + }, + { + "epoch": 6.223198495876692, + "grad_norm": 0.31021997332572937, + "learning_rate": 6.588763369588655e-06, + "loss": 1.2049, + "step": 20894 + }, + { + "epoch": 6.22349634207636, + "grad_norm": 0.2987977862358093, + "learning_rate": 6.587856675299213e-06, + "loss": 1.2202, + "step": 20895 + }, + { + "epoch": 6.223794188276029, + "grad_norm": 0.2946363687515259, + "learning_rate": 6.586950012755955e-06, + "loss": 1.2014, + "step": 20896 + }, + { + "epoch": 6.224092034475698, + "grad_norm": 0.28557130694389343, + "learning_rate": 6.586043381967311e-06, + "loss": 1.2024, + "step": 20897 + }, + { + "epoch": 6.224389880675366, + "grad_norm": 0.28428199887275696, + "learning_rate": 6.5851367829417216e-06, + "loss": 1.2079, + "step": 20898 + }, + { + "epoch": 6.224687726875035, + "grad_norm": 0.2734763026237488, + "learning_rate": 6.584230215687618e-06, + "loss": 1.2088, + "step": 20899 + }, + { + "epoch": 6.2249855730747035, + "grad_norm": 0.2596544921398163, + "learning_rate": 6.583323680213436e-06, + "loss": 1.2098, + "step": 20900 + }, + { + "epoch": 6.225283419274372, + "grad_norm": 0.30075424909591675, + "learning_rate": 6.582417176527609e-06, + "loss": 1.1872, + "step": 20901 + }, + { + "epoch": 6.225581265474041, + "grad_norm": 0.2578868269920349, + "learning_rate": 6.581510704638574e-06, + "loss": 1.2195, + "step": 20902 + }, + { + "epoch": 6.225879111673709, + "grad_norm": 0.38626062870025635, + "learning_rate": 6.5806042645547595e-06, + "loss": 1.2001, + "step": 20903 + }, + { + "epoch": 6.2261769578733785, + "grad_norm": 0.32577237486839294, + "learning_rate": 6.5796978562846045e-06, + "loss": 1.2029, + "step": 20904 + }, + { + "epoch": 6.226474804073047, + "grad_norm": 0.27227717638015747, + "learning_rate": 6.578791479836537e-06, + "loss": 1.2117, + "step": 20905 + }, + { + "epoch": 6.226772650272715, + "grad_norm": 0.2720992863178253, + "learning_rate": 6.577885135218993e-06, + "loss": 1.2015, + "step": 20906 + }, + { + "epoch": 6.227070496472384, + "grad_norm": 0.2652950584888458, + "learning_rate": 6.5769788224404075e-06, + "loss": 1.2018, + "step": 20907 + }, + { + "epoch": 6.227368342672053, + "grad_norm": 0.29024332761764526, + "learning_rate": 6.576072541509204e-06, + "loss": 1.1911, + "step": 20908 + }, + { + "epoch": 6.227666188871721, + "grad_norm": 0.2996639013290405, + "learning_rate": 6.575166292433825e-06, + "loss": 1.2186, + "step": 20909 + }, + { + "epoch": 6.22796403507139, + "grad_norm": 0.250465452671051, + "learning_rate": 6.5742600752226985e-06, + "loss": 1.212, + "step": 20910 + }, + { + "epoch": 6.228261881271059, + "grad_norm": 0.3431493043899536, + "learning_rate": 6.573353889884249e-06, + "loss": 1.2106, + "step": 20911 + }, + { + "epoch": 6.228559727470727, + "grad_norm": 0.28145286440849304, + "learning_rate": 6.572447736426917e-06, + "loss": 1.2023, + "step": 20912 + }, + { + "epoch": 6.228857573670396, + "grad_norm": 0.2633451521396637, + "learning_rate": 6.5715416148591295e-06, + "loss": 1.2007, + "step": 20913 + }, + { + "epoch": 6.2291554198700645, + "grad_norm": 0.37861987948417664, + "learning_rate": 6.570635525189313e-06, + "loss": 1.2104, + "step": 20914 + }, + { + "epoch": 6.229453266069733, + "grad_norm": 0.26000311970710754, + "learning_rate": 6.5697294674259045e-06, + "loss": 1.2368, + "step": 20915 + }, + { + "epoch": 6.229751112269402, + "grad_norm": 0.43028923869132996, + "learning_rate": 6.568823441577332e-06, + "loss": 1.2118, + "step": 20916 + }, + { + "epoch": 6.23004895846907, + "grad_norm": 0.25162479281425476, + "learning_rate": 6.56791744765202e-06, + "loss": 1.1923, + "step": 20917 + }, + { + "epoch": 6.23034680466874, + "grad_norm": 0.43839678168296814, + "learning_rate": 6.567011485658403e-06, + "loss": 1.2002, + "step": 20918 + }, + { + "epoch": 6.230644650868408, + "grad_norm": 0.28519049286842346, + "learning_rate": 6.566105555604912e-06, + "loss": 1.1909, + "step": 20919 + }, + { + "epoch": 6.230942497068076, + "grad_norm": 0.4051321744918823, + "learning_rate": 6.5651996574999665e-06, + "loss": 1.1975, + "step": 20920 + }, + { + "epoch": 6.2312403432677455, + "grad_norm": 0.27308428287506104, + "learning_rate": 6.564293791352006e-06, + "loss": 1.2234, + "step": 20921 + }, + { + "epoch": 6.231538189467414, + "grad_norm": 0.3390137851238251, + "learning_rate": 6.563387957169447e-06, + "loss": 1.2159, + "step": 20922 + }, + { + "epoch": 6.231836035667082, + "grad_norm": 0.32019802927970886, + "learning_rate": 6.56248215496073e-06, + "loss": 1.2053, + "step": 20923 + }, + { + "epoch": 6.232133881866751, + "grad_norm": 0.27889537811279297, + "learning_rate": 6.5615763847342716e-06, + "loss": 1.1981, + "step": 20924 + }, + { + "epoch": 6.23243172806642, + "grad_norm": 0.46111857891082764, + "learning_rate": 6.560670646498504e-06, + "loss": 1.2231, + "step": 20925 + }, + { + "epoch": 6.232729574266088, + "grad_norm": 0.27917757630348206, + "learning_rate": 6.559764940261855e-06, + "loss": 1.2033, + "step": 20926 + }, + { + "epoch": 6.233027420465757, + "grad_norm": 0.4410606026649475, + "learning_rate": 6.558859266032751e-06, + "loss": 1.2231, + "step": 20927 + }, + { + "epoch": 6.2333252666654255, + "grad_norm": 0.32204869389533997, + "learning_rate": 6.557953623819612e-06, + "loss": 1.2021, + "step": 20928 + }, + { + "epoch": 6.233623112865095, + "grad_norm": 0.33063867688179016, + "learning_rate": 6.557048013630873e-06, + "loss": 1.214, + "step": 20929 + }, + { + "epoch": 6.233920959064763, + "grad_norm": 0.42502111196517944, + "learning_rate": 6.556142435474954e-06, + "loss": 1.2174, + "step": 20930 + }, + { + "epoch": 6.234218805264431, + "grad_norm": 0.25938206911087036, + "learning_rate": 6.555236889360279e-06, + "loss": 1.2172, + "step": 20931 + }, + { + "epoch": 6.234516651464101, + "grad_norm": 0.42170843482017517, + "learning_rate": 6.554331375295281e-06, + "loss": 1.2116, + "step": 20932 + }, + { + "epoch": 6.234814497663769, + "grad_norm": 0.3091077208518982, + "learning_rate": 6.553425893288379e-06, + "loss": 1.2093, + "step": 20933 + }, + { + "epoch": 6.235112343863437, + "grad_norm": 0.2852857708930969, + "learning_rate": 6.552520443347995e-06, + "loss": 1.2077, + "step": 20934 + }, + { + "epoch": 6.2354101900631065, + "grad_norm": 0.46792224049568176, + "learning_rate": 6.551615025482559e-06, + "loss": 1.1973, + "step": 20935 + }, + { + "epoch": 6.235708036262775, + "grad_norm": 0.2661101818084717, + "learning_rate": 6.5507096397004934e-06, + "loss": 1.2205, + "step": 20936 + }, + { + "epoch": 6.236005882462443, + "grad_norm": 0.3596484661102295, + "learning_rate": 6.549804286010217e-06, + "loss": 1.2045, + "step": 20937 + }, + { + "epoch": 6.236303728662112, + "grad_norm": 0.36223822832107544, + "learning_rate": 6.548898964420161e-06, + "loss": 1.2203, + "step": 20938 + }, + { + "epoch": 6.236601574861781, + "grad_norm": 0.2536521553993225, + "learning_rate": 6.547993674938741e-06, + "loss": 1.2012, + "step": 20939 + }, + { + "epoch": 6.236899421061449, + "grad_norm": 0.4647725820541382, + "learning_rate": 6.547088417574385e-06, + "loss": 1.2219, + "step": 20940 + }, + { + "epoch": 6.237197267261118, + "grad_norm": 0.32785943150520325, + "learning_rate": 6.546183192335513e-06, + "loss": 1.2062, + "step": 20941 + }, + { + "epoch": 6.2374951134607866, + "grad_norm": 0.33601462841033936, + "learning_rate": 6.545277999230546e-06, + "loss": 1.2073, + "step": 20942 + }, + { + "epoch": 6.237792959660455, + "grad_norm": 0.3900188207626343, + "learning_rate": 6.544372838267912e-06, + "loss": 1.2045, + "step": 20943 + }, + { + "epoch": 6.238090805860124, + "grad_norm": 0.29292941093444824, + "learning_rate": 6.543467709456026e-06, + "loss": 1.2137, + "step": 20944 + }, + { + "epoch": 6.238388652059792, + "grad_norm": 0.27695026993751526, + "learning_rate": 6.542562612803308e-06, + "loss": 1.1888, + "step": 20945 + }, + { + "epoch": 6.238686498259462, + "grad_norm": 0.4734032154083252, + "learning_rate": 6.5416575483181855e-06, + "loss": 1.2062, + "step": 20946 + }, + { + "epoch": 6.23898434445913, + "grad_norm": 0.2380705028772354, + "learning_rate": 6.5407525160090745e-06, + "loss": 1.2037, + "step": 20947 + }, + { + "epoch": 6.239282190658798, + "grad_norm": 0.3729903995990753, + "learning_rate": 6.539847515884394e-06, + "loss": 1.2075, + "step": 20948 + }, + { + "epoch": 6.2395800368584675, + "grad_norm": 0.3799933195114136, + "learning_rate": 6.53894254795257e-06, + "loss": 1.2212, + "step": 20949 + }, + { + "epoch": 6.239877883058136, + "grad_norm": 0.37151652574539185, + "learning_rate": 6.538037612222019e-06, + "loss": 1.1912, + "step": 20950 + }, + { + "epoch": 6.240175729257804, + "grad_norm": 0.32603856921195984, + "learning_rate": 6.537132708701157e-06, + "loss": 1.213, + "step": 20951 + }, + { + "epoch": 6.240473575457473, + "grad_norm": 0.2977096736431122, + "learning_rate": 6.536227837398409e-06, + "loss": 1.2204, + "step": 20952 + }, + { + "epoch": 6.240771421657142, + "grad_norm": 0.2917332351207733, + "learning_rate": 6.535322998322189e-06, + "loss": 1.1913, + "step": 20953 + }, + { + "epoch": 6.24106926785681, + "grad_norm": 0.2621777057647705, + "learning_rate": 6.534418191480916e-06, + "loss": 1.1929, + "step": 20954 + }, + { + "epoch": 6.241367114056479, + "grad_norm": 0.3363119065761566, + "learning_rate": 6.533513416883014e-06, + "loss": 1.2104, + "step": 20955 + }, + { + "epoch": 6.241664960256148, + "grad_norm": 0.27684512734413147, + "learning_rate": 6.5326086745368925e-06, + "loss": 1.2169, + "step": 20956 + }, + { + "epoch": 6.241962806455817, + "grad_norm": 0.34454455971717834, + "learning_rate": 6.531703964450977e-06, + "loss": 1.201, + "step": 20957 + }, + { + "epoch": 6.242260652655485, + "grad_norm": 0.29072514176368713, + "learning_rate": 6.530799286633679e-06, + "loss": 1.1902, + "step": 20958 + }, + { + "epoch": 6.2425584988551535, + "grad_norm": 0.2903517186641693, + "learning_rate": 6.529894641093417e-06, + "loss": 1.2001, + "step": 20959 + }, + { + "epoch": 6.242856345054823, + "grad_norm": 0.3266454339027405, + "learning_rate": 6.52899002783861e-06, + "loss": 1.2088, + "step": 20960 + }, + { + "epoch": 6.243154191254491, + "grad_norm": 0.25888121128082275, + "learning_rate": 6.5280854468776745e-06, + "loss": 1.2099, + "step": 20961 + }, + { + "epoch": 6.243452037454159, + "grad_norm": 0.35930338501930237, + "learning_rate": 6.52718089821902e-06, + "loss": 1.2153, + "step": 20962 + }, + { + "epoch": 6.2437498836538285, + "grad_norm": 0.24772289395332336, + "learning_rate": 6.52627638187107e-06, + "loss": 1.1967, + "step": 20963 + }, + { + "epoch": 6.244047729853497, + "grad_norm": 0.2683066129684448, + "learning_rate": 6.525371897842239e-06, + "loss": 1.1901, + "step": 20964 + }, + { + "epoch": 6.244345576053165, + "grad_norm": 0.31842073798179626, + "learning_rate": 6.524467446140935e-06, + "loss": 1.2013, + "step": 20965 + }, + { + "epoch": 6.244643422252834, + "grad_norm": 0.25482967495918274, + "learning_rate": 6.523563026775583e-06, + "loss": 1.1959, + "step": 20966 + }, + { + "epoch": 6.244941268452503, + "grad_norm": 0.41225674748420715, + "learning_rate": 6.5226586397545935e-06, + "loss": 1.2045, + "step": 20967 + }, + { + "epoch": 6.245239114652171, + "grad_norm": 0.26212966442108154, + "learning_rate": 6.521754285086377e-06, + "loss": 1.2079, + "step": 20968 + }, + { + "epoch": 6.24553696085184, + "grad_norm": 0.30973610281944275, + "learning_rate": 6.520849962779353e-06, + "loss": 1.2186, + "step": 20969 + }, + { + "epoch": 6.245834807051509, + "grad_norm": 0.4781585931777954, + "learning_rate": 6.519945672841932e-06, + "loss": 1.21, + "step": 20970 + }, + { + "epoch": 6.246132653251178, + "grad_norm": 0.3428117334842682, + "learning_rate": 6.519041415282525e-06, + "loss": 1.2135, + "step": 20971 + }, + { + "epoch": 6.246430499450846, + "grad_norm": 0.38383954763412476, + "learning_rate": 6.518137190109554e-06, + "loss": 1.2013, + "step": 20972 + }, + { + "epoch": 6.2467283456505145, + "grad_norm": 0.2761189043521881, + "learning_rate": 6.517232997331422e-06, + "loss": 1.2122, + "step": 20973 + }, + { + "epoch": 6.247026191850184, + "grad_norm": 0.34829890727996826, + "learning_rate": 6.516328836956551e-06, + "loss": 1.2013, + "step": 20974 + }, + { + "epoch": 6.247324038049852, + "grad_norm": 0.5543403625488281, + "learning_rate": 6.515424708993345e-06, + "loss": 1.2089, + "step": 20975 + }, + { + "epoch": 6.24762188424952, + "grad_norm": 0.4369713366031647, + "learning_rate": 6.514520613450217e-06, + "loss": 1.1956, + "step": 20976 + }, + { + "epoch": 6.24791973044919, + "grad_norm": 0.4368632435798645, + "learning_rate": 6.513616550335582e-06, + "loss": 1.1939, + "step": 20977 + }, + { + "epoch": 6.248217576648858, + "grad_norm": 0.296641081571579, + "learning_rate": 6.512712519657852e-06, + "loss": 1.2088, + "step": 20978 + }, + { + "epoch": 6.248515422848526, + "grad_norm": 0.5300294160842896, + "learning_rate": 6.5118085214254316e-06, + "loss": 1.2153, + "step": 20979 + }, + { + "epoch": 6.2488132690481955, + "grad_norm": 0.5507962107658386, + "learning_rate": 6.51090455564674e-06, + "loss": 1.2076, + "step": 20980 + }, + { + "epoch": 6.249111115247864, + "grad_norm": 0.45971810817718506, + "learning_rate": 6.510000622330181e-06, + "loss": 1.2169, + "step": 20981 + }, + { + "epoch": 6.249408961447532, + "grad_norm": 0.6372692584991455, + "learning_rate": 6.509096721484167e-06, + "loss": 1.1966, + "step": 20982 + }, + { + "epoch": 6.249706807647201, + "grad_norm": 0.3024199903011322, + "learning_rate": 6.508192853117108e-06, + "loss": 1.2039, + "step": 20983 + }, + { + "epoch": 6.25000465384687, + "grad_norm": 0.5083332061767578, + "learning_rate": 6.507289017237415e-06, + "loss": 1.2128, + "step": 20984 + }, + { + "epoch": 6.250302500046539, + "grad_norm": 0.2918650507926941, + "learning_rate": 6.50638521385349e-06, + "loss": 1.2117, + "step": 20985 + }, + { + "epoch": 6.250600346246207, + "grad_norm": 0.31115829944610596, + "learning_rate": 6.505481442973753e-06, + "loss": 1.215, + "step": 20986 + }, + { + "epoch": 6.2508981924458755, + "grad_norm": 0.35847991704940796, + "learning_rate": 6.504577704606605e-06, + "loss": 1.2101, + "step": 20987 + }, + { + "epoch": 6.251196038645545, + "grad_norm": 0.3025646507740021, + "learning_rate": 6.503673998760456e-06, + "loss": 1.2197, + "step": 20988 + }, + { + "epoch": 6.251493884845213, + "grad_norm": 0.3011787533760071, + "learning_rate": 6.502770325443713e-06, + "loss": 1.2, + "step": 20989 + }, + { + "epoch": 6.251791731044881, + "grad_norm": 0.31051358580589294, + "learning_rate": 6.501866684664784e-06, + "loss": 1.2001, + "step": 20990 + }, + { + "epoch": 6.252089577244551, + "grad_norm": 0.2914946675300598, + "learning_rate": 6.5009630764320795e-06, + "loss": 1.2113, + "step": 20991 + }, + { + "epoch": 6.252387423444219, + "grad_norm": 0.26340430974960327, + "learning_rate": 6.500059500754006e-06, + "loss": 1.2135, + "step": 20992 + }, + { + "epoch": 6.252685269643887, + "grad_norm": 0.3000192642211914, + "learning_rate": 6.499155957638963e-06, + "loss": 1.2107, + "step": 20993 + }, + { + "epoch": 6.2529831158435565, + "grad_norm": 0.2933465838432312, + "learning_rate": 6.498252447095366e-06, + "loss": 1.2105, + "step": 20994 + }, + { + "epoch": 6.253280962043225, + "grad_norm": 0.3004761338233948, + "learning_rate": 6.4973489691316184e-06, + "loss": 1.2072, + "step": 20995 + }, + { + "epoch": 6.253578808242894, + "grad_norm": 0.3727788031101227, + "learning_rate": 6.496445523756122e-06, + "loss": 1.217, + "step": 20996 + }, + { + "epoch": 6.253876654442562, + "grad_norm": 0.29946908354759216, + "learning_rate": 6.495542110977288e-06, + "loss": 1.2101, + "step": 20997 + }, + { + "epoch": 6.254174500642231, + "grad_norm": 0.2809601426124573, + "learning_rate": 6.494638730803522e-06, + "loss": 1.2028, + "step": 20998 + }, + { + "epoch": 6.2544723468419, + "grad_norm": 0.3155931234359741, + "learning_rate": 6.493735383243221e-06, + "loss": 1.2228, + "step": 20999 + }, + { + "epoch": 6.254770193041568, + "grad_norm": 0.2671355605125427, + "learning_rate": 6.492832068304796e-06, + "loss": 1.19, + "step": 21000 + }, + { + "epoch": 6.254770193041568, + "eval_loss": 1.3171136379241943, + "eval_runtime": 25.842, + "eval_samples_per_second": 67.1, + "eval_steps_per_second": 4.218, + "step": 21000 + }, + { + "epoch": 6.254770193041568, + "step": 21000, + "total_flos": 2.2989868385333726e+20, + "train_loss": 1.263435139360882, + "train_runtime": 585194.7226, + "train_samples_per_second": 29.375, + "train_steps_per_second": 0.057 + } + ], + "logging_steps": 1, + "max_steps": 33570, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 3, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.2989868385333726e+20, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}