diff --git "a/checkpoint-6102/trainer_state.json" "b/checkpoint-6102/trainer_state.json" deleted file mode 100644--- "a/checkpoint-6102/trainer_state.json" +++ /dev/null @@ -1,42803 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 2.9964976958525344, - "eval_steps": 1017, - "global_step": 6102, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0004918839153959665, - "grad_norm": 3.2944885419116114, - "learning_rate": 2.0000000000000002e-07, - "loss": 1.3754, - "step": 1 - }, - { - "epoch": 0.0004918839153959665, - "eval_loss": 1.7429497241973877, - "eval_runtime": 6757.8165, - "eval_samples_per_second": 4.217, - "eval_steps_per_second": 2.109, - "step": 1 - }, - { - "epoch": 0.000983767830791933, - "grad_norm": 4.387903167670078, - "learning_rate": 4.0000000000000003e-07, - "loss": 1.4888, - "step": 2 - }, - { - "epoch": 0.0014756517461878996, - "grad_norm": 3.702486400269614, - "learning_rate": 6.000000000000001e-07, - "loss": 1.42, - "step": 3 - }, - { - "epoch": 0.001967535661583866, - "grad_norm": 3.6873356554967187, - "learning_rate": 8.000000000000001e-07, - "loss": 1.4333, - "step": 4 - }, - { - "epoch": 0.002459419576979833, - "grad_norm": 3.4722776683329206, - "learning_rate": 1.0000000000000002e-06, - "loss": 1.4012, - "step": 5 - }, - { - "epoch": 0.002951303492375799, - "grad_norm": 3.501303887226817, - "learning_rate": 1.2000000000000002e-06, - "loss": 1.4415, - "step": 6 - }, - { - "epoch": 0.003443187407771766, - "grad_norm": 3.7234243820409483, - "learning_rate": 1.4000000000000001e-06, - "loss": 1.393, - "step": 7 - }, - { - "epoch": 0.003935071323167732, - "grad_norm": 4.291923849986159, - "learning_rate": 1.6000000000000001e-06, - "loss": 1.4528, - "step": 8 - }, - { - "epoch": 0.004426955238563699, - "grad_norm": 3.551363047162373, - "learning_rate": 1.8e-06, - "loss": 1.5339, - "step": 9 - }, - { - "epoch": 0.004918839153959666, - "grad_norm": 3.2095887045357925, - "learning_rate": 2.0000000000000003e-06, - "loss": 1.4037, - "step": 10 - }, - { - "epoch": 0.0054107230693556324, - "grad_norm": 3.2136633682112525, - "learning_rate": 2.2e-06, - "loss": 1.4643, - "step": 11 - }, - { - "epoch": 0.005902606984751598, - "grad_norm": 3.2755879900685083, - "learning_rate": 2.4000000000000003e-06, - "loss": 1.4317, - "step": 12 - }, - { - "epoch": 0.006394490900147565, - "grad_norm": 3.0262191829479654, - "learning_rate": 2.6e-06, - "loss": 1.385, - "step": 13 - }, - { - "epoch": 0.006886374815543532, - "grad_norm": 2.829759375766714, - "learning_rate": 2.8000000000000003e-06, - "loss": 1.4019, - "step": 14 - }, - { - "epoch": 0.0073782587309394985, - "grad_norm": 3.3734116571047603, - "learning_rate": 3e-06, - "loss": 1.5036, - "step": 15 - }, - { - "epoch": 0.007870142646335464, - "grad_norm": 2.2223043949951697, - "learning_rate": 3.2000000000000003e-06, - "loss": 1.4211, - "step": 16 - }, - { - "epoch": 0.008362026561731432, - "grad_norm": 2.150664036408096, - "learning_rate": 3.4000000000000005e-06, - "loss": 1.421, - "step": 17 - }, - { - "epoch": 0.008853910477127398, - "grad_norm": 2.167189487369783, - "learning_rate": 3.6e-06, - "loss": 1.412, - "step": 18 - }, - { - "epoch": 0.009345794392523364, - "grad_norm": 1.9248200012962078, - "learning_rate": 3.8e-06, - "loss": 1.3272, - "step": 19 - }, - { - "epoch": 0.009837678307919331, - "grad_norm": 1.4357536891109497, - "learning_rate": 4.000000000000001e-06, - "loss": 1.2681, - "step": 20 - }, - { - "epoch": 0.010329562223315297, - "grad_norm": 1.4074010741692329, - "learning_rate": 4.2000000000000004e-06, - "loss": 1.3999, - "step": 21 - }, - { - "epoch": 0.010821446138711265, - "grad_norm": 1.3417126612322257, - "learning_rate": 4.4e-06, - "loss": 1.3597, - "step": 22 - }, - { - "epoch": 0.01131333005410723, - "grad_norm": 1.2085318394846978, - "learning_rate": 4.6e-06, - "loss": 1.2399, - "step": 23 - }, - { - "epoch": 0.011805213969503197, - "grad_norm": 1.2314668941371152, - "learning_rate": 4.800000000000001e-06, - "loss": 1.2098, - "step": 24 - }, - { - "epoch": 0.012297097884899164, - "grad_norm": 1.1065431950366, - "learning_rate": 5e-06, - "loss": 1.1893, - "step": 25 - }, - { - "epoch": 0.01278898180029513, - "grad_norm": 1.2923599312074627, - "learning_rate": 5.2e-06, - "loss": 1.4414, - "step": 26 - }, - { - "epoch": 0.013280865715691098, - "grad_norm": 1.4380339902364112, - "learning_rate": 5.4e-06, - "loss": 1.3331, - "step": 27 - }, - { - "epoch": 0.013772749631087064, - "grad_norm": 1.0684373704524008, - "learning_rate": 5.600000000000001e-06, - "loss": 1.2566, - "step": 28 - }, - { - "epoch": 0.01426463354648303, - "grad_norm": 1.9974303492749381, - "learning_rate": 5.8e-06, - "loss": 1.392, - "step": 29 - }, - { - "epoch": 0.014756517461878997, - "grad_norm": 1.0635238293595115, - "learning_rate": 6e-06, - "loss": 1.2207, - "step": 30 - }, - { - "epoch": 0.015248401377274963, - "grad_norm": 1.144610170563909, - "learning_rate": 6.2e-06, - "loss": 1.1776, - "step": 31 - }, - { - "epoch": 0.01574028529267093, - "grad_norm": 1.0215236404064907, - "learning_rate": 6.4000000000000006e-06, - "loss": 1.2454, - "step": 32 - }, - { - "epoch": 0.016232169208066895, - "grad_norm": 1.0978098723676712, - "learning_rate": 6.6e-06, - "loss": 1.3075, - "step": 33 - }, - { - "epoch": 0.016724053123462864, - "grad_norm": 1.011060190230393, - "learning_rate": 6.800000000000001e-06, - "loss": 1.3027, - "step": 34 - }, - { - "epoch": 0.01721593703885883, - "grad_norm": 0.8804453134798843, - "learning_rate": 7.000000000000001e-06, - "loss": 1.2454, - "step": 35 - }, - { - "epoch": 0.017707820954254796, - "grad_norm": 1.0044730642582949, - "learning_rate": 7.2e-06, - "loss": 1.1784, - "step": 36 - }, - { - "epoch": 0.01819970486965076, - "grad_norm": 1.0003621918042445, - "learning_rate": 7.4e-06, - "loss": 1.1357, - "step": 37 - }, - { - "epoch": 0.018691588785046728, - "grad_norm": 0.902655380789519, - "learning_rate": 7.6e-06, - "loss": 1.2277, - "step": 38 - }, - { - "epoch": 0.019183472700442697, - "grad_norm": 0.8620586528524807, - "learning_rate": 7.8e-06, - "loss": 1.1915, - "step": 39 - }, - { - "epoch": 0.019675356615838663, - "grad_norm": 0.8137353744254079, - "learning_rate": 8.000000000000001e-06, - "loss": 1.2105, - "step": 40 - }, - { - "epoch": 0.02016724053123463, - "grad_norm": 0.8498537846473997, - "learning_rate": 8.200000000000001e-06, - "loss": 1.2529, - "step": 41 - }, - { - "epoch": 0.020659124446630595, - "grad_norm": 0.7727069062986878, - "learning_rate": 8.400000000000001e-06, - "loss": 1.1524, - "step": 42 - }, - { - "epoch": 0.02115100836202656, - "grad_norm": 0.7866519624230008, - "learning_rate": 8.599999999999999e-06, - "loss": 1.1283, - "step": 43 - }, - { - "epoch": 0.02164289227742253, - "grad_norm": 0.8400864244937326, - "learning_rate": 8.8e-06, - "loss": 1.1706, - "step": 44 - }, - { - "epoch": 0.022134776192818496, - "grad_norm": 0.7547645574164925, - "learning_rate": 9e-06, - "loss": 1.1232, - "step": 45 - }, - { - "epoch": 0.02262666010821446, - "grad_norm": 0.8158199982108677, - "learning_rate": 9.2e-06, - "loss": 1.1204, - "step": 46 - }, - { - "epoch": 0.023118544023610427, - "grad_norm": 1.3611423960474989, - "learning_rate": 9.4e-06, - "loss": 1.2444, - "step": 47 - }, - { - "epoch": 0.023610427939006393, - "grad_norm": 2.665278252165078, - "learning_rate": 9.600000000000001e-06, - "loss": 1.245, - "step": 48 - }, - { - "epoch": 0.024102311854402363, - "grad_norm": 0.7473276580820656, - "learning_rate": 9.800000000000001e-06, - "loss": 1.1831, - "step": 49 - }, - { - "epoch": 0.02459419576979833, - "grad_norm": 0.867805055812748, - "learning_rate": 1e-05, - "loss": 1.1472, - "step": 50 - }, - { - "epoch": 0.025086079685194294, - "grad_norm": 0.7651704029177023, - "learning_rate": 1.02e-05, - "loss": 1.1587, - "step": 51 - }, - { - "epoch": 0.02557796360059026, - "grad_norm": 0.8363948308926066, - "learning_rate": 1.04e-05, - "loss": 1.0964, - "step": 52 - }, - { - "epoch": 0.026069847515986226, - "grad_norm": 0.7505930186457537, - "learning_rate": 1.06e-05, - "loss": 1.078, - "step": 53 - }, - { - "epoch": 0.026561731431382195, - "grad_norm": 0.8102793549862889, - "learning_rate": 1.08e-05, - "loss": 1.2748, - "step": 54 - }, - { - "epoch": 0.02705361534677816, - "grad_norm": 0.7249072559891252, - "learning_rate": 1.1000000000000001e-05, - "loss": 1.0854, - "step": 55 - }, - { - "epoch": 0.027545499262174127, - "grad_norm": 3.1671466786532454, - "learning_rate": 1.1200000000000001e-05, - "loss": 1.124, - "step": 56 - }, - { - "epoch": 0.028037383177570093, - "grad_norm": 0.824497189137833, - "learning_rate": 1.1400000000000001e-05, - "loss": 1.1202, - "step": 57 - }, - { - "epoch": 0.02852926709296606, - "grad_norm": 0.7272267118712353, - "learning_rate": 1.16e-05, - "loss": 1.1422, - "step": 58 - }, - { - "epoch": 0.029021151008362025, - "grad_norm": 0.6765223066810455, - "learning_rate": 1.18e-05, - "loss": 1.2085, - "step": 59 - }, - { - "epoch": 0.029513034923757994, - "grad_norm": 0.8119323091496039, - "learning_rate": 1.2e-05, - "loss": 1.1647, - "step": 60 - }, - { - "epoch": 0.03000491883915396, - "grad_norm": 0.808311334107385, - "learning_rate": 1.22e-05, - "loss": 1.1931, - "step": 61 - }, - { - "epoch": 0.030496802754549926, - "grad_norm": 0.7333493427730208, - "learning_rate": 1.24e-05, - "loss": 1.1925, - "step": 62 - }, - { - "epoch": 0.030988686669945892, - "grad_norm": 0.6744329214605398, - "learning_rate": 1.2600000000000001e-05, - "loss": 1.0442, - "step": 63 - }, - { - "epoch": 0.03148057058534186, - "grad_norm": 0.6926905933941976, - "learning_rate": 1.2800000000000001e-05, - "loss": 1.0809, - "step": 64 - }, - { - "epoch": 0.031972454500737824, - "grad_norm": 1.3668458130847467, - "learning_rate": 1.3000000000000001e-05, - "loss": 1.0877, - "step": 65 - }, - { - "epoch": 0.03246433841613379, - "grad_norm": 0.8289258812132557, - "learning_rate": 1.32e-05, - "loss": 1.1428, - "step": 66 - }, - { - "epoch": 0.03295622233152976, - "grad_norm": 0.6980913641316498, - "learning_rate": 1.3400000000000002e-05, - "loss": 1.0939, - "step": 67 - }, - { - "epoch": 0.03344810624692573, - "grad_norm": 0.6979371882691208, - "learning_rate": 1.3600000000000002e-05, - "loss": 1.0171, - "step": 68 - }, - { - "epoch": 0.033939990162321694, - "grad_norm": 0.7772969332213725, - "learning_rate": 1.3800000000000002e-05, - "loss": 1.193, - "step": 69 - }, - { - "epoch": 0.03443187407771766, - "grad_norm": 0.7028342002384361, - "learning_rate": 1.4000000000000001e-05, - "loss": 1.1459, - "step": 70 - }, - { - "epoch": 0.034923757993113626, - "grad_norm": 0.6968410058706372, - "learning_rate": 1.42e-05, - "loss": 1.0484, - "step": 71 - }, - { - "epoch": 0.03541564190850959, - "grad_norm": 0.695112229906698, - "learning_rate": 1.44e-05, - "loss": 1.0834, - "step": 72 - }, - { - "epoch": 0.03590752582390556, - "grad_norm": 0.6971728236831181, - "learning_rate": 1.4599999999999999e-05, - "loss": 0.9687, - "step": 73 - }, - { - "epoch": 0.03639940973930152, - "grad_norm": 0.7529709591379297, - "learning_rate": 1.48e-05, - "loss": 1.0345, - "step": 74 - }, - { - "epoch": 0.03689129365469749, - "grad_norm": 0.7219911603689171, - "learning_rate": 1.5e-05, - "loss": 1.127, - "step": 75 - }, - { - "epoch": 0.037383177570093455, - "grad_norm": 2.3000076928278923, - "learning_rate": 1.52e-05, - "loss": 1.1482, - "step": 76 - }, - { - "epoch": 0.03787506148548942, - "grad_norm": 0.7809471483200394, - "learning_rate": 1.54e-05, - "loss": 1.1334, - "step": 77 - }, - { - "epoch": 0.038366945400885394, - "grad_norm": 0.6682890681656087, - "learning_rate": 1.56e-05, - "loss": 1.0889, - "step": 78 - }, - { - "epoch": 0.03885882931628136, - "grad_norm": 0.6544825794684948, - "learning_rate": 1.58e-05, - "loss": 1.0849, - "step": 79 - }, - { - "epoch": 0.039350713231677326, - "grad_norm": 0.7193471748761666, - "learning_rate": 1.6000000000000003e-05, - "loss": 1.0591, - "step": 80 - }, - { - "epoch": 0.03984259714707329, - "grad_norm": 0.7047661426725917, - "learning_rate": 1.62e-05, - "loss": 1.1813, - "step": 81 - }, - { - "epoch": 0.04033448106246926, - "grad_norm": 0.7908701188113396, - "learning_rate": 1.6400000000000002e-05, - "loss": 1.1497, - "step": 82 - }, - { - "epoch": 0.04082636497786522, - "grad_norm": 0.7708018644658138, - "learning_rate": 1.66e-05, - "loss": 1.1542, - "step": 83 - }, - { - "epoch": 0.04131824889326119, - "grad_norm": 0.7903847098228288, - "learning_rate": 1.6800000000000002e-05, - "loss": 1.0924, - "step": 84 - }, - { - "epoch": 0.041810132808657155, - "grad_norm": 0.7175005138594569, - "learning_rate": 1.7000000000000003e-05, - "loss": 1.0804, - "step": 85 - }, - { - "epoch": 0.04230201672405312, - "grad_norm": 0.6586142525496985, - "learning_rate": 1.7199999999999998e-05, - "loss": 1.1146, - "step": 86 - }, - { - "epoch": 0.04279390063944909, - "grad_norm": 0.653452996418673, - "learning_rate": 1.74e-05, - "loss": 1.0572, - "step": 87 - }, - { - "epoch": 0.04328578455484506, - "grad_norm": 0.6977335177281804, - "learning_rate": 1.76e-05, - "loss": 1.0954, - "step": 88 - }, - { - "epoch": 0.043777668470241025, - "grad_norm": 0.7965568824498416, - "learning_rate": 1.78e-05, - "loss": 1.1306, - "step": 89 - }, - { - "epoch": 0.04426955238563699, - "grad_norm": 0.7702049866863492, - "learning_rate": 1.8e-05, - "loss": 1.0624, - "step": 90 - }, - { - "epoch": 0.04476143630103296, - "grad_norm": 0.833380055984766, - "learning_rate": 1.8200000000000002e-05, - "loss": 1.1674, - "step": 91 - }, - { - "epoch": 0.04525332021642892, - "grad_norm": 0.7292333702665459, - "learning_rate": 1.84e-05, - "loss": 1.119, - "step": 92 - }, - { - "epoch": 0.04574520413182489, - "grad_norm": 1.085524626858829, - "learning_rate": 1.86e-05, - "loss": 1.1564, - "step": 93 - }, - { - "epoch": 0.046237088047220855, - "grad_norm": 0.7419165242760628, - "learning_rate": 1.88e-05, - "loss": 1.0865, - "step": 94 - }, - { - "epoch": 0.04672897196261682, - "grad_norm": 0.7521136841453322, - "learning_rate": 1.9e-05, - "loss": 1.0778, - "step": 95 - }, - { - "epoch": 0.04722085587801279, - "grad_norm": 0.8250024974505459, - "learning_rate": 1.9200000000000003e-05, - "loss": 1.0457, - "step": 96 - }, - { - "epoch": 0.04771273979340875, - "grad_norm": 0.6965644981293239, - "learning_rate": 1.94e-05, - "loss": 1.1203, - "step": 97 - }, - { - "epoch": 0.048204623708804725, - "grad_norm": 0.7725116537341091, - "learning_rate": 1.9600000000000002e-05, - "loss": 1.1314, - "step": 98 - }, - { - "epoch": 0.04869650762420069, - "grad_norm": 0.8954655800907969, - "learning_rate": 1.9800000000000004e-05, - "loss": 1.1697, - "step": 99 - }, - { - "epoch": 0.04918839153959666, - "grad_norm": 0.7099715953824428, - "learning_rate": 2e-05, - "loss": 1.0653, - "step": 100 - }, - { - "epoch": 0.04968027545499262, - "grad_norm": 0.6979198082373752, - "learning_rate": 2.0200000000000003e-05, - "loss": 1.0092, - "step": 101 - }, - { - "epoch": 0.05017215937038859, - "grad_norm": 0.7542658397206478, - "learning_rate": 2.04e-05, - "loss": 0.9632, - "step": 102 - }, - { - "epoch": 0.050664043285784555, - "grad_norm": 0.6828660555164732, - "learning_rate": 2.06e-05, - "loss": 1.0614, - "step": 103 - }, - { - "epoch": 0.05115592720118052, - "grad_norm": 0.6721539069619723, - "learning_rate": 2.08e-05, - "loss": 1.0285, - "step": 104 - }, - { - "epoch": 0.051647811116576486, - "grad_norm": 0.7159114501125087, - "learning_rate": 2.1e-05, - "loss": 1.0312, - "step": 105 - }, - { - "epoch": 0.05213969503197245, - "grad_norm": 0.6545850965584485, - "learning_rate": 2.12e-05, - "loss": 1.0698, - "step": 106 - }, - { - "epoch": 0.05263157894736842, - "grad_norm": 0.658675131654431, - "learning_rate": 2.1400000000000002e-05, - "loss": 1.1183, - "step": 107 - }, - { - "epoch": 0.05312346286276439, - "grad_norm": 0.6765638401749785, - "learning_rate": 2.16e-05, - "loss": 1.0923, - "step": 108 - }, - { - "epoch": 0.05361534677816036, - "grad_norm": 0.6774305162394292, - "learning_rate": 2.18e-05, - "loss": 1.0696, - "step": 109 - }, - { - "epoch": 0.05410723069355632, - "grad_norm": 0.6941450033816315, - "learning_rate": 2.2000000000000003e-05, - "loss": 1.0278, - "step": 110 - }, - { - "epoch": 0.05459911460895229, - "grad_norm": 0.7298702161046116, - "learning_rate": 2.22e-05, - "loss": 0.9942, - "step": 111 - }, - { - "epoch": 0.055090998524348254, - "grad_norm": 0.78359048411605, - "learning_rate": 2.2400000000000002e-05, - "loss": 1.0722, - "step": 112 - }, - { - "epoch": 0.05558288243974422, - "grad_norm": 0.7234222510303335, - "learning_rate": 2.26e-05, - "loss": 1.0851, - "step": 113 - }, - { - "epoch": 0.056074766355140186, - "grad_norm": 0.7517380728841061, - "learning_rate": 2.2800000000000002e-05, - "loss": 0.9929, - "step": 114 - }, - { - "epoch": 0.05656665027053615, - "grad_norm": 0.688888491301893, - "learning_rate": 2.3000000000000003e-05, - "loss": 1.0505, - "step": 115 - }, - { - "epoch": 0.05705853418593212, - "grad_norm": 0.815491863848212, - "learning_rate": 2.32e-05, - "loss": 1.1339, - "step": 116 - }, - { - "epoch": 0.057550418101328084, - "grad_norm": 0.75531131045591, - "learning_rate": 2.3400000000000003e-05, - "loss": 1.0716, - "step": 117 - }, - { - "epoch": 0.05804230201672405, - "grad_norm": 0.7625830133131772, - "learning_rate": 2.36e-05, - "loss": 1.0524, - "step": 118 - }, - { - "epoch": 0.05853418593212002, - "grad_norm": 0.7566379286598796, - "learning_rate": 2.38e-05, - "loss": 1.0334, - "step": 119 - }, - { - "epoch": 0.05902606984751599, - "grad_norm": 0.7028004568598516, - "learning_rate": 2.4e-05, - "loss": 1.0436, - "step": 120 - }, - { - "epoch": 0.059517953762911954, - "grad_norm": 0.7070567970425724, - "learning_rate": 2.4200000000000002e-05, - "loss": 1.0651, - "step": 121 - }, - { - "epoch": 0.06000983767830792, - "grad_norm": 0.7098752497716972, - "learning_rate": 2.44e-05, - "loss": 1.0092, - "step": 122 - }, - { - "epoch": 0.060501721593703886, - "grad_norm": 0.6923985079217133, - "learning_rate": 2.46e-05, - "loss": 0.9777, - "step": 123 - }, - { - "epoch": 0.06099360550909985, - "grad_norm": 0.736397162563532, - "learning_rate": 2.48e-05, - "loss": 1.0494, - "step": 124 - }, - { - "epoch": 0.06148548942449582, - "grad_norm": 0.6717044407705316, - "learning_rate": 2.5e-05, - "loss": 1.0805, - "step": 125 - }, - { - "epoch": 0.061977373339891784, - "grad_norm": 0.6918045884854129, - "learning_rate": 2.5200000000000003e-05, - "loss": 1.0736, - "step": 126 - }, - { - "epoch": 0.06246925725528775, - "grad_norm": 0.7451366287208874, - "learning_rate": 2.54e-05, - "loss": 1.1339, - "step": 127 - }, - { - "epoch": 0.06296114117068372, - "grad_norm": 0.7280019983557976, - "learning_rate": 2.5600000000000002e-05, - "loss": 1.0258, - "step": 128 - }, - { - "epoch": 0.06345302508607968, - "grad_norm": 0.8510806423225098, - "learning_rate": 2.58e-05, - "loss": 0.9742, - "step": 129 - }, - { - "epoch": 0.06394490900147565, - "grad_norm": 0.6722285390124173, - "learning_rate": 2.6000000000000002e-05, - "loss": 1.091, - "step": 130 - }, - { - "epoch": 0.06443679291687161, - "grad_norm": 0.7882995585151922, - "learning_rate": 2.6200000000000003e-05, - "loss": 1.1108, - "step": 131 - }, - { - "epoch": 0.06492867683226758, - "grad_norm": 0.7124573821253081, - "learning_rate": 2.64e-05, - "loss": 1.094, - "step": 132 - }, - { - "epoch": 0.06542056074766354, - "grad_norm": 0.7173359737653685, - "learning_rate": 2.6600000000000003e-05, - "loss": 0.9962, - "step": 133 - }, - { - "epoch": 0.06591244466305952, - "grad_norm": 0.7834358909342277, - "learning_rate": 2.6800000000000004e-05, - "loss": 1.1099, - "step": 134 - }, - { - "epoch": 0.06640432857845549, - "grad_norm": 0.7008793922142209, - "learning_rate": 2.7000000000000002e-05, - "loss": 1.0077, - "step": 135 - }, - { - "epoch": 0.06689621249385146, - "grad_norm": 0.7634356340968927, - "learning_rate": 2.7200000000000004e-05, - "loss": 1.1082, - "step": 136 - }, - { - "epoch": 0.06738809640924742, - "grad_norm": 0.7378999561528924, - "learning_rate": 2.7400000000000002e-05, - "loss": 1.1178, - "step": 137 - }, - { - "epoch": 0.06787998032464339, - "grad_norm": 0.700386136828009, - "learning_rate": 2.7600000000000003e-05, - "loss": 1.1726, - "step": 138 - }, - { - "epoch": 0.06837186424003935, - "grad_norm": 0.68094249806194, - "learning_rate": 2.7800000000000005e-05, - "loss": 1.0505, - "step": 139 - }, - { - "epoch": 0.06886374815543532, - "grad_norm": 0.6844926094012175, - "learning_rate": 2.8000000000000003e-05, - "loss": 1.0492, - "step": 140 - }, - { - "epoch": 0.06935563207083129, - "grad_norm": 0.6903824887607274, - "learning_rate": 2.8199999999999998e-05, - "loss": 1.0364, - "step": 141 - }, - { - "epoch": 0.06984751598622725, - "grad_norm": 0.7116078298920068, - "learning_rate": 2.84e-05, - "loss": 1.1036, - "step": 142 - }, - { - "epoch": 0.07033939990162322, - "grad_norm": 0.6756583405629968, - "learning_rate": 2.86e-05, - "loss": 0.9928, - "step": 143 - }, - { - "epoch": 0.07083128381701918, - "grad_norm": 0.7394965661508411, - "learning_rate": 2.88e-05, - "loss": 1.102, - "step": 144 - }, - { - "epoch": 0.07132316773241515, - "grad_norm": 0.7541310822808502, - "learning_rate": 2.9e-05, - "loss": 1.0452, - "step": 145 - }, - { - "epoch": 0.07181505164781112, - "grad_norm": 0.7239753447013636, - "learning_rate": 2.9199999999999998e-05, - "loss": 1.0378, - "step": 146 - }, - { - "epoch": 0.07230693556320708, - "grad_norm": 0.6864664784282436, - "learning_rate": 2.94e-05, - "loss": 1.0373, - "step": 147 - }, - { - "epoch": 0.07279881947860305, - "grad_norm": 0.6457006672999801, - "learning_rate": 2.96e-05, - "loss": 1.0271, - "step": 148 - }, - { - "epoch": 0.07329070339399901, - "grad_norm": 0.7029187156226748, - "learning_rate": 2.98e-05, - "loss": 1.1029, - "step": 149 - }, - { - "epoch": 0.07378258730939498, - "grad_norm": 0.6923306808409229, - "learning_rate": 3e-05, - "loss": 1.0088, - "step": 150 - }, - { - "epoch": 0.07427447122479094, - "grad_norm": 0.6863830617495799, - "learning_rate": 3.02e-05, - "loss": 1.0504, - "step": 151 - }, - { - "epoch": 0.07476635514018691, - "grad_norm": 0.6593531065686705, - "learning_rate": 3.04e-05, - "loss": 1.0617, - "step": 152 - }, - { - "epoch": 0.07525823905558288, - "grad_norm": 0.7881517346051917, - "learning_rate": 3.06e-05, - "loss": 1.0784, - "step": 153 - }, - { - "epoch": 0.07575012297097884, - "grad_norm": 0.6915901349067198, - "learning_rate": 3.08e-05, - "loss": 1.0124, - "step": 154 - }, - { - "epoch": 0.07624200688637482, - "grad_norm": 0.7266366648143068, - "learning_rate": 3.1e-05, - "loss": 1.1224, - "step": 155 - }, - { - "epoch": 0.07673389080177079, - "grad_norm": 0.7162444071348569, - "learning_rate": 3.12e-05, - "loss": 1.149, - "step": 156 - }, - { - "epoch": 0.07722577471716675, - "grad_norm": 0.7237776602931134, - "learning_rate": 3.1400000000000004e-05, - "loss": 1.0799, - "step": 157 - }, - { - "epoch": 0.07771765863256272, - "grad_norm": 0.8258091547112008, - "learning_rate": 3.16e-05, - "loss": 1.0523, - "step": 158 - }, - { - "epoch": 0.07820954254795869, - "grad_norm": 0.6852296673700097, - "learning_rate": 3.18e-05, - "loss": 1.0213, - "step": 159 - }, - { - "epoch": 0.07870142646335465, - "grad_norm": 0.6350465704546835, - "learning_rate": 3.2000000000000005e-05, - "loss": 0.9624, - "step": 160 - }, - { - "epoch": 0.07919331037875062, - "grad_norm": 1.4962584316561658, - "learning_rate": 3.2200000000000003e-05, - "loss": 1.0782, - "step": 161 - }, - { - "epoch": 0.07968519429414658, - "grad_norm": 0.7353035875932009, - "learning_rate": 3.24e-05, - "loss": 0.9777, - "step": 162 - }, - { - "epoch": 0.08017707820954255, - "grad_norm": 0.8083481878994074, - "learning_rate": 3.26e-05, - "loss": 1.0283, - "step": 163 - }, - { - "epoch": 0.08066896212493851, - "grad_norm": 0.7376876744622659, - "learning_rate": 3.2800000000000004e-05, - "loss": 1.0049, - "step": 164 - }, - { - "epoch": 0.08116084604033448, - "grad_norm": 0.7684385423867578, - "learning_rate": 3.3e-05, - "loss": 1.0919, - "step": 165 - }, - { - "epoch": 0.08165272995573045, - "grad_norm": 0.6828848133442723, - "learning_rate": 3.32e-05, - "loss": 1.0262, - "step": 166 - }, - { - "epoch": 0.08214461387112641, - "grad_norm": 0.6435240309407694, - "learning_rate": 3.3400000000000005e-05, - "loss": 1.0324, - "step": 167 - }, - { - "epoch": 0.08263649778652238, - "grad_norm": 0.720314282911722, - "learning_rate": 3.3600000000000004e-05, - "loss": 1.0566, - "step": 168 - }, - { - "epoch": 0.08312838170191834, - "grad_norm": 0.7727153801819339, - "learning_rate": 3.38e-05, - "loss": 0.9888, - "step": 169 - }, - { - "epoch": 0.08362026561731431, - "grad_norm": 0.6505013326914232, - "learning_rate": 3.4000000000000007e-05, - "loss": 1.0101, - "step": 170 - }, - { - "epoch": 0.08411214953271028, - "grad_norm": 0.6728550885628142, - "learning_rate": 3.4200000000000005e-05, - "loss": 1.0934, - "step": 171 - }, - { - "epoch": 0.08460403344810624, - "grad_norm": 0.6665239884376515, - "learning_rate": 3.4399999999999996e-05, - "loss": 1.0947, - "step": 172 - }, - { - "epoch": 0.08509591736350221, - "grad_norm": 0.6959342104513762, - "learning_rate": 3.46e-05, - "loss": 1.0673, - "step": 173 - }, - { - "epoch": 0.08558780127889817, - "grad_norm": 0.7429585913136677, - "learning_rate": 3.48e-05, - "loss": 0.9795, - "step": 174 - }, - { - "epoch": 0.08607968519429415, - "grad_norm": 0.6739954355198908, - "learning_rate": 3.5e-05, - "loss": 1.0087, - "step": 175 - }, - { - "epoch": 0.08657156910969012, - "grad_norm": 0.7196630795576944, - "learning_rate": 3.52e-05, - "loss": 1.042, - "step": 176 - }, - { - "epoch": 0.08706345302508608, - "grad_norm": 0.674245145598711, - "learning_rate": 3.54e-05, - "loss": 0.9691, - "step": 177 - }, - { - "epoch": 0.08755533694048205, - "grad_norm": 0.6237692105197545, - "learning_rate": 3.56e-05, - "loss": 1.0078, - "step": 178 - }, - { - "epoch": 0.08804722085587802, - "grad_norm": 0.7338926542494251, - "learning_rate": 3.58e-05, - "loss": 1.0325, - "step": 179 - }, - { - "epoch": 0.08853910477127398, - "grad_norm": 0.6797934842299513, - "learning_rate": 3.6e-05, - "loss": 1.0204, - "step": 180 - }, - { - "epoch": 0.08903098868666995, - "grad_norm": 0.7299173452740791, - "learning_rate": 3.62e-05, - "loss": 1.047, - "step": 181 - }, - { - "epoch": 0.08952287260206591, - "grad_norm": 0.7045963197748994, - "learning_rate": 3.6400000000000004e-05, - "loss": 1.0916, - "step": 182 - }, - { - "epoch": 0.09001475651746188, - "grad_norm": 0.7436237348213588, - "learning_rate": 3.66e-05, - "loss": 1.0694, - "step": 183 - }, - { - "epoch": 0.09050664043285785, - "grad_norm": 0.7114595640434511, - "learning_rate": 3.68e-05, - "loss": 1.0079, - "step": 184 - }, - { - "epoch": 0.09099852434825381, - "grad_norm": 0.6846091209296655, - "learning_rate": 3.7e-05, - "loss": 0.9537, - "step": 185 - }, - { - "epoch": 0.09149040826364978, - "grad_norm": 0.685900598420304, - "learning_rate": 3.72e-05, - "loss": 1.0499, - "step": 186 - }, - { - "epoch": 0.09198229217904574, - "grad_norm": 0.6801716444252568, - "learning_rate": 3.74e-05, - "loss": 1.0011, - "step": 187 - }, - { - "epoch": 0.09247417609444171, - "grad_norm": 0.6959834732767167, - "learning_rate": 3.76e-05, - "loss": 1.0543, - "step": 188 - }, - { - "epoch": 0.09296606000983768, - "grad_norm": 0.6526864408010312, - "learning_rate": 3.7800000000000004e-05, - "loss": 1.0366, - "step": 189 - }, - { - "epoch": 0.09345794392523364, - "grad_norm": 0.6982655777808178, - "learning_rate": 3.8e-05, - "loss": 1.0264, - "step": 190 - }, - { - "epoch": 0.09394982784062961, - "grad_norm": 0.6967368126765116, - "learning_rate": 3.82e-05, - "loss": 1.005, - "step": 191 - }, - { - "epoch": 0.09444171175602557, - "grad_norm": 0.6571475095270259, - "learning_rate": 3.8400000000000005e-05, - "loss": 0.9507, - "step": 192 - }, - { - "epoch": 0.09493359567142154, - "grad_norm": 0.7893579623123786, - "learning_rate": 3.86e-05, - "loss": 1.0194, - "step": 193 - }, - { - "epoch": 0.0954254795868175, - "grad_norm": 0.6747872226437224, - "learning_rate": 3.88e-05, - "loss": 0.9702, - "step": 194 - }, - { - "epoch": 0.09591736350221347, - "grad_norm": 0.6951769070722532, - "learning_rate": 3.9000000000000006e-05, - "loss": 1.0154, - "step": 195 - }, - { - "epoch": 0.09640924741760945, - "grad_norm": 0.6729600273811861, - "learning_rate": 3.9200000000000004e-05, - "loss": 0.9737, - "step": 196 - }, - { - "epoch": 0.09690113133300542, - "grad_norm": 0.646289442356639, - "learning_rate": 3.94e-05, - "loss": 0.9964, - "step": 197 - }, - { - "epoch": 0.09739301524840138, - "grad_norm": 0.7111022014767776, - "learning_rate": 3.960000000000001e-05, - "loss": 0.9115, - "step": 198 - }, - { - "epoch": 0.09788489916379735, - "grad_norm": 0.7852036200596197, - "learning_rate": 3.9800000000000005e-05, - "loss": 1.05, - "step": 199 - }, - { - "epoch": 0.09837678307919331, - "grad_norm": 0.7088405268610213, - "learning_rate": 4e-05, - "loss": 1.0435, - "step": 200 - }, - { - "epoch": 0.09886866699458928, - "grad_norm": 0.7777103623163572, - "learning_rate": 4.02e-05, - "loss": 1.0491, - "step": 201 - }, - { - "epoch": 0.09936055090998525, - "grad_norm": 0.6579744250873005, - "learning_rate": 4.0400000000000006e-05, - "loss": 1.0442, - "step": 202 - }, - { - "epoch": 0.09985243482538121, - "grad_norm": 0.6543156891174083, - "learning_rate": 4.0600000000000004e-05, - "loss": 0.975, - "step": 203 - }, - { - "epoch": 0.10034431874077718, - "grad_norm": 0.7020232902689696, - "learning_rate": 4.08e-05, - "loss": 1.0331, - "step": 204 - }, - { - "epoch": 0.10083620265617314, - "grad_norm": 0.6556250690855042, - "learning_rate": 4.1e-05, - "loss": 0.9965, - "step": 205 - }, - { - "epoch": 0.10132808657156911, - "grad_norm": 0.6496179732912809, - "learning_rate": 4.12e-05, - "loss": 0.9488, - "step": 206 - }, - { - "epoch": 0.10181997048696508, - "grad_norm": 0.6590304062714621, - "learning_rate": 4.14e-05, - "loss": 1.01, - "step": 207 - }, - { - "epoch": 0.10231185440236104, - "grad_norm": 0.6984098889798658, - "learning_rate": 4.16e-05, - "loss": 1.0673, - "step": 208 - }, - { - "epoch": 0.102803738317757, - "grad_norm": 0.6751148224917966, - "learning_rate": 4.18e-05, - "loss": 1.0013, - "step": 209 - }, - { - "epoch": 0.10329562223315297, - "grad_norm": 0.6661081975498573, - "learning_rate": 4.2e-05, - "loss": 1.0373, - "step": 210 - }, - { - "epoch": 0.10378750614854894, - "grad_norm": 0.7460179270770505, - "learning_rate": 4.22e-05, - "loss": 1.0583, - "step": 211 - }, - { - "epoch": 0.1042793900639449, - "grad_norm": 0.6823917332904713, - "learning_rate": 4.24e-05, - "loss": 0.9444, - "step": 212 - }, - { - "epoch": 0.10477127397934087, - "grad_norm": 0.6308296487006928, - "learning_rate": 4.26e-05, - "loss": 0.9662, - "step": 213 - }, - { - "epoch": 0.10526315789473684, - "grad_norm": 0.6405975064168019, - "learning_rate": 4.2800000000000004e-05, - "loss": 0.964, - "step": 214 - }, - { - "epoch": 0.1057550418101328, - "grad_norm": 0.6332265604039056, - "learning_rate": 4.3e-05, - "loss": 1.0117, - "step": 215 - }, - { - "epoch": 0.10624692572552878, - "grad_norm": 0.6234050429175113, - "learning_rate": 4.32e-05, - "loss": 0.8902, - "step": 216 - }, - { - "epoch": 0.10673880964092475, - "grad_norm": 0.6873878864046749, - "learning_rate": 4.3400000000000005e-05, - "loss": 1.0024, - "step": 217 - }, - { - "epoch": 0.10723069355632071, - "grad_norm": 0.6497392131207163, - "learning_rate": 4.36e-05, - "loss": 0.9846, - "step": 218 - }, - { - "epoch": 0.10772257747171668, - "grad_norm": 0.6821587564436689, - "learning_rate": 4.38e-05, - "loss": 0.9138, - "step": 219 - }, - { - "epoch": 0.10821446138711265, - "grad_norm": 0.7071003711939765, - "learning_rate": 4.4000000000000006e-05, - "loss": 1.0503, - "step": 220 - }, - { - "epoch": 0.10870634530250861, - "grad_norm": 0.6535807129084836, - "learning_rate": 4.4200000000000004e-05, - "loss": 1.0347, - "step": 221 - }, - { - "epoch": 0.10919822921790458, - "grad_norm": 0.6675568978536691, - "learning_rate": 4.44e-05, - "loss": 1.0242, - "step": 222 - }, - { - "epoch": 0.10969011313330054, - "grad_norm": 0.6433035642719022, - "learning_rate": 4.46e-05, - "loss": 0.9678, - "step": 223 - }, - { - "epoch": 0.11018199704869651, - "grad_norm": 0.734775071588386, - "learning_rate": 4.4800000000000005e-05, - "loss": 0.9407, - "step": 224 - }, - { - "epoch": 0.11067388096409247, - "grad_norm": 0.6463459120325686, - "learning_rate": 4.5e-05, - "loss": 0.9922, - "step": 225 - }, - { - "epoch": 0.11116576487948844, - "grad_norm": 0.699785016750569, - "learning_rate": 4.52e-05, - "loss": 0.9674, - "step": 226 - }, - { - "epoch": 0.1116576487948844, - "grad_norm": 0.6187364073010676, - "learning_rate": 4.5400000000000006e-05, - "loss": 0.9845, - "step": 227 - }, - { - "epoch": 0.11214953271028037, - "grad_norm": 0.6506296005006268, - "learning_rate": 4.5600000000000004e-05, - "loss": 1.0033, - "step": 228 - }, - { - "epoch": 0.11264141662567634, - "grad_norm": 0.6769159452757093, - "learning_rate": 4.58e-05, - "loss": 1.0231, - "step": 229 - }, - { - "epoch": 0.1131333005410723, - "grad_norm": 0.6581875106445415, - "learning_rate": 4.600000000000001e-05, - "loss": 0.9595, - "step": 230 - }, - { - "epoch": 0.11362518445646827, - "grad_norm": 0.7094660326901466, - "learning_rate": 4.6200000000000005e-05, - "loss": 0.9939, - "step": 231 - }, - { - "epoch": 0.11411706837186424, - "grad_norm": 0.6298366474174654, - "learning_rate": 4.64e-05, - "loss": 0.9612, - "step": 232 - }, - { - "epoch": 0.1146089522872602, - "grad_norm": 0.6700127579688815, - "learning_rate": 4.660000000000001e-05, - "loss": 1.0655, - "step": 233 - }, - { - "epoch": 0.11510083620265617, - "grad_norm": 0.6908235779413228, - "learning_rate": 4.6800000000000006e-05, - "loss": 1.0331, - "step": 234 - }, - { - "epoch": 0.11559272011805213, - "grad_norm": 0.7278533721042479, - "learning_rate": 4.7e-05, - "loss": 1.0268, - "step": 235 - }, - { - "epoch": 0.1160846040334481, - "grad_norm": 0.6561769300017106, - "learning_rate": 4.72e-05, - "loss": 0.9314, - "step": 236 - }, - { - "epoch": 0.11657648794884408, - "grad_norm": 0.6838436557643636, - "learning_rate": 4.74e-05, - "loss": 1.0146, - "step": 237 - }, - { - "epoch": 0.11706837186424005, - "grad_norm": 0.7077889514774452, - "learning_rate": 4.76e-05, - "loss": 1.0245, - "step": 238 - }, - { - "epoch": 0.11756025577963601, - "grad_norm": 0.7519909427913024, - "learning_rate": 4.78e-05, - "loss": 1.0359, - "step": 239 - }, - { - "epoch": 0.11805213969503198, - "grad_norm": 0.672648659379634, - "learning_rate": 4.8e-05, - "loss": 1.0815, - "step": 240 - }, - { - "epoch": 0.11854402361042794, - "grad_norm": 0.6810172690834393, - "learning_rate": 4.82e-05, - "loss": 1.0053, - "step": 241 - }, - { - "epoch": 0.11903590752582391, - "grad_norm": 0.6708663972642332, - "learning_rate": 4.8400000000000004e-05, - "loss": 1.0072, - "step": 242 - }, - { - "epoch": 0.11952779144121987, - "grad_norm": 0.7168031253779362, - "learning_rate": 4.86e-05, - "loss": 0.9631, - "step": 243 - }, - { - "epoch": 0.12001967535661584, - "grad_norm": 0.6583600524485893, - "learning_rate": 4.88e-05, - "loss": 1.0597, - "step": 244 - }, - { - "epoch": 0.1205115592720118, - "grad_norm": 0.6753110485508403, - "learning_rate": 4.9e-05, - "loss": 1.0366, - "step": 245 - }, - { - "epoch": 0.12100344318740777, - "grad_norm": 0.7026537939048871, - "learning_rate": 4.92e-05, - "loss": 1.0412, - "step": 246 - }, - { - "epoch": 0.12149532710280374, - "grad_norm": 0.6840738596363672, - "learning_rate": 4.94e-05, - "loss": 0.8955, - "step": 247 - }, - { - "epoch": 0.1219872110181997, - "grad_norm": 0.6787167522875908, - "learning_rate": 4.96e-05, - "loss": 1.0285, - "step": 248 - }, - { - "epoch": 0.12247909493359567, - "grad_norm": 0.6447409625317669, - "learning_rate": 4.9800000000000004e-05, - "loss": 1.0053, - "step": 249 - }, - { - "epoch": 0.12297097884899164, - "grad_norm": 0.6540923109167576, - "learning_rate": 5e-05, - "loss": 0.9068, - "step": 250 - }, - { - "epoch": 0.1234628627643876, - "grad_norm": 0.6565019553654861, - "learning_rate": 5.02e-05, - "loss": 1.0011, - "step": 251 - }, - { - "epoch": 0.12395474667978357, - "grad_norm": 1.909933657652513, - "learning_rate": 5.0400000000000005e-05, - "loss": 1.0145, - "step": 252 - }, - { - "epoch": 0.12444663059517953, - "grad_norm": 0.66278666736166, - "learning_rate": 5.0600000000000003e-05, - "loss": 0.9843, - "step": 253 - }, - { - "epoch": 0.1249385145105755, - "grad_norm": 0.6836835743581384, - "learning_rate": 5.08e-05, - "loss": 0.9417, - "step": 254 - }, - { - "epoch": 0.12543039842597148, - "grad_norm": 0.6863239949032678, - "learning_rate": 5.1000000000000006e-05, - "loss": 0.933, - "step": 255 - }, - { - "epoch": 0.12592228234136743, - "grad_norm": 0.6904796237804453, - "learning_rate": 5.1200000000000004e-05, - "loss": 0.9765, - "step": 256 - }, - { - "epoch": 0.1264141662567634, - "grad_norm": 0.685186929661333, - "learning_rate": 5.14e-05, - "loss": 0.9448, - "step": 257 - }, - { - "epoch": 0.12690605017215936, - "grad_norm": 0.6414016224391275, - "learning_rate": 5.16e-05, - "loss": 1.0642, - "step": 258 - }, - { - "epoch": 0.12739793408755534, - "grad_norm": 0.6389222696585697, - "learning_rate": 5.1800000000000005e-05, - "loss": 0.9642, - "step": 259 - }, - { - "epoch": 0.1278898180029513, - "grad_norm": 0.6741485892556393, - "learning_rate": 5.2000000000000004e-05, - "loss": 1.0645, - "step": 260 - }, - { - "epoch": 0.12838170191834727, - "grad_norm": 0.6229141794339108, - "learning_rate": 5.22e-05, - "loss": 1.013, - "step": 261 - }, - { - "epoch": 0.12887358583374323, - "grad_norm": 0.6907834356049275, - "learning_rate": 5.2400000000000007e-05, - "loss": 0.9936, - "step": 262 - }, - { - "epoch": 0.1293654697491392, - "grad_norm": 0.7038791661126186, - "learning_rate": 5.2600000000000005e-05, - "loss": 1.0136, - "step": 263 - }, - { - "epoch": 0.12985735366453516, - "grad_norm": 0.640725198404905, - "learning_rate": 5.28e-05, - "loss": 0.9825, - "step": 264 - }, - { - "epoch": 0.13034923757993114, - "grad_norm": 0.5842738998671183, - "learning_rate": 5.300000000000001e-05, - "loss": 0.8905, - "step": 265 - }, - { - "epoch": 0.1308411214953271, - "grad_norm": 0.7302625337561353, - "learning_rate": 5.3200000000000006e-05, - "loss": 0.9707, - "step": 266 - }, - { - "epoch": 0.13133300541072307, - "grad_norm": 0.640591046196343, - "learning_rate": 5.3400000000000004e-05, - "loss": 0.9923, - "step": 267 - }, - { - "epoch": 0.13182488932611905, - "grad_norm": 0.6401230104389162, - "learning_rate": 5.360000000000001e-05, - "loss": 1.0509, - "step": 268 - }, - { - "epoch": 0.132316773241515, - "grad_norm": 0.6168517912433212, - "learning_rate": 5.380000000000001e-05, - "loss": 1.0207, - "step": 269 - }, - { - "epoch": 0.13280865715691098, - "grad_norm": 0.6477707857028956, - "learning_rate": 5.4000000000000005e-05, - "loss": 1.0227, - "step": 270 - }, - { - "epoch": 0.13330054107230693, - "grad_norm": 0.6832697192287229, - "learning_rate": 5.420000000000001e-05, - "loss": 1.0356, - "step": 271 - }, - { - "epoch": 0.1337924249877029, - "grad_norm": 0.6243425458645113, - "learning_rate": 5.440000000000001e-05, - "loss": 0.9723, - "step": 272 - }, - { - "epoch": 0.13428430890309886, - "grad_norm": 0.6659761821502914, - "learning_rate": 5.4600000000000006e-05, - "loss": 0.9364, - "step": 273 - }, - { - "epoch": 0.13477619281849484, - "grad_norm": 0.655424441774203, - "learning_rate": 5.4800000000000004e-05, - "loss": 0.9311, - "step": 274 - }, - { - "epoch": 0.1352680767338908, - "grad_norm": 0.6658295218882189, - "learning_rate": 5.500000000000001e-05, - "loss": 0.9409, - "step": 275 - }, - { - "epoch": 0.13575996064928678, - "grad_norm": 4.3427010379394915, - "learning_rate": 5.520000000000001e-05, - "loss": 1.1593, - "step": 276 - }, - { - "epoch": 0.13625184456468273, - "grad_norm": 0.651728430891512, - "learning_rate": 5.5400000000000005e-05, - "loss": 0.9168, - "step": 277 - }, - { - "epoch": 0.1367437284800787, - "grad_norm": 0.6611257694815104, - "learning_rate": 5.560000000000001e-05, - "loss": 0.9477, - "step": 278 - }, - { - "epoch": 0.13723561239547466, - "grad_norm": 0.677143186101974, - "learning_rate": 5.580000000000001e-05, - "loss": 0.9808, - "step": 279 - }, - { - "epoch": 0.13772749631087064, - "grad_norm": 0.6757312948008557, - "learning_rate": 5.6000000000000006e-05, - "loss": 1.0211, - "step": 280 - }, - { - "epoch": 0.1382193802262666, - "grad_norm": 0.6204458845897312, - "learning_rate": 5.620000000000001e-05, - "loss": 0.9787, - "step": 281 - }, - { - "epoch": 0.13871126414166257, - "grad_norm": 0.6301827758158104, - "learning_rate": 5.6399999999999995e-05, - "loss": 0.9815, - "step": 282 - }, - { - "epoch": 0.13920314805705852, - "grad_norm": 0.8061051866889022, - "learning_rate": 5.66e-05, - "loss": 1.0097, - "step": 283 - }, - { - "epoch": 0.1396950319724545, - "grad_norm": 0.6337588057022924, - "learning_rate": 5.68e-05, - "loss": 0.9373, - "step": 284 - }, - { - "epoch": 0.14018691588785046, - "grad_norm": 0.6629923742300748, - "learning_rate": 5.6999999999999996e-05, - "loss": 1.105, - "step": 285 - }, - { - "epoch": 0.14067879980324643, - "grad_norm": 0.6202099379314775, - "learning_rate": 5.72e-05, - "loss": 1.0256, - "step": 286 - }, - { - "epoch": 0.1411706837186424, - "grad_norm": 0.6364234732947897, - "learning_rate": 5.74e-05, - "loss": 0.996, - "step": 287 - }, - { - "epoch": 0.14166256763403837, - "grad_norm": 0.6345000851230562, - "learning_rate": 5.76e-05, - "loss": 0.9256, - "step": 288 - }, - { - "epoch": 0.14215445154943435, - "grad_norm": 0.6356287411447527, - "learning_rate": 5.7799999999999995e-05, - "loss": 0.9884, - "step": 289 - }, - { - "epoch": 0.1426463354648303, - "grad_norm": 0.7056133469436392, - "learning_rate": 5.8e-05, - "loss": 0.981, - "step": 290 - }, - { - "epoch": 0.14313821938022628, - "grad_norm": 0.6850636267710177, - "learning_rate": 5.82e-05, - "loss": 0.9667, - "step": 291 - }, - { - "epoch": 0.14363010329562223, - "grad_norm": 0.6282739333968476, - "learning_rate": 5.8399999999999997e-05, - "loss": 0.9947, - "step": 292 - }, - { - "epoch": 0.1441219872110182, - "grad_norm": 0.6418596479252349, - "learning_rate": 5.86e-05, - "loss": 1.0141, - "step": 293 - }, - { - "epoch": 0.14461387112641416, - "grad_norm": 0.6595528178588359, - "learning_rate": 5.88e-05, - "loss": 1.0151, - "step": 294 - }, - { - "epoch": 0.14510575504181014, - "grad_norm": 0.6920906659420407, - "learning_rate": 5.9e-05, - "loss": 0.9947, - "step": 295 - }, - { - "epoch": 0.1455976389572061, - "grad_norm": 0.6518087338599203, - "learning_rate": 5.92e-05, - "loss": 0.9352, - "step": 296 - }, - { - "epoch": 0.14608952287260207, - "grad_norm": 0.611941517521001, - "learning_rate": 5.94e-05, - "loss": 0.9139, - "step": 297 - }, - { - "epoch": 0.14658140678799803, - "grad_norm": 0.6447816591904986, - "learning_rate": 5.96e-05, - "loss": 0.9703, - "step": 298 - }, - { - "epoch": 0.147073290703394, - "grad_norm": 0.6306005109194481, - "learning_rate": 5.9800000000000003e-05, - "loss": 0.9748, - "step": 299 - }, - { - "epoch": 0.14756517461878996, - "grad_norm": 0.6632237582326308, - "learning_rate": 6e-05, - "loss": 1.0416, - "step": 300 - }, - { - "epoch": 0.14805705853418594, - "grad_norm": 0.6517410541130536, - "learning_rate": 6.02e-05, - "loss": 0.9308, - "step": 301 - }, - { - "epoch": 0.1485489424495819, - "grad_norm": 0.7062565571754538, - "learning_rate": 6.04e-05, - "loss": 1.0257, - "step": 302 - }, - { - "epoch": 0.14904082636497787, - "grad_norm": 0.6459974349047483, - "learning_rate": 6.06e-05, - "loss": 0.9867, - "step": 303 - }, - { - "epoch": 0.14953271028037382, - "grad_norm": 0.6671134833116954, - "learning_rate": 6.08e-05, - "loss": 1.0088, - "step": 304 - }, - { - "epoch": 0.1500245941957698, - "grad_norm": 0.6682947966042925, - "learning_rate": 6.1e-05, - "loss": 1.0053, - "step": 305 - }, - { - "epoch": 0.15051647811116575, - "grad_norm": 0.6132659312827933, - "learning_rate": 6.12e-05, - "loss": 0.9845, - "step": 306 - }, - { - "epoch": 0.15100836202656173, - "grad_norm": 0.7433304321908412, - "learning_rate": 6.14e-05, - "loss": 1.0108, - "step": 307 - }, - { - "epoch": 0.15150024594195768, - "grad_norm": 0.7092648473648441, - "learning_rate": 6.16e-05, - "loss": 0.9951, - "step": 308 - }, - { - "epoch": 0.15199212985735366, - "grad_norm": 0.6355870188527534, - "learning_rate": 6.18e-05, - "loss": 0.9423, - "step": 309 - }, - { - "epoch": 0.15248401377274964, - "grad_norm": 0.6242503654633647, - "learning_rate": 6.2e-05, - "loss": 0.9129, - "step": 310 - }, - { - "epoch": 0.1529758976881456, - "grad_norm": 0.6109391351151449, - "learning_rate": 6.220000000000001e-05, - "loss": 0.9067, - "step": 311 - }, - { - "epoch": 0.15346778160354158, - "grad_norm": 0.6300248986974131, - "learning_rate": 6.24e-05, - "loss": 0.9957, - "step": 312 - }, - { - "epoch": 0.15395966551893753, - "grad_norm": 0.6165606343456851, - "learning_rate": 6.26e-05, - "loss": 1.0315, - "step": 313 - }, - { - "epoch": 0.1544515494343335, - "grad_norm": 0.616948344272319, - "learning_rate": 6.280000000000001e-05, - "loss": 0.964, - "step": 314 - }, - { - "epoch": 0.15494343334972946, - "grad_norm": 0.6465167406806364, - "learning_rate": 6.3e-05, - "loss": 1.0329, - "step": 315 - }, - { - "epoch": 0.15543531726512544, - "grad_norm": 0.6153339495121846, - "learning_rate": 6.32e-05, - "loss": 1.035, - "step": 316 - }, - { - "epoch": 0.1559272011805214, - "grad_norm": 0.6170904248104213, - "learning_rate": 6.340000000000001e-05, - "loss": 0.97, - "step": 317 - }, - { - "epoch": 0.15641908509591737, - "grad_norm": 0.696445964079362, - "learning_rate": 6.36e-05, - "loss": 1.0645, - "step": 318 - }, - { - "epoch": 0.15691096901131332, - "grad_norm": 0.5936689219966211, - "learning_rate": 6.38e-05, - "loss": 0.9516, - "step": 319 - }, - { - "epoch": 0.1574028529267093, - "grad_norm": 0.6766475677939298, - "learning_rate": 6.400000000000001e-05, - "loss": 0.9874, - "step": 320 - }, - { - "epoch": 0.15789473684210525, - "grad_norm": 0.6199318696925801, - "learning_rate": 6.42e-05, - "loss": 0.9286, - "step": 321 - }, - { - "epoch": 0.15838662075750123, - "grad_norm": 0.6643394593362091, - "learning_rate": 6.440000000000001e-05, - "loss": 1.0774, - "step": 322 - }, - { - "epoch": 0.1588785046728972, - "grad_norm": 0.5898421821171212, - "learning_rate": 6.460000000000001e-05, - "loss": 0.8951, - "step": 323 - }, - { - "epoch": 0.15937038858829317, - "grad_norm": 0.6146746666462884, - "learning_rate": 6.48e-05, - "loss": 0.9303, - "step": 324 - }, - { - "epoch": 0.15986227250368912, - "grad_norm": 0.6423463369567147, - "learning_rate": 6.500000000000001e-05, - "loss": 1.0393, - "step": 325 - }, - { - "epoch": 0.1603541564190851, - "grad_norm": 0.6315059932248648, - "learning_rate": 6.52e-05, - "loss": 1.0133, - "step": 326 - }, - { - "epoch": 0.16084604033448105, - "grad_norm": 0.6815156367432513, - "learning_rate": 6.54e-05, - "loss": 0.9572, - "step": 327 - }, - { - "epoch": 0.16133792424987703, - "grad_norm": 0.6519666208797611, - "learning_rate": 6.560000000000001e-05, - "loss": 0.9082, - "step": 328 - }, - { - "epoch": 0.161829808165273, - "grad_norm": 0.62523957910431, - "learning_rate": 6.58e-05, - "loss": 0.9654, - "step": 329 - }, - { - "epoch": 0.16232169208066896, - "grad_norm": 0.7309020196058403, - "learning_rate": 6.6e-05, - "loss": 0.9375, - "step": 330 - }, - { - "epoch": 0.16281357599606494, - "grad_norm": 0.6695844264781273, - "learning_rate": 6.620000000000001e-05, - "loss": 0.9489, - "step": 331 - }, - { - "epoch": 0.1633054599114609, - "grad_norm": 0.6494479688502096, - "learning_rate": 6.64e-05, - "loss": 0.9377, - "step": 332 - }, - { - "epoch": 0.16379734382685687, - "grad_norm": 0.6336750522480793, - "learning_rate": 6.66e-05, - "loss": 0.9992, - "step": 333 - }, - { - "epoch": 0.16428922774225282, - "grad_norm": 0.6693397113005544, - "learning_rate": 6.680000000000001e-05, - "loss": 0.9959, - "step": 334 - }, - { - "epoch": 0.1647811116576488, - "grad_norm": 0.6002755640350812, - "learning_rate": 6.7e-05, - "loss": 0.9596, - "step": 335 - }, - { - "epoch": 0.16527299557304476, - "grad_norm": 0.6559694357848178, - "learning_rate": 6.720000000000001e-05, - "loss": 1.0273, - "step": 336 - }, - { - "epoch": 0.16576487948844074, - "grad_norm": 0.6545341522027837, - "learning_rate": 6.740000000000001e-05, - "loss": 1.0007, - "step": 337 - }, - { - "epoch": 0.1662567634038367, - "grad_norm": 0.640746586799343, - "learning_rate": 6.76e-05, - "loss": 1.0115, - "step": 338 - }, - { - "epoch": 0.16674864731923267, - "grad_norm": 0.6186436669731425, - "learning_rate": 6.780000000000001e-05, - "loss": 1.0891, - "step": 339 - }, - { - "epoch": 0.16724053123462862, - "grad_norm": 0.6119028812273772, - "learning_rate": 6.800000000000001e-05, - "loss": 0.9275, - "step": 340 - }, - { - "epoch": 0.1677324151500246, - "grad_norm": 0.6579939475287446, - "learning_rate": 6.82e-05, - "loss": 1.0235, - "step": 341 - }, - { - "epoch": 0.16822429906542055, - "grad_norm": 0.6383679027580579, - "learning_rate": 6.840000000000001e-05, - "loss": 1.0284, - "step": 342 - }, - { - "epoch": 0.16871618298081653, - "grad_norm": 0.6507778291909052, - "learning_rate": 6.860000000000001e-05, - "loss": 1.0164, - "step": 343 - }, - { - "epoch": 0.16920806689621248, - "grad_norm": 0.6559899499233537, - "learning_rate": 6.879999999999999e-05, - "loss": 1.0207, - "step": 344 - }, - { - "epoch": 0.16969995081160846, - "grad_norm": 0.6307205703320318, - "learning_rate": 6.9e-05, - "loss": 0.9824, - "step": 345 - }, - { - "epoch": 0.17019183472700442, - "grad_norm": 0.6484572895058113, - "learning_rate": 6.92e-05, - "loss": 0.9476, - "step": 346 - }, - { - "epoch": 0.1706837186424004, - "grad_norm": 0.5938424560381667, - "learning_rate": 6.939999999999999e-05, - "loss": 0.9291, - "step": 347 - }, - { - "epoch": 0.17117560255779635, - "grad_norm": 0.6077052340881578, - "learning_rate": 6.96e-05, - "loss": 0.9314, - "step": 348 - }, - { - "epoch": 0.17166748647319233, - "grad_norm": 0.5793665866913059, - "learning_rate": 6.98e-05, - "loss": 0.8881, - "step": 349 - }, - { - "epoch": 0.1721593703885883, - "grad_norm": 0.6217622467644931, - "learning_rate": 7e-05, - "loss": 0.9426, - "step": 350 - }, - { - "epoch": 0.17265125430398426, - "grad_norm": 0.6492966649268521, - "learning_rate": 7.02e-05, - "loss": 0.9888, - "step": 351 - }, - { - "epoch": 0.17314313821938024, - "grad_norm": 0.638633022430336, - "learning_rate": 7.04e-05, - "loss": 1.0077, - "step": 352 - }, - { - "epoch": 0.1736350221347762, - "grad_norm": 0.6228845062610631, - "learning_rate": 7.06e-05, - "loss": 0.9699, - "step": 353 - }, - { - "epoch": 0.17412690605017217, - "grad_norm": 0.5702229019173722, - "learning_rate": 7.08e-05, - "loss": 0.9429, - "step": 354 - }, - { - "epoch": 0.17461878996556812, - "grad_norm": 0.5924801567771402, - "learning_rate": 7.1e-05, - "loss": 0.9659, - "step": 355 - }, - { - "epoch": 0.1751106738809641, - "grad_norm": 0.6232365364163425, - "learning_rate": 7.12e-05, - "loss": 0.993, - "step": 356 - }, - { - "epoch": 0.17560255779636005, - "grad_norm": 0.6183797922468482, - "learning_rate": 7.14e-05, - "loss": 0.9918, - "step": 357 - }, - { - "epoch": 0.17609444171175603, - "grad_norm": 0.8735174977501221, - "learning_rate": 7.16e-05, - "loss": 1.1078, - "step": 358 - }, - { - "epoch": 0.17658632562715199, - "grad_norm": 0.6011381747794056, - "learning_rate": 7.18e-05, - "loss": 0.9236, - "step": 359 - }, - { - "epoch": 0.17707820954254797, - "grad_norm": 0.5965327186295076, - "learning_rate": 7.2e-05, - "loss": 0.9969, - "step": 360 - }, - { - "epoch": 0.17757009345794392, - "grad_norm": 0.6331964093992425, - "learning_rate": 7.22e-05, - "loss": 0.9924, - "step": 361 - }, - { - "epoch": 0.1780619773733399, - "grad_norm": 0.6412859155950007, - "learning_rate": 7.24e-05, - "loss": 0.9406, - "step": 362 - }, - { - "epoch": 0.17855386128873585, - "grad_norm": 0.638583426526138, - "learning_rate": 7.26e-05, - "loss": 1.0027, - "step": 363 - }, - { - "epoch": 0.17904574520413183, - "grad_norm": 0.5709523375451793, - "learning_rate": 7.280000000000001e-05, - "loss": 0.9009, - "step": 364 - }, - { - "epoch": 0.17953762911952778, - "grad_norm": 0.6474681443412542, - "learning_rate": 7.3e-05, - "loss": 1.0186, - "step": 365 - }, - { - "epoch": 0.18002951303492376, - "grad_norm": 0.7193510038246659, - "learning_rate": 7.32e-05, - "loss": 0.9623, - "step": 366 - }, - { - "epoch": 0.1805213969503197, - "grad_norm": 0.6479023597226308, - "learning_rate": 7.340000000000001e-05, - "loss": 1.0473, - "step": 367 - }, - { - "epoch": 0.1810132808657157, - "grad_norm": 0.6714577198688532, - "learning_rate": 7.36e-05, - "loss": 0.9741, - "step": 368 - }, - { - "epoch": 0.18150516478111164, - "grad_norm": 0.6626593218023097, - "learning_rate": 7.38e-05, - "loss": 1.0352, - "step": 369 - }, - { - "epoch": 0.18199704869650762, - "grad_norm": 0.6860491249523826, - "learning_rate": 7.4e-05, - "loss": 0.9544, - "step": 370 - }, - { - "epoch": 0.1824889326119036, - "grad_norm": 0.6577681496370567, - "learning_rate": 7.42e-05, - "loss": 1.0932, - "step": 371 - }, - { - "epoch": 0.18298081652729956, - "grad_norm": 0.5896122652070747, - "learning_rate": 7.44e-05, - "loss": 1.0346, - "step": 372 - }, - { - "epoch": 0.18347270044269554, - "grad_norm": 0.6350856778845727, - "learning_rate": 7.46e-05, - "loss": 1.0566, - "step": 373 - }, - { - "epoch": 0.1839645843580915, - "grad_norm": 0.6310622245330231, - "learning_rate": 7.48e-05, - "loss": 0.9427, - "step": 374 - }, - { - "epoch": 0.18445646827348747, - "grad_norm": 0.591393808769729, - "learning_rate": 7.500000000000001e-05, - "loss": 0.9367, - "step": 375 - }, - { - "epoch": 0.18494835218888342, - "grad_norm": 0.5983444228918225, - "learning_rate": 7.52e-05, - "loss": 0.8887, - "step": 376 - }, - { - "epoch": 0.1854402361042794, - "grad_norm": 0.6300800008512292, - "learning_rate": 7.54e-05, - "loss": 1.0046, - "step": 377 - }, - { - "epoch": 0.18593212001967535, - "grad_norm": 0.6248792310246031, - "learning_rate": 7.560000000000001e-05, - "loss": 0.9309, - "step": 378 - }, - { - "epoch": 0.18642400393507133, - "grad_norm": 0.6224165965967048, - "learning_rate": 7.58e-05, - "loss": 0.9787, - "step": 379 - }, - { - "epoch": 0.18691588785046728, - "grad_norm": 0.7023975725077084, - "learning_rate": 7.6e-05, - "loss": 1.0844, - "step": 380 - }, - { - "epoch": 0.18740777176586326, - "grad_norm": 0.7741352574238073, - "learning_rate": 7.620000000000001e-05, - "loss": 1.0394, - "step": 381 - }, - { - "epoch": 0.18789965568125921, - "grad_norm": 0.6098810052004804, - "learning_rate": 7.64e-05, - "loss": 0.9726, - "step": 382 - }, - { - "epoch": 0.1883915395966552, - "grad_norm": 0.5818813489381275, - "learning_rate": 7.66e-05, - "loss": 0.9709, - "step": 383 - }, - { - "epoch": 0.18888342351205115, - "grad_norm": 0.6226189074036584, - "learning_rate": 7.680000000000001e-05, - "loss": 0.9544, - "step": 384 - }, - { - "epoch": 0.18937530742744713, - "grad_norm": 0.6356790232097046, - "learning_rate": 7.7e-05, - "loss": 0.9817, - "step": 385 - }, - { - "epoch": 0.18986719134284308, - "grad_norm": 0.671564537895718, - "learning_rate": 7.72e-05, - "loss": 0.9338, - "step": 386 - }, - { - "epoch": 0.19035907525823906, - "grad_norm": 0.664535078510209, - "learning_rate": 7.740000000000001e-05, - "loss": 0.9018, - "step": 387 - }, - { - "epoch": 0.190850959173635, - "grad_norm": 0.6394011220372047, - "learning_rate": 7.76e-05, - "loss": 1.0591, - "step": 388 - }, - { - "epoch": 0.191342843089031, - "grad_norm": 0.5892373379666562, - "learning_rate": 7.780000000000001e-05, - "loss": 0.9627, - "step": 389 - }, - { - "epoch": 0.19183472700442694, - "grad_norm": 0.6377552139327307, - "learning_rate": 7.800000000000001e-05, - "loss": 1.0187, - "step": 390 - }, - { - "epoch": 0.19232661091982292, - "grad_norm": 0.6050739601789275, - "learning_rate": 7.82e-05, - "loss": 0.9378, - "step": 391 - }, - { - "epoch": 0.1928184948352189, - "grad_norm": 0.6674680863375297, - "learning_rate": 7.840000000000001e-05, - "loss": 1.0179, - "step": 392 - }, - { - "epoch": 0.19331037875061485, - "grad_norm": 0.6328515579503435, - "learning_rate": 7.860000000000001e-05, - "loss": 1.0017, - "step": 393 - }, - { - "epoch": 0.19380226266601083, - "grad_norm": 0.6908769012404306, - "learning_rate": 7.88e-05, - "loss": 1.0762, - "step": 394 - }, - { - "epoch": 0.19429414658140678, - "grad_norm": 0.6482324781379857, - "learning_rate": 7.900000000000001e-05, - "loss": 0.9735, - "step": 395 - }, - { - "epoch": 0.19478603049680276, - "grad_norm": 0.6018402507831343, - "learning_rate": 7.920000000000001e-05, - "loss": 0.9507, - "step": 396 - }, - { - "epoch": 0.19527791441219872, - "grad_norm": 0.6019084942068132, - "learning_rate": 7.94e-05, - "loss": 0.9597, - "step": 397 - }, - { - "epoch": 0.1957697983275947, - "grad_norm": 0.6818325977268064, - "learning_rate": 7.960000000000001e-05, - "loss": 0.8991, - "step": 398 - }, - { - "epoch": 0.19626168224299065, - "grad_norm": 0.5914021323075259, - "learning_rate": 7.98e-05, - "loss": 0.9103, - "step": 399 - }, - { - "epoch": 0.19675356615838663, - "grad_norm": 0.6391699256528783, - "learning_rate": 8e-05, - "loss": 0.9233, - "step": 400 - }, - { - "epoch": 0.19724545007378258, - "grad_norm": 0.6336439557662955, - "learning_rate": 8.020000000000001e-05, - "loss": 0.9096, - "step": 401 - }, - { - "epoch": 0.19773733398917856, - "grad_norm": 0.6266381712881892, - "learning_rate": 8.04e-05, - "loss": 0.997, - "step": 402 - }, - { - "epoch": 0.1982292179045745, - "grad_norm": 0.600600584008272, - "learning_rate": 8.060000000000001e-05, - "loss": 0.992, - "step": 403 - }, - { - "epoch": 0.1987211018199705, - "grad_norm": 0.6517734815494437, - "learning_rate": 8.080000000000001e-05, - "loss": 0.9817, - "step": 404 - }, - { - "epoch": 0.19921298573536644, - "grad_norm": 0.6293416217886094, - "learning_rate": 8.1e-05, - "loss": 1.0526, - "step": 405 - }, - { - "epoch": 0.19970486965076242, - "grad_norm": 0.6465733209160941, - "learning_rate": 8.120000000000001e-05, - "loss": 0.976, - "step": 406 - }, - { - "epoch": 0.20019675356615838, - "grad_norm": 0.6485332141795432, - "learning_rate": 8.14e-05, - "loss": 1.0016, - "step": 407 - }, - { - "epoch": 0.20068863748155436, - "grad_norm": 0.5718975648508007, - "learning_rate": 8.16e-05, - "loss": 0.9666, - "step": 408 - }, - { - "epoch": 0.2011805213969503, - "grad_norm": 0.6283887150004627, - "learning_rate": 8.18e-05, - "loss": 0.9837, - "step": 409 - }, - { - "epoch": 0.2016724053123463, - "grad_norm": 0.6345487037460759, - "learning_rate": 8.2e-05, - "loss": 0.981, - "step": 410 - }, - { - "epoch": 0.20216428922774227, - "grad_norm": 0.6248364172555066, - "learning_rate": 8.22e-05, - "loss": 0.9581, - "step": 411 - }, - { - "epoch": 0.20265617314313822, - "grad_norm": 0.6102819030010944, - "learning_rate": 8.24e-05, - "loss": 0.9207, - "step": 412 - }, - { - "epoch": 0.2031480570585342, - "grad_norm": 0.9706049727359666, - "learning_rate": 8.26e-05, - "loss": 0.9608, - "step": 413 - }, - { - "epoch": 0.20363994097393015, - "grad_norm": 0.6044328622763054, - "learning_rate": 8.28e-05, - "loss": 0.9308, - "step": 414 - }, - { - "epoch": 0.20413182488932613, - "grad_norm": 0.639778068725871, - "learning_rate": 8.3e-05, - "loss": 1.0425, - "step": 415 - }, - { - "epoch": 0.20462370880472208, - "grad_norm": 0.6167918577777772, - "learning_rate": 8.32e-05, - "loss": 0.9816, - "step": 416 - }, - { - "epoch": 0.20511559272011806, - "grad_norm": 0.5999991449288878, - "learning_rate": 8.34e-05, - "loss": 0.8474, - "step": 417 - }, - { - "epoch": 0.205607476635514, - "grad_norm": 0.6076168181208658, - "learning_rate": 8.36e-05, - "loss": 0.9624, - "step": 418 - }, - { - "epoch": 0.20609936055091, - "grad_norm": 0.6165342411399956, - "learning_rate": 8.38e-05, - "loss": 0.9529, - "step": 419 - }, - { - "epoch": 0.20659124446630595, - "grad_norm": 0.6132122494477279, - "learning_rate": 8.4e-05, - "loss": 0.989, - "step": 420 - }, - { - "epoch": 0.20708312838170193, - "grad_norm": 0.6027449919133129, - "learning_rate": 8.42e-05, - "loss": 0.9066, - "step": 421 - }, - { - "epoch": 0.20757501229709788, - "grad_norm": 0.5928239968088229, - "learning_rate": 8.44e-05, - "loss": 1.0044, - "step": 422 - }, - { - "epoch": 0.20806689621249386, - "grad_norm": 0.5869899950440051, - "learning_rate": 8.46e-05, - "loss": 0.9863, - "step": 423 - }, - { - "epoch": 0.2085587801278898, - "grad_norm": 0.5766767880122016, - "learning_rate": 8.48e-05, - "loss": 1.0377, - "step": 424 - }, - { - "epoch": 0.2090506640432858, - "grad_norm": 0.6264523722554275, - "learning_rate": 8.5e-05, - "loss": 0.947, - "step": 425 - }, - { - "epoch": 0.20954254795868174, - "grad_norm": 0.6045702512587927, - "learning_rate": 8.52e-05, - "loss": 0.9296, - "step": 426 - }, - { - "epoch": 0.21003443187407772, - "grad_norm": 0.5860636391406551, - "learning_rate": 8.54e-05, - "loss": 0.9624, - "step": 427 - }, - { - "epoch": 0.21052631578947367, - "grad_norm": 0.8151675635611291, - "learning_rate": 8.560000000000001e-05, - "loss": 0.997, - "step": 428 - }, - { - "epoch": 0.21101819970486965, - "grad_norm": 0.6030314739242377, - "learning_rate": 8.58e-05, - "loss": 0.9598, - "step": 429 - }, - { - "epoch": 0.2115100836202656, - "grad_norm": 0.5792174344697232, - "learning_rate": 8.6e-05, - "loss": 1.0389, - "step": 430 - }, - { - "epoch": 0.21200196753566158, - "grad_norm": 0.5684645313201525, - "learning_rate": 8.620000000000001e-05, - "loss": 0.8997, - "step": 431 - }, - { - "epoch": 0.21249385145105756, - "grad_norm": 0.5876143641689083, - "learning_rate": 8.64e-05, - "loss": 1.0035, - "step": 432 - }, - { - "epoch": 0.21298573536645352, - "grad_norm": 0.5872667882912961, - "learning_rate": 8.66e-05, - "loss": 0.9598, - "step": 433 - }, - { - "epoch": 0.2134776192818495, - "grad_norm": 0.637253451970368, - "learning_rate": 8.680000000000001e-05, - "loss": 0.9934, - "step": 434 - }, - { - "epoch": 0.21396950319724545, - "grad_norm": 0.5481379443674183, - "learning_rate": 8.7e-05, - "loss": 0.953, - "step": 435 - }, - { - "epoch": 0.21446138711264143, - "grad_norm": 0.5971331846018932, - "learning_rate": 8.72e-05, - "loss": 0.9386, - "step": 436 - }, - { - "epoch": 0.21495327102803738, - "grad_norm": 0.6307097366754557, - "learning_rate": 8.740000000000001e-05, - "loss": 0.9256, - "step": 437 - }, - { - "epoch": 0.21544515494343336, - "grad_norm": 0.5788073659558398, - "learning_rate": 8.76e-05, - "loss": 0.8838, - "step": 438 - }, - { - "epoch": 0.2159370388588293, - "grad_norm": 0.601923427178012, - "learning_rate": 8.78e-05, - "loss": 0.9056, - "step": 439 - }, - { - "epoch": 0.2164289227742253, - "grad_norm": 0.6370454166547089, - "learning_rate": 8.800000000000001e-05, - "loss": 1.0072, - "step": 440 - }, - { - "epoch": 0.21692080668962124, - "grad_norm": 0.6189606659633952, - "learning_rate": 8.82e-05, - "loss": 0.9057, - "step": 441 - }, - { - "epoch": 0.21741269060501722, - "grad_norm": 0.5936340681748756, - "learning_rate": 8.840000000000001e-05, - "loss": 0.9471, - "step": 442 - }, - { - "epoch": 0.21790457452041317, - "grad_norm": 0.7206104620301254, - "learning_rate": 8.86e-05, - "loss": 1.0461, - "step": 443 - }, - { - "epoch": 0.21839645843580915, - "grad_norm": 0.6075997639143946, - "learning_rate": 8.88e-05, - "loss": 0.9781, - "step": 444 - }, - { - "epoch": 0.2188883423512051, - "grad_norm": 0.6248001399556955, - "learning_rate": 8.900000000000001e-05, - "loss": 0.9315, - "step": 445 - }, - { - "epoch": 0.21938022626660109, - "grad_norm": 0.5813944293621506, - "learning_rate": 8.92e-05, - "loss": 0.9133, - "step": 446 - }, - { - "epoch": 0.21987211018199704, - "grad_norm": 0.636632987875858, - "learning_rate": 8.94e-05, - "loss": 0.9739, - "step": 447 - }, - { - "epoch": 0.22036399409739302, - "grad_norm": 0.590767131415477, - "learning_rate": 8.960000000000001e-05, - "loss": 0.8692, - "step": 448 - }, - { - "epoch": 0.22085587801278897, - "grad_norm": 0.617883557634732, - "learning_rate": 8.98e-05, - "loss": 0.961, - "step": 449 - }, - { - "epoch": 0.22134776192818495, - "grad_norm": 0.5876025309993035, - "learning_rate": 9e-05, - "loss": 0.8839, - "step": 450 - }, - { - "epoch": 0.2218396458435809, - "grad_norm": 0.567536847812842, - "learning_rate": 9.020000000000001e-05, - "loss": 0.9234, - "step": 451 - }, - { - "epoch": 0.22233152975897688, - "grad_norm": 0.6249958799726273, - "learning_rate": 9.04e-05, - "loss": 0.9916, - "step": 452 - }, - { - "epoch": 0.22282341367437286, - "grad_norm": 0.5903837871692948, - "learning_rate": 9.06e-05, - "loss": 0.9584, - "step": 453 - }, - { - "epoch": 0.2233152975897688, - "grad_norm": 0.5975533024049463, - "learning_rate": 9.080000000000001e-05, - "loss": 0.9771, - "step": 454 - }, - { - "epoch": 0.2238071815051648, - "grad_norm": 0.5772160499556087, - "learning_rate": 9.1e-05, - "loss": 1.0129, - "step": 455 - }, - { - "epoch": 0.22429906542056074, - "grad_norm": 0.6639280119847331, - "learning_rate": 9.120000000000001e-05, - "loss": 1.068, - "step": 456 - }, - { - "epoch": 0.22479094933595672, - "grad_norm": 0.5642570348460942, - "learning_rate": 9.140000000000001e-05, - "loss": 0.9018, - "step": 457 - }, - { - "epoch": 0.22528283325135268, - "grad_norm": 0.6103778845508339, - "learning_rate": 9.16e-05, - "loss": 0.932, - "step": 458 - }, - { - "epoch": 0.22577471716674866, - "grad_norm": 0.6456272560340363, - "learning_rate": 9.180000000000001e-05, - "loss": 1.0029, - "step": 459 - }, - { - "epoch": 0.2262666010821446, - "grad_norm": 0.5961434170382809, - "learning_rate": 9.200000000000001e-05, - "loss": 0.9617, - "step": 460 - }, - { - "epoch": 0.2267584849975406, - "grad_norm": 0.6314598969589529, - "learning_rate": 9.22e-05, - "loss": 0.9515, - "step": 461 - }, - { - "epoch": 0.22725036891293654, - "grad_norm": 0.5602422225660095, - "learning_rate": 9.240000000000001e-05, - "loss": 0.9644, - "step": 462 - }, - { - "epoch": 0.22774225282833252, - "grad_norm": 0.5977278837906834, - "learning_rate": 9.260000000000001e-05, - "loss": 0.8666, - "step": 463 - }, - { - "epoch": 0.22823413674372847, - "grad_norm": 0.5733762249981974, - "learning_rate": 9.28e-05, - "loss": 0.9176, - "step": 464 - }, - { - "epoch": 0.22872602065912445, - "grad_norm": 0.6065979413259693, - "learning_rate": 9.300000000000001e-05, - "loss": 0.9304, - "step": 465 - }, - { - "epoch": 0.2292179045745204, - "grad_norm": 0.6682669750658685, - "learning_rate": 9.320000000000002e-05, - "loss": 1.0568, - "step": 466 - }, - { - "epoch": 0.22970978848991638, - "grad_norm": 0.6114510975349654, - "learning_rate": 9.340000000000001e-05, - "loss": 0.9325, - "step": 467 - }, - { - "epoch": 0.23020167240531234, - "grad_norm": 0.9915938410046113, - "learning_rate": 9.360000000000001e-05, - "loss": 0.9733, - "step": 468 - }, - { - "epoch": 0.23069355632070832, - "grad_norm": 0.5858992558347075, - "learning_rate": 9.38e-05, - "loss": 0.9279, - "step": 469 - }, - { - "epoch": 0.23118544023610427, - "grad_norm": 0.5521035516591665, - "learning_rate": 9.4e-05, - "loss": 0.9738, - "step": 470 - }, - { - "epoch": 0.23167732415150025, - "grad_norm": 0.5754273126337525, - "learning_rate": 9.42e-05, - "loss": 0.9224, - "step": 471 - }, - { - "epoch": 0.2321692080668962, - "grad_norm": 0.6288424898995382, - "learning_rate": 9.44e-05, - "loss": 0.934, - "step": 472 - }, - { - "epoch": 0.23266109198229218, - "grad_norm": 0.5930926280595387, - "learning_rate": 9.46e-05, - "loss": 0.9048, - "step": 473 - }, - { - "epoch": 0.23315297589768816, - "grad_norm": 1.0573852034903584, - "learning_rate": 9.48e-05, - "loss": 0.9652, - "step": 474 - }, - { - "epoch": 0.2336448598130841, - "grad_norm": 0.7957217544718668, - "learning_rate": 9.5e-05, - "loss": 1.0241, - "step": 475 - }, - { - "epoch": 0.2341367437284801, - "grad_norm": 0.6637946045342709, - "learning_rate": 9.52e-05, - "loss": 1.0212, - "step": 476 - }, - { - "epoch": 0.23462862764387604, - "grad_norm": 0.6091148671336437, - "learning_rate": 9.54e-05, - "loss": 1.0706, - "step": 477 - }, - { - "epoch": 0.23512051155927202, - "grad_norm": 0.5928870673422334, - "learning_rate": 9.56e-05, - "loss": 1.024, - "step": 478 - }, - { - "epoch": 0.23561239547466797, - "grad_norm": 0.6328201225197264, - "learning_rate": 9.58e-05, - "loss": 1.004, - "step": 479 - }, - { - "epoch": 0.23610427939006395, - "grad_norm": 1.3909007846120647, - "learning_rate": 9.6e-05, - "loss": 1.0172, - "step": 480 - }, - { - "epoch": 0.2365961633054599, - "grad_norm": 0.6446533249348028, - "learning_rate": 9.620000000000001e-05, - "loss": 0.9927, - "step": 481 - }, - { - "epoch": 0.23708804722085589, - "grad_norm": 0.7692567910839619, - "learning_rate": 9.64e-05, - "loss": 0.9427, - "step": 482 - }, - { - "epoch": 0.23757993113625184, - "grad_norm": 0.6093984042747421, - "learning_rate": 9.66e-05, - "loss": 0.9684, - "step": 483 - }, - { - "epoch": 0.23807181505164782, - "grad_norm": 1.0005629998863774, - "learning_rate": 9.680000000000001e-05, - "loss": 0.9163, - "step": 484 - }, - { - "epoch": 0.23856369896704377, - "grad_norm": 2.1010922008411193, - "learning_rate": 9.7e-05, - "loss": 1.0023, - "step": 485 - }, - { - "epoch": 0.23905558288243975, - "grad_norm": 0.6795740877841306, - "learning_rate": 9.72e-05, - "loss": 0.9061, - "step": 486 - }, - { - "epoch": 0.2395474667978357, - "grad_norm": 1.3821862397613611, - "learning_rate": 9.74e-05, - "loss": 1.0303, - "step": 487 - }, - { - "epoch": 0.24003935071323168, - "grad_norm": 0.5980079275840284, - "learning_rate": 9.76e-05, - "loss": 0.8701, - "step": 488 - }, - { - "epoch": 0.24053123462862763, - "grad_norm": 0.6332421610937169, - "learning_rate": 9.78e-05, - "loss": 0.9758, - "step": 489 - }, - { - "epoch": 0.2410231185440236, - "grad_norm": 0.5999201682022608, - "learning_rate": 9.8e-05, - "loss": 0.9127, - "step": 490 - }, - { - "epoch": 0.24151500245941956, - "grad_norm": 1.2712794830931637, - "learning_rate": 9.82e-05, - "loss": 1.062, - "step": 491 - }, - { - "epoch": 0.24200688637481554, - "grad_norm": 0.6504036380794753, - "learning_rate": 9.84e-05, - "loss": 0.9864, - "step": 492 - }, - { - "epoch": 0.24249877029021152, - "grad_norm": 0.5651148499618845, - "learning_rate": 9.86e-05, - "loss": 0.9324, - "step": 493 - }, - { - "epoch": 0.24299065420560748, - "grad_norm": 0.5793954144082308, - "learning_rate": 9.88e-05, - "loss": 1.0368, - "step": 494 - }, - { - "epoch": 0.24348253812100346, - "grad_norm": 0.6124134428526112, - "learning_rate": 9.900000000000001e-05, - "loss": 0.9498, - "step": 495 - }, - { - "epoch": 0.2439744220363994, - "grad_norm": 0.5744591301399047, - "learning_rate": 9.92e-05, - "loss": 0.9139, - "step": 496 - }, - { - "epoch": 0.2444663059517954, - "grad_norm": 0.7292901522520773, - "learning_rate": 9.94e-05, - "loss": 0.96, - "step": 497 - }, - { - "epoch": 0.24495818986719134, - "grad_norm": 0.6198598397505529, - "learning_rate": 9.960000000000001e-05, - "loss": 0.9709, - "step": 498 - }, - { - "epoch": 0.24545007378258732, - "grad_norm": 0.7475634835580965, - "learning_rate": 9.98e-05, - "loss": 0.9326, - "step": 499 - }, - { - "epoch": 0.24594195769798327, - "grad_norm": 0.576130778925058, - "learning_rate": 0.0001, - "loss": 0.9667, - "step": 500 - }, - { - "epoch": 0.24643384161337925, - "grad_norm": 1.6126086957373478, - "learning_rate": 0.00010020000000000001, - "loss": 0.9386, - "step": 501 - }, - { - "epoch": 0.2469257255287752, - "grad_norm": 0.6396943451122885, - "learning_rate": 0.0001004, - "loss": 0.9154, - "step": 502 - }, - { - "epoch": 0.24741760944417118, - "grad_norm": 0.593136779240249, - "learning_rate": 0.0001006, - "loss": 0.9406, - "step": 503 - }, - { - "epoch": 0.24790949335956713, - "grad_norm": 0.5942229056137495, - "learning_rate": 0.00010080000000000001, - "loss": 1.0724, - "step": 504 - }, - { - "epoch": 0.24840137727496311, - "grad_norm": 0.5619219133034947, - "learning_rate": 0.000101, - "loss": 1.0055, - "step": 505 - }, - { - "epoch": 0.24889326119035907, - "grad_norm": 0.6672551418770766, - "learning_rate": 0.00010120000000000001, - "loss": 1.0772, - "step": 506 - }, - { - "epoch": 0.24938514510575505, - "grad_norm": 0.5996092644056796, - "learning_rate": 0.00010140000000000001, - "loss": 0.9409, - "step": 507 - }, - { - "epoch": 0.249877029021151, - "grad_norm": 0.5923281147260792, - "learning_rate": 0.0001016, - "loss": 0.9666, - "step": 508 - }, - { - "epoch": 0.25036891293654695, - "grad_norm": 0.636100143332497, - "learning_rate": 0.00010180000000000001, - "loss": 0.9769, - "step": 509 - }, - { - "epoch": 0.25086079685194296, - "grad_norm": 0.6200372518917829, - "learning_rate": 0.00010200000000000001, - "loss": 1.0639, - "step": 510 - }, - { - "epoch": 0.2513526807673389, - "grad_norm": 0.6502389809828354, - "learning_rate": 0.0001022, - "loss": 0.8989, - "step": 511 - }, - { - "epoch": 0.25184456468273486, - "grad_norm": 0.6335856938392092, - "learning_rate": 0.00010240000000000001, - "loss": 1.0302, - "step": 512 - }, - { - "epoch": 0.2523364485981308, - "grad_norm": 0.6054997347827139, - "learning_rate": 0.00010260000000000001, - "loss": 0.9907, - "step": 513 - }, - { - "epoch": 0.2528283325135268, - "grad_norm": 0.570723575147577, - "learning_rate": 0.0001028, - "loss": 0.9767, - "step": 514 - }, - { - "epoch": 0.2533202164289228, - "grad_norm": 0.6220939076031705, - "learning_rate": 0.00010300000000000001, - "loss": 0.9529, - "step": 515 - }, - { - "epoch": 0.2538121003443187, - "grad_norm": 0.6155466649132726, - "learning_rate": 0.0001032, - "loss": 1.0067, - "step": 516 - }, - { - "epoch": 0.25430398425971473, - "grad_norm": 0.8726025671843395, - "learning_rate": 0.0001034, - "loss": 0.9961, - "step": 517 - }, - { - "epoch": 0.2547958681751107, - "grad_norm": 0.6620076482195109, - "learning_rate": 0.00010360000000000001, - "loss": 0.9331, - "step": 518 - }, - { - "epoch": 0.25528775209050664, - "grad_norm": 1.2595302428131354, - "learning_rate": 0.0001038, - "loss": 0.8981, - "step": 519 - }, - { - "epoch": 0.2557796360059026, - "grad_norm": 0.6458264834672185, - "learning_rate": 0.00010400000000000001, - "loss": 0.9958, - "step": 520 - }, - { - "epoch": 0.2562715199212986, - "grad_norm": 0.7635091611089441, - "learning_rate": 0.00010420000000000001, - "loss": 0.9467, - "step": 521 - }, - { - "epoch": 0.25676340383669455, - "grad_norm": 0.714982120373078, - "learning_rate": 0.0001044, - "loss": 0.9966, - "step": 522 - }, - { - "epoch": 0.2572552877520905, - "grad_norm": 0.6425717367298969, - "learning_rate": 0.00010460000000000001, - "loss": 0.913, - "step": 523 - }, - { - "epoch": 0.25774717166748645, - "grad_norm": 0.5721649715746685, - "learning_rate": 0.00010480000000000001, - "loss": 1.0209, - "step": 524 - }, - { - "epoch": 0.25823905558288246, - "grad_norm": 0.7066309254383983, - "learning_rate": 0.000105, - "loss": 1.0178, - "step": 525 - }, - { - "epoch": 0.2587309394982784, - "grad_norm": 0.602685793418877, - "learning_rate": 0.00010520000000000001, - "loss": 0.9892, - "step": 526 - }, - { - "epoch": 0.25922282341367436, - "grad_norm": 0.6250830114089143, - "learning_rate": 0.00010540000000000001, - "loss": 1.038, - "step": 527 - }, - { - "epoch": 0.2597147073290703, - "grad_norm": 0.5996294671164073, - "learning_rate": 0.0001056, - "loss": 0.9697, - "step": 528 - }, - { - "epoch": 0.2602065912444663, - "grad_norm": 0.5670041707916343, - "learning_rate": 0.00010580000000000001, - "loss": 0.8589, - "step": 529 - }, - { - "epoch": 0.2606984751598623, - "grad_norm": 0.5992300105808054, - "learning_rate": 0.00010600000000000002, - "loss": 0.9784, - "step": 530 - }, - { - "epoch": 0.2611903590752582, - "grad_norm": 0.5734369344300806, - "learning_rate": 0.0001062, - "loss": 0.9789, - "step": 531 - }, - { - "epoch": 0.2616822429906542, - "grad_norm": 0.6202809724939895, - "learning_rate": 0.00010640000000000001, - "loss": 0.9404, - "step": 532 - }, - { - "epoch": 0.2621741269060502, - "grad_norm": 0.5708965613496635, - "learning_rate": 0.00010660000000000002, - "loss": 0.9989, - "step": 533 - }, - { - "epoch": 0.26266601082144614, - "grad_norm": 0.6050720664565258, - "learning_rate": 0.00010680000000000001, - "loss": 1.0119, - "step": 534 - }, - { - "epoch": 0.2631578947368421, - "grad_norm": 0.6503025204290864, - "learning_rate": 0.00010700000000000001, - "loss": 0.8838, - "step": 535 - }, - { - "epoch": 0.2636497786522381, - "grad_norm": 0.6003814831427035, - "learning_rate": 0.00010720000000000002, - "loss": 0.9797, - "step": 536 - }, - { - "epoch": 0.26414166256763405, - "grad_norm": 0.5700604678362049, - "learning_rate": 0.00010740000000000001, - "loss": 0.9529, - "step": 537 - }, - { - "epoch": 0.26463354648303, - "grad_norm": 0.6286797156268352, - "learning_rate": 0.00010760000000000001, - "loss": 1.0507, - "step": 538 - }, - { - "epoch": 0.26512543039842595, - "grad_norm": 0.5648484304652618, - "learning_rate": 0.00010780000000000002, - "loss": 0.9681, - "step": 539 - }, - { - "epoch": 0.26561731431382196, - "grad_norm": 0.6284114762154561, - "learning_rate": 0.00010800000000000001, - "loss": 0.9653, - "step": 540 - }, - { - "epoch": 0.2661091982292179, - "grad_norm": 0.5569323701062922, - "learning_rate": 0.00010820000000000001, - "loss": 0.9292, - "step": 541 - }, - { - "epoch": 0.26660108214461387, - "grad_norm": 0.5887770254355695, - "learning_rate": 0.00010840000000000002, - "loss": 0.9844, - "step": 542 - }, - { - "epoch": 0.2670929660600098, - "grad_norm": 0.6248044039199642, - "learning_rate": 0.00010860000000000001, - "loss": 0.8658, - "step": 543 - }, - { - "epoch": 0.2675848499754058, - "grad_norm": 0.662119932700776, - "learning_rate": 0.00010880000000000002, - "loss": 1.0137, - "step": 544 - }, - { - "epoch": 0.2680767338908018, - "grad_norm": 1.262688816523999, - "learning_rate": 0.000109, - "loss": 0.8925, - "step": 545 - }, - { - "epoch": 0.26856861780619773, - "grad_norm": 0.667479850878831, - "learning_rate": 0.00010920000000000001, - "loss": 1.0127, - "step": 546 - }, - { - "epoch": 0.2690605017215937, - "grad_norm": 0.5988917410592548, - "learning_rate": 0.00010940000000000002, - "loss": 0.9813, - "step": 547 - }, - { - "epoch": 0.2695523856369897, - "grad_norm": 0.646787588104879, - "learning_rate": 0.00010960000000000001, - "loss": 0.968, - "step": 548 - }, - { - "epoch": 0.27004426955238564, - "grad_norm": 0.5934709955522586, - "learning_rate": 0.00010980000000000001, - "loss": 0.9106, - "step": 549 - }, - { - "epoch": 0.2705361534677816, - "grad_norm": 0.6890356002678032, - "learning_rate": 0.00011000000000000002, - "loss": 0.896, - "step": 550 - }, - { - "epoch": 0.27102803738317754, - "grad_norm": 0.635952440140494, - "learning_rate": 0.00011020000000000001, - "loss": 1.0253, - "step": 551 - }, - { - "epoch": 0.27151992129857355, - "grad_norm": 0.5480143941867203, - "learning_rate": 0.00011040000000000001, - "loss": 0.9053, - "step": 552 - }, - { - "epoch": 0.2720118052139695, - "grad_norm": 0.5458926973694216, - "learning_rate": 0.00011060000000000002, - "loss": 0.9718, - "step": 553 - }, - { - "epoch": 0.27250368912936546, - "grad_norm": 0.5891788945755541, - "learning_rate": 0.00011080000000000001, - "loss": 0.937, - "step": 554 - }, - { - "epoch": 0.27299557304476146, - "grad_norm": 0.6604709314184619, - "learning_rate": 0.00011100000000000001, - "loss": 0.941, - "step": 555 - }, - { - "epoch": 0.2734874569601574, - "grad_norm": 0.6087029346118886, - "learning_rate": 0.00011120000000000002, - "loss": 1.0306, - "step": 556 - }, - { - "epoch": 0.27397934087555337, - "grad_norm": 0.5801967623798765, - "learning_rate": 0.00011140000000000001, - "loss": 0.9579, - "step": 557 - }, - { - "epoch": 0.2744712247909493, - "grad_norm": 1.7686976341139045, - "learning_rate": 0.00011160000000000002, - "loss": 0.9981, - "step": 558 - }, - { - "epoch": 0.2749631087063453, - "grad_norm": 0.606144888287221, - "learning_rate": 0.00011180000000000002, - "loss": 0.9892, - "step": 559 - }, - { - "epoch": 0.2754549926217413, - "grad_norm": 0.5641112522030708, - "learning_rate": 0.00011200000000000001, - "loss": 0.9197, - "step": 560 - }, - { - "epoch": 0.27594687653713723, - "grad_norm": 0.5348434591364023, - "learning_rate": 0.00011220000000000002, - "loss": 0.8833, - "step": 561 - }, - { - "epoch": 0.2764387604525332, - "grad_norm": 0.5621195777422913, - "learning_rate": 0.00011240000000000002, - "loss": 0.9392, - "step": 562 - }, - { - "epoch": 0.2769306443679292, - "grad_norm": 0.6183412922022422, - "learning_rate": 0.0001126, - "loss": 0.9989, - "step": 563 - }, - { - "epoch": 0.27742252828332514, - "grad_norm": 1.566745653338309, - "learning_rate": 0.00011279999999999999, - "loss": 0.9692, - "step": 564 - }, - { - "epoch": 0.2779144121987211, - "grad_norm": 0.5884266700151978, - "learning_rate": 0.000113, - "loss": 0.996, - "step": 565 - }, - { - "epoch": 0.27840629611411705, - "grad_norm": 0.5941883832990468, - "learning_rate": 0.0001132, - "loss": 0.8867, - "step": 566 - }, - { - "epoch": 0.27889818002951305, - "grad_norm": 0.5663337462783874, - "learning_rate": 0.00011339999999999999, - "loss": 0.9749, - "step": 567 - }, - { - "epoch": 0.279390063944909, - "grad_norm": 0.7083122369348663, - "learning_rate": 0.0001136, - "loss": 0.983, - "step": 568 - }, - { - "epoch": 0.27988194786030496, - "grad_norm": 0.6082393826155303, - "learning_rate": 0.0001138, - "loss": 0.996, - "step": 569 - }, - { - "epoch": 0.2803738317757009, - "grad_norm": 0.5778035016558593, - "learning_rate": 0.00011399999999999999, - "loss": 0.9854, - "step": 570 - }, - { - "epoch": 0.2808657156910969, - "grad_norm": 0.5871357641220516, - "learning_rate": 0.0001142, - "loss": 0.9928, - "step": 571 - }, - { - "epoch": 0.28135759960649287, - "grad_norm": 0.5950779514570566, - "learning_rate": 0.0001144, - "loss": 0.9732, - "step": 572 - }, - { - "epoch": 0.2818494835218888, - "grad_norm": 0.6282290640040025, - "learning_rate": 0.0001146, - "loss": 0.9407, - "step": 573 - }, - { - "epoch": 0.2823413674372848, - "grad_norm": 0.6091611582888863, - "learning_rate": 0.0001148, - "loss": 0.9945, - "step": 574 - }, - { - "epoch": 0.2828332513526808, - "grad_norm": 0.5445033036319282, - "learning_rate": 0.00011499999999999999, - "loss": 0.9237, - "step": 575 - }, - { - "epoch": 0.28332513526807673, - "grad_norm": 0.6028655740862312, - "learning_rate": 0.0001152, - "loss": 1.0068, - "step": 576 - }, - { - "epoch": 0.2838170191834727, - "grad_norm": 0.6450170796685454, - "learning_rate": 0.0001154, - "loss": 0.9833, - "step": 577 - }, - { - "epoch": 0.2843089030988687, - "grad_norm": 0.5774276129265211, - "learning_rate": 0.00011559999999999999, - "loss": 0.9708, - "step": 578 - }, - { - "epoch": 0.28480078701426464, - "grad_norm": 0.5252295207085985, - "learning_rate": 0.0001158, - "loss": 0.9146, - "step": 579 - }, - { - "epoch": 0.2852926709296606, - "grad_norm": 0.5846219170257404, - "learning_rate": 0.000116, - "loss": 0.9593, - "step": 580 - }, - { - "epoch": 0.28578455484505655, - "grad_norm": 0.6406247034473914, - "learning_rate": 0.00011619999999999999, - "loss": 0.9634, - "step": 581 - }, - { - "epoch": 0.28627643876045256, - "grad_norm": 0.5407092545690616, - "learning_rate": 0.0001164, - "loss": 0.9844, - "step": 582 - }, - { - "epoch": 0.2867683226758485, - "grad_norm": 0.5405736655392542, - "learning_rate": 0.0001166, - "loss": 0.885, - "step": 583 - }, - { - "epoch": 0.28726020659124446, - "grad_norm": 0.5635996908391186, - "learning_rate": 0.00011679999999999999, - "loss": 0.9904, - "step": 584 - }, - { - "epoch": 0.2877520905066404, - "grad_norm": 0.5386855826499043, - "learning_rate": 0.000117, - "loss": 0.9462, - "step": 585 - }, - { - "epoch": 0.2882439744220364, - "grad_norm": 0.5625326433463138, - "learning_rate": 0.0001172, - "loss": 0.9852, - "step": 586 - }, - { - "epoch": 0.28873585833743237, - "grad_norm": 0.5754031622934517, - "learning_rate": 0.0001174, - "loss": 0.9888, - "step": 587 - }, - { - "epoch": 0.2892277422528283, - "grad_norm": 0.5602175216543137, - "learning_rate": 0.0001176, - "loss": 1.0083, - "step": 588 - }, - { - "epoch": 0.2897196261682243, - "grad_norm": 0.6661953671814983, - "learning_rate": 0.0001178, - "loss": 0.9044, - "step": 589 - }, - { - "epoch": 0.2902115100836203, - "grad_norm": 0.5607320927918982, - "learning_rate": 0.000118, - "loss": 1.0375, - "step": 590 - }, - { - "epoch": 0.29070339399901624, - "grad_norm": 0.6091870680391808, - "learning_rate": 0.0001182, - "loss": 0.969, - "step": 591 - }, - { - "epoch": 0.2911952779144122, - "grad_norm": 0.6194195534546384, - "learning_rate": 0.0001184, - "loss": 0.9679, - "step": 592 - }, - { - "epoch": 0.29168716182980814, - "grad_norm": 0.5379078512348062, - "learning_rate": 0.0001186, - "loss": 0.9136, - "step": 593 - }, - { - "epoch": 0.29217904574520415, - "grad_norm": 0.5881165054322028, - "learning_rate": 0.0001188, - "loss": 0.8879, - "step": 594 - }, - { - "epoch": 0.2926709296606001, - "grad_norm": 0.5693384812772315, - "learning_rate": 0.000119, - "loss": 0.9097, - "step": 595 - }, - { - "epoch": 0.29316281357599605, - "grad_norm": 0.6220722328965506, - "learning_rate": 0.0001192, - "loss": 1.0315, - "step": 596 - }, - { - "epoch": 0.29365469749139206, - "grad_norm": 0.5885920602717346, - "learning_rate": 0.0001194, - "loss": 1.0039, - "step": 597 - }, - { - "epoch": 0.294146581406788, - "grad_norm": 0.5342119882180294, - "learning_rate": 0.00011960000000000001, - "loss": 0.9946, - "step": 598 - }, - { - "epoch": 0.29463846532218396, - "grad_norm": 0.5578159619400491, - "learning_rate": 0.0001198, - "loss": 0.975, - "step": 599 - }, - { - "epoch": 0.2951303492375799, - "grad_norm": 0.5673528225237494, - "learning_rate": 0.00012, - "loss": 0.8774, - "step": 600 - }, - { - "epoch": 0.2956222331529759, - "grad_norm": 0.6363250119030278, - "learning_rate": 0.00012020000000000001, - "loss": 0.9999, - "step": 601 - }, - { - "epoch": 0.2961141170683719, - "grad_norm": 0.5701824144888316, - "learning_rate": 0.0001204, - "loss": 1.0054, - "step": 602 - }, - { - "epoch": 0.2966060009837678, - "grad_norm": 0.5925396106953534, - "learning_rate": 0.0001206, - "loss": 0.9801, - "step": 603 - }, - { - "epoch": 0.2970978848991638, - "grad_norm": 0.8132860321261537, - "learning_rate": 0.0001208, - "loss": 0.9329, - "step": 604 - }, - { - "epoch": 0.2975897688145598, - "grad_norm": 0.5979623554943858, - "learning_rate": 0.000121, - "loss": 0.9376, - "step": 605 - }, - { - "epoch": 0.29808165272995574, - "grad_norm": 0.5537701303285555, - "learning_rate": 0.0001212, - "loss": 0.8985, - "step": 606 - }, - { - "epoch": 0.2985735366453517, - "grad_norm": 1.1934420587099575, - "learning_rate": 0.0001214, - "loss": 1.0415, - "step": 607 - }, - { - "epoch": 0.29906542056074764, - "grad_norm": 0.6390295391691274, - "learning_rate": 0.0001216, - "loss": 1.0092, - "step": 608 - }, - { - "epoch": 0.29955730447614365, - "grad_norm": 0.5632358051191003, - "learning_rate": 0.0001218, - "loss": 0.9522, - "step": 609 - }, - { - "epoch": 0.3000491883915396, - "grad_norm": 0.5846962460493994, - "learning_rate": 0.000122, - "loss": 0.9943, - "step": 610 - }, - { - "epoch": 0.30054107230693555, - "grad_norm": 0.720362221624888, - "learning_rate": 0.00012220000000000002, - "loss": 1.0228, - "step": 611 - }, - { - "epoch": 0.3010329562223315, - "grad_norm": 0.6473326791386469, - "learning_rate": 0.0001224, - "loss": 0.9099, - "step": 612 - }, - { - "epoch": 0.3015248401377275, - "grad_norm": 0.6463604247118291, - "learning_rate": 0.0001226, - "loss": 0.9361, - "step": 613 - }, - { - "epoch": 0.30201672405312346, - "grad_norm": 0.6032454455578362, - "learning_rate": 0.0001228, - "loss": 0.9172, - "step": 614 - }, - { - "epoch": 0.3025086079685194, - "grad_norm": 0.5975245447925146, - "learning_rate": 0.000123, - "loss": 0.9364, - "step": 615 - }, - { - "epoch": 0.30300049188391537, - "grad_norm": 0.6286030008050292, - "learning_rate": 0.0001232, - "loss": 0.9243, - "step": 616 - }, - { - "epoch": 0.3034923757993114, - "grad_norm": 0.7795705212149114, - "learning_rate": 0.00012340000000000002, - "loss": 0.8647, - "step": 617 - }, - { - "epoch": 0.30398425971470733, - "grad_norm": 0.6254254168046154, - "learning_rate": 0.0001236, - "loss": 0.9356, - "step": 618 - }, - { - "epoch": 0.3044761436301033, - "grad_norm": 0.5790348057840939, - "learning_rate": 0.0001238, - "loss": 0.9544, - "step": 619 - }, - { - "epoch": 0.3049680275454993, - "grad_norm": 0.587364870786678, - "learning_rate": 0.000124, - "loss": 0.9516, - "step": 620 - }, - { - "epoch": 0.30545991146089524, - "grad_norm": 0.5954110385983872, - "learning_rate": 0.0001242, - "loss": 1.0545, - "step": 621 - }, - { - "epoch": 0.3059517953762912, - "grad_norm": 0.5733986106469642, - "learning_rate": 0.00012440000000000002, - "loss": 0.9221, - "step": 622 - }, - { - "epoch": 0.30644367929168714, - "grad_norm": 0.6133244232392819, - "learning_rate": 0.0001246, - "loss": 0.9729, - "step": 623 - }, - { - "epoch": 0.30693556320708315, - "grad_norm": 0.6156606416669301, - "learning_rate": 0.0001248, - "loss": 1.077, - "step": 624 - }, - { - "epoch": 0.3074274471224791, - "grad_norm": 0.7156742684026377, - "learning_rate": 0.000125, - "loss": 0.9544, - "step": 625 - }, - { - "epoch": 0.30791933103787505, - "grad_norm": 0.5891986448979375, - "learning_rate": 0.0001252, - "loss": 0.9714, - "step": 626 - }, - { - "epoch": 0.308411214953271, - "grad_norm": 0.6154447166478924, - "learning_rate": 0.0001254, - "loss": 1.0105, - "step": 627 - }, - { - "epoch": 0.308903098868667, - "grad_norm": 0.5712629776576733, - "learning_rate": 0.00012560000000000002, - "loss": 0.9743, - "step": 628 - }, - { - "epoch": 0.30939498278406297, - "grad_norm": 0.603402460788961, - "learning_rate": 0.0001258, - "loss": 0.9682, - "step": 629 - }, - { - "epoch": 0.3098868666994589, - "grad_norm": 0.7361243548316179, - "learning_rate": 0.000126, - "loss": 0.9361, - "step": 630 - }, - { - "epoch": 0.31037875061485487, - "grad_norm": 0.5982206894600505, - "learning_rate": 0.0001262, - "loss": 0.9961, - "step": 631 - }, - { - "epoch": 0.3108706345302509, - "grad_norm": 0.5686996284709666, - "learning_rate": 0.0001264, - "loss": 1.0046, - "step": 632 - }, - { - "epoch": 0.31136251844564683, - "grad_norm": 0.6957390022948259, - "learning_rate": 0.00012660000000000001, - "loss": 1.1283, - "step": 633 - }, - { - "epoch": 0.3118544023610428, - "grad_norm": 0.5659975015734552, - "learning_rate": 0.00012680000000000002, - "loss": 1.0171, - "step": 634 - }, - { - "epoch": 0.31234628627643873, - "grad_norm": 0.6356091656285305, - "learning_rate": 0.000127, - "loss": 1.0067, - "step": 635 - }, - { - "epoch": 0.31283817019183474, - "grad_norm": 0.5765472435552592, - "learning_rate": 0.0001272, - "loss": 0.9492, - "step": 636 - }, - { - "epoch": 0.3133300541072307, - "grad_norm": 0.593052903331575, - "learning_rate": 0.0001274, - "loss": 0.9961, - "step": 637 - }, - { - "epoch": 0.31382193802262665, - "grad_norm": 0.5737673808109037, - "learning_rate": 0.0001276, - "loss": 0.8553, - "step": 638 - }, - { - "epoch": 0.31431382193802265, - "grad_norm": 0.583876297288899, - "learning_rate": 0.00012780000000000002, - "loss": 0.9115, - "step": 639 - }, - { - "epoch": 0.3148057058534186, - "grad_norm": 0.5507556107242984, - "learning_rate": 0.00012800000000000002, - "loss": 0.9111, - "step": 640 - }, - { - "epoch": 0.31529758976881456, - "grad_norm": 0.5657350366364802, - "learning_rate": 0.0001282, - "loss": 0.947, - "step": 641 - }, - { - "epoch": 0.3157894736842105, - "grad_norm": 0.5809533663037738, - "learning_rate": 0.0001284, - "loss": 0.9645, - "step": 642 - }, - { - "epoch": 0.3162813575996065, - "grad_norm": 0.5346005995438732, - "learning_rate": 0.0001286, - "loss": 0.916, - "step": 643 - }, - { - "epoch": 0.31677324151500247, - "grad_norm": 0.6486732450104193, - "learning_rate": 0.00012880000000000001, - "loss": 0.9752, - "step": 644 - }, - { - "epoch": 0.3172651254303984, - "grad_norm": 0.6276238792109974, - "learning_rate": 0.00012900000000000002, - "loss": 0.959, - "step": 645 - }, - { - "epoch": 0.3177570093457944, - "grad_norm": 0.5786657577934453, - "learning_rate": 0.00012920000000000002, - "loss": 0.9983, - "step": 646 - }, - { - "epoch": 0.3182488932611904, - "grad_norm": 0.600133902239411, - "learning_rate": 0.0001294, - "loss": 1.1194, - "step": 647 - }, - { - "epoch": 0.31874077717658633, - "grad_norm": 0.5756013538106385, - "learning_rate": 0.0001296, - "loss": 0.9643, - "step": 648 - }, - { - "epoch": 0.3192326610919823, - "grad_norm": 0.5724358955757081, - "learning_rate": 0.0001298, - "loss": 0.9263, - "step": 649 - }, - { - "epoch": 0.31972454500737824, - "grad_norm": 0.5865977260259286, - "learning_rate": 0.00013000000000000002, - "loss": 1.0196, - "step": 650 - }, - { - "epoch": 0.32021642892277424, - "grad_norm": 0.5509247698582813, - "learning_rate": 0.00013020000000000002, - "loss": 0.9469, - "step": 651 - }, - { - "epoch": 0.3207083128381702, - "grad_norm": 0.5375325028727678, - "learning_rate": 0.0001304, - "loss": 0.9322, - "step": 652 - }, - { - "epoch": 0.32120019675356615, - "grad_norm": 0.549825036834768, - "learning_rate": 0.0001306, - "loss": 0.9052, - "step": 653 - }, - { - "epoch": 0.3216920806689621, - "grad_norm": 0.7103188525342564, - "learning_rate": 0.0001308, - "loss": 1.0165, - "step": 654 - }, - { - "epoch": 0.3221839645843581, - "grad_norm": 0.5398051571072059, - "learning_rate": 0.000131, - "loss": 0.9455, - "step": 655 - }, - { - "epoch": 0.32267584849975406, - "grad_norm": 0.5815471566735355, - "learning_rate": 0.00013120000000000002, - "loss": 0.9447, - "step": 656 - }, - { - "epoch": 0.32316773241515, - "grad_norm": 0.5417798807684112, - "learning_rate": 0.00013140000000000002, - "loss": 0.9406, - "step": 657 - }, - { - "epoch": 0.323659616330546, - "grad_norm": 0.5301739122946627, - "learning_rate": 0.0001316, - "loss": 0.9318, - "step": 658 - }, - { - "epoch": 0.32415150024594197, - "grad_norm": 0.5534740199402641, - "learning_rate": 0.0001318, - "loss": 0.964, - "step": 659 - }, - { - "epoch": 0.3246433841613379, - "grad_norm": 0.5654722268298933, - "learning_rate": 0.000132, - "loss": 0.9994, - "step": 660 - }, - { - "epoch": 0.3251352680767339, - "grad_norm": 0.5967544855156218, - "learning_rate": 0.00013220000000000001, - "loss": 1.0159, - "step": 661 - }, - { - "epoch": 0.3256271519921299, - "grad_norm": 0.6466246670710497, - "learning_rate": 0.00013240000000000002, - "loss": 0.9938, - "step": 662 - }, - { - "epoch": 0.32611903590752583, - "grad_norm": 0.5539070788172011, - "learning_rate": 0.00013260000000000002, - "loss": 0.9236, - "step": 663 - }, - { - "epoch": 0.3266109198229218, - "grad_norm": 0.5357911496242144, - "learning_rate": 0.0001328, - "loss": 0.8679, - "step": 664 - }, - { - "epoch": 0.32710280373831774, - "grad_norm": 0.5757556362805178, - "learning_rate": 0.000133, - "loss": 0.9713, - "step": 665 - }, - { - "epoch": 0.32759468765371375, - "grad_norm": 1.595516807778794, - "learning_rate": 0.0001332, - "loss": 1.0152, - "step": 666 - }, - { - "epoch": 0.3280865715691097, - "grad_norm": 0.8092171184687358, - "learning_rate": 0.00013340000000000002, - "loss": 0.9439, - "step": 667 - }, - { - "epoch": 0.32857845548450565, - "grad_norm": 0.5547824246192261, - "learning_rate": 0.00013360000000000002, - "loss": 0.9708, - "step": 668 - }, - { - "epoch": 0.3290703393999016, - "grad_norm": 0.5667719349306606, - "learning_rate": 0.00013380000000000003, - "loss": 0.9921, - "step": 669 - }, - { - "epoch": 0.3295622233152976, - "grad_norm": 0.5602335299747019, - "learning_rate": 0.000134, - "loss": 0.9328, - "step": 670 - }, - { - "epoch": 0.33005410723069356, - "grad_norm": 1.542235688252758, - "learning_rate": 0.0001342, - "loss": 1.0415, - "step": 671 - }, - { - "epoch": 0.3305459911460895, - "grad_norm": 0.528859524564198, - "learning_rate": 0.00013440000000000001, - "loss": 0.9635, - "step": 672 - }, - { - "epoch": 0.33103787506148546, - "grad_norm": 0.570686563030655, - "learning_rate": 0.00013460000000000002, - "loss": 1.0653, - "step": 673 - }, - { - "epoch": 0.3315297589768815, - "grad_norm": 0.5205478000916222, - "learning_rate": 0.00013480000000000002, - "loss": 0.9937, - "step": 674 - }, - { - "epoch": 0.3320216428922774, - "grad_norm": 0.5968896498283343, - "learning_rate": 0.00013500000000000003, - "loss": 0.9731, - "step": 675 - }, - { - "epoch": 0.3325135268076734, - "grad_norm": 0.5933439978690628, - "learning_rate": 0.0001352, - "loss": 0.9518, - "step": 676 - }, - { - "epoch": 0.33300541072306933, - "grad_norm": 0.6183079211299322, - "learning_rate": 0.0001354, - "loss": 0.9279, - "step": 677 - }, - { - "epoch": 0.33349729463846534, - "grad_norm": 0.5681382320807932, - "learning_rate": 0.00013560000000000002, - "loss": 0.9453, - "step": 678 - }, - { - "epoch": 0.3339891785538613, - "grad_norm": 0.580113713966502, - "learning_rate": 0.00013580000000000002, - "loss": 0.9855, - "step": 679 - }, - { - "epoch": 0.33448106246925724, - "grad_norm": 0.6179511418209821, - "learning_rate": 0.00013600000000000003, - "loss": 1.0135, - "step": 680 - }, - { - "epoch": 0.33497294638465325, - "grad_norm": 0.6647931623300862, - "learning_rate": 0.0001362, - "loss": 0.8864, - "step": 681 - }, - { - "epoch": 0.3354648303000492, - "grad_norm": 0.5474827309408606, - "learning_rate": 0.0001364, - "loss": 0.9699, - "step": 682 - }, - { - "epoch": 0.33595671421544515, - "grad_norm": 0.71983895648374, - "learning_rate": 0.0001366, - "loss": 1.0405, - "step": 683 - }, - { - "epoch": 0.3364485981308411, - "grad_norm": 0.5996720854516058, - "learning_rate": 0.00013680000000000002, - "loss": 0.9509, - "step": 684 - }, - { - "epoch": 0.3369404820462371, - "grad_norm": 0.5638459937753646, - "learning_rate": 0.00013700000000000002, - "loss": 0.9376, - "step": 685 - }, - { - "epoch": 0.33743236596163306, - "grad_norm": 0.6071855878622703, - "learning_rate": 0.00013720000000000003, - "loss": 1.0412, - "step": 686 - }, - { - "epoch": 0.337924249877029, - "grad_norm": 0.6709708960381234, - "learning_rate": 0.0001374, - "loss": 0.9346, - "step": 687 - }, - { - "epoch": 0.33841613379242497, - "grad_norm": 0.5882971338580069, - "learning_rate": 0.00013759999999999998, - "loss": 1.0748, - "step": 688 - }, - { - "epoch": 0.338908017707821, - "grad_norm": 0.6895579608375398, - "learning_rate": 0.0001378, - "loss": 0.9703, - "step": 689 - }, - { - "epoch": 0.3393999016232169, - "grad_norm": 0.5549688261016558, - "learning_rate": 0.000138, - "loss": 0.8929, - "step": 690 - }, - { - "epoch": 0.3398917855386129, - "grad_norm": 0.5963572160505828, - "learning_rate": 0.0001382, - "loss": 1.0304, - "step": 691 - }, - { - "epoch": 0.34038366945400883, - "grad_norm": 0.6198648727259731, - "learning_rate": 0.0001384, - "loss": 0.9831, - "step": 692 - }, - { - "epoch": 0.34087555336940484, - "grad_norm": 0.5719964600673073, - "learning_rate": 0.0001386, - "loss": 0.9806, - "step": 693 - }, - { - "epoch": 0.3413674372848008, - "grad_norm": 0.5731736669696705, - "learning_rate": 0.00013879999999999999, - "loss": 0.8825, - "step": 694 - }, - { - "epoch": 0.34185932120019674, - "grad_norm": 0.5419254295863805, - "learning_rate": 0.000139, - "loss": 0.8876, - "step": 695 - }, - { - "epoch": 0.3423512051155927, - "grad_norm": 0.5480778481079349, - "learning_rate": 0.0001392, - "loss": 0.8689, - "step": 696 - }, - { - "epoch": 0.3428430890309887, - "grad_norm": 0.6061965287289426, - "learning_rate": 0.0001394, - "loss": 0.9779, - "step": 697 - }, - { - "epoch": 0.34333497294638465, - "grad_norm": 0.56418799698156, - "learning_rate": 0.0001396, - "loss": 0.9965, - "step": 698 - }, - { - "epoch": 0.3438268568617806, - "grad_norm": 0.5474847123144074, - "learning_rate": 0.0001398, - "loss": 0.9504, - "step": 699 - }, - { - "epoch": 0.3443187407771766, - "grad_norm": 0.6020447593211263, - "learning_rate": 0.00014, - "loss": 0.9916, - "step": 700 - }, - { - "epoch": 0.34481062469257256, - "grad_norm": 0.5716767111427228, - "learning_rate": 0.0001402, - "loss": 0.9785, - "step": 701 - }, - { - "epoch": 0.3453025086079685, - "grad_norm": 0.53738356089577, - "learning_rate": 0.0001404, - "loss": 0.913, - "step": 702 - }, - { - "epoch": 0.34579439252336447, - "grad_norm": 0.5533695050238833, - "learning_rate": 0.0001406, - "loss": 0.8827, - "step": 703 - }, - { - "epoch": 0.3462862764387605, - "grad_norm": 0.521861791445525, - "learning_rate": 0.0001408, - "loss": 0.963, - "step": 704 - }, - { - "epoch": 0.34677816035415643, - "grad_norm": 0.5467343292585995, - "learning_rate": 0.000141, - "loss": 0.9585, - "step": 705 - }, - { - "epoch": 0.3472700442695524, - "grad_norm": 0.6613807586137288, - "learning_rate": 0.0001412, - "loss": 0.9471, - "step": 706 - }, - { - "epoch": 0.34776192818494833, - "grad_norm": 0.5387654674317737, - "learning_rate": 0.0001414, - "loss": 0.894, - "step": 707 - }, - { - "epoch": 0.34825381210034434, - "grad_norm": 0.6187900254092165, - "learning_rate": 0.0001416, - "loss": 1.0506, - "step": 708 - }, - { - "epoch": 0.3487456960157403, - "grad_norm": 0.5842576555732705, - "learning_rate": 0.0001418, - "loss": 0.972, - "step": 709 - }, - { - "epoch": 0.34923757993113624, - "grad_norm": 0.5471657682259135, - "learning_rate": 0.000142, - "loss": 0.9363, - "step": 710 - }, - { - "epoch": 0.3497294638465322, - "grad_norm": 0.6713128120037045, - "learning_rate": 0.0001422, - "loss": 1.0076, - "step": 711 - }, - { - "epoch": 0.3502213477619282, - "grad_norm": 0.562939754525997, - "learning_rate": 0.0001424, - "loss": 0.994, - "step": 712 - }, - { - "epoch": 0.35071323167732416, - "grad_norm": 0.5427219860424152, - "learning_rate": 0.0001426, - "loss": 0.9664, - "step": 713 - }, - { - "epoch": 0.3512051155927201, - "grad_norm": 0.5336172312438766, - "learning_rate": 0.0001428, - "loss": 0.8873, - "step": 714 - }, - { - "epoch": 0.35169699950811606, - "grad_norm": 0.5759279718130945, - "learning_rate": 0.000143, - "loss": 0.994, - "step": 715 - }, - { - "epoch": 0.35218888342351207, - "grad_norm": 0.6210830795173621, - "learning_rate": 0.0001432, - "loss": 1.0437, - "step": 716 - }, - { - "epoch": 0.352680767338908, - "grad_norm": 0.5279473344404623, - "learning_rate": 0.0001434, - "loss": 0.8987, - "step": 717 - }, - { - "epoch": 0.35317265125430397, - "grad_norm": 0.575646122945982, - "learning_rate": 0.0001436, - "loss": 0.9883, - "step": 718 - }, - { - "epoch": 0.3536645351697, - "grad_norm": 0.5866741707982973, - "learning_rate": 0.0001438, - "loss": 0.9654, - "step": 719 - }, - { - "epoch": 0.35415641908509593, - "grad_norm": 0.8804767483817335, - "learning_rate": 0.000144, - "loss": 1.0005, - "step": 720 - }, - { - "epoch": 0.3546483030004919, - "grad_norm": 0.49721230503684766, - "learning_rate": 0.0001442, - "loss": 0.8645, - "step": 721 - }, - { - "epoch": 0.35514018691588783, - "grad_norm": 0.6135127465838539, - "learning_rate": 0.0001444, - "loss": 0.9552, - "step": 722 - }, - { - "epoch": 0.35563207083128384, - "grad_norm": 0.6006179727170639, - "learning_rate": 0.0001446, - "loss": 0.9883, - "step": 723 - }, - { - "epoch": 0.3561239547466798, - "grad_norm": 0.5791762925281315, - "learning_rate": 0.0001448, - "loss": 1.0127, - "step": 724 - }, - { - "epoch": 0.35661583866207575, - "grad_norm": 0.6211190579227377, - "learning_rate": 0.000145, - "loss": 0.9896, - "step": 725 - }, - { - "epoch": 0.3571077225774717, - "grad_norm": 0.5927954254139796, - "learning_rate": 0.0001452, - "loss": 0.9654, - "step": 726 - }, - { - "epoch": 0.3575996064928677, - "grad_norm": 0.5627595411888491, - "learning_rate": 0.0001454, - "loss": 0.9103, - "step": 727 - }, - { - "epoch": 0.35809149040826366, - "grad_norm": 0.7494298053751162, - "learning_rate": 0.00014560000000000002, - "loss": 1.0005, - "step": 728 - }, - { - "epoch": 0.3585833743236596, - "grad_norm": 0.536368405130208, - "learning_rate": 0.0001458, - "loss": 0.8954, - "step": 729 - }, - { - "epoch": 0.35907525823905556, - "grad_norm": 0.5323406282397959, - "learning_rate": 0.000146, - "loss": 0.9857, - "step": 730 - }, - { - "epoch": 0.35956714215445157, - "grad_norm": 0.5674356330933522, - "learning_rate": 0.0001462, - "loss": 0.927, - "step": 731 - }, - { - "epoch": 0.3600590260698475, - "grad_norm": 0.6209798732214817, - "learning_rate": 0.0001464, - "loss": 1.0156, - "step": 732 - }, - { - "epoch": 0.3605509099852435, - "grad_norm": 0.5546862564497353, - "learning_rate": 0.0001466, - "loss": 0.8843, - "step": 733 - }, - { - "epoch": 0.3610427939006394, - "grad_norm": 0.533114061617224, - "learning_rate": 0.00014680000000000002, - "loss": 0.8848, - "step": 734 - }, - { - "epoch": 0.36153467781603543, - "grad_norm": 0.6350819782389399, - "learning_rate": 0.000147, - "loss": 0.9055, - "step": 735 - }, - { - "epoch": 0.3620265617314314, - "grad_norm": 0.5678788134404464, - "learning_rate": 0.0001472, - "loss": 0.9804, - "step": 736 - }, - { - "epoch": 0.36251844564682734, - "grad_norm": 0.5760551838489592, - "learning_rate": 0.0001474, - "loss": 0.9949, - "step": 737 - }, - { - "epoch": 0.3630103295622233, - "grad_norm": 2.2923759269385813, - "learning_rate": 0.0001476, - "loss": 0.9776, - "step": 738 - }, - { - "epoch": 0.3635022134776193, - "grad_norm": 0.632221799207917, - "learning_rate": 0.00014780000000000001, - "loss": 1.0221, - "step": 739 - }, - { - "epoch": 0.36399409739301525, - "grad_norm": 0.6136556819957433, - "learning_rate": 0.000148, - "loss": 1.0076, - "step": 740 - }, - { - "epoch": 0.3644859813084112, - "grad_norm": 0.5539084756184474, - "learning_rate": 0.0001482, - "loss": 0.9471, - "step": 741 - }, - { - "epoch": 0.3649778652238072, - "grad_norm": 0.600388803994492, - "learning_rate": 0.0001484, - "loss": 1.0066, - "step": 742 - }, - { - "epoch": 0.36546974913920316, - "grad_norm": 0.5904080160729892, - "learning_rate": 0.0001486, - "loss": 0.9935, - "step": 743 - }, - { - "epoch": 0.3659616330545991, - "grad_norm": 0.6062535085595192, - "learning_rate": 0.0001488, - "loss": 0.9043, - "step": 744 - }, - { - "epoch": 0.36645351696999506, - "grad_norm": 0.581266851539047, - "learning_rate": 0.00014900000000000002, - "loss": 0.9436, - "step": 745 - }, - { - "epoch": 0.36694540088539107, - "grad_norm": 0.5531548464262217, - "learning_rate": 0.0001492, - "loss": 0.9334, - "step": 746 - }, - { - "epoch": 0.367437284800787, - "grad_norm": 0.8283535851545999, - "learning_rate": 0.0001494, - "loss": 0.9869, - "step": 747 - }, - { - "epoch": 0.367929168716183, - "grad_norm": 0.5998592619281596, - "learning_rate": 0.0001496, - "loss": 1.0054, - "step": 748 - }, - { - "epoch": 0.3684210526315789, - "grad_norm": 0.5254187775306348, - "learning_rate": 0.0001498, - "loss": 0.8801, - "step": 749 - }, - { - "epoch": 0.36891293654697493, - "grad_norm": 0.5840307586056854, - "learning_rate": 0.00015000000000000001, - "loss": 1.0775, - "step": 750 - }, - { - "epoch": 0.3694048204623709, - "grad_norm": 0.5548845893528461, - "learning_rate": 0.00015020000000000002, - "loss": 0.8954, - "step": 751 - }, - { - "epoch": 0.36989670437776684, - "grad_norm": 0.4779657950486935, - "learning_rate": 0.0001504, - "loss": 0.8264, - "step": 752 - }, - { - "epoch": 0.3703885882931628, - "grad_norm": 0.5680652462758233, - "learning_rate": 0.0001506, - "loss": 1.0293, - "step": 753 - }, - { - "epoch": 0.3708804722085588, - "grad_norm": 0.5506113131006278, - "learning_rate": 0.0001508, - "loss": 0.9605, - "step": 754 - }, - { - "epoch": 0.37137235612395475, - "grad_norm": 0.5241419248314438, - "learning_rate": 0.000151, - "loss": 0.95, - "step": 755 - }, - { - "epoch": 0.3718642400393507, - "grad_norm": 0.5548976086666146, - "learning_rate": 0.00015120000000000002, - "loss": 0.9271, - "step": 756 - }, - { - "epoch": 0.37235612395474665, - "grad_norm": 0.545441963837688, - "learning_rate": 0.00015140000000000002, - "loss": 0.9555, - "step": 757 - }, - { - "epoch": 0.37284800787014266, - "grad_norm": 0.51534507000048, - "learning_rate": 0.0001516, - "loss": 0.8994, - "step": 758 - }, - { - "epoch": 0.3733398917855386, - "grad_norm": 0.5373985678785738, - "learning_rate": 0.0001518, - "loss": 1.0127, - "step": 759 - }, - { - "epoch": 0.37383177570093457, - "grad_norm": 0.5746685531454623, - "learning_rate": 0.000152, - "loss": 1.0257, - "step": 760 - }, - { - "epoch": 0.3743236596163306, - "grad_norm": 0.503979910793055, - "learning_rate": 0.0001522, - "loss": 0.9056, - "step": 761 - }, - { - "epoch": 0.3748155435317265, - "grad_norm": 0.5913604557559807, - "learning_rate": 0.00015240000000000002, - "loss": 1.0301, - "step": 762 - }, - { - "epoch": 0.3753074274471225, - "grad_norm": 0.5398864774426727, - "learning_rate": 0.00015260000000000002, - "loss": 0.9607, - "step": 763 - }, - { - "epoch": 0.37579931136251843, - "grad_norm": 0.5495286862756644, - "learning_rate": 0.0001528, - "loss": 0.9597, - "step": 764 - }, - { - "epoch": 0.37629119527791444, - "grad_norm": 0.5907199039930229, - "learning_rate": 0.000153, - "loss": 0.8879, - "step": 765 - }, - { - "epoch": 0.3767830791933104, - "grad_norm": 0.6058728144562922, - "learning_rate": 0.0001532, - "loss": 1.0295, - "step": 766 - }, - { - "epoch": 0.37727496310870634, - "grad_norm": 0.5535252317067916, - "learning_rate": 0.00015340000000000002, - "loss": 0.9721, - "step": 767 - }, - { - "epoch": 0.3777668470241023, - "grad_norm": 0.5689401640933246, - "learning_rate": 0.00015360000000000002, - "loss": 0.9659, - "step": 768 - }, - { - "epoch": 0.3782587309394983, - "grad_norm": 0.5492311772960732, - "learning_rate": 0.0001538, - "loss": 1.0209, - "step": 769 - }, - { - "epoch": 0.37875061485489425, - "grad_norm": 0.547940922474521, - "learning_rate": 0.000154, - "loss": 0.989, - "step": 770 - }, - { - "epoch": 0.3792424987702902, - "grad_norm": 0.50370578360955, - "learning_rate": 0.0001542, - "loss": 0.9145, - "step": 771 - }, - { - "epoch": 0.37973438268568616, - "grad_norm": 0.5633163359060936, - "learning_rate": 0.0001544, - "loss": 0.9606, - "step": 772 - }, - { - "epoch": 0.38022626660108216, - "grad_norm": 0.5672102348479715, - "learning_rate": 0.00015460000000000002, - "loss": 1.0797, - "step": 773 - }, - { - "epoch": 0.3807181505164781, - "grad_norm": 0.48875960371628935, - "learning_rate": 0.00015480000000000002, - "loss": 0.8897, - "step": 774 - }, - { - "epoch": 0.38121003443187407, - "grad_norm": 0.5183070695492531, - "learning_rate": 0.000155, - "loss": 0.9114, - "step": 775 - }, - { - "epoch": 0.38170191834727, - "grad_norm": 0.5260847222140161, - "learning_rate": 0.0001552, - "loss": 1.0111, - "step": 776 - }, - { - "epoch": 0.382193802262666, - "grad_norm": 0.4873770357123154, - "learning_rate": 0.0001554, - "loss": 0.8981, - "step": 777 - }, - { - "epoch": 0.382685686178062, - "grad_norm": 0.533184635208334, - "learning_rate": 0.00015560000000000001, - "loss": 0.912, - "step": 778 - }, - { - "epoch": 0.38317757009345793, - "grad_norm": 0.5726377304478081, - "learning_rate": 0.00015580000000000002, - "loss": 1.0016, - "step": 779 - }, - { - "epoch": 0.3836694540088539, - "grad_norm": 0.5699433809365081, - "learning_rate": 0.00015600000000000002, - "loss": 1.0063, - "step": 780 - }, - { - "epoch": 0.3841613379242499, - "grad_norm": 0.52631644676918, - "learning_rate": 0.0001562, - "loss": 0.9563, - "step": 781 - }, - { - "epoch": 0.38465322183964584, - "grad_norm": 0.5634430508864812, - "learning_rate": 0.0001564, - "loss": 1.0055, - "step": 782 - }, - { - "epoch": 0.3851451057550418, - "grad_norm": 0.5221500631664022, - "learning_rate": 0.0001566, - "loss": 1.0167, - "step": 783 - }, - { - "epoch": 0.3856369896704378, - "grad_norm": 0.5654864366000731, - "learning_rate": 0.00015680000000000002, - "loss": 0.9569, - "step": 784 - }, - { - "epoch": 0.38612887358583375, - "grad_norm": 0.5180728219205175, - "learning_rate": 0.00015700000000000002, - "loss": 0.9176, - "step": 785 - }, - { - "epoch": 0.3866207575012297, - "grad_norm": 0.538064697836738, - "learning_rate": 0.00015720000000000003, - "loss": 0.9627, - "step": 786 - }, - { - "epoch": 0.38711264141662566, - "grad_norm": 0.5519014505895645, - "learning_rate": 0.0001574, - "loss": 0.986, - "step": 787 - }, - { - "epoch": 0.38760452533202167, - "grad_norm": 0.5921326184710332, - "learning_rate": 0.0001576, - "loss": 1.1092, - "step": 788 - }, - { - "epoch": 0.3880964092474176, - "grad_norm": 0.5365549041365112, - "learning_rate": 0.00015780000000000001, - "loss": 0.9762, - "step": 789 - }, - { - "epoch": 0.38858829316281357, - "grad_norm": 0.5285861559667572, - "learning_rate": 0.00015800000000000002, - "loss": 1.0393, - "step": 790 - }, - { - "epoch": 0.3890801770782095, - "grad_norm": 0.5353608332868288, - "learning_rate": 0.00015820000000000002, - "loss": 0.9721, - "step": 791 - }, - { - "epoch": 0.38957206099360553, - "grad_norm": 0.5676796635103992, - "learning_rate": 0.00015840000000000003, - "loss": 0.9707, - "step": 792 - }, - { - "epoch": 0.3900639449090015, - "grad_norm": 0.5324974277172961, - "learning_rate": 0.0001586, - "loss": 1.0181, - "step": 793 - }, - { - "epoch": 0.39055582882439743, - "grad_norm": 0.5444144065736729, - "learning_rate": 0.0001588, - "loss": 1.0322, - "step": 794 - }, - { - "epoch": 0.3910477127397934, - "grad_norm": 0.5216984893352097, - "learning_rate": 0.00015900000000000002, - "loss": 0.9468, - "step": 795 - }, - { - "epoch": 0.3915395966551894, - "grad_norm": 0.5118829818178605, - "learning_rate": 0.00015920000000000002, - "loss": 0.9302, - "step": 796 - }, - { - "epoch": 0.39203148057058534, - "grad_norm": 0.5560684653179324, - "learning_rate": 0.00015940000000000003, - "loss": 1.0283, - "step": 797 - }, - { - "epoch": 0.3925233644859813, - "grad_norm": 0.5137964259683437, - "learning_rate": 0.0001596, - "loss": 0.9219, - "step": 798 - }, - { - "epoch": 0.39301524840137725, - "grad_norm": 0.5972357390162277, - "learning_rate": 0.0001598, - "loss": 0.8816, - "step": 799 - }, - { - "epoch": 0.39350713231677326, - "grad_norm": 0.5731293092568074, - "learning_rate": 0.00016, - "loss": 1.0413, - "step": 800 - }, - { - "epoch": 0.3939990162321692, - "grad_norm": 0.5832125955169386, - "learning_rate": 0.00016020000000000002, - "loss": 0.9656, - "step": 801 - }, - { - "epoch": 0.39449090014756516, - "grad_norm": 0.5703505863315125, - "learning_rate": 0.00016040000000000002, - "loss": 0.9717, - "step": 802 - }, - { - "epoch": 0.39498278406296117, - "grad_norm": 0.5233933183593751, - "learning_rate": 0.00016060000000000003, - "loss": 0.986, - "step": 803 - }, - { - "epoch": 0.3954746679783571, - "grad_norm": 0.5239916997866086, - "learning_rate": 0.0001608, - "loss": 0.9461, - "step": 804 - }, - { - "epoch": 0.39596655189375307, - "grad_norm": 0.5361896576371097, - "learning_rate": 0.000161, - "loss": 0.931, - "step": 805 - }, - { - "epoch": 0.396458435809149, - "grad_norm": 1.84624704067933, - "learning_rate": 0.00016120000000000002, - "loss": 1.07, - "step": 806 - }, - { - "epoch": 0.39695031972454503, - "grad_norm": 0.6194161425941437, - "learning_rate": 0.00016140000000000002, - "loss": 0.9299, - "step": 807 - }, - { - "epoch": 0.397442203639941, - "grad_norm": 0.5598157158310759, - "learning_rate": 0.00016160000000000002, - "loss": 0.8952, - "step": 808 - }, - { - "epoch": 0.39793408755533693, - "grad_norm": 0.5147892749426699, - "learning_rate": 0.00016180000000000003, - "loss": 0.9648, - "step": 809 - }, - { - "epoch": 0.3984259714707329, - "grad_norm": 0.53381884733348, - "learning_rate": 0.000162, - "loss": 0.965, - "step": 810 - }, - { - "epoch": 0.3989178553861289, - "grad_norm": 0.5228813426312591, - "learning_rate": 0.0001622, - "loss": 0.9231, - "step": 811 - }, - { - "epoch": 0.39940973930152485, - "grad_norm": 0.5837534109522405, - "learning_rate": 0.00016240000000000002, - "loss": 0.9328, - "step": 812 - }, - { - "epoch": 0.3999016232169208, - "grad_norm": 0.5275571830323403, - "learning_rate": 0.0001626, - "loss": 0.8928, - "step": 813 - }, - { - "epoch": 0.40039350713231675, - "grad_norm": 0.5876104356865259, - "learning_rate": 0.0001628, - "loss": 1.0121, - "step": 814 - }, - { - "epoch": 0.40088539104771276, - "grad_norm": 0.6382169634620648, - "learning_rate": 0.000163, - "loss": 0.9843, - "step": 815 - }, - { - "epoch": 0.4013772749631087, - "grad_norm": 0.5801866766826601, - "learning_rate": 0.0001632, - "loss": 1.0471, - "step": 816 - }, - { - "epoch": 0.40186915887850466, - "grad_norm": 0.5510639872096187, - "learning_rate": 0.0001634, - "loss": 1.0638, - "step": 817 - }, - { - "epoch": 0.4023610427939006, - "grad_norm": 0.5140283673856436, - "learning_rate": 0.0001636, - "loss": 0.9217, - "step": 818 - }, - { - "epoch": 0.4028529267092966, - "grad_norm": 0.7924911609540576, - "learning_rate": 0.0001638, - "loss": 0.9755, - "step": 819 - }, - { - "epoch": 0.4033448106246926, - "grad_norm": 0.5331762791413481, - "learning_rate": 0.000164, - "loss": 0.9539, - "step": 820 - }, - { - "epoch": 0.4038366945400885, - "grad_norm": 0.5966286424829558, - "learning_rate": 0.0001642, - "loss": 0.8722, - "step": 821 - }, - { - "epoch": 0.40432857845548453, - "grad_norm": 1.309276772968705, - "learning_rate": 0.0001644, - "loss": 1.0592, - "step": 822 - }, - { - "epoch": 0.4048204623708805, - "grad_norm": 0.5380584739269623, - "learning_rate": 0.0001646, - "loss": 0.9616, - "step": 823 - }, - { - "epoch": 0.40531234628627644, - "grad_norm": 0.5424035960763122, - "learning_rate": 0.0001648, - "loss": 0.9627, - "step": 824 - }, - { - "epoch": 0.4058042302016724, - "grad_norm": 0.5538177464111131, - "learning_rate": 0.000165, - "loss": 1.0005, - "step": 825 - }, - { - "epoch": 0.4062961141170684, - "grad_norm": 0.6408546066646377, - "learning_rate": 0.0001652, - "loss": 0.8947, - "step": 826 - }, - { - "epoch": 0.40678799803246435, - "grad_norm": 0.6640867349404851, - "learning_rate": 0.0001654, - "loss": 1.0078, - "step": 827 - }, - { - "epoch": 0.4072798819478603, - "grad_norm": 0.6337291990645043, - "learning_rate": 0.0001656, - "loss": 1.0795, - "step": 828 - }, - { - "epoch": 0.40777176586325625, - "grad_norm": 0.5994443818623946, - "learning_rate": 0.0001658, - "loss": 1.0258, - "step": 829 - }, - { - "epoch": 0.40826364977865226, - "grad_norm": 0.6256523434556119, - "learning_rate": 0.000166, - "loss": 1.0448, - "step": 830 - }, - { - "epoch": 0.4087555336940482, - "grad_norm": 0.5442887937180465, - "learning_rate": 0.0001662, - "loss": 0.9358, - "step": 831 - }, - { - "epoch": 0.40924741760944416, - "grad_norm": 0.8138607813124575, - "learning_rate": 0.0001664, - "loss": 0.9839, - "step": 832 - }, - { - "epoch": 0.4097393015248401, - "grad_norm": 0.547143371707003, - "learning_rate": 0.0001666, - "loss": 0.9262, - "step": 833 - }, - { - "epoch": 0.4102311854402361, - "grad_norm": 0.5750895604971006, - "learning_rate": 0.0001668, - "loss": 1.0468, - "step": 834 - }, - { - "epoch": 0.4107230693556321, - "grad_norm": 0.5904281252379038, - "learning_rate": 0.000167, - "loss": 0.963, - "step": 835 - }, - { - "epoch": 0.411214953271028, - "grad_norm": 0.5016551397659511, - "learning_rate": 0.0001672, - "loss": 0.9286, - "step": 836 - }, - { - "epoch": 0.411706837186424, - "grad_norm": 0.5351125384196013, - "learning_rate": 0.0001674, - "loss": 0.9714, - "step": 837 - }, - { - "epoch": 0.41219872110182, - "grad_norm": 0.5382761575591485, - "learning_rate": 0.0001676, - "loss": 0.9532, - "step": 838 - }, - { - "epoch": 0.41269060501721594, - "grad_norm": 1.0701240247269224, - "learning_rate": 0.0001678, - "loss": 1.0374, - "step": 839 - }, - { - "epoch": 0.4131824889326119, - "grad_norm": 0.4991462766725644, - "learning_rate": 0.000168, - "loss": 0.8981, - "step": 840 - }, - { - "epoch": 0.41367437284800784, - "grad_norm": 0.6255019505856168, - "learning_rate": 0.0001682, - "loss": 0.9986, - "step": 841 - }, - { - "epoch": 0.41416625676340385, - "grad_norm": 0.5226316612893023, - "learning_rate": 0.0001684, - "loss": 0.8961, - "step": 842 - }, - { - "epoch": 0.4146581406787998, - "grad_norm": 0.5686473192385973, - "learning_rate": 0.0001686, - "loss": 0.9205, - "step": 843 - }, - { - "epoch": 0.41515002459419575, - "grad_norm": 0.5761042202049704, - "learning_rate": 0.0001688, - "loss": 1.008, - "step": 844 - }, - { - "epoch": 0.41564190850959176, - "grad_norm": 0.5630637985316533, - "learning_rate": 0.00016900000000000002, - "loss": 0.9981, - "step": 845 - }, - { - "epoch": 0.4161337924249877, - "grad_norm": 0.6032313914065442, - "learning_rate": 0.0001692, - "loss": 1.0199, - "step": 846 - }, - { - "epoch": 0.41662567634038367, - "grad_norm": 0.5261433904762187, - "learning_rate": 0.0001694, - "loss": 0.9452, - "step": 847 - }, - { - "epoch": 0.4171175602557796, - "grad_norm": 0.5330558170318862, - "learning_rate": 0.0001696, - "loss": 0.9222, - "step": 848 - }, - { - "epoch": 0.4176094441711756, - "grad_norm": 0.5732281296425276, - "learning_rate": 0.0001698, - "loss": 0.9501, - "step": 849 - }, - { - "epoch": 0.4181013280865716, - "grad_norm": 0.5491208816730045, - "learning_rate": 0.00017, - "loss": 0.9096, - "step": 850 - }, - { - "epoch": 0.41859321200196753, - "grad_norm": 0.5597683458893487, - "learning_rate": 0.00017020000000000002, - "loss": 1.0032, - "step": 851 - }, - { - "epoch": 0.4190850959173635, - "grad_norm": 0.5622591927744834, - "learning_rate": 0.0001704, - "loss": 0.9857, - "step": 852 - }, - { - "epoch": 0.4195769798327595, - "grad_norm": 0.5885794587196331, - "learning_rate": 0.0001706, - "loss": 1.0659, - "step": 853 - }, - { - "epoch": 0.42006886374815544, - "grad_norm": 0.5514053204871268, - "learning_rate": 0.0001708, - "loss": 0.9635, - "step": 854 - }, - { - "epoch": 0.4205607476635514, - "grad_norm": 0.5518416797926007, - "learning_rate": 0.000171, - "loss": 0.954, - "step": 855 - }, - { - "epoch": 0.42105263157894735, - "grad_norm": 0.6023882068182663, - "learning_rate": 0.00017120000000000001, - "loss": 0.9739, - "step": 856 - }, - { - "epoch": 0.42154451549434335, - "grad_norm": 0.5500016689819863, - "learning_rate": 0.0001714, - "loss": 0.9494, - "step": 857 - }, - { - "epoch": 0.4220363994097393, - "grad_norm": 0.5649553119783989, - "learning_rate": 0.0001716, - "loss": 0.9781, - "step": 858 - }, - { - "epoch": 0.42252828332513526, - "grad_norm": 0.5459792426412101, - "learning_rate": 0.0001718, - "loss": 0.941, - "step": 859 - }, - { - "epoch": 0.4230201672405312, - "grad_norm": 0.5652491524840089, - "learning_rate": 0.000172, - "loss": 0.9732, - "step": 860 - }, - { - "epoch": 0.4235120511559272, - "grad_norm": 0.5270656850538266, - "learning_rate": 0.0001722, - "loss": 0.9065, - "step": 861 - }, - { - "epoch": 0.42400393507132317, - "grad_norm": 0.5769523725720945, - "learning_rate": 0.00017240000000000002, - "loss": 1.0541, - "step": 862 - }, - { - "epoch": 0.4244958189867191, - "grad_norm": 0.7200570869145366, - "learning_rate": 0.0001726, - "loss": 0.9736, - "step": 863 - }, - { - "epoch": 0.4249877029021151, - "grad_norm": 0.5161154721973787, - "learning_rate": 0.0001728, - "loss": 1.0092, - "step": 864 - }, - { - "epoch": 0.4254795868175111, - "grad_norm": 0.5296402663634671, - "learning_rate": 0.000173, - "loss": 0.9159, - "step": 865 - }, - { - "epoch": 0.42597147073290703, - "grad_norm": 0.5562736193802396, - "learning_rate": 0.0001732, - "loss": 0.8968, - "step": 866 - }, - { - "epoch": 0.426463354648303, - "grad_norm": 0.5270375029235659, - "learning_rate": 0.0001734, - "loss": 0.9101, - "step": 867 - }, - { - "epoch": 0.426955238563699, - "grad_norm": 0.5405629707161682, - "learning_rate": 0.00017360000000000002, - "loss": 1.0218, - "step": 868 - }, - { - "epoch": 0.42744712247909494, - "grad_norm": 0.5370615414631368, - "learning_rate": 0.0001738, - "loss": 0.9493, - "step": 869 - }, - { - "epoch": 0.4279390063944909, - "grad_norm": 0.5836169487693725, - "learning_rate": 0.000174, - "loss": 1.0389, - "step": 870 - }, - { - "epoch": 0.42843089030988685, - "grad_norm": 0.5692881217128932, - "learning_rate": 0.0001742, - "loss": 1.0029, - "step": 871 - }, - { - "epoch": 0.42892277422528285, - "grad_norm": 0.5162749207602118, - "learning_rate": 0.0001744, - "loss": 0.9913, - "step": 872 - }, - { - "epoch": 0.4294146581406788, - "grad_norm": 0.5179704644692854, - "learning_rate": 0.00017460000000000002, - "loss": 0.9416, - "step": 873 - }, - { - "epoch": 0.42990654205607476, - "grad_norm": 0.5413434863970977, - "learning_rate": 0.00017480000000000002, - "loss": 0.9138, - "step": 874 - }, - { - "epoch": 0.4303984259714707, - "grad_norm": 0.5278059859029571, - "learning_rate": 0.000175, - "loss": 0.9468, - "step": 875 - }, - { - "epoch": 0.4308903098868667, - "grad_norm": 0.5119656198059847, - "learning_rate": 0.0001752, - "loss": 1.0543, - "step": 876 - }, - { - "epoch": 0.43138219380226267, - "grad_norm": 0.5377509485318009, - "learning_rate": 0.0001754, - "loss": 1.0107, - "step": 877 - }, - { - "epoch": 0.4318740777176586, - "grad_norm": 0.5820007835814583, - "learning_rate": 0.0001756, - "loss": 0.9246, - "step": 878 - }, - { - "epoch": 0.4323659616330546, - "grad_norm": 0.5343999036018037, - "learning_rate": 0.00017580000000000002, - "loss": 0.9566, - "step": 879 - }, - { - "epoch": 0.4328578455484506, - "grad_norm": 0.5062821082498787, - "learning_rate": 0.00017600000000000002, - "loss": 0.8971, - "step": 880 - }, - { - "epoch": 0.43334972946384653, - "grad_norm": 0.5503322884070123, - "learning_rate": 0.0001762, - "loss": 1.1004, - "step": 881 - }, - { - "epoch": 0.4338416133792425, - "grad_norm": 0.5212493269418079, - "learning_rate": 0.0001764, - "loss": 0.9533, - "step": 882 - }, - { - "epoch": 0.43433349729463844, - "grad_norm": 0.5388116816718315, - "learning_rate": 0.0001766, - "loss": 0.9838, - "step": 883 - }, - { - "epoch": 0.43482538121003445, - "grad_norm": 0.5735442764615539, - "learning_rate": 0.00017680000000000001, - "loss": 1.0515, - "step": 884 - }, - { - "epoch": 0.4353172651254304, - "grad_norm": 0.5155939464902994, - "learning_rate": 0.00017700000000000002, - "loss": 0.9761, - "step": 885 - }, - { - "epoch": 0.43580914904082635, - "grad_norm": 0.5067177828353471, - "learning_rate": 0.0001772, - "loss": 0.9218, - "step": 886 - }, - { - "epoch": 0.43630103295622236, - "grad_norm": 0.516882852883826, - "learning_rate": 0.0001774, - "loss": 0.9928, - "step": 887 - }, - { - "epoch": 0.4367929168716183, - "grad_norm": 0.541258076851383, - "learning_rate": 0.0001776, - "loss": 1.0355, - "step": 888 - }, - { - "epoch": 0.43728480078701426, - "grad_norm": 0.5145423495342223, - "learning_rate": 0.0001778, - "loss": 0.9292, - "step": 889 - }, - { - "epoch": 0.4377766847024102, - "grad_norm": 0.5142489348461433, - "learning_rate": 0.00017800000000000002, - "loss": 0.9434, - "step": 890 - }, - { - "epoch": 0.4382685686178062, - "grad_norm": 0.5150661220780518, - "learning_rate": 0.00017820000000000002, - "loss": 0.8795, - "step": 891 - }, - { - "epoch": 0.43876045253320217, - "grad_norm": 0.5739910608494103, - "learning_rate": 0.0001784, - "loss": 1.0217, - "step": 892 - }, - { - "epoch": 0.4392523364485981, - "grad_norm": 0.5210480855295413, - "learning_rate": 0.0001786, - "loss": 0.9, - "step": 893 - }, - { - "epoch": 0.4397442203639941, - "grad_norm": 0.5226029458504111, - "learning_rate": 0.0001788, - "loss": 0.9757, - "step": 894 - }, - { - "epoch": 0.4402361042793901, - "grad_norm": 0.5390926276584241, - "learning_rate": 0.00017900000000000001, - "loss": 0.9254, - "step": 895 - }, - { - "epoch": 0.44072798819478604, - "grad_norm": 0.6096776794834838, - "learning_rate": 0.00017920000000000002, - "loss": 1.0365, - "step": 896 - }, - { - "epoch": 0.441219872110182, - "grad_norm": 0.9843822292629351, - "learning_rate": 0.00017940000000000002, - "loss": 0.9873, - "step": 897 - }, - { - "epoch": 0.44171175602557794, - "grad_norm": 0.5597177313333231, - "learning_rate": 0.0001796, - "loss": 1.0479, - "step": 898 - }, - { - "epoch": 0.44220363994097395, - "grad_norm": 0.5127121453283586, - "learning_rate": 0.0001798, - "loss": 0.9685, - "step": 899 - }, - { - "epoch": 0.4426955238563699, - "grad_norm": 0.5030124116175764, - "learning_rate": 0.00018, - "loss": 0.8948, - "step": 900 - }, - { - "epoch": 0.44318740777176585, - "grad_norm": 0.5293459541657133, - "learning_rate": 0.00018020000000000002, - "loss": 0.9261, - "step": 901 - }, - { - "epoch": 0.4436792916871618, - "grad_norm": 0.5054173854921744, - "learning_rate": 0.00018040000000000002, - "loss": 1.0131, - "step": 902 - }, - { - "epoch": 0.4441711756025578, - "grad_norm": 0.5217743352033825, - "learning_rate": 0.00018060000000000003, - "loss": 0.9877, - "step": 903 - }, - { - "epoch": 0.44466305951795376, - "grad_norm": 0.501600899425512, - "learning_rate": 0.0001808, - "loss": 0.82, - "step": 904 - }, - { - "epoch": 0.4451549434333497, - "grad_norm": 0.525501434091119, - "learning_rate": 0.000181, - "loss": 0.9267, - "step": 905 - }, - { - "epoch": 0.4456468273487457, - "grad_norm": 0.6120434569760427, - "learning_rate": 0.0001812, - "loss": 1.1088, - "step": 906 - }, - { - "epoch": 0.4461387112641417, - "grad_norm": 0.493055662564238, - "learning_rate": 0.00018140000000000002, - "loss": 0.9278, - "step": 907 - }, - { - "epoch": 0.4466305951795376, - "grad_norm": 0.4836970336324819, - "learning_rate": 0.00018160000000000002, - "loss": 0.8893, - "step": 908 - }, - { - "epoch": 0.4471224790949336, - "grad_norm": 0.5465285130057251, - "learning_rate": 0.00018180000000000003, - "loss": 0.9715, - "step": 909 - }, - { - "epoch": 0.4476143630103296, - "grad_norm": 0.5609985643623988, - "learning_rate": 0.000182, - "loss": 0.9634, - "step": 910 - }, - { - "epoch": 0.44810624692572554, - "grad_norm": 0.5492145145037122, - "learning_rate": 0.0001822, - "loss": 0.98, - "step": 911 - }, - { - "epoch": 0.4485981308411215, - "grad_norm": 0.5143204844427829, - "learning_rate": 0.00018240000000000002, - "loss": 0.9637, - "step": 912 - }, - { - "epoch": 0.44909001475651744, - "grad_norm": 0.50718221347543, - "learning_rate": 0.00018260000000000002, - "loss": 0.9899, - "step": 913 - }, - { - "epoch": 0.44958189867191345, - "grad_norm": 0.4881790127376402, - "learning_rate": 0.00018280000000000003, - "loss": 1.0146, - "step": 914 - }, - { - "epoch": 0.4500737825873094, - "grad_norm": 0.5273502353074515, - "learning_rate": 0.000183, - "loss": 0.9837, - "step": 915 - }, - { - "epoch": 0.45056566650270535, - "grad_norm": 1.4151392968827463, - "learning_rate": 0.0001832, - "loss": 1.0776, - "step": 916 - }, - { - "epoch": 0.4510575504181013, - "grad_norm": 0.5161037772931225, - "learning_rate": 0.0001834, - "loss": 0.9127, - "step": 917 - }, - { - "epoch": 0.4515494343334973, - "grad_norm": 0.553790492621958, - "learning_rate": 0.00018360000000000002, - "loss": 0.982, - "step": 918 - }, - { - "epoch": 0.45204131824889326, - "grad_norm": 0.8272049234808955, - "learning_rate": 0.00018380000000000002, - "loss": 0.9753, - "step": 919 - }, - { - "epoch": 0.4525332021642892, - "grad_norm": 0.5646572338940326, - "learning_rate": 0.00018400000000000003, - "loss": 1.0917, - "step": 920 - }, - { - "epoch": 0.45302508607968517, - "grad_norm": 1.1220925918626086, - "learning_rate": 0.0001842, - "loss": 0.9684, - "step": 921 - }, - { - "epoch": 0.4535169699950812, - "grad_norm": 0.5288359765330555, - "learning_rate": 0.0001844, - "loss": 0.964, - "step": 922 - }, - { - "epoch": 0.45400885391047713, - "grad_norm": 0.5561258676542358, - "learning_rate": 0.00018460000000000001, - "loss": 1.0354, - "step": 923 - }, - { - "epoch": 0.4545007378258731, - "grad_norm": 0.6450660996673966, - "learning_rate": 0.00018480000000000002, - "loss": 0.9265, - "step": 924 - }, - { - "epoch": 0.4549926217412691, - "grad_norm": 0.6319923842774048, - "learning_rate": 0.00018500000000000002, - "loss": 0.969, - "step": 925 - }, - { - "epoch": 0.45548450565666504, - "grad_norm": 0.5424189796922686, - "learning_rate": 0.00018520000000000003, - "loss": 0.9816, - "step": 926 - }, - { - "epoch": 0.455976389572061, - "grad_norm": 0.5781436446478081, - "learning_rate": 0.0001854, - "loss": 0.9374, - "step": 927 - }, - { - "epoch": 0.45646827348745694, - "grad_norm": 0.5578660553072525, - "learning_rate": 0.0001856, - "loss": 0.9408, - "step": 928 - }, - { - "epoch": 0.45696015740285295, - "grad_norm": 0.5143529191819146, - "learning_rate": 0.00018580000000000002, - "loss": 0.9493, - "step": 929 - }, - { - "epoch": 0.4574520413182489, - "grad_norm": 0.5566530266352897, - "learning_rate": 0.00018600000000000002, - "loss": 0.9306, - "step": 930 - }, - { - "epoch": 0.45794392523364486, - "grad_norm": 0.5693834575300597, - "learning_rate": 0.00018620000000000003, - "loss": 0.9072, - "step": 931 - }, - { - "epoch": 0.4584358091490408, - "grad_norm": 0.5883519629457379, - "learning_rate": 0.00018640000000000003, - "loss": 0.878, - "step": 932 - }, - { - "epoch": 0.4589276930644368, - "grad_norm": 0.5931664627910979, - "learning_rate": 0.0001866, - "loss": 1.0526, - "step": 933 - }, - { - "epoch": 0.45941957697983277, - "grad_norm": 0.5474015587714414, - "learning_rate": 0.00018680000000000001, - "loss": 0.9676, - "step": 934 - }, - { - "epoch": 0.4599114608952287, - "grad_norm": 0.5177988921047931, - "learning_rate": 0.00018700000000000002, - "loss": 0.9288, - "step": 935 - }, - { - "epoch": 0.46040334481062467, - "grad_norm": 0.545548390053368, - "learning_rate": 0.00018720000000000002, - "loss": 1.0001, - "step": 936 - }, - { - "epoch": 0.4608952287260207, - "grad_norm": 0.519325393867732, - "learning_rate": 0.00018740000000000003, - "loss": 0.9482, - "step": 937 - }, - { - "epoch": 0.46138711264141663, - "grad_norm": 0.5456394285516738, - "learning_rate": 0.0001876, - "loss": 0.9549, - "step": 938 - }, - { - "epoch": 0.4618789965568126, - "grad_norm": 0.5610463252748996, - "learning_rate": 0.0001878, - "loss": 0.9887, - "step": 939 - }, - { - "epoch": 0.46237088047220853, - "grad_norm": 0.5088615249359095, - "learning_rate": 0.000188, - "loss": 0.9997, - "step": 940 - }, - { - "epoch": 0.46286276438760454, - "grad_norm": 0.6403892090296743, - "learning_rate": 0.0001882, - "loss": 0.9992, - "step": 941 - }, - { - "epoch": 0.4633546483030005, - "grad_norm": 0.5459758884645787, - "learning_rate": 0.0001884, - "loss": 0.9136, - "step": 942 - }, - { - "epoch": 0.46384653221839645, - "grad_norm": 0.5264417005084878, - "learning_rate": 0.0001886, - "loss": 1.0359, - "step": 943 - }, - { - "epoch": 0.4643384161337924, - "grad_norm": 0.5452103362565185, - "learning_rate": 0.0001888, - "loss": 1.0104, - "step": 944 - }, - { - "epoch": 0.4648303000491884, - "grad_norm": 0.49040713618929604, - "learning_rate": 0.00018899999999999999, - "loss": 1.0123, - "step": 945 - }, - { - "epoch": 0.46532218396458436, - "grad_norm": 0.5444843316477975, - "learning_rate": 0.0001892, - "loss": 0.9674, - "step": 946 - }, - { - "epoch": 0.4658140678799803, - "grad_norm": 0.5154842838561677, - "learning_rate": 0.0001894, - "loss": 0.954, - "step": 947 - }, - { - "epoch": 0.4663059517953763, - "grad_norm": 0.5311303516764446, - "learning_rate": 0.0001896, - "loss": 0.9436, - "step": 948 - }, - { - "epoch": 0.46679783571077227, - "grad_norm": 0.6332795107793114, - "learning_rate": 0.0001898, - "loss": 0.9715, - "step": 949 - }, - { - "epoch": 0.4672897196261682, - "grad_norm": 0.5310883218087131, - "learning_rate": 0.00019, - "loss": 0.9733, - "step": 950 - }, - { - "epoch": 0.4677816035415642, - "grad_norm": 0.5585087599292401, - "learning_rate": 0.0001902, - "loss": 0.9766, - "step": 951 - }, - { - "epoch": 0.4682734874569602, - "grad_norm": 0.5379772982249258, - "learning_rate": 0.0001904, - "loss": 0.9725, - "step": 952 - }, - { - "epoch": 0.46876537137235613, - "grad_norm": 1.0927315848471355, - "learning_rate": 0.0001906, - "loss": 1.0, - "step": 953 - }, - { - "epoch": 0.4692572552877521, - "grad_norm": 0.522722072916259, - "learning_rate": 0.0001908, - "loss": 0.9734, - "step": 954 - }, - { - "epoch": 0.46974913920314804, - "grad_norm": 0.5722683426740471, - "learning_rate": 0.000191, - "loss": 0.9747, - "step": 955 - }, - { - "epoch": 0.47024102311854404, - "grad_norm": 0.6054667509782405, - "learning_rate": 0.0001912, - "loss": 0.9534, - "step": 956 - }, - { - "epoch": 0.47073290703394, - "grad_norm": 0.5028530415420397, - "learning_rate": 0.0001914, - "loss": 0.9583, - "step": 957 - }, - { - "epoch": 0.47122479094933595, - "grad_norm": 0.4567247279556181, - "learning_rate": 0.0001916, - "loss": 0.9365, - "step": 958 - }, - { - "epoch": 0.4717166748647319, - "grad_norm": 0.5415736824580925, - "learning_rate": 0.0001918, - "loss": 1.0211, - "step": 959 - }, - { - "epoch": 0.4722085587801279, - "grad_norm": 0.4893423926343375, - "learning_rate": 0.000192, - "loss": 0.9563, - "step": 960 - }, - { - "epoch": 0.47270044269552386, - "grad_norm": 0.5508130258707056, - "learning_rate": 0.0001922, - "loss": 1.008, - "step": 961 - }, - { - "epoch": 0.4731923266109198, - "grad_norm": 0.5786410786759495, - "learning_rate": 0.00019240000000000001, - "loss": 1.0173, - "step": 962 - }, - { - "epoch": 0.47368421052631576, - "grad_norm": 0.5611499337597293, - "learning_rate": 0.0001926, - "loss": 0.9662, - "step": 963 - }, - { - "epoch": 0.47417609444171177, - "grad_norm": 0.5198201756826512, - "learning_rate": 0.0001928, - "loss": 0.9753, - "step": 964 - }, - { - "epoch": 0.4746679783571077, - "grad_norm": 0.5852887651233413, - "learning_rate": 0.000193, - "loss": 0.9793, - "step": 965 - }, - { - "epoch": 0.4751598622725037, - "grad_norm": 0.6813951087616439, - "learning_rate": 0.0001932, - "loss": 0.9821, - "step": 966 - }, - { - "epoch": 0.4756517461878997, - "grad_norm": 0.5327706348908753, - "learning_rate": 0.0001934, - "loss": 0.8946, - "step": 967 - }, - { - "epoch": 0.47614363010329563, - "grad_norm": 0.5161668091121259, - "learning_rate": 0.00019360000000000002, - "loss": 0.9447, - "step": 968 - }, - { - "epoch": 0.4766355140186916, - "grad_norm": 0.5417441809512521, - "learning_rate": 0.0001938, - "loss": 0.9962, - "step": 969 - }, - { - "epoch": 0.47712739793408754, - "grad_norm": 0.5813128131414306, - "learning_rate": 0.000194, - "loss": 1.0569, - "step": 970 - }, - { - "epoch": 0.47761928184948355, - "grad_norm": 0.5322364446816521, - "learning_rate": 0.0001942, - "loss": 0.9452, - "step": 971 - }, - { - "epoch": 0.4781111657648795, - "grad_norm": 0.5523793272808772, - "learning_rate": 0.0001944, - "loss": 0.9407, - "step": 972 - }, - { - "epoch": 0.47860304968027545, - "grad_norm": 0.5598749774485714, - "learning_rate": 0.00019460000000000001, - "loss": 1.0484, - "step": 973 - }, - { - "epoch": 0.4790949335956714, - "grad_norm": 0.5043454954607122, - "learning_rate": 0.0001948, - "loss": 0.8841, - "step": 974 - }, - { - "epoch": 0.4795868175110674, - "grad_norm": 0.5271800626889586, - "learning_rate": 0.000195, - "loss": 1.0506, - "step": 975 - }, - { - "epoch": 0.48007870142646336, - "grad_norm": 0.5863380204930064, - "learning_rate": 0.0001952, - "loss": 1.0006, - "step": 976 - }, - { - "epoch": 0.4805705853418593, - "grad_norm": 0.5863752827782015, - "learning_rate": 0.0001954, - "loss": 1.0227, - "step": 977 - }, - { - "epoch": 0.48106246925725527, - "grad_norm": 0.5420702202322971, - "learning_rate": 0.0001956, - "loss": 1.0879, - "step": 978 - }, - { - "epoch": 0.4815543531726513, - "grad_norm": 0.46710007809882115, - "learning_rate": 0.00019580000000000002, - "loss": 0.9074, - "step": 979 - }, - { - "epoch": 0.4820462370880472, - "grad_norm": 0.4979039222094224, - "learning_rate": 0.000196, - "loss": 0.9127, - "step": 980 - }, - { - "epoch": 0.4825381210034432, - "grad_norm": 0.5270304948270585, - "learning_rate": 0.0001962, - "loss": 0.9595, - "step": 981 - }, - { - "epoch": 0.48303000491883913, - "grad_norm": 0.5074612848390806, - "learning_rate": 0.0001964, - "loss": 1.0176, - "step": 982 - }, - { - "epoch": 0.48352188883423514, - "grad_norm": 0.5303100630890872, - "learning_rate": 0.0001966, - "loss": 0.89, - "step": 983 - }, - { - "epoch": 0.4840137727496311, - "grad_norm": 0.47382999672479026, - "learning_rate": 0.0001968, - "loss": 0.9367, - "step": 984 - }, - { - "epoch": 0.48450565666502704, - "grad_norm": 0.5519787323789412, - "learning_rate": 0.00019700000000000002, - "loss": 1.0038, - "step": 985 - }, - { - "epoch": 0.48499754058042305, - "grad_norm": 0.4897366081420991, - "learning_rate": 0.0001972, - "loss": 0.9696, - "step": 986 - }, - { - "epoch": 0.485489424495819, - "grad_norm": 0.49158245345946805, - "learning_rate": 0.0001974, - "loss": 0.952, - "step": 987 - }, - { - "epoch": 0.48598130841121495, - "grad_norm": 0.5105032086419014, - "learning_rate": 0.0001976, - "loss": 0.9443, - "step": 988 - }, - { - "epoch": 0.4864731923266109, - "grad_norm": 0.4699398564080473, - "learning_rate": 0.0001978, - "loss": 0.9536, - "step": 989 - }, - { - "epoch": 0.4869650762420069, - "grad_norm": 0.5173446919886702, - "learning_rate": 0.00019800000000000002, - "loss": 0.8655, - "step": 990 - }, - { - "epoch": 0.48745696015740286, - "grad_norm": 0.5004505477278638, - "learning_rate": 0.00019820000000000002, - "loss": 0.9124, - "step": 991 - }, - { - "epoch": 0.4879488440727988, - "grad_norm": 0.48220413607150886, - "learning_rate": 0.0001984, - "loss": 0.9546, - "step": 992 - }, - { - "epoch": 0.48844072798819477, - "grad_norm": 0.4819792774684789, - "learning_rate": 0.0001986, - "loss": 0.8738, - "step": 993 - }, - { - "epoch": 0.4889326119035908, - "grad_norm": 0.5164599873717493, - "learning_rate": 0.0001988, - "loss": 0.9594, - "step": 994 - }, - { - "epoch": 0.4894244958189867, - "grad_norm": 0.5672787019265206, - "learning_rate": 0.000199, - "loss": 0.9997, - "step": 995 - }, - { - "epoch": 0.4899163797343827, - "grad_norm": 0.48163904907656424, - "learning_rate": 0.00019920000000000002, - "loss": 0.9249, - "step": 996 - }, - { - "epoch": 0.49040826364977863, - "grad_norm": 0.4992747022764088, - "learning_rate": 0.00019940000000000002, - "loss": 0.9101, - "step": 997 - }, - { - "epoch": 0.49090014756517464, - "grad_norm": 0.505152562500964, - "learning_rate": 0.0001996, - "loss": 0.98, - "step": 998 - }, - { - "epoch": 0.4913920314805706, - "grad_norm": 0.49220963895087283, - "learning_rate": 0.0001998, - "loss": 1.0135, - "step": 999 - }, - { - "epoch": 0.49188391539596654, - "grad_norm": 0.499785327862804, - "learning_rate": 0.0002, - "loss": 0.9809, - "step": 1000 - }, - { - "epoch": 0.4923757993113625, - "grad_norm": 0.5155171431912795, - "learning_rate": 0.00019999998104216116, - "loss": 1.0481, - "step": 1001 - }, - { - "epoch": 0.4928676832267585, - "grad_norm": 0.4980545715907866, - "learning_rate": 0.00019999992416865176, - "loss": 0.8527, - "step": 1002 - }, - { - "epoch": 0.49335956714215445, - "grad_norm": 0.47724253947130507, - "learning_rate": 0.0001999998293794934, - "loss": 0.9135, - "step": 1003 - }, - { - "epoch": 0.4938514510575504, - "grad_norm": 0.5720455995618264, - "learning_rate": 0.00019999969667472203, - "loss": 0.9878, - "step": 1004 - }, - { - "epoch": 0.49434333497294636, - "grad_norm": 0.5371333285470227, - "learning_rate": 0.00019999952605438795, - "loss": 1.0797, - "step": 1005 - }, - { - "epoch": 0.49483521888834237, - "grad_norm": 0.4819222664190339, - "learning_rate": 0.00019999931751855582, - "loss": 0.9576, - "step": 1006 - }, - { - "epoch": 0.4953271028037383, - "grad_norm": 0.4823918029759922, - "learning_rate": 0.00019999907106730479, - "loss": 0.9503, - "step": 1007 - }, - { - "epoch": 0.49581898671913427, - "grad_norm": 0.5089751526246875, - "learning_rate": 0.0001999987867007282, - "loss": 0.9793, - "step": 1008 - }, - { - "epoch": 0.4963108706345303, - "grad_norm": 0.5511405448092508, - "learning_rate": 0.00019999846441893398, - "loss": 1.1435, - "step": 1009 - }, - { - "epoch": 0.49680275454992623, - "grad_norm": 0.5157899143785711, - "learning_rate": 0.00019999810422204422, - "loss": 1.017, - "step": 1010 - }, - { - "epoch": 0.4972946384653222, - "grad_norm": 0.5026713002369468, - "learning_rate": 0.00019999770611019556, - "loss": 1.0423, - "step": 1011 - }, - { - "epoch": 0.49778652238071813, - "grad_norm": 0.4821734284298085, - "learning_rate": 0.0001999972700835389, - "loss": 0.9666, - "step": 1012 - }, - { - "epoch": 0.49827840629611414, - "grad_norm": 0.5215944398758797, - "learning_rate": 0.00019999679614223962, - "loss": 0.9825, - "step": 1013 - }, - { - "epoch": 0.4987702902115101, - "grad_norm": 0.5304865325619055, - "learning_rate": 0.00019999628428647736, - "loss": 0.995, - "step": 1014 - }, - { - "epoch": 0.49926217412690604, - "grad_norm": 0.5204191713213968, - "learning_rate": 0.00019999573451644627, - "loss": 0.9588, - "step": 1015 - }, - { - "epoch": 0.499754058042302, - "grad_norm": 0.5174085296709169, - "learning_rate": 0.0001999951468323547, - "loss": 0.9716, - "step": 1016 - }, - { - "epoch": 0.500245941957698, - "grad_norm": 0.4938326578972806, - "learning_rate": 0.00019999452123442554, - "loss": 1.0, - "step": 1017 - }, - { - "epoch": 0.500245941957698, - "eval_loss": 0.8863905668258667, - "eval_runtime": 6679.255, - "eval_samples_per_second": 4.266, - "eval_steps_per_second": 2.133, - "step": 1017 - }, - { - "epoch": 0.5007378258730939, - "grad_norm": 0.49214519001737467, - "learning_rate": 0.00019999385772289597, - "loss": 0.9059, - "step": 1018 - }, - { - "epoch": 0.5012297097884899, - "grad_norm": 0.5509060070684917, - "learning_rate": 0.00019999315629801759, - "loss": 0.9234, - "step": 1019 - }, - { - "epoch": 0.5017215937038859, - "grad_norm": 0.5360077984580891, - "learning_rate": 0.00019999241696005632, - "loss": 1.0058, - "step": 1020 - }, - { - "epoch": 0.5022134776192818, - "grad_norm": 0.6575060966897581, - "learning_rate": 0.00019999163970929246, - "loss": 0.9754, - "step": 1021 - }, - { - "epoch": 0.5027053615346778, - "grad_norm": 0.4964465882401881, - "learning_rate": 0.00019999082454602078, - "loss": 0.8975, - "step": 1022 - }, - { - "epoch": 0.5031972454500738, - "grad_norm": 0.48294639824987906, - "learning_rate": 0.0001999899714705503, - "loss": 0.9591, - "step": 1023 - }, - { - "epoch": 0.5036891293654697, - "grad_norm": 0.5251594738406836, - "learning_rate": 0.0001999890804832045, - "loss": 0.9166, - "step": 1024 - }, - { - "epoch": 0.5041810132808657, - "grad_norm": 0.5536997769465125, - "learning_rate": 0.00019998815158432117, - "loss": 1.0339, - "step": 1025 - }, - { - "epoch": 0.5046728971962616, - "grad_norm": 0.5365066362511384, - "learning_rate": 0.00019998718477425256, - "loss": 0.9394, - "step": 1026 - }, - { - "epoch": 0.5051647811116576, - "grad_norm": 0.5220932228818115, - "learning_rate": 0.0001999861800533652, - "loss": 0.9513, - "step": 1027 - }, - { - "epoch": 0.5056566650270536, - "grad_norm": 0.48675880030109264, - "learning_rate": 0.00019998513742204005, - "loss": 0.9856, - "step": 1028 - }, - { - "epoch": 0.5061485489424495, - "grad_norm": 0.5340826835480033, - "learning_rate": 0.0001999840568806724, - "loss": 0.9793, - "step": 1029 - }, - { - "epoch": 0.5066404328578455, - "grad_norm": 0.5435428108318967, - "learning_rate": 0.000199982938429672, - "loss": 0.971, - "step": 1030 - }, - { - "epoch": 0.5071323167732416, - "grad_norm": 0.49665018666396576, - "learning_rate": 0.0001999817820694629, - "loss": 1.0116, - "step": 1031 - }, - { - "epoch": 0.5076242006886375, - "grad_norm": 0.5290700436374691, - "learning_rate": 0.00019998058780048352, - "loss": 1.0281, - "step": 1032 - }, - { - "epoch": 0.5081160846040335, - "grad_norm": 0.46656726365676354, - "learning_rate": 0.0001999793556231867, - "loss": 0.9635, - "step": 1033 - }, - { - "epoch": 0.5086079685194295, - "grad_norm": 0.5230829519946276, - "learning_rate": 0.0001999780855380396, - "loss": 1.0314, - "step": 1034 - }, - { - "epoch": 0.5090998524348254, - "grad_norm": 0.4974803748014261, - "learning_rate": 0.00019997677754552383, - "loss": 0.9608, - "step": 1035 - }, - { - "epoch": 0.5095917363502214, - "grad_norm": 0.48499834789667756, - "learning_rate": 0.00019997543164613525, - "loss": 0.9624, - "step": 1036 - }, - { - "epoch": 0.5100836202656173, - "grad_norm": 0.5263013642858179, - "learning_rate": 0.00019997404784038425, - "loss": 1.0824, - "step": 1037 - }, - { - "epoch": 0.5105755041810133, - "grad_norm": 0.48615387278539696, - "learning_rate": 0.00019997262612879543, - "loss": 1.0257, - "step": 1038 - }, - { - "epoch": 0.5110673880964093, - "grad_norm": 0.4918583874695705, - "learning_rate": 0.00019997116651190793, - "loss": 0.9189, - "step": 1039 - }, - { - "epoch": 0.5115592720118052, - "grad_norm": 0.5158751454023438, - "learning_rate": 0.0001999696689902751, - "loss": 0.9687, - "step": 1040 - }, - { - "epoch": 0.5120511559272012, - "grad_norm": 0.5091459685499787, - "learning_rate": 0.00019996813356446477, - "loss": 0.9564, - "step": 1041 - }, - { - "epoch": 0.5125430398425972, - "grad_norm": 0.5316267225900095, - "learning_rate": 0.00019996656023505907, - "loss": 0.9381, - "step": 1042 - }, - { - "epoch": 0.5130349237579931, - "grad_norm": 0.5552813111100937, - "learning_rate": 0.0001999649490026546, - "loss": 0.9379, - "step": 1043 - }, - { - "epoch": 0.5135268076733891, - "grad_norm": 0.5229496871549363, - "learning_rate": 0.0001999632998678622, - "loss": 0.9411, - "step": 1044 - }, - { - "epoch": 0.514018691588785, - "grad_norm": 0.5479240671596419, - "learning_rate": 0.0001999616128313072, - "loss": 0.9772, - "step": 1045 - }, - { - "epoch": 0.514510575504181, - "grad_norm": 0.5028138794727268, - "learning_rate": 0.00019995988789362924, - "loss": 0.9436, - "step": 1046 - }, - { - "epoch": 0.515002459419577, - "grad_norm": 0.5192800745783225, - "learning_rate": 0.00019995812505548235, - "loss": 1.0119, - "step": 1047 - }, - { - "epoch": 0.5154943433349729, - "grad_norm": 0.5052205027465987, - "learning_rate": 0.0001999563243175349, - "loss": 0.965, - "step": 1048 - }, - { - "epoch": 0.5159862272503689, - "grad_norm": 0.5076807151419906, - "learning_rate": 0.00019995448568046968, - "loss": 1.0358, - "step": 1049 - }, - { - "epoch": 0.5164781111657649, - "grad_norm": 0.5583453877277026, - "learning_rate": 0.0001999526091449838, - "loss": 0.9359, - "step": 1050 - }, - { - "epoch": 0.5169699950811608, - "grad_norm": 0.4883208945615239, - "learning_rate": 0.00019995069471178873, - "loss": 0.8411, - "step": 1051 - }, - { - "epoch": 0.5174618789965568, - "grad_norm": 0.48140272826016023, - "learning_rate": 0.0001999487423816104, - "loss": 0.9294, - "step": 1052 - }, - { - "epoch": 0.5179537629119528, - "grad_norm": 0.49741183449374904, - "learning_rate": 0.00019994675215518904, - "loss": 1.0319, - "step": 1053 - }, - { - "epoch": 0.5184456468273487, - "grad_norm": 0.5049671413313106, - "learning_rate": 0.00019994472403327924, - "loss": 0.9564, - "step": 1054 - }, - { - "epoch": 0.5189375307427447, - "grad_norm": 0.5014596762915716, - "learning_rate": 0.00019994265801664995, - "loss": 1.0328, - "step": 1055 - }, - { - "epoch": 0.5194294146581406, - "grad_norm": 0.5439278599357487, - "learning_rate": 0.0001999405541060846, - "loss": 0.9626, - "step": 1056 - }, - { - "epoch": 0.5199212985735366, - "grad_norm": 0.516105235953036, - "learning_rate": 0.0001999384123023808, - "loss": 0.9725, - "step": 1057 - }, - { - "epoch": 0.5204131824889326, - "grad_norm": 0.5134502517325076, - "learning_rate": 0.0001999362326063507, - "loss": 0.9918, - "step": 1058 - }, - { - "epoch": 0.5209050664043285, - "grad_norm": 1.130623670607306, - "learning_rate": 0.0001999340150188207, - "loss": 0.9511, - "step": 1059 - }, - { - "epoch": 0.5213969503197246, - "grad_norm": 0.571277939096185, - "learning_rate": 0.00019993175954063162, - "loss": 1.0004, - "step": 1060 - }, - { - "epoch": 0.5218888342351206, - "grad_norm": 0.5638754527966803, - "learning_rate": 0.0001999294661726387, - "loss": 1.0458, - "step": 1061 - }, - { - "epoch": 0.5223807181505165, - "grad_norm": 0.4915903459752852, - "learning_rate": 0.00019992713491571141, - "loss": 0.9286, - "step": 1062 - }, - { - "epoch": 0.5228726020659125, - "grad_norm": 0.526938834777462, - "learning_rate": 0.00019992476577073372, - "loss": 0.9886, - "step": 1063 - }, - { - "epoch": 0.5233644859813084, - "grad_norm": 0.4815199240735547, - "learning_rate": 0.00019992235873860387, - "loss": 0.9171, - "step": 1064 - }, - { - "epoch": 0.5238563698967044, - "grad_norm": 0.5525309180666556, - "learning_rate": 0.0001999199138202345, - "loss": 0.9285, - "step": 1065 - }, - { - "epoch": 0.5243482538121004, - "grad_norm": 0.4876592580799642, - "learning_rate": 0.0001999174310165526, - "loss": 0.9306, - "step": 1066 - }, - { - "epoch": 0.5248401377274963, - "grad_norm": 0.4956962542956257, - "learning_rate": 0.00019991491032849963, - "loss": 0.952, - "step": 1067 - }, - { - "epoch": 0.5253320216428923, - "grad_norm": 0.5124942889923908, - "learning_rate": 0.00019991235175703125, - "loss": 0.9061, - "step": 1068 - }, - { - "epoch": 0.5258239055582883, - "grad_norm": 0.5240364330047325, - "learning_rate": 0.0001999097553031176, - "loss": 0.9607, - "step": 1069 - }, - { - "epoch": 0.5263157894736842, - "grad_norm": 2.4267417582396087, - "learning_rate": 0.0001999071209677431, - "loss": 1.0853, - "step": 1070 - }, - { - "epoch": 0.5268076733890802, - "grad_norm": 0.6139702116859586, - "learning_rate": 0.00019990444875190658, - "loss": 1.0365, - "step": 1071 - }, - { - "epoch": 0.5272995573044762, - "grad_norm": 0.519507374423165, - "learning_rate": 0.00019990173865662124, - "loss": 0.9761, - "step": 1072 - }, - { - "epoch": 0.5277914412198721, - "grad_norm": 0.5584447614688508, - "learning_rate": 0.00019989899068291467, - "loss": 0.9908, - "step": 1073 - }, - { - "epoch": 0.5282833251352681, - "grad_norm": 0.6773970980327138, - "learning_rate": 0.00019989620483182874, - "loss": 1.0332, - "step": 1074 - }, - { - "epoch": 0.528775209050664, - "grad_norm": 0.5557178461152342, - "learning_rate": 0.00019989338110441973, - "loss": 0.9724, - "step": 1075 - }, - { - "epoch": 0.52926709296606, - "grad_norm": 0.5603461502619792, - "learning_rate": 0.00019989051950175827, - "loss": 0.9609, - "step": 1076 - }, - { - "epoch": 0.529758976881456, - "grad_norm": 0.6223679108164312, - "learning_rate": 0.00019988762002492937, - "loss": 1.0384, - "step": 1077 - }, - { - "epoch": 0.5302508607968519, - "grad_norm": 0.5221436427607583, - "learning_rate": 0.0001998846826750324, - "loss": 0.9731, - "step": 1078 - }, - { - "epoch": 0.5307427447122479, - "grad_norm": 0.5006124844249952, - "learning_rate": 0.00019988170745318103, - "loss": 1.029, - "step": 1079 - }, - { - "epoch": 0.5312346286276439, - "grad_norm": 0.6447036478746674, - "learning_rate": 0.00019987869436050338, - "loss": 0.9187, - "step": 1080 - }, - { - "epoch": 0.5317265125430398, - "grad_norm": 0.4770607221759239, - "learning_rate": 0.00019987564339814183, - "loss": 0.894, - "step": 1081 - }, - { - "epoch": 0.5322183964584358, - "grad_norm": 0.5530214993105562, - "learning_rate": 0.00019987255456725326, - "loss": 0.966, - "step": 1082 - }, - { - "epoch": 0.5327102803738317, - "grad_norm": 0.49400308451337255, - "learning_rate": 0.00019986942786900876, - "loss": 0.9563, - "step": 1083 - }, - { - "epoch": 0.5332021642892277, - "grad_norm": 0.5066660160933474, - "learning_rate": 0.00019986626330459383, - "loss": 0.9655, - "step": 1084 - }, - { - "epoch": 0.5336940482046237, - "grad_norm": 0.48185408793585677, - "learning_rate": 0.00019986306087520838, - "loss": 0.8772, - "step": 1085 - }, - { - "epoch": 0.5341859321200196, - "grad_norm": 0.5570005012138782, - "learning_rate": 0.0001998598205820666, - "loss": 1.0364, - "step": 1086 - }, - { - "epoch": 0.5346778160354156, - "grad_norm": 0.4649427073411664, - "learning_rate": 0.00019985654242639709, - "loss": 0.9675, - "step": 1087 - }, - { - "epoch": 0.5351696999508117, - "grad_norm": 0.5067231805794385, - "learning_rate": 0.00019985322640944276, - "loss": 0.9838, - "step": 1088 - }, - { - "epoch": 0.5356615838662075, - "grad_norm": 0.5434555214676685, - "learning_rate": 0.0001998498725324609, - "loss": 0.9817, - "step": 1089 - }, - { - "epoch": 0.5361534677816036, - "grad_norm": 0.5006330847052317, - "learning_rate": 0.0001998464807967232, - "loss": 0.9275, - "step": 1090 - }, - { - "epoch": 0.5366453516969996, - "grad_norm": 0.48403912095323887, - "learning_rate": 0.0001998430512035156, - "loss": 0.98, - "step": 1091 - }, - { - "epoch": 0.5371372356123955, - "grad_norm": 0.8596836891088817, - "learning_rate": 0.0001998395837541385, - "loss": 0.9859, - "step": 1092 - }, - { - "epoch": 0.5376291195277915, - "grad_norm": 0.5041580903732553, - "learning_rate": 0.00019983607844990662, - "loss": 1.0673, - "step": 1093 - }, - { - "epoch": 0.5381210034431874, - "grad_norm": 0.5221800187321489, - "learning_rate": 0.00019983253529214892, - "loss": 0.9914, - "step": 1094 - }, - { - "epoch": 0.5386128873585834, - "grad_norm": 0.5192987066849946, - "learning_rate": 0.00019982895428220893, - "loss": 0.9408, - "step": 1095 - }, - { - "epoch": 0.5391047712739794, - "grad_norm": 0.8540398302865382, - "learning_rate": 0.00019982533542144438, - "loss": 0.9425, - "step": 1096 - }, - { - "epoch": 0.5395966551893753, - "grad_norm": 0.5089958604203334, - "learning_rate": 0.0001998216787112273, - "loss": 0.9503, - "step": 1097 - }, - { - "epoch": 0.5400885391047713, - "grad_norm": 0.5214487492607818, - "learning_rate": 0.0001998179841529443, - "loss": 0.9329, - "step": 1098 - }, - { - "epoch": 0.5405804230201673, - "grad_norm": 0.5145236945942359, - "learning_rate": 0.00019981425174799607, - "loss": 0.976, - "step": 1099 - }, - { - "epoch": 0.5410723069355632, - "grad_norm": 0.5933597180220942, - "learning_rate": 0.00019981048149779785, - "loss": 0.9122, - "step": 1100 - }, - { - "epoch": 0.5415641908509592, - "grad_norm": 1.0519914958129526, - "learning_rate": 0.00019980667340377915, - "loss": 1.0156, - "step": 1101 - }, - { - "epoch": 0.5420560747663551, - "grad_norm": 0.5179856400801137, - "learning_rate": 0.00019980282746738385, - "loss": 0.9248, - "step": 1102 - }, - { - "epoch": 0.5425479586817511, - "grad_norm": 0.5259129704713565, - "learning_rate": 0.00019979894369007007, - "loss": 0.9591, - "step": 1103 - }, - { - "epoch": 0.5430398425971471, - "grad_norm": 0.523228801819585, - "learning_rate": 0.00019979502207331047, - "loss": 1.023, - "step": 1104 - }, - { - "epoch": 0.543531726512543, - "grad_norm": 1.138365192252825, - "learning_rate": 0.00019979106261859192, - "loss": 0.9991, - "step": 1105 - }, - { - "epoch": 0.544023610427939, - "grad_norm": 0.509377639787038, - "learning_rate": 0.0001997870653274157, - "loss": 0.9631, - "step": 1106 - }, - { - "epoch": 0.544515494343335, - "grad_norm": 0.5211249290231439, - "learning_rate": 0.00019978303020129734, - "loss": 0.9886, - "step": 1107 - }, - { - "epoch": 0.5450073782587309, - "grad_norm": 0.5462673297624403, - "learning_rate": 0.00019977895724176685, - "loss": 1.1203, - "step": 1108 - }, - { - "epoch": 0.5454992621741269, - "grad_norm": 0.5584142311810938, - "learning_rate": 0.0001997748464503685, - "loss": 0.9207, - "step": 1109 - }, - { - "epoch": 0.5459911460895229, - "grad_norm": 0.543254237701427, - "learning_rate": 0.00019977069782866092, - "loss": 1.0187, - "step": 1110 - }, - { - "epoch": 0.5464830300049188, - "grad_norm": 0.7324250838584815, - "learning_rate": 0.00019976651137821713, - "loss": 1.044, - "step": 1111 - }, - { - "epoch": 0.5469749139203148, - "grad_norm": 0.5505484318466812, - "learning_rate": 0.0001997622871006244, - "loss": 0.9891, - "step": 1112 - }, - { - "epoch": 0.5474667978357107, - "grad_norm": 0.6030395790239121, - "learning_rate": 0.00019975802499748438, - "loss": 0.9314, - "step": 1113 - }, - { - "epoch": 0.5479586817511067, - "grad_norm": 0.5111429687043084, - "learning_rate": 0.00019975372507041313, - "loss": 0.9423, - "step": 1114 - }, - { - "epoch": 0.5484505656665027, - "grad_norm": 0.5204916826717528, - "learning_rate": 0.00019974938732104095, - "loss": 0.9904, - "step": 1115 - }, - { - "epoch": 0.5489424495818986, - "grad_norm": 0.54202739853523, - "learning_rate": 0.00019974501175101258, - "loss": 1.0219, - "step": 1116 - }, - { - "epoch": 0.5494343334972946, - "grad_norm": 0.5908784365319103, - "learning_rate": 0.00019974059836198697, - "loss": 0.9277, - "step": 1117 - }, - { - "epoch": 0.5499262174126907, - "grad_norm": 0.5817756149253414, - "learning_rate": 0.00019973614715563758, - "loss": 0.9432, - "step": 1118 - }, - { - "epoch": 0.5504181013280866, - "grad_norm": 3.5849082606997693, - "learning_rate": 0.00019973165813365204, - "loss": 0.9221, - "step": 1119 - }, - { - "epoch": 0.5509099852434826, - "grad_norm": 0.5720745065681223, - "learning_rate": 0.00019972713129773242, - "loss": 1.0715, - "step": 1120 - }, - { - "epoch": 0.5514018691588785, - "grad_norm": 0.5050276723287347, - "learning_rate": 0.00019972256664959514, - "loss": 0.9377, - "step": 1121 - }, - { - "epoch": 0.5518937530742745, - "grad_norm": 0.5225068931361693, - "learning_rate": 0.00019971796419097082, - "loss": 1.008, - "step": 1122 - }, - { - "epoch": 0.5523856369896705, - "grad_norm": 0.523775827439192, - "learning_rate": 0.0001997133239236046, - "loss": 0.9946, - "step": 1123 - }, - { - "epoch": 0.5528775209050664, - "grad_norm": 0.48384120477333176, - "learning_rate": 0.00019970864584925582, - "loss": 0.9144, - "step": 1124 - }, - { - "epoch": 0.5533694048204624, - "grad_norm": 0.5114787640705357, - "learning_rate": 0.00019970392996969825, - "loss": 1.0152, - "step": 1125 - }, - { - "epoch": 0.5538612887358584, - "grad_norm": 0.6021346846711332, - "learning_rate": 0.0001996991762867199, - "loss": 0.9429, - "step": 1126 - }, - { - "epoch": 0.5543531726512543, - "grad_norm": 0.5523618315595479, - "learning_rate": 0.00019969438480212318, - "loss": 1.0008, - "step": 1127 - }, - { - "epoch": 0.5548450565666503, - "grad_norm": 0.5380313924950736, - "learning_rate": 0.00019968955551772483, - "loss": 1.0406, - "step": 1128 - }, - { - "epoch": 0.5553369404820462, - "grad_norm": 0.48910840408788725, - "learning_rate": 0.00019968468843535592, - "loss": 0.9843, - "step": 1129 - }, - { - "epoch": 0.5558288243974422, - "grad_norm": 0.5355562704481304, - "learning_rate": 0.00019967978355686175, - "loss": 0.9486, - "step": 1130 - }, - { - "epoch": 0.5563207083128382, - "grad_norm": 1.016066263257823, - "learning_rate": 0.00019967484088410212, - "loss": 1.0011, - "step": 1131 - }, - { - "epoch": 0.5568125922282341, - "grad_norm": 0.5723587103276893, - "learning_rate": 0.00019966986041895107, - "loss": 1.0672, - "step": 1132 - }, - { - "epoch": 0.5573044761436301, - "grad_norm": 0.5703152223843464, - "learning_rate": 0.00019966484216329695, - "loss": 1.051, - "step": 1133 - }, - { - "epoch": 0.5577963600590261, - "grad_norm": 2.282358305729226, - "learning_rate": 0.00019965978611904248, - "loss": 1.0211, - "step": 1134 - }, - { - "epoch": 0.558288243974422, - "grad_norm": 0.5525840883553625, - "learning_rate": 0.00019965469228810467, - "loss": 0.962, - "step": 1135 - }, - { - "epoch": 0.558780127889818, - "grad_norm": 0.5215527967589574, - "learning_rate": 0.00019964956067241492, - "loss": 0.9639, - "step": 1136 - }, - { - "epoch": 0.559272011805214, - "grad_norm": 0.7241043949117516, - "learning_rate": 0.0001996443912739189, - "loss": 1.0431, - "step": 1137 - }, - { - "epoch": 0.5597638957206099, - "grad_norm": 0.6687373612723888, - "learning_rate": 0.0001996391840945766, - "loss": 0.9453, - "step": 1138 - }, - { - "epoch": 0.5602557796360059, - "grad_norm": 0.5545674299287187, - "learning_rate": 0.0001996339391363624, - "loss": 1.0497, - "step": 1139 - }, - { - "epoch": 0.5607476635514018, - "grad_norm": 0.5352437956107986, - "learning_rate": 0.00019962865640126495, - "loss": 0.9849, - "step": 1140 - }, - { - "epoch": 0.5612395474667978, - "grad_norm": 0.6055837858580955, - "learning_rate": 0.0001996233358912872, - "loss": 0.9791, - "step": 1141 - }, - { - "epoch": 0.5617314313821938, - "grad_norm": 0.4846389948189242, - "learning_rate": 0.0001996179776084465, - "loss": 0.9185, - "step": 1142 - }, - { - "epoch": 0.5622233152975897, - "grad_norm": 0.5139802802931714, - "learning_rate": 0.00019961258155477447, - "loss": 0.966, - "step": 1143 - }, - { - "epoch": 0.5627151992129857, - "grad_norm": 0.6870938004187139, - "learning_rate": 0.00019960714773231702, - "loss": 0.9575, - "step": 1144 - }, - { - "epoch": 0.5632070831283817, - "grad_norm": 0.5113594529829487, - "learning_rate": 0.00019960167614313447, - "loss": 0.9578, - "step": 1145 - }, - { - "epoch": 0.5636989670437776, - "grad_norm": 3.9031183834515217, - "learning_rate": 0.0001995961667893014, - "loss": 1.0727, - "step": 1146 - }, - { - "epoch": 0.5641908509591737, - "grad_norm": 0.5574019687648437, - "learning_rate": 0.00019959061967290672, - "loss": 0.8689, - "step": 1147 - }, - { - "epoch": 0.5646827348745695, - "grad_norm": 0.5221563977062531, - "learning_rate": 0.0001995850347960536, - "loss": 1.0548, - "step": 1148 - }, - { - "epoch": 0.5651746187899656, - "grad_norm": 0.5267773017129473, - "learning_rate": 0.00019957941216085966, - "loss": 0.9983, - "step": 1149 - }, - { - "epoch": 0.5656665027053616, - "grad_norm": 0.526975611698099, - "learning_rate": 0.00019957375176945675, - "loss": 1.0295, - "step": 1150 - }, - { - "epoch": 0.5661583866207575, - "grad_norm": 0.577297577131239, - "learning_rate": 0.000199568053623991, - "loss": 0.9194, - "step": 1151 - }, - { - "epoch": 0.5666502705361535, - "grad_norm": 0.502403890036141, - "learning_rate": 0.00019956231772662292, - "loss": 0.8761, - "step": 1152 - }, - { - "epoch": 0.5671421544515495, - "grad_norm": 0.8534168950724117, - "learning_rate": 0.00019955654407952735, - "loss": 0.9752, - "step": 1153 - }, - { - "epoch": 0.5676340383669454, - "grad_norm": 0.4922632287741393, - "learning_rate": 0.00019955073268489336, - "loss": 1.0172, - "step": 1154 - }, - { - "epoch": 0.5681259222823414, - "grad_norm": 0.5336179384604482, - "learning_rate": 0.0001995448835449244, - "loss": 1.0188, - "step": 1155 - }, - { - "epoch": 0.5686178061977374, - "grad_norm": 0.4844042264946027, - "learning_rate": 0.00019953899666183824, - "loss": 0.9233, - "step": 1156 - }, - { - "epoch": 0.5691096901131333, - "grad_norm": 0.507882364764521, - "learning_rate": 0.00019953307203786688, - "loss": 0.957, - "step": 1157 - }, - { - "epoch": 0.5696015740285293, - "grad_norm": 0.591541215870336, - "learning_rate": 0.0001995271096752567, - "loss": 0.9356, - "step": 1158 - }, - { - "epoch": 0.5700934579439252, - "grad_norm": 0.6186382260669341, - "learning_rate": 0.0001995211095762684, - "loss": 0.9369, - "step": 1159 - }, - { - "epoch": 0.5705853418593212, - "grad_norm": 0.5616378717608467, - "learning_rate": 0.0001995150717431769, - "loss": 0.9946, - "step": 1160 - }, - { - "epoch": 0.5710772257747172, - "grad_norm": 0.5799201922520058, - "learning_rate": 0.00019950899617827153, - "loss": 1.0119, - "step": 1161 - }, - { - "epoch": 0.5715691096901131, - "grad_norm": 0.5253536377945148, - "learning_rate": 0.00019950288288385587, - "loss": 1.0241, - "step": 1162 - }, - { - "epoch": 0.5720609936055091, - "grad_norm": 0.5309270499178786, - "learning_rate": 0.00019949673186224782, - "loss": 1.0884, - "step": 1163 - }, - { - "epoch": 0.5725528775209051, - "grad_norm": 0.47403768845019634, - "learning_rate": 0.00019949054311577957, - "loss": 0.8939, - "step": 1164 - }, - { - "epoch": 0.573044761436301, - "grad_norm": 0.47354505756863635, - "learning_rate": 0.00019948431664679764, - "loss": 0.9801, - "step": 1165 - }, - { - "epoch": 0.573536645351697, - "grad_norm": 1.58911521402257, - "learning_rate": 0.0001994780524576628, - "loss": 0.9705, - "step": 1166 - }, - { - "epoch": 0.5740285292670929, - "grad_norm": 0.7408555909645079, - "learning_rate": 0.00019947175055075021, - "loss": 1.0306, - "step": 1167 - }, - { - "epoch": 0.5745204131824889, - "grad_norm": 0.5432257419818766, - "learning_rate": 0.0001994654109284493, - "loss": 0.976, - "step": 1168 - }, - { - "epoch": 0.5750122970978849, - "grad_norm": 0.49371878758607346, - "learning_rate": 0.00019945903359316368, - "loss": 0.913, - "step": 1169 - }, - { - "epoch": 0.5755041810132808, - "grad_norm": 0.5461157822068061, - "learning_rate": 0.0001994526185473115, - "loss": 1.0219, - "step": 1170 - }, - { - "epoch": 0.5759960649286768, - "grad_norm": 0.5938609107667081, - "learning_rate": 0.0001994461657933249, - "loss": 0.9624, - "step": 1171 - }, - { - "epoch": 0.5764879488440728, - "grad_norm": 2.2098702053471238, - "learning_rate": 0.00019943967533365061, - "loss": 1.1616, - "step": 1172 - }, - { - "epoch": 0.5769798327594687, - "grad_norm": 0.5867682418120316, - "learning_rate": 0.00019943314717074952, - "loss": 0.9791, - "step": 1173 - }, - { - "epoch": 0.5774717166748647, - "grad_norm": 0.6149364776768556, - "learning_rate": 0.0001994265813070968, - "loss": 1.045, - "step": 1174 - }, - { - "epoch": 0.5779636005902608, - "grad_norm": 0.580305060422531, - "learning_rate": 0.00019941997774518193, - "loss": 1.0056, - "step": 1175 - }, - { - "epoch": 0.5784554845056566, - "grad_norm": 0.576886147710524, - "learning_rate": 0.0001994133364875087, - "loss": 0.9277, - "step": 1176 - }, - { - "epoch": 0.5789473684210527, - "grad_norm": 0.5761376880421364, - "learning_rate": 0.00019940665753659522, - "loss": 1.007, - "step": 1177 - }, - { - "epoch": 0.5794392523364486, - "grad_norm": 0.538725198753228, - "learning_rate": 0.00019939994089497384, - "loss": 0.9512, - "step": 1178 - }, - { - "epoch": 0.5799311362518446, - "grad_norm": 0.5906770112529988, - "learning_rate": 0.00019939318656519119, - "loss": 1.0418, - "step": 1179 - }, - { - "epoch": 0.5804230201672406, - "grad_norm": 0.5235094859182425, - "learning_rate": 0.00019938639454980826, - "loss": 0.9882, - "step": 1180 - }, - { - "epoch": 0.5809149040826365, - "grad_norm": 0.5289012994896675, - "learning_rate": 0.0001993795648514003, - "loss": 0.9857, - "step": 1181 - }, - { - "epoch": 0.5814067879980325, - "grad_norm": 0.5185599758845685, - "learning_rate": 0.0001993726974725568, - "loss": 0.9397, - "step": 1182 - }, - { - "epoch": 0.5818986719134285, - "grad_norm": 0.4900753841084743, - "learning_rate": 0.0001993657924158816, - "loss": 0.9016, - "step": 1183 - }, - { - "epoch": 0.5823905558288244, - "grad_norm": 0.5322886410713817, - "learning_rate": 0.00019935884968399277, - "loss": 0.9766, - "step": 1184 - }, - { - "epoch": 0.5828824397442204, - "grad_norm": 0.49780528534311047, - "learning_rate": 0.0001993518692795227, - "loss": 1.0136, - "step": 1185 - }, - { - "epoch": 0.5833743236596163, - "grad_norm": 0.5118283118364332, - "learning_rate": 0.0001993448512051181, - "loss": 1.0426, - "step": 1186 - }, - { - "epoch": 0.5838662075750123, - "grad_norm": 0.5038756348443018, - "learning_rate": 0.00019933779546343983, - "loss": 1.017, - "step": 1187 - }, - { - "epoch": 0.5843580914904083, - "grad_norm": 0.5377683615665799, - "learning_rate": 0.00019933070205716328, - "loss": 0.9851, - "step": 1188 - }, - { - "epoch": 0.5848499754058042, - "grad_norm": 0.4875958919404516, - "learning_rate": 0.00019932357098897775, - "loss": 0.9638, - "step": 1189 - }, - { - "epoch": 0.5853418593212002, - "grad_norm": 0.6352966387789497, - "learning_rate": 0.0001993164022615872, - "loss": 0.9676, - "step": 1190 - }, - { - "epoch": 0.5858337432365962, - "grad_norm": 0.5477111727893049, - "learning_rate": 0.00019930919587770967, - "loss": 1.035, - "step": 1191 - }, - { - "epoch": 0.5863256271519921, - "grad_norm": 0.49050963486189864, - "learning_rate": 0.00019930195184007747, - "loss": 0.9813, - "step": 1192 - }, - { - "epoch": 0.5868175110673881, - "grad_norm": 1.6622331873903418, - "learning_rate": 0.00019929467015143723, - "loss": 1.0632, - "step": 1193 - }, - { - "epoch": 0.5873093949827841, - "grad_norm": 0.5400732997976715, - "learning_rate": 0.00019928735081454986, - "loss": 1.0002, - "step": 1194 - }, - { - "epoch": 0.58780127889818, - "grad_norm": 0.5174335529269121, - "learning_rate": 0.00019927999383219055, - "loss": 1.0093, - "step": 1195 - }, - { - "epoch": 0.588293162813576, - "grad_norm": 0.5367307503099527, - "learning_rate": 0.00019927259920714873, - "loss": 0.9744, - "step": 1196 - }, - { - "epoch": 0.5887850467289719, - "grad_norm": 0.510879953323215, - "learning_rate": 0.00019926516694222817, - "loss": 0.9288, - "step": 1197 - }, - { - "epoch": 0.5892769306443679, - "grad_norm": 0.5591430284311459, - "learning_rate": 0.0001992576970402468, - "loss": 1.0922, - "step": 1198 - }, - { - "epoch": 0.5897688145597639, - "grad_norm": 0.68786396492244, - "learning_rate": 0.00019925018950403688, - "loss": 1.0722, - "step": 1199 - }, - { - "epoch": 0.5902606984751598, - "grad_norm": 0.5197894423013034, - "learning_rate": 0.000199242644336445, - "loss": 0.9052, - "step": 1200 - }, - { - "epoch": 0.5907525823905558, - "grad_norm": 0.540254752280019, - "learning_rate": 0.00019923506154033197, - "loss": 0.9469, - "step": 1201 - }, - { - "epoch": 0.5912444663059518, - "grad_norm": 0.5093910387920063, - "learning_rate": 0.00019922744111857278, - "loss": 1.0185, - "step": 1202 - }, - { - "epoch": 0.5917363502213477, - "grad_norm": 0.46809841121528273, - "learning_rate": 0.00019921978307405684, - "loss": 0.9627, - "step": 1203 - }, - { - "epoch": 0.5922282341367437, - "grad_norm": 0.504913090098908, - "learning_rate": 0.00019921208740968769, - "loss": 1.0126, - "step": 1204 - }, - { - "epoch": 0.5927201180521396, - "grad_norm": 0.8418977179759569, - "learning_rate": 0.00019920435412838322, - "loss": 0.9788, - "step": 1205 - }, - { - "epoch": 0.5932120019675357, - "grad_norm": 0.5038224844723608, - "learning_rate": 0.00019919658323307559, - "loss": 0.9563, - "step": 1206 - }, - { - "epoch": 0.5937038858829317, - "grad_norm": 0.5071611037513739, - "learning_rate": 0.0001991887747267111, - "loss": 0.9306, - "step": 1207 - }, - { - "epoch": 0.5941957697983276, - "grad_norm": 0.4857967851656961, - "learning_rate": 0.0001991809286122505, - "loss": 0.8986, - "step": 1208 - }, - { - "epoch": 0.5946876537137236, - "grad_norm": 0.47897108328724575, - "learning_rate": 0.0001991730448926686, - "loss": 0.9278, - "step": 1209 - }, - { - "epoch": 0.5951795376291196, - "grad_norm": 0.4990242027411677, - "learning_rate": 0.00019916512357095467, - "loss": 0.9378, - "step": 1210 - }, - { - "epoch": 0.5956714215445155, - "grad_norm": 0.505568450647096, - "learning_rate": 0.00019915716465011208, - "loss": 1.0102, - "step": 1211 - }, - { - "epoch": 0.5961633054599115, - "grad_norm": 0.4829731223656389, - "learning_rate": 0.00019914916813315844, - "loss": 0.9408, - "step": 1212 - }, - { - "epoch": 0.5966551893753075, - "grad_norm": 0.4668168249073365, - "learning_rate": 0.00019914113402312583, - "loss": 0.9245, - "step": 1213 - }, - { - "epoch": 0.5971470732907034, - "grad_norm": 0.5323856425343063, - "learning_rate": 0.0001991330623230603, - "loss": 0.8927, - "step": 1214 - }, - { - "epoch": 0.5976389572060994, - "grad_norm": 0.5509541021295224, - "learning_rate": 0.00019912495303602237, - "loss": 1.0505, - "step": 1215 - }, - { - "epoch": 0.5981308411214953, - "grad_norm": 0.488808472489773, - "learning_rate": 0.00019911680616508672, - "loss": 0.9372, - "step": 1216 - }, - { - "epoch": 0.5986227250368913, - "grad_norm": 0.5384313294484143, - "learning_rate": 0.00019910862171334227, - "loss": 0.9781, - "step": 1217 - }, - { - "epoch": 0.5991146089522873, - "grad_norm": 0.5379481451615976, - "learning_rate": 0.00019910039968389223, - "loss": 0.9996, - "step": 1218 - }, - { - "epoch": 0.5996064928676832, - "grad_norm": 0.4961902093761553, - "learning_rate": 0.00019909214007985402, - "loss": 0.9365, - "step": 1219 - }, - { - "epoch": 0.6000983767830792, - "grad_norm": 0.5302134253201395, - "learning_rate": 0.00019908384290435934, - "loss": 0.9992, - "step": 1220 - }, - { - "epoch": 0.6005902606984752, - "grad_norm": 0.5672680562736032, - "learning_rate": 0.00019907550816055408, - "loss": 1.0397, - "step": 1221 - }, - { - "epoch": 0.6010821446138711, - "grad_norm": 0.46181095225446894, - "learning_rate": 0.00019906713585159848, - "loss": 0.9246, - "step": 1222 - }, - { - "epoch": 0.6015740285292671, - "grad_norm": 0.5811155767195028, - "learning_rate": 0.00019905872598066692, - "loss": 1.0781, - "step": 1223 - }, - { - "epoch": 0.602065912444663, - "grad_norm": 0.49787784398474455, - "learning_rate": 0.00019905027855094808, - "loss": 0.9936, - "step": 1224 - }, - { - "epoch": 0.602557796360059, - "grad_norm": 0.5149302314602863, - "learning_rate": 0.00019904179356564482, - "loss": 0.9727, - "step": 1225 - }, - { - "epoch": 0.603049680275455, - "grad_norm": 0.5116850293447144, - "learning_rate": 0.00019903327102797433, - "loss": 0.9836, - "step": 1226 - }, - { - "epoch": 0.6035415641908509, - "grad_norm": 0.485077962795156, - "learning_rate": 0.00019902471094116795, - "loss": 0.9013, - "step": 1227 - }, - { - "epoch": 0.6040334481062469, - "grad_norm": 0.5385744118882313, - "learning_rate": 0.00019901611330847132, - "loss": 0.9741, - "step": 1228 - }, - { - "epoch": 0.6045253320216429, - "grad_norm": 0.4745263928640794, - "learning_rate": 0.00019900747813314426, - "loss": 0.9313, - "step": 1229 - }, - { - "epoch": 0.6050172159370388, - "grad_norm": 0.5045603418455769, - "learning_rate": 0.0001989988054184609, - "loss": 0.9431, - "step": 1230 - }, - { - "epoch": 0.6055090998524348, - "grad_norm": 0.5058346863207296, - "learning_rate": 0.00019899009516770953, - "loss": 1.0214, - "step": 1231 - }, - { - "epoch": 0.6060009837678307, - "grad_norm": 0.47596987066450536, - "learning_rate": 0.00019898134738419268, - "loss": 0.8772, - "step": 1232 - }, - { - "epoch": 0.6064928676832267, - "grad_norm": 0.4769610363097206, - "learning_rate": 0.00019897256207122719, - "loss": 1.0398, - "step": 1233 - }, - { - "epoch": 0.6069847515986228, - "grad_norm": 0.5993985205285812, - "learning_rate": 0.000198963739232144, - "loss": 0.9669, - "step": 1234 - }, - { - "epoch": 0.6074766355140186, - "grad_norm": 0.5040906972160556, - "learning_rate": 0.00019895487887028841, - "loss": 0.9493, - "step": 1235 - }, - { - "epoch": 0.6079685194294147, - "grad_norm": 0.4917920275499752, - "learning_rate": 0.00019894598098901988, - "loss": 0.9296, - "step": 1236 - }, - { - "epoch": 0.6084604033448107, - "grad_norm": 0.43862876354585245, - "learning_rate": 0.00019893704559171202, - "loss": 0.8733, - "step": 1237 - }, - { - "epoch": 0.6089522872602066, - "grad_norm": 0.5183451409292822, - "learning_rate": 0.00019892807268175283, - "loss": 1.0264, - "step": 1238 - }, - { - "epoch": 0.6094441711756026, - "grad_norm": 0.8865642362813331, - "learning_rate": 0.0001989190622625445, - "loss": 1.0417, - "step": 1239 - }, - { - "epoch": 0.6099360550909986, - "grad_norm": 0.5153707302595621, - "learning_rate": 0.00019891001433750325, - "loss": 0.97, - "step": 1240 - }, - { - "epoch": 0.6104279390063945, - "grad_norm": 0.5403104228621635, - "learning_rate": 0.00019890092891005975, - "loss": 1.0097, - "step": 1241 - }, - { - "epoch": 0.6109198229217905, - "grad_norm": 0.5487171784369699, - "learning_rate": 0.00019889180598365878, - "loss": 0.8768, - "step": 1242 - }, - { - "epoch": 0.6114117068371864, - "grad_norm": 0.5709273233038086, - "learning_rate": 0.00019888264556175935, - "loss": 0.9997, - "step": 1243 - }, - { - "epoch": 0.6119035907525824, - "grad_norm": 0.5536470447507988, - "learning_rate": 0.00019887344764783475, - "loss": 1.053, - "step": 1244 - }, - { - "epoch": 0.6123954746679784, - "grad_norm": 0.49447844921601386, - "learning_rate": 0.00019886421224537239, - "loss": 0.9231, - "step": 1245 - }, - { - "epoch": 0.6128873585833743, - "grad_norm": 0.47170407712589835, - "learning_rate": 0.0001988549393578739, - "loss": 0.8496, - "step": 1246 - }, - { - "epoch": 0.6133792424987703, - "grad_norm": 0.5020565154047474, - "learning_rate": 0.0001988456289888552, - "loss": 0.9665, - "step": 1247 - }, - { - "epoch": 0.6138711264141663, - "grad_norm": 0.5190702352179734, - "learning_rate": 0.0001988362811418464, - "loss": 1.01, - "step": 1248 - }, - { - "epoch": 0.6143630103295622, - "grad_norm": 0.4880703088966864, - "learning_rate": 0.00019882689582039175, - "loss": 0.904, - "step": 1249 - }, - { - "epoch": 0.6148548942449582, - "grad_norm": 0.47574493833336534, - "learning_rate": 0.0001988174730280498, - "loss": 0.9665, - "step": 1250 - }, - { - "epoch": 0.6153467781603541, - "grad_norm": 0.5148282636836543, - "learning_rate": 0.00019880801276839325, - "loss": 0.9884, - "step": 1251 - }, - { - "epoch": 0.6158386620757501, - "grad_norm": 0.5235160138996259, - "learning_rate": 0.000198798515045009, - "loss": 0.9887, - "step": 1252 - }, - { - "epoch": 0.6163305459911461, - "grad_norm": 0.5085056626140083, - "learning_rate": 0.00019878897986149824, - "loss": 0.9317, - "step": 1253 - }, - { - "epoch": 0.616822429906542, - "grad_norm": 0.5445136987605921, - "learning_rate": 0.0001987794072214762, - "loss": 1.0149, - "step": 1254 - }, - { - "epoch": 0.617314313821938, - "grad_norm": 0.4930655828797307, - "learning_rate": 0.0001987697971285725, - "loss": 0.9999, - "step": 1255 - }, - { - "epoch": 0.617806197737334, - "grad_norm": 0.465599374379101, - "learning_rate": 0.0001987601495864308, - "loss": 0.8981, - "step": 1256 - }, - { - "epoch": 0.6182980816527299, - "grad_norm": 0.5238286908374731, - "learning_rate": 0.0001987504645987091, - "loss": 0.9931, - "step": 1257 - }, - { - "epoch": 0.6187899655681259, - "grad_norm": 0.4479033955221958, - "learning_rate": 0.0001987407421690795, - "loss": 0.9034, - "step": 1258 - }, - { - "epoch": 0.6192818494835219, - "grad_norm": 0.46701498124585433, - "learning_rate": 0.0001987309823012283, - "loss": 0.9245, - "step": 1259 - }, - { - "epoch": 0.6197737333989178, - "grad_norm": 0.48555597397308403, - "learning_rate": 0.0001987211849988561, - "loss": 0.939, - "step": 1260 - }, - { - "epoch": 0.6202656173143138, - "grad_norm": 0.5381445047831256, - "learning_rate": 0.00019871135026567748, - "loss": 0.9292, - "step": 1261 - }, - { - "epoch": 0.6207575012297097, - "grad_norm": 0.5216798455491977, - "learning_rate": 0.00019870147810542148, - "loss": 0.9461, - "step": 1262 - }, - { - "epoch": 0.6212493851451057, - "grad_norm": 0.5029059724053576, - "learning_rate": 0.00019869156852183112, - "loss": 0.9747, - "step": 1263 - }, - { - "epoch": 0.6217412690605018, - "grad_norm": 0.47676542086643253, - "learning_rate": 0.00019868162151866371, - "loss": 0.9085, - "step": 1264 - }, - { - "epoch": 0.6222331529758977, - "grad_norm": 0.4891345162549392, - "learning_rate": 0.0001986716370996907, - "loss": 1.0671, - "step": 1265 - }, - { - "epoch": 0.6227250368912937, - "grad_norm": 0.6370750809827045, - "learning_rate": 0.0001986616152686978, - "loss": 0.9901, - "step": 1266 - }, - { - "epoch": 0.6232169208066897, - "grad_norm": 0.45380153940893, - "learning_rate": 0.0001986515560294848, - "loss": 0.9482, - "step": 1267 - }, - { - "epoch": 0.6237088047220856, - "grad_norm": 0.49341269333462745, - "learning_rate": 0.00019864145938586574, - "loss": 0.9426, - "step": 1268 - }, - { - "epoch": 0.6242006886374816, - "grad_norm": 0.5444189677286736, - "learning_rate": 0.00019863132534166886, - "loss": 1.0082, - "step": 1269 - }, - { - "epoch": 0.6246925725528775, - "grad_norm": 0.4401484698050124, - "learning_rate": 0.00019862115390073654, - "loss": 0.9203, - "step": 1270 - }, - { - "epoch": 0.6251844564682735, - "grad_norm": 0.559815902272095, - "learning_rate": 0.00019861094506692535, - "loss": 0.9326, - "step": 1271 - }, - { - "epoch": 0.6256763403836695, - "grad_norm": 0.5123525668852091, - "learning_rate": 0.00019860069884410604, - "loss": 0.9681, - "step": 1272 - }, - { - "epoch": 0.6261682242990654, - "grad_norm": 0.5021160581899634, - "learning_rate": 0.0001985904152361635, - "loss": 1.0304, - "step": 1273 - }, - { - "epoch": 0.6266601082144614, - "grad_norm": 0.6376400508102466, - "learning_rate": 0.00019858009424699686, - "loss": 0.9213, - "step": 1274 - }, - { - "epoch": 0.6271519921298574, - "grad_norm": 0.4733845157988784, - "learning_rate": 0.00019856973588051942, - "loss": 0.9236, - "step": 1275 - }, - { - "epoch": 0.6276438760452533, - "grad_norm": 0.49203738543908704, - "learning_rate": 0.00019855934014065857, - "loss": 1.0325, - "step": 1276 - }, - { - "epoch": 0.6281357599606493, - "grad_norm": 0.46907799363323144, - "learning_rate": 0.00019854890703135597, - "loss": 1.0026, - "step": 1277 - }, - { - "epoch": 0.6286276438760453, - "grad_norm": 0.45569539638688455, - "learning_rate": 0.00019853843655656737, - "loss": 0.9547, - "step": 1278 - }, - { - "epoch": 0.6291195277914412, - "grad_norm": 0.49872856225546003, - "learning_rate": 0.00019852792872026274, - "loss": 0.9291, - "step": 1279 - }, - { - "epoch": 0.6296114117068372, - "grad_norm": 0.5584940782919813, - "learning_rate": 0.0001985173835264262, - "loss": 0.947, - "step": 1280 - }, - { - "epoch": 0.6301032956222331, - "grad_norm": 0.5770565277492296, - "learning_rate": 0.00019850680097905602, - "loss": 0.9817, - "step": 1281 - }, - { - "epoch": 0.6305951795376291, - "grad_norm": 0.48890068341395715, - "learning_rate": 0.00019849618108216466, - "loss": 0.9959, - "step": 1282 - }, - { - "epoch": 0.6310870634530251, - "grad_norm": 0.49117862102377047, - "learning_rate": 0.00019848552383977872, - "loss": 0.9741, - "step": 1283 - }, - { - "epoch": 0.631578947368421, - "grad_norm": 0.5035023302634901, - "learning_rate": 0.00019847482925593895, - "loss": 0.9934, - "step": 1284 - }, - { - "epoch": 0.632070831283817, - "grad_norm": 0.4952634613565297, - "learning_rate": 0.0001984640973347003, - "loss": 0.8869, - "step": 1285 - }, - { - "epoch": 0.632562715199213, - "grad_norm": 0.49315288560193077, - "learning_rate": 0.00019845332808013182, - "loss": 0.9259, - "step": 1286 - }, - { - "epoch": 0.6330545991146089, - "grad_norm": 0.47740099661206553, - "learning_rate": 0.00019844252149631676, - "loss": 0.9996, - "step": 1287 - }, - { - "epoch": 0.6335464830300049, - "grad_norm": 0.46708786273246256, - "learning_rate": 0.00019843167758735254, - "loss": 0.9772, - "step": 1288 - }, - { - "epoch": 0.6340383669454008, - "grad_norm": 0.4705108678393681, - "learning_rate": 0.00019842079635735066, - "loss": 0.9669, - "step": 1289 - }, - { - "epoch": 0.6345302508607968, - "grad_norm": 0.5466072337028248, - "learning_rate": 0.00019840987781043684, - "loss": 0.9398, - "step": 1290 - }, - { - "epoch": 0.6350221347761928, - "grad_norm": 0.4874484074629659, - "learning_rate": 0.0001983989219507509, - "loss": 1.0452, - "step": 1291 - }, - { - "epoch": 0.6355140186915887, - "grad_norm": 0.46611852462326536, - "learning_rate": 0.0001983879287824468, - "loss": 0.8998, - "step": 1292 - }, - { - "epoch": 0.6360059026069848, - "grad_norm": 0.5081887752375128, - "learning_rate": 0.00019837689830969274, - "loss": 1.0055, - "step": 1293 - }, - { - "epoch": 0.6364977865223808, - "grad_norm": 0.4677023400470637, - "learning_rate": 0.00019836583053667095, - "loss": 0.9816, - "step": 1294 - }, - { - "epoch": 0.6369896704377767, - "grad_norm": 0.47492532520981673, - "learning_rate": 0.0001983547254675779, - "loss": 0.9368, - "step": 1295 - }, - { - "epoch": 0.6374815543531727, - "grad_norm": 0.49346995310388697, - "learning_rate": 0.0001983435831066241, - "loss": 0.9676, - "step": 1296 - }, - { - "epoch": 0.6379734382685687, - "grad_norm": 0.47036017948332953, - "learning_rate": 0.00019833240345803427, - "loss": 0.9913, - "step": 1297 - }, - { - "epoch": 0.6384653221839646, - "grad_norm": 0.48065973778314525, - "learning_rate": 0.00019832118652604727, - "loss": 0.8905, - "step": 1298 - }, - { - "epoch": 0.6389572060993606, - "grad_norm": 0.46647813373553965, - "learning_rate": 0.00019830993231491605, - "loss": 0.8878, - "step": 1299 - }, - { - "epoch": 0.6394490900147565, - "grad_norm": 0.5114555581077286, - "learning_rate": 0.00019829864082890772, - "loss": 0.9685, - "step": 1300 - }, - { - "epoch": 0.6399409739301525, - "grad_norm": 0.4965084945995025, - "learning_rate": 0.00019828731207230355, - "loss": 1.0056, - "step": 1301 - }, - { - "epoch": 0.6404328578455485, - "grad_norm": 0.5214245592678145, - "learning_rate": 0.0001982759460493989, - "loss": 1.0231, - "step": 1302 - }, - { - "epoch": 0.6409247417609444, - "grad_norm": 0.46714813612874007, - "learning_rate": 0.00019826454276450326, - "loss": 0.9114, - "step": 1303 - }, - { - "epoch": 0.6414166256763404, - "grad_norm": 0.49141817093497603, - "learning_rate": 0.00019825310222194026, - "loss": 0.9891, - "step": 1304 - }, - { - "epoch": 0.6419085095917364, - "grad_norm": 0.460648180573316, - "learning_rate": 0.0001982416244260477, - "loss": 0.9866, - "step": 1305 - }, - { - "epoch": 0.6424003935071323, - "grad_norm": 0.4887121271829397, - "learning_rate": 0.00019823010938117743, - "loss": 0.9695, - "step": 1306 - }, - { - "epoch": 0.6428922774225283, - "grad_norm": 0.5027267845537975, - "learning_rate": 0.00019821855709169545, - "loss": 0.921, - "step": 1307 - }, - { - "epoch": 0.6433841613379242, - "grad_norm": 0.48475309963053514, - "learning_rate": 0.00019820696756198193, - "loss": 0.9635, - "step": 1308 - }, - { - "epoch": 0.6438760452533202, - "grad_norm": 0.48365476545550495, - "learning_rate": 0.00019819534079643108, - "loss": 0.9205, - "step": 1309 - }, - { - "epoch": 0.6443679291687162, - "grad_norm": 0.5501815549047151, - "learning_rate": 0.00019818367679945128, - "loss": 1.0474, - "step": 1310 - }, - { - "epoch": 0.6448598130841121, - "grad_norm": 0.502069360256173, - "learning_rate": 0.000198171975575465, - "loss": 0.9187, - "step": 1311 - }, - { - "epoch": 0.6453516969995081, - "grad_norm": 0.48678101208708974, - "learning_rate": 0.0001981602371289089, - "loss": 0.9325, - "step": 1312 - }, - { - "epoch": 0.6458435809149041, - "grad_norm": 0.506409525085648, - "learning_rate": 0.0001981484614642336, - "loss": 0.9562, - "step": 1313 - }, - { - "epoch": 0.6463354648303, - "grad_norm": 0.4661370633764652, - "learning_rate": 0.00019813664858590397, - "loss": 0.9183, - "step": 1314 - }, - { - "epoch": 0.646827348745696, - "grad_norm": 0.4806886534468273, - "learning_rate": 0.00019812479849839893, - "loss": 0.9439, - "step": 1315 - }, - { - "epoch": 0.647319232661092, - "grad_norm": 0.47126460868823805, - "learning_rate": 0.00019811291120621155, - "loss": 0.951, - "step": 1316 - }, - { - "epoch": 0.6478111165764879, - "grad_norm": 0.4542406666227788, - "learning_rate": 0.00019810098671384894, - "loss": 0.9383, - "step": 1317 - }, - { - "epoch": 0.6483030004918839, - "grad_norm": 0.4994704028630951, - "learning_rate": 0.0001980890250258324, - "loss": 0.9752, - "step": 1318 - }, - { - "epoch": 0.6487948844072798, - "grad_norm": 0.4930725970805956, - "learning_rate": 0.00019807702614669723, - "loss": 0.9901, - "step": 1319 - }, - { - "epoch": 0.6492867683226758, - "grad_norm": 0.45835005141114954, - "learning_rate": 0.0001980649900809929, - "loss": 0.9519, - "step": 1320 - }, - { - "epoch": 0.6497786522380719, - "grad_norm": 0.48788496599573516, - "learning_rate": 0.000198052916833283, - "loss": 0.8987, - "step": 1321 - }, - { - "epoch": 0.6502705361534677, - "grad_norm": 0.5118753458003786, - "learning_rate": 0.00019804080640814514, - "loss": 0.9839, - "step": 1322 - }, - { - "epoch": 0.6507624200688638, - "grad_norm": 0.4831313513667437, - "learning_rate": 0.00019802865881017115, - "loss": 1.021, - "step": 1323 - }, - { - "epoch": 0.6512543039842598, - "grad_norm": 0.5060562265707746, - "learning_rate": 0.00019801647404396676, - "loss": 0.9929, - "step": 1324 - }, - { - "epoch": 0.6517461878996557, - "grad_norm": 0.4782416160343314, - "learning_rate": 0.00019800425211415195, - "loss": 1.0068, - "step": 1325 - }, - { - "epoch": 0.6522380718150517, - "grad_norm": 0.48910673323559956, - "learning_rate": 0.0001979919930253608, - "loss": 0.9691, - "step": 1326 - }, - { - "epoch": 0.6527299557304476, - "grad_norm": 0.548161295704732, - "learning_rate": 0.00019797969678224134, - "loss": 0.9615, - "step": 1327 - }, - { - "epoch": 0.6532218396458436, - "grad_norm": 0.4363558582637419, - "learning_rate": 0.00019796736338945588, - "loss": 0.8841, - "step": 1328 - }, - { - "epoch": 0.6537137235612396, - "grad_norm": 0.4364977172112859, - "learning_rate": 0.00019795499285168059, - "loss": 1.0105, - "step": 1329 - }, - { - "epoch": 0.6542056074766355, - "grad_norm": 0.49734668944721677, - "learning_rate": 0.00019794258517360594, - "loss": 1.0651, - "step": 1330 - }, - { - "epoch": 0.6546974913920315, - "grad_norm": 0.45890224119065426, - "learning_rate": 0.00019793014035993634, - "loss": 0.9658, - "step": 1331 - }, - { - "epoch": 0.6551893753074275, - "grad_norm": 0.47898528377522087, - "learning_rate": 0.0001979176584153903, - "loss": 1.0042, - "step": 1332 - }, - { - "epoch": 0.6556812592228234, - "grad_norm": 0.47836600015020336, - "learning_rate": 0.0001979051393447005, - "loss": 0.8972, - "step": 1333 - }, - { - "epoch": 0.6561731431382194, - "grad_norm": 0.6329808475765831, - "learning_rate": 0.0001978925831526136, - "loss": 0.9891, - "step": 1334 - }, - { - "epoch": 0.6566650270536154, - "grad_norm": 0.5212393325985413, - "learning_rate": 0.00019787998984389035, - "loss": 1.0377, - "step": 1335 - }, - { - "epoch": 0.6571569109690113, - "grad_norm": 0.5012305937786911, - "learning_rate": 0.00019786735942330558, - "loss": 0.9978, - "step": 1336 - }, - { - "epoch": 0.6576487948844073, - "grad_norm": 0.5522986750214255, - "learning_rate": 0.00019785469189564827, - "loss": 1.0115, - "step": 1337 - }, - { - "epoch": 0.6581406787998032, - "grad_norm": 0.6066439726715466, - "learning_rate": 0.0001978419872657213, - "loss": 1.03, - "step": 1338 - }, - { - "epoch": 0.6586325627151992, - "grad_norm": 0.46082593127000604, - "learning_rate": 0.00019782924553834178, - "loss": 0.959, - "step": 1339 - }, - { - "epoch": 0.6591244466305952, - "grad_norm": 0.5021208184636669, - "learning_rate": 0.00019781646671834083, - "loss": 0.9057, - "step": 1340 - }, - { - "epoch": 0.6596163305459911, - "grad_norm": 0.4844685339555825, - "learning_rate": 0.00019780365081056357, - "loss": 0.9454, - "step": 1341 - }, - { - "epoch": 0.6601082144613871, - "grad_norm": 1.9164930910785711, - "learning_rate": 0.00019779079781986932, - "loss": 0.9429, - "step": 1342 - }, - { - "epoch": 0.6606000983767831, - "grad_norm": 0.6216687573184569, - "learning_rate": 0.0001977779077511313, - "loss": 0.9998, - "step": 1343 - }, - { - "epoch": 0.661091982292179, - "grad_norm": 0.4977795406068352, - "learning_rate": 0.0001977649806092369, - "loss": 0.9701, - "step": 1344 - }, - { - "epoch": 0.661583866207575, - "grad_norm": 0.5243996014098755, - "learning_rate": 0.00019775201639908753, - "loss": 0.8677, - "step": 1345 - }, - { - "epoch": 0.6620757501229709, - "grad_norm": 0.4812749828993618, - "learning_rate": 0.00019773901512559866, - "loss": 0.9539, - "step": 1346 - }, - { - "epoch": 0.6625676340383669, - "grad_norm": 0.546259167840415, - "learning_rate": 0.00019772597679369982, - "loss": 0.9659, - "step": 1347 - }, - { - "epoch": 0.663059517953763, - "grad_norm": 0.45627340502765906, - "learning_rate": 0.00019771290140833457, - "loss": 0.9403, - "step": 1348 - }, - { - "epoch": 0.6635514018691588, - "grad_norm": 0.5198080634215432, - "learning_rate": 0.00019769978897446052, - "loss": 0.9559, - "step": 1349 - }, - { - "epoch": 0.6640432857845548, - "grad_norm": 0.5238002127306112, - "learning_rate": 0.00019768663949704934, - "loss": 1.0228, - "step": 1350 - }, - { - "epoch": 0.6645351696999509, - "grad_norm": 0.5208634930014021, - "learning_rate": 0.0001976734529810868, - "loss": 1.0147, - "step": 1351 - }, - { - "epoch": 0.6650270536153468, - "grad_norm": 0.5248961222985504, - "learning_rate": 0.0001976602294315726, - "loss": 0.9727, - "step": 1352 - }, - { - "epoch": 0.6655189375307428, - "grad_norm": 0.6254830439164408, - "learning_rate": 0.00019764696885352052, - "loss": 0.9649, - "step": 1353 - }, - { - "epoch": 0.6660108214461387, - "grad_norm": 0.47110297597909734, - "learning_rate": 0.00019763367125195846, - "loss": 0.9369, - "step": 1354 - }, - { - "epoch": 0.6665027053615347, - "grad_norm": 0.47187579909328614, - "learning_rate": 0.00019762033663192825, - "loss": 0.94, - "step": 1355 - }, - { - "epoch": 0.6669945892769307, - "grad_norm": 0.5772950964597373, - "learning_rate": 0.00019760696499848581, - "loss": 1.0795, - "step": 1356 - }, - { - "epoch": 0.6674864731923266, - "grad_norm": 0.4882232660721926, - "learning_rate": 0.00019759355635670108, - "loss": 0.95, - "step": 1357 - }, - { - "epoch": 0.6679783571077226, - "grad_norm": 0.5487075048985032, - "learning_rate": 0.00019758011071165806, - "loss": 0.9049, - "step": 1358 - }, - { - "epoch": 0.6684702410231186, - "grad_norm": 0.5135018536157103, - "learning_rate": 0.00019756662806845476, - "loss": 0.9378, - "step": 1359 - }, - { - "epoch": 0.6689621249385145, - "grad_norm": 0.5064050586331412, - "learning_rate": 0.0001975531084322032, - "loss": 1.0298, - "step": 1360 - }, - { - "epoch": 0.6694540088539105, - "grad_norm": 0.48251250028109743, - "learning_rate": 0.00019753955180802947, - "loss": 1.0018, - "step": 1361 - }, - { - "epoch": 0.6699458927693065, - "grad_norm": 0.465794161860618, - "learning_rate": 0.00019752595820107357, - "loss": 0.9575, - "step": 1362 - }, - { - "epoch": 0.6704377766847024, - "grad_norm": 0.46197345974106235, - "learning_rate": 0.00019751232761648968, - "loss": 0.9247, - "step": 1363 - }, - { - "epoch": 0.6709296606000984, - "grad_norm": 0.4872691668123498, - "learning_rate": 0.00019749866005944596, - "loss": 0.9579, - "step": 1364 - }, - { - "epoch": 0.6714215445154943, - "grad_norm": 0.5246678631985456, - "learning_rate": 0.00019748495553512446, - "loss": 0.9364, - "step": 1365 - }, - { - "epoch": 0.6719134284308903, - "grad_norm": 0.44793193058858854, - "learning_rate": 0.0001974712140487214, - "loss": 0.961, - "step": 1366 - }, - { - "epoch": 0.6724053123462863, - "grad_norm": 0.448453558256479, - "learning_rate": 0.000197457435605447, - "loss": 0.8742, - "step": 1367 - }, - { - "epoch": 0.6728971962616822, - "grad_norm": 0.43038781161979683, - "learning_rate": 0.00019744362021052538, - "loss": 0.8682, - "step": 1368 - }, - { - "epoch": 0.6733890801770782, - "grad_norm": 0.5163316258068651, - "learning_rate": 0.00019742976786919477, - "loss": 1.0356, - "step": 1369 - }, - { - "epoch": 0.6738809640924742, - "grad_norm": 0.44590154547646654, - "learning_rate": 0.00019741587858670734, - "loss": 0.8705, - "step": 1370 - }, - { - "epoch": 0.6743728480078701, - "grad_norm": 0.4936612873848726, - "learning_rate": 0.0001974019523683294, - "loss": 1.068, - "step": 1371 - }, - { - "epoch": 0.6748647319232661, - "grad_norm": 0.4693563025941603, - "learning_rate": 0.00019738798921934106, - "loss": 0.9408, - "step": 1372 - }, - { - "epoch": 0.675356615838662, - "grad_norm": 0.4809559613105093, - "learning_rate": 0.00019737398914503659, - "loss": 0.9443, - "step": 1373 - }, - { - "epoch": 0.675848499754058, - "grad_norm": 0.5138724513993868, - "learning_rate": 0.00019735995215072424, - "loss": 1.0152, - "step": 1374 - }, - { - "epoch": 0.676340383669454, - "grad_norm": 0.5055424715655393, - "learning_rate": 0.0001973458782417262, - "loss": 1.0696, - "step": 1375 - }, - { - "epoch": 0.6768322675848499, - "grad_norm": 0.4667703894682567, - "learning_rate": 0.0001973317674233787, - "loss": 0.9734, - "step": 1376 - }, - { - "epoch": 0.6773241515002459, - "grad_norm": 0.5384300104335755, - "learning_rate": 0.00019731761970103194, - "loss": 1.0105, - "step": 1377 - }, - { - "epoch": 0.677816035415642, - "grad_norm": 0.4655179511026166, - "learning_rate": 0.0001973034350800501, - "loss": 0.9076, - "step": 1378 - }, - { - "epoch": 0.6783079193310378, - "grad_norm": 0.5347175128994112, - "learning_rate": 0.00019728921356581145, - "loss": 0.9431, - "step": 1379 - }, - { - "epoch": 0.6787998032464339, - "grad_norm": 0.4656927796859899, - "learning_rate": 0.0001972749551637081, - "loss": 0.9451, - "step": 1380 - }, - { - "epoch": 0.6792916871618299, - "grad_norm": 0.48177774646127997, - "learning_rate": 0.00019726065987914627, - "loss": 0.9401, - "step": 1381 - }, - { - "epoch": 0.6797835710772258, - "grad_norm": 0.5123952935408527, - "learning_rate": 0.0001972463277175461, - "loss": 1.0117, - "step": 1382 - }, - { - "epoch": 0.6802754549926218, - "grad_norm": 0.47515439460474146, - "learning_rate": 0.00019723195868434173, - "loss": 0.8981, - "step": 1383 - }, - { - "epoch": 0.6807673389080177, - "grad_norm": 0.4758402685560721, - "learning_rate": 0.00019721755278498125, - "loss": 1.0036, - "step": 1384 - }, - { - "epoch": 0.6812592228234137, - "grad_norm": 0.4368396159224531, - "learning_rate": 0.00019720311002492674, - "loss": 0.8823, - "step": 1385 - }, - { - "epoch": 0.6817511067388097, - "grad_norm": 0.49245664440439657, - "learning_rate": 0.00019718863040965433, - "loss": 0.9077, - "step": 1386 - }, - { - "epoch": 0.6822429906542056, - "grad_norm": 0.45367647317868026, - "learning_rate": 0.00019717411394465403, - "loss": 0.8926, - "step": 1387 - }, - { - "epoch": 0.6827348745696016, - "grad_norm": 0.44195369077293045, - "learning_rate": 0.00019715956063542987, - "loss": 0.8891, - "step": 1388 - }, - { - "epoch": 0.6832267584849976, - "grad_norm": 0.46903923611598164, - "learning_rate": 0.00019714497048749983, - "loss": 0.9259, - "step": 1389 - }, - { - "epoch": 0.6837186424003935, - "grad_norm": 0.465613582505028, - "learning_rate": 0.00019713034350639586, - "loss": 0.9348, - "step": 1390 - }, - { - "epoch": 0.6842105263157895, - "grad_norm": 0.4582440126316995, - "learning_rate": 0.00019711567969766386, - "loss": 0.985, - "step": 1391 - }, - { - "epoch": 0.6847024102311854, - "grad_norm": 0.4392419955464664, - "learning_rate": 0.0001971009790668638, - "loss": 0.9216, - "step": 1392 - }, - { - "epoch": 0.6851942941465814, - "grad_norm": 0.4491326846139963, - "learning_rate": 0.00019708624161956937, - "loss": 0.961, - "step": 1393 - }, - { - "epoch": 0.6856861780619774, - "grad_norm": 0.4696919061376751, - "learning_rate": 0.0001970714673613685, - "loss": 0.9931, - "step": 1394 - }, - { - "epoch": 0.6861780619773733, - "grad_norm": 0.4884458175997585, - "learning_rate": 0.0001970566562978629, - "loss": 0.9676, - "step": 1395 - }, - { - "epoch": 0.6866699458927693, - "grad_norm": 0.525966371118796, - "learning_rate": 0.00019704180843466832, - "loss": 0.9762, - "step": 1396 - }, - { - "epoch": 0.6871618298081653, - "grad_norm": 0.44826665427966983, - "learning_rate": 0.00019702692377741437, - "loss": 0.9858, - "step": 1397 - }, - { - "epoch": 0.6876537137235612, - "grad_norm": 0.46657972521671054, - "learning_rate": 0.0001970120023317447, - "loss": 0.9516, - "step": 1398 - }, - { - "epoch": 0.6881455976389572, - "grad_norm": 0.4902442987303898, - "learning_rate": 0.0001969970441033169, - "loss": 0.9438, - "step": 1399 - }, - { - "epoch": 0.6886374815543532, - "grad_norm": 0.4967055119940703, - "learning_rate": 0.00019698204909780244, - "loss": 0.9215, - "step": 1400 - }, - { - "epoch": 0.6891293654697491, - "grad_norm": 0.4407476049359459, - "learning_rate": 0.00019696701732088682, - "loss": 0.9008, - "step": 1401 - }, - { - "epoch": 0.6896212493851451, - "grad_norm": 0.48602316707693566, - "learning_rate": 0.00019695194877826942, - "loss": 0.9536, - "step": 1402 - }, - { - "epoch": 0.690113133300541, - "grad_norm": 0.4338969018010427, - "learning_rate": 0.00019693684347566357, - "loss": 0.9118, - "step": 1403 - }, - { - "epoch": 0.690605017215937, - "grad_norm": 0.4617062326813443, - "learning_rate": 0.00019692170141879655, - "loss": 0.962, - "step": 1404 - }, - { - "epoch": 0.691096901131333, - "grad_norm": 0.4853395184959136, - "learning_rate": 0.0001969065226134096, - "loss": 0.9644, - "step": 1405 - }, - { - "epoch": 0.6915887850467289, - "grad_norm": 0.45556059903509, - "learning_rate": 0.00019689130706525783, - "loss": 0.9241, - "step": 1406 - }, - { - "epoch": 0.692080668962125, - "grad_norm": 0.4845371013575807, - "learning_rate": 0.00019687605478011037, - "loss": 0.9443, - "step": 1407 - }, - { - "epoch": 0.692572552877521, - "grad_norm": 0.5027071999875199, - "learning_rate": 0.00019686076576375016, - "loss": 1.0104, - "step": 1408 - }, - { - "epoch": 0.6930644367929168, - "grad_norm": 0.49588782762936817, - "learning_rate": 0.00019684544002197417, - "loss": 0.9522, - "step": 1409 - }, - { - "epoch": 0.6935563207083129, - "grad_norm": 0.49351571536552424, - "learning_rate": 0.00019683007756059325, - "loss": 0.8928, - "step": 1410 - }, - { - "epoch": 0.6940482046237088, - "grad_norm": 0.46301918368683015, - "learning_rate": 0.00019681467838543224, - "loss": 1.0038, - "step": 1411 - }, - { - "epoch": 0.6945400885391048, - "grad_norm": 0.45749522584031316, - "learning_rate": 0.00019679924250232974, - "loss": 0.9363, - "step": 1412 - }, - { - "epoch": 0.6950319724545008, - "grad_norm": 0.44525199283556677, - "learning_rate": 0.00019678376991713844, - "loss": 0.9343, - "step": 1413 - }, - { - "epoch": 0.6955238563698967, - "grad_norm": 0.4744116189440004, - "learning_rate": 0.00019676826063572483, - "loss": 0.978, - "step": 1414 - }, - { - "epoch": 0.6960157402852927, - "grad_norm": 0.46033717354531234, - "learning_rate": 0.00019675271466396943, - "loss": 0.9656, - "step": 1415 - }, - { - "epoch": 0.6965076242006887, - "grad_norm": 0.4836554184941867, - "learning_rate": 0.00019673713200776653, - "loss": 0.9155, - "step": 1416 - }, - { - "epoch": 0.6969995081160846, - "grad_norm": 0.46460747714447387, - "learning_rate": 0.0001967215126730244, - "loss": 0.8926, - "step": 1417 - }, - { - "epoch": 0.6974913920314806, - "grad_norm": 0.4583940507740178, - "learning_rate": 0.0001967058566656653, - "loss": 0.9965, - "step": 1418 - }, - { - "epoch": 0.6979832759468766, - "grad_norm": 1.748934833714311, - "learning_rate": 0.0001966901639916252, - "loss": 1.0506, - "step": 1419 - }, - { - "epoch": 0.6984751598622725, - "grad_norm": 0.5196255454458959, - "learning_rate": 0.0001966744346568542, - "loss": 0.9866, - "step": 1420 - }, - { - "epoch": 0.6989670437776685, - "grad_norm": 0.44519596488889035, - "learning_rate": 0.00019665866866731604, - "loss": 0.9717, - "step": 1421 - }, - { - "epoch": 0.6994589276930644, - "grad_norm": 0.4829070905960186, - "learning_rate": 0.0001966428660289886, - "loss": 0.9013, - "step": 1422 - }, - { - "epoch": 0.6999508116084604, - "grad_norm": 0.4924165229937081, - "learning_rate": 0.00019662702674786358, - "loss": 0.9345, - "step": 1423 - }, - { - "epoch": 0.7004426955238564, - "grad_norm": 0.4501450802072641, - "learning_rate": 0.00019661115082994648, - "loss": 0.9368, - "step": 1424 - }, - { - "epoch": 0.7009345794392523, - "grad_norm": 0.44427865956883583, - "learning_rate": 0.0001965952382812568, - "loss": 0.9414, - "step": 1425 - }, - { - "epoch": 0.7014264633546483, - "grad_norm": 0.44543902575339583, - "learning_rate": 0.00019657928910782788, - "loss": 0.9622, - "step": 1426 - }, - { - "epoch": 0.7019183472700443, - "grad_norm": 0.4700780537106658, - "learning_rate": 0.00019656330331570696, - "loss": 1.0538, - "step": 1427 - }, - { - "epoch": 0.7024102311854402, - "grad_norm": 0.43588081453106137, - "learning_rate": 0.00019654728091095516, - "loss": 0.8978, - "step": 1428 - }, - { - "epoch": 0.7029021151008362, - "grad_norm": 0.4843955149609755, - "learning_rate": 0.00019653122189964748, - "loss": 0.9875, - "step": 1429 - }, - { - "epoch": 0.7033939990162321, - "grad_norm": 0.4495943285880614, - "learning_rate": 0.00019651512628787284, - "loss": 0.9256, - "step": 1430 - }, - { - "epoch": 0.7038858829316281, - "grad_norm": 0.5071515618320818, - "learning_rate": 0.00019649899408173392, - "loss": 0.9805, - "step": 1431 - }, - { - "epoch": 0.7043777668470241, - "grad_norm": 0.4946709894083941, - "learning_rate": 0.00019648282528734743, - "loss": 1.0207, - "step": 1432 - }, - { - "epoch": 0.70486965076242, - "grad_norm": 0.5317316907653367, - "learning_rate": 0.00019646661991084386, - "loss": 0.9049, - "step": 1433 - }, - { - "epoch": 0.705361534677816, - "grad_norm": 0.47497952868667054, - "learning_rate": 0.00019645037795836757, - "loss": 1.0027, - "step": 1434 - }, - { - "epoch": 0.705853418593212, - "grad_norm": 0.42811971576461416, - "learning_rate": 0.0001964340994360768, - "loss": 0.8511, - "step": 1435 - }, - { - "epoch": 0.7063453025086079, - "grad_norm": 0.47933110770990467, - "learning_rate": 0.00019641778435014367, - "loss": 0.9524, - "step": 1436 - }, - { - "epoch": 0.706837186424004, - "grad_norm": 0.44032343135979374, - "learning_rate": 0.0001964014327067542, - "loss": 0.9693, - "step": 1437 - }, - { - "epoch": 0.7073290703394, - "grad_norm": 0.45546567885492706, - "learning_rate": 0.00019638504451210818, - "loss": 0.9352, - "step": 1438 - }, - { - "epoch": 0.7078209542547959, - "grad_norm": 0.4367818108711072, - "learning_rate": 0.0001963686197724193, - "loss": 0.8951, - "step": 1439 - }, - { - "epoch": 0.7083128381701919, - "grad_norm": 0.4519902395124329, - "learning_rate": 0.00019635215849391513, - "loss": 0.8983, - "step": 1440 - }, - { - "epoch": 0.7088047220855878, - "grad_norm": 0.49348356075891786, - "learning_rate": 0.0001963356606828371, - "loss": 0.9997, - "step": 1441 - }, - { - "epoch": 0.7092966060009838, - "grad_norm": 0.4313215319354549, - "learning_rate": 0.00019631912634544038, - "loss": 0.8413, - "step": 1442 - }, - { - "epoch": 0.7097884899163798, - "grad_norm": 0.48002828957831445, - "learning_rate": 0.00019630255548799418, - "loss": 0.9967, - "step": 1443 - }, - { - "epoch": 0.7102803738317757, - "grad_norm": 0.4623049897510101, - "learning_rate": 0.0001962859481167814, - "loss": 0.862, - "step": 1444 - }, - { - "epoch": 0.7107722577471717, - "grad_norm": 0.48333998986430504, - "learning_rate": 0.00019626930423809883, - "loss": 1.0378, - "step": 1445 - }, - { - "epoch": 0.7112641416625677, - "grad_norm": 0.4565159520400276, - "learning_rate": 0.00019625262385825713, - "loss": 0.9619, - "step": 1446 - }, - { - "epoch": 0.7117560255779636, - "grad_norm": 0.4620869378230684, - "learning_rate": 0.00019623590698358078, - "loss": 0.8501, - "step": 1447 - }, - { - "epoch": 0.7122479094933596, - "grad_norm": 0.44254477361234446, - "learning_rate": 0.0001962191536204081, - "loss": 0.9782, - "step": 1448 - }, - { - "epoch": 0.7127397934087555, - "grad_norm": 0.4353683138462511, - "learning_rate": 0.0001962023637750912, - "loss": 0.8796, - "step": 1449 - }, - { - "epoch": 0.7132316773241515, - "grad_norm": 0.46120579313640847, - "learning_rate": 0.00019618553745399614, - "loss": 0.946, - "step": 1450 - }, - { - "epoch": 0.7137235612395475, - "grad_norm": 0.47686488482110984, - "learning_rate": 0.00019616867466350265, - "loss": 0.9315, - "step": 1451 - }, - { - "epoch": 0.7142154451549434, - "grad_norm": 0.4633726294481254, - "learning_rate": 0.0001961517754100044, - "loss": 0.9626, - "step": 1452 - }, - { - "epoch": 0.7147073290703394, - "grad_norm": 0.45707485377320534, - "learning_rate": 0.00019613483969990888, - "loss": 0.9778, - "step": 1453 - }, - { - "epoch": 0.7151992129857354, - "grad_norm": 0.45880063101939245, - "learning_rate": 0.0001961178675396374, - "loss": 0.9926, - "step": 1454 - }, - { - "epoch": 0.7156910969011313, - "grad_norm": 0.46807713742194085, - "learning_rate": 0.000196100858935625, - "loss": 1.0291, - "step": 1455 - }, - { - "epoch": 0.7161829808165273, - "grad_norm": 0.4788185630398034, - "learning_rate": 0.00019608381389432062, - "loss": 0.8905, - "step": 1456 - }, - { - "epoch": 0.7166748647319232, - "grad_norm": 0.5116107496857625, - "learning_rate": 0.00019606673242218706, - "loss": 0.8742, - "step": 1457 - }, - { - "epoch": 0.7171667486473192, - "grad_norm": 0.44046305805249486, - "learning_rate": 0.00019604961452570084, - "loss": 0.8847, - "step": 1458 - }, - { - "epoch": 0.7176586325627152, - "grad_norm": 0.4835553328061689, - "learning_rate": 0.0001960324602113523, - "loss": 0.9298, - "step": 1459 - }, - { - "epoch": 0.7181505164781111, - "grad_norm": 0.5134368076985019, - "learning_rate": 0.00019601526948564567, - "loss": 1.0206, - "step": 1460 - }, - { - "epoch": 0.7186424003935071, - "grad_norm": 0.5960705026051694, - "learning_rate": 0.00019599804235509888, - "loss": 0.9985, - "step": 1461 - }, - { - "epoch": 0.7191342843089031, - "grad_norm": 0.621317736308171, - "learning_rate": 0.00019598077882624376, - "loss": 0.9574, - "step": 1462 - }, - { - "epoch": 0.719626168224299, - "grad_norm": 0.489190450109789, - "learning_rate": 0.00019596347890562586, - "loss": 0.9254, - "step": 1463 - }, - { - "epoch": 0.720118052139695, - "grad_norm": 0.6065307822795144, - "learning_rate": 0.00019594614259980456, - "loss": 0.9965, - "step": 1464 - }, - { - "epoch": 0.720609936055091, - "grad_norm": 0.45873329370551547, - "learning_rate": 0.00019592876991535304, - "loss": 0.9031, - "step": 1465 - }, - { - "epoch": 0.721101819970487, - "grad_norm": 0.467556547135862, - "learning_rate": 0.0001959113608588583, - "loss": 0.9664, - "step": 1466 - }, - { - "epoch": 0.721593703885883, - "grad_norm": 0.4758304877114131, - "learning_rate": 0.0001958939154369211, - "loss": 0.9792, - "step": 1467 - }, - { - "epoch": 0.7220855878012788, - "grad_norm": 0.478407499672416, - "learning_rate": 0.00019587643365615595, - "loss": 0.9107, - "step": 1468 - }, - { - "epoch": 0.7225774717166749, - "grad_norm": 0.5020595058918841, - "learning_rate": 0.00019585891552319123, - "loss": 0.9603, - "step": 1469 - }, - { - "epoch": 0.7230693556320709, - "grad_norm": 0.47531218980151596, - "learning_rate": 0.000195841361044669, - "loss": 0.9525, - "step": 1470 - }, - { - "epoch": 0.7235612395474668, - "grad_norm": 0.4309090855161772, - "learning_rate": 0.00019582377022724524, - "loss": 0.9628, - "step": 1471 - }, - { - "epoch": 0.7240531234628628, - "grad_norm": 0.4613319397147935, - "learning_rate": 0.00019580614307758952, - "loss": 0.972, - "step": 1472 - }, - { - "epoch": 0.7245450073782588, - "grad_norm": 0.4714298734153978, - "learning_rate": 0.0001957884796023854, - "loss": 0.9153, - "step": 1473 - }, - { - "epoch": 0.7250368912936547, - "grad_norm": 0.45793842944304836, - "learning_rate": 0.00019577077980833006, - "loss": 0.9442, - "step": 1474 - }, - { - "epoch": 0.7255287752090507, - "grad_norm": 0.4768470879202041, - "learning_rate": 0.0001957530437021345, - "loss": 0.8783, - "step": 1475 - }, - { - "epoch": 0.7260206591244466, - "grad_norm": 0.43647545764853873, - "learning_rate": 0.00019573527129052346, - "loss": 0.8677, - "step": 1476 - }, - { - "epoch": 0.7265125430398426, - "grad_norm": 0.4365078187098896, - "learning_rate": 0.00019571746258023555, - "loss": 0.967, - "step": 1477 - }, - { - "epoch": 0.7270044269552386, - "grad_norm": 0.4545650221688097, - "learning_rate": 0.00019569961757802298, - "loss": 0.9082, - "step": 1478 - }, - { - "epoch": 0.7274963108706345, - "grad_norm": 0.4401653990286907, - "learning_rate": 0.00019568173629065183, - "loss": 0.909, - "step": 1479 - }, - { - "epoch": 0.7279881947860305, - "grad_norm": 0.45995965989755017, - "learning_rate": 0.0001956638187249019, - "loss": 0.9893, - "step": 1480 - }, - { - "epoch": 0.7284800787014265, - "grad_norm": 0.44975805510063777, - "learning_rate": 0.0001956458648875668, - "loss": 0.9266, - "step": 1481 - }, - { - "epoch": 0.7289719626168224, - "grad_norm": 0.4930061220127927, - "learning_rate": 0.00019562787478545377, - "loss": 1.0045, - "step": 1482 - }, - { - "epoch": 0.7294638465322184, - "grad_norm": 0.5347367764789409, - "learning_rate": 0.00019560984842538397, - "loss": 1.022, - "step": 1483 - }, - { - "epoch": 0.7299557304476144, - "grad_norm": 0.4969844454587438, - "learning_rate": 0.00019559178581419215, - "loss": 0.9755, - "step": 1484 - }, - { - "epoch": 0.7304476143630103, - "grad_norm": 0.4771433816539928, - "learning_rate": 0.00019557368695872693, - "loss": 0.9305, - "step": 1485 - }, - { - "epoch": 0.7309394982784063, - "grad_norm": 0.49038254254723584, - "learning_rate": 0.00019555555186585055, - "loss": 0.8811, - "step": 1486 - }, - { - "epoch": 0.7314313821938022, - "grad_norm": 0.4400367538995633, - "learning_rate": 0.0001955373805424391, - "loss": 0.9098, - "step": 1487 - }, - { - "epoch": 0.7319232661091982, - "grad_norm": 0.5074890597051152, - "learning_rate": 0.00019551917299538232, - "loss": 0.9253, - "step": 1488 - }, - { - "epoch": 0.7324151500245942, - "grad_norm": 0.5091464986563263, - "learning_rate": 0.00019550092923158378, - "loss": 1.0048, - "step": 1489 - }, - { - "epoch": 0.7329070339399901, - "grad_norm": 0.472882246063679, - "learning_rate": 0.00019548264925796068, - "loss": 0.9963, - "step": 1490 - }, - { - "epoch": 0.7333989178553861, - "grad_norm": 0.4516101349724856, - "learning_rate": 0.00019546433308144403, - "loss": 0.9094, - "step": 1491 - }, - { - "epoch": 0.7338908017707821, - "grad_norm": 0.5118558111688102, - "learning_rate": 0.00019544598070897847, - "loss": 0.935, - "step": 1492 - }, - { - "epoch": 0.734382685686178, - "grad_norm": 0.45535268218100866, - "learning_rate": 0.0001954275921475225, - "loss": 0.9147, - "step": 1493 - }, - { - "epoch": 0.734874569601574, - "grad_norm": 0.42115777229307744, - "learning_rate": 0.0001954091674040482, - "loss": 0.91, - "step": 1494 - }, - { - "epoch": 0.7353664535169699, - "grad_norm": 0.4832454548127842, - "learning_rate": 0.00019539070648554155, - "loss": 0.9255, - "step": 1495 - }, - { - "epoch": 0.735858337432366, - "grad_norm": 0.4602081923293951, - "learning_rate": 0.00019537220939900202, - "loss": 0.9809, - "step": 1496 - }, - { - "epoch": 0.736350221347762, - "grad_norm": 0.47156152008319946, - "learning_rate": 0.0001953536761514429, - "loss": 0.9763, - "step": 1497 - }, - { - "epoch": 0.7368421052631579, - "grad_norm": 0.5141716716407748, - "learning_rate": 0.00019533510674989127, - "loss": 0.9673, - "step": 1498 - }, - { - "epoch": 0.7373339891785539, - "grad_norm": 0.47149772748268765, - "learning_rate": 0.00019531650120138783, - "loss": 0.9298, - "step": 1499 - }, - { - "epoch": 0.7378258730939499, - "grad_norm": 0.48429056140486454, - "learning_rate": 0.00019529785951298697, - "loss": 0.9077, - "step": 1500 - }, - { - "epoch": 0.7383177570093458, - "grad_norm": 0.5527017998587758, - "learning_rate": 0.00019527918169175683, - "loss": 1.0596, - "step": 1501 - }, - { - "epoch": 0.7388096409247418, - "grad_norm": 0.4619922352446741, - "learning_rate": 0.0001952604677447792, - "loss": 0.9087, - "step": 1502 - }, - { - "epoch": 0.7393015248401378, - "grad_norm": 0.49899778377780707, - "learning_rate": 0.00019524171767914967, - "loss": 0.9907, - "step": 1503 - }, - { - "epoch": 0.7397934087555337, - "grad_norm": 0.4404848899417216, - "learning_rate": 0.00019522293150197738, - "loss": 0.9308, - "step": 1504 - }, - { - "epoch": 0.7402852926709297, - "grad_norm": 0.4562171327754888, - "learning_rate": 0.0001952041092203853, - "loss": 0.9408, - "step": 1505 - }, - { - "epoch": 0.7407771765863256, - "grad_norm": 0.43446031665160695, - "learning_rate": 0.00019518525084150995, - "loss": 0.926, - "step": 1506 - }, - { - "epoch": 0.7412690605017216, - "grad_norm": 0.4900985254829056, - "learning_rate": 0.0001951663563725017, - "loss": 0.917, - "step": 1507 - }, - { - "epoch": 0.7417609444171176, - "grad_norm": 0.5238018635317627, - "learning_rate": 0.00019514742582052446, - "loss": 1.0288, - "step": 1508 - }, - { - "epoch": 0.7422528283325135, - "grad_norm": 0.45483497986719174, - "learning_rate": 0.00019512845919275587, - "loss": 0.9224, - "step": 1509 - }, - { - "epoch": 0.7427447122479095, - "grad_norm": 0.45470957670600526, - "learning_rate": 0.0001951094564963873, - "loss": 0.9517, - "step": 1510 - }, - { - "epoch": 0.7432365961633055, - "grad_norm": 0.4822942918152697, - "learning_rate": 0.0001950904177386237, - "loss": 1.0147, - "step": 1511 - }, - { - "epoch": 0.7437284800787014, - "grad_norm": 0.47794452427478296, - "learning_rate": 0.00019507134292668377, - "loss": 0.9073, - "step": 1512 - }, - { - "epoch": 0.7442203639940974, - "grad_norm": 0.44594834309930764, - "learning_rate": 0.00019505223206779987, - "loss": 0.8856, - "step": 1513 - }, - { - "epoch": 0.7447122479094933, - "grad_norm": 0.46883285255090074, - "learning_rate": 0.000195033085169218, - "loss": 0.9091, - "step": 1514 - }, - { - "epoch": 0.7452041318248893, - "grad_norm": 0.5042459269749029, - "learning_rate": 0.00019501390223819782, - "loss": 0.9948, - "step": 1515 - }, - { - "epoch": 0.7456960157402853, - "grad_norm": 0.5029030858543784, - "learning_rate": 0.00019499468328201269, - "loss": 0.9345, - "step": 1516 - }, - { - "epoch": 0.7461878996556812, - "grad_norm": 0.4252714821508642, - "learning_rate": 0.00019497542830794958, - "loss": 0.8709, - "step": 1517 - }, - { - "epoch": 0.7466797835710772, - "grad_norm": 0.5050335002582659, - "learning_rate": 0.00019495613732330917, - "loss": 0.9007, - "step": 1518 - }, - { - "epoch": 0.7471716674864732, - "grad_norm": 0.4890954632088174, - "learning_rate": 0.00019493681033540576, - "loss": 0.9823, - "step": 1519 - }, - { - "epoch": 0.7476635514018691, - "grad_norm": 0.4553543503953204, - "learning_rate": 0.00019491744735156732, - "loss": 0.8745, - "step": 1520 - }, - { - "epoch": 0.7481554353172651, - "grad_norm": 0.44030988669129767, - "learning_rate": 0.00019489804837913543, - "loss": 0.9011, - "step": 1521 - }, - { - "epoch": 0.7486473192326611, - "grad_norm": 0.4671129225093876, - "learning_rate": 0.00019487861342546537, - "loss": 0.8703, - "step": 1522 - }, - { - "epoch": 0.749139203148057, - "grad_norm": 0.48648288352303504, - "learning_rate": 0.00019485914249792603, - "loss": 0.9533, - "step": 1523 - }, - { - "epoch": 0.749631087063453, - "grad_norm": 0.43543064871374093, - "learning_rate": 0.0001948396356038999, - "loss": 0.9175, - "step": 1524 - }, - { - "epoch": 0.750122970978849, - "grad_norm": 0.48098356204014586, - "learning_rate": 0.0001948200927507832, - "loss": 0.9555, - "step": 1525 - }, - { - "epoch": 0.750614854894245, - "grad_norm": 0.46401849995174804, - "learning_rate": 0.00019480051394598572, - "loss": 0.9021, - "step": 1526 - }, - { - "epoch": 0.751106738809641, - "grad_norm": 0.5480349658843938, - "learning_rate": 0.00019478089919693092, - "loss": 0.9719, - "step": 1527 - }, - { - "epoch": 0.7515986227250369, - "grad_norm": 0.4450166515844905, - "learning_rate": 0.00019476124851105578, - "loss": 1.0222, - "step": 1528 - }, - { - "epoch": 0.7520905066404329, - "grad_norm": 0.46402663986264037, - "learning_rate": 0.00019474156189581111, - "loss": 0.9842, - "step": 1529 - }, - { - "epoch": 0.7525823905558289, - "grad_norm": 0.4983316468313297, - "learning_rate": 0.00019472183935866118, - "loss": 0.9784, - "step": 1530 - }, - { - "epoch": 0.7530742744712248, - "grad_norm": 0.442659315148361, - "learning_rate": 0.00019470208090708387, - "loss": 0.9043, - "step": 1531 - }, - { - "epoch": 0.7535661583866208, - "grad_norm": 0.44801710083887064, - "learning_rate": 0.0001946822865485708, - "loss": 0.9062, - "step": 1532 - }, - { - "epoch": 0.7540580423020167, - "grad_norm": 0.49620844983399426, - "learning_rate": 0.0001946624562906271, - "loss": 0.9754, - "step": 1533 - }, - { - "epoch": 0.7545499262174127, - "grad_norm": 0.4570053805043761, - "learning_rate": 0.00019464259014077153, - "loss": 1.0158, - "step": 1534 - }, - { - "epoch": 0.7550418101328087, - "grad_norm": 0.4604203532938178, - "learning_rate": 0.0001946226881065365, - "loss": 0.9914, - "step": 1535 - }, - { - "epoch": 0.7555336940482046, - "grad_norm": 0.4843548091166105, - "learning_rate": 0.00019460275019546802, - "loss": 0.997, - "step": 1536 - }, - { - "epoch": 0.7560255779636006, - "grad_norm": 0.4539157484540685, - "learning_rate": 0.00019458277641512566, - "loss": 0.9446, - "step": 1537 - }, - { - "epoch": 0.7565174618789966, - "grad_norm": 0.4448221663748266, - "learning_rate": 0.00019456276677308262, - "loss": 0.8979, - "step": 1538 - }, - { - "epoch": 0.7570093457943925, - "grad_norm": 0.45644392016358865, - "learning_rate": 0.00019454272127692569, - "loss": 0.9854, - "step": 1539 - }, - { - "epoch": 0.7575012297097885, - "grad_norm": 0.4623780984854484, - "learning_rate": 0.00019452263993425522, - "loss": 0.9623, - "step": 1540 - }, - { - "epoch": 0.7579931136251845, - "grad_norm": 0.4377138354111479, - "learning_rate": 0.00019450252275268528, - "loss": 0.9085, - "step": 1541 - }, - { - "epoch": 0.7584849975405804, - "grad_norm": 0.4309747845767413, - "learning_rate": 0.00019448236973984334, - "loss": 0.8849, - "step": 1542 - }, - { - "epoch": 0.7589768814559764, - "grad_norm": 0.42781127795203905, - "learning_rate": 0.0001944621809033706, - "loss": 0.8148, - "step": 1543 - }, - { - "epoch": 0.7594687653713723, - "grad_norm": 0.46547586850746064, - "learning_rate": 0.0001944419562509218, - "loss": 0.9215, - "step": 1544 - }, - { - "epoch": 0.7599606492867683, - "grad_norm": 0.4438294386361765, - "learning_rate": 0.00019442169579016522, - "loss": 0.9195, - "step": 1545 - }, - { - "epoch": 0.7604525332021643, - "grad_norm": 0.4403595828515475, - "learning_rate": 0.00019440139952878275, - "loss": 0.8287, - "step": 1546 - }, - { - "epoch": 0.7609444171175602, - "grad_norm": 0.45832092210999603, - "learning_rate": 0.0001943810674744699, - "loss": 0.8571, - "step": 1547 - }, - { - "epoch": 0.7614363010329562, - "grad_norm": 0.49296253953229596, - "learning_rate": 0.00019436069963493568, - "loss": 0.9385, - "step": 1548 - }, - { - "epoch": 0.7619281849483522, - "grad_norm": 0.473185289657026, - "learning_rate": 0.0001943402960179027, - "loss": 0.933, - "step": 1549 - }, - { - "epoch": 0.7624200688637481, - "grad_norm": 0.4296344153592067, - "learning_rate": 0.0001943198566311071, - "loss": 0.8782, - "step": 1550 - }, - { - "epoch": 0.7629119527791441, - "grad_norm": 0.4662749203028031, - "learning_rate": 0.00019429938148229865, - "loss": 0.8848, - "step": 1551 - }, - { - "epoch": 0.76340383669454, - "grad_norm": 0.4513214582091005, - "learning_rate": 0.0001942788705792406, - "loss": 0.9188, - "step": 1552 - }, - { - "epoch": 0.763895720609936, - "grad_norm": 0.4591419564477227, - "learning_rate": 0.00019425832392970987, - "loss": 0.9115, - "step": 1553 - }, - { - "epoch": 0.764387604525332, - "grad_norm": 0.4434159713443017, - "learning_rate": 0.0001942377415414968, - "loss": 0.9414, - "step": 1554 - }, - { - "epoch": 0.764879488440728, - "grad_norm": 0.4792413561377097, - "learning_rate": 0.00019421712342240538, - "loss": 0.9341, - "step": 1555 - }, - { - "epoch": 0.765371372356124, - "grad_norm": 0.4571233966332298, - "learning_rate": 0.00019419646958025304, - "loss": 0.9736, - "step": 1556 - }, - { - "epoch": 0.76586325627152, - "grad_norm": 0.5151264468857435, - "learning_rate": 0.0001941757800228709, - "loss": 1.0183, - "step": 1557 - }, - { - "epoch": 0.7663551401869159, - "grad_norm": 0.4554295322487166, - "learning_rate": 0.00019415505475810352, - "loss": 0.9483, - "step": 1558 - }, - { - "epoch": 0.7668470241023119, - "grad_norm": 0.46158790861470056, - "learning_rate": 0.00019413429379380902, - "loss": 0.9447, - "step": 1559 - }, - { - "epoch": 0.7673389080177078, - "grad_norm": 0.4813083752405416, - "learning_rate": 0.00019411349713785905, - "loss": 0.9625, - "step": 1560 - }, - { - "epoch": 0.7678307919331038, - "grad_norm": 0.46427574383257714, - "learning_rate": 0.00019409266479813883, - "loss": 0.9808, - "step": 1561 - }, - { - "epoch": 0.7683226758484998, - "grad_norm": 0.5031775742968747, - "learning_rate": 0.00019407179678254707, - "loss": 0.9054, - "step": 1562 - }, - { - "epoch": 0.7688145597638957, - "grad_norm": 0.4539929202407643, - "learning_rate": 0.00019405089309899602, - "loss": 0.9528, - "step": 1563 - }, - { - "epoch": 0.7693064436792917, - "grad_norm": 0.46450887484643755, - "learning_rate": 0.00019402995375541145, - "loss": 0.9818, - "step": 1564 - }, - { - "epoch": 0.7697983275946877, - "grad_norm": 0.4737268161140511, - "learning_rate": 0.00019400897875973265, - "loss": 1.0212, - "step": 1565 - }, - { - "epoch": 0.7702902115100836, - "grad_norm": 0.4746804064421646, - "learning_rate": 0.00019398796811991243, - "loss": 0.9546, - "step": 1566 - }, - { - "epoch": 0.7707820954254796, - "grad_norm": 0.4927982579852802, - "learning_rate": 0.00019396692184391715, - "loss": 0.9783, - "step": 1567 - }, - { - "epoch": 0.7712739793408756, - "grad_norm": 0.46978666141723746, - "learning_rate": 0.0001939458399397266, - "loss": 0.9595, - "step": 1568 - }, - { - "epoch": 0.7717658632562715, - "grad_norm": 0.4368343909072726, - "learning_rate": 0.00019392472241533417, - "loss": 0.9283, - "step": 1569 - }, - { - "epoch": 0.7722577471716675, - "grad_norm": 0.4729589519572927, - "learning_rate": 0.00019390356927874666, - "loss": 0.9241, - "step": 1570 - }, - { - "epoch": 0.7727496310870634, - "grad_norm": 0.4680601858523261, - "learning_rate": 0.0001938823805379845, - "loss": 0.9536, - "step": 1571 - }, - { - "epoch": 0.7732415150024594, - "grad_norm": 0.44741493265017857, - "learning_rate": 0.00019386115620108148, - "loss": 0.9878, - "step": 1572 - }, - { - "epoch": 0.7737333989178554, - "grad_norm": 0.45874614310505646, - "learning_rate": 0.00019383989627608496, - "loss": 0.9403, - "step": 1573 - }, - { - "epoch": 0.7742252828332513, - "grad_norm": 0.4783541327168479, - "learning_rate": 0.0001938186007710558, - "loss": 0.9792, - "step": 1574 - }, - { - "epoch": 0.7747171667486473, - "grad_norm": 0.47751385243285904, - "learning_rate": 0.00019379726969406832, - "loss": 0.9044, - "step": 1575 - }, - { - "epoch": 0.7752090506640433, - "grad_norm": 0.4815871742945832, - "learning_rate": 0.0001937759030532104, - "loss": 0.9999, - "step": 1576 - }, - { - "epoch": 0.7757009345794392, - "grad_norm": 0.4578006443117354, - "learning_rate": 0.00019375450085658326, - "loss": 0.9107, - "step": 1577 - }, - { - "epoch": 0.7761928184948352, - "grad_norm": 0.49119482793649694, - "learning_rate": 0.00019373306311230176, - "loss": 0.9841, - "step": 1578 - }, - { - "epoch": 0.7766847024102311, - "grad_norm": 6.823370082793621, - "learning_rate": 0.0001937115898284941, - "loss": 1.3784, - "step": 1579 - }, - { - "epoch": 0.7771765863256271, - "grad_norm": 0.5531959848646097, - "learning_rate": 0.0001936900810133021, - "loss": 0.9586, - "step": 1580 - }, - { - "epoch": 0.7776684702410231, - "grad_norm": 0.48994368995911186, - "learning_rate": 0.0001936685366748809, - "loss": 0.9523, - "step": 1581 - }, - { - "epoch": 0.778160354156419, - "grad_norm": 0.626551897369954, - "learning_rate": 0.00019364695682139922, - "loss": 0.972, - "step": 1582 - }, - { - "epoch": 0.778652238071815, - "grad_norm": 0.45270344719609446, - "learning_rate": 0.00019362534146103922, - "loss": 0.9204, - "step": 1583 - }, - { - "epoch": 0.7791441219872111, - "grad_norm": 0.4691826031541264, - "learning_rate": 0.00019360369060199647, - "loss": 0.9541, - "step": 1584 - }, - { - "epoch": 0.779636005902607, - "grad_norm": 0.5077218371602953, - "learning_rate": 0.00019358200425248002, - "loss": 0.8997, - "step": 1585 - }, - { - "epoch": 0.780127889818003, - "grad_norm": 0.4561702721741504, - "learning_rate": 0.00019356028242071248, - "loss": 0.891, - "step": 1586 - }, - { - "epoch": 0.780619773733399, - "grad_norm": 0.4676087402728596, - "learning_rate": 0.0001935385251149298, - "loss": 0.9643, - "step": 1587 - }, - { - "epoch": 0.7811116576487949, - "grad_norm": 0.47241355479761177, - "learning_rate": 0.00019351673234338138, - "loss": 0.9294, - "step": 1588 - }, - { - "epoch": 0.7816035415641909, - "grad_norm": 0.5681763043851413, - "learning_rate": 0.0001934949041143301, - "loss": 1.0596, - "step": 1589 - }, - { - "epoch": 0.7820954254795868, - "grad_norm": 0.4391500738716643, - "learning_rate": 0.0001934730404360523, - "loss": 0.8989, - "step": 1590 - }, - { - "epoch": 0.7825873093949828, - "grad_norm": 0.46519103493459424, - "learning_rate": 0.00019345114131683774, - "loss": 0.8926, - "step": 1591 - }, - { - "epoch": 0.7830791933103788, - "grad_norm": 0.502320252683488, - "learning_rate": 0.0001934292067649896, - "loss": 1.0, - "step": 1592 - }, - { - "epoch": 0.7835710772257747, - "grad_norm": 0.48300705699795393, - "learning_rate": 0.00019340723678882454, - "loss": 1.0473, - "step": 1593 - }, - { - "epoch": 0.7840629611411707, - "grad_norm": 0.47061023321541257, - "learning_rate": 0.00019338523139667262, - "loss": 1.0138, - "step": 1594 - }, - { - "epoch": 0.7845548450565667, - "grad_norm": 0.4224254111419423, - "learning_rate": 0.0001933631905968773, - "loss": 0.9177, - "step": 1595 - }, - { - "epoch": 0.7850467289719626, - "grad_norm": 0.4452485746247102, - "learning_rate": 0.00019334111439779558, - "loss": 0.8828, - "step": 1596 - }, - { - "epoch": 0.7855386128873586, - "grad_norm": 0.5015343696233722, - "learning_rate": 0.0001933190028077977, - "loss": 1.0458, - "step": 1597 - }, - { - "epoch": 0.7860304968027545, - "grad_norm": 0.4549251969089811, - "learning_rate": 0.00019329685583526748, - "loss": 0.8869, - "step": 1598 - }, - { - "epoch": 0.7865223807181505, - "grad_norm": 0.45312364995771337, - "learning_rate": 0.00019327467348860208, - "loss": 0.9665, - "step": 1599 - }, - { - "epoch": 0.7870142646335465, - "grad_norm": 0.5053126722237422, - "learning_rate": 0.00019325245577621209, - "loss": 0.9931, - "step": 1600 - }, - { - "epoch": 0.7875061485489424, - "grad_norm": 0.48392174160193185, - "learning_rate": 0.0001932302027065215, - "loss": 0.9856, - "step": 1601 - }, - { - "epoch": 0.7879980324643384, - "grad_norm": 0.5986324995620144, - "learning_rate": 0.00019320791428796774, - "loss": 0.9421, - "step": 1602 - }, - { - "epoch": 0.7884899163797344, - "grad_norm": 0.45550074064574225, - "learning_rate": 0.00019318559052900155, - "loss": 0.9373, - "step": 1603 - }, - { - "epoch": 0.7889818002951303, - "grad_norm": 0.48491000695876824, - "learning_rate": 0.00019316323143808722, - "loss": 1.0153, - "step": 1604 - }, - { - "epoch": 0.7894736842105263, - "grad_norm": 0.497246864253746, - "learning_rate": 0.00019314083702370226, - "loss": 0.9551, - "step": 1605 - }, - { - "epoch": 0.7899655681259223, - "grad_norm": 0.517932175705986, - "learning_rate": 0.00019311840729433773, - "loss": 1.0129, - "step": 1606 - }, - { - "epoch": 0.7904574520413182, - "grad_norm": 0.45489433714258243, - "learning_rate": 0.000193095942258498, - "loss": 0.8971, - "step": 1607 - }, - { - "epoch": 0.7909493359567142, - "grad_norm": 0.47133976803694805, - "learning_rate": 0.00019307344192470084, - "loss": 0.9763, - "step": 1608 - }, - { - "epoch": 0.7914412198721101, - "grad_norm": 0.46947040535121726, - "learning_rate": 0.00019305090630147742, - "loss": 0.9255, - "step": 1609 - }, - { - "epoch": 0.7919331037875061, - "grad_norm": 0.4420899586233333, - "learning_rate": 0.0001930283353973722, - "loss": 0.8474, - "step": 1610 - }, - { - "epoch": 0.7924249877029022, - "grad_norm": 0.4480831417460121, - "learning_rate": 0.00019300572922094318, - "loss": 0.9437, - "step": 1611 - }, - { - "epoch": 0.792916871618298, - "grad_norm": 0.4684174784436505, - "learning_rate": 0.0001929830877807616, - "loss": 0.999, - "step": 1612 - }, - { - "epoch": 0.793408755533694, - "grad_norm": 0.48752167381100503, - "learning_rate": 0.00019296041108541215, - "loss": 0.9927, - "step": 1613 - }, - { - "epoch": 0.7939006394490901, - "grad_norm": 0.5276968315456267, - "learning_rate": 0.0001929376991434928, - "loss": 0.9702, - "step": 1614 - }, - { - "epoch": 0.794392523364486, - "grad_norm": 0.4772786470315235, - "learning_rate": 0.00019291495196361496, - "loss": 1.0118, - "step": 1615 - }, - { - "epoch": 0.794884407279882, - "grad_norm": 0.4718448673875022, - "learning_rate": 0.00019289216955440338, - "loss": 1.0037, - "step": 1616 - }, - { - "epoch": 0.7953762911952779, - "grad_norm": 0.4537633482352533, - "learning_rate": 0.0001928693519244962, - "loss": 0.9617, - "step": 1617 - }, - { - "epoch": 0.7958681751106739, - "grad_norm": 0.5164387700241561, - "learning_rate": 0.00019284649908254479, - "loss": 0.9491, - "step": 1618 - }, - { - "epoch": 0.7963600590260699, - "grad_norm": 0.438994288744205, - "learning_rate": 0.00019282361103721405, - "loss": 0.8788, - "step": 1619 - }, - { - "epoch": 0.7968519429414658, - "grad_norm": 0.4631003609801896, - "learning_rate": 0.0001928006877971821, - "loss": 1.006, - "step": 1620 - }, - { - "epoch": 0.7973438268568618, - "grad_norm": 0.432299703741968, - "learning_rate": 0.00019277772937114046, - "loss": 0.9982, - "step": 1621 - }, - { - "epoch": 0.7978357107722578, - "grad_norm": 0.44108770983585777, - "learning_rate": 0.00019275473576779395, - "loss": 0.951, - "step": 1622 - }, - { - "epoch": 0.7983275946876537, - "grad_norm": 0.4342514741553981, - "learning_rate": 0.00019273170699586075, - "loss": 0.9107, - "step": 1623 - }, - { - "epoch": 0.7988194786030497, - "grad_norm": 0.46203229766886594, - "learning_rate": 0.00019270864306407238, - "loss": 0.9805, - "step": 1624 - }, - { - "epoch": 0.7993113625184457, - "grad_norm": 0.43764253671175973, - "learning_rate": 0.0001926855439811737, - "loss": 0.956, - "step": 1625 - }, - { - "epoch": 0.7998032464338416, - "grad_norm": 0.4553303193705595, - "learning_rate": 0.00019266240975592287, - "loss": 0.9146, - "step": 1626 - }, - { - "epoch": 0.8002951303492376, - "grad_norm": 0.43882644054933073, - "learning_rate": 0.00019263924039709138, - "loss": 0.9142, - "step": 1627 - }, - { - "epoch": 0.8007870142646335, - "grad_norm": 0.43967325794082146, - "learning_rate": 0.00019261603591346407, - "loss": 0.9005, - "step": 1628 - }, - { - "epoch": 0.8012788981800295, - "grad_norm": 0.45993695271604523, - "learning_rate": 0.00019259279631383906, - "loss": 0.9224, - "step": 1629 - }, - { - "epoch": 0.8017707820954255, - "grad_norm": 0.41841573061177034, - "learning_rate": 0.00019256952160702783, - "loss": 0.8089, - "step": 1630 - }, - { - "epoch": 0.8022626660108214, - "grad_norm": 0.4517922112489481, - "learning_rate": 0.0001925462118018551, - "loss": 0.9832, - "step": 1631 - }, - { - "epoch": 0.8027545499262174, - "grad_norm": 0.4294363143126728, - "learning_rate": 0.000192522866907159, - "loss": 0.9678, - "step": 1632 - }, - { - "epoch": 0.8032464338416134, - "grad_norm": 0.4655940714688177, - "learning_rate": 0.00019249948693179084, - "loss": 1.0027, - "step": 1633 - }, - { - "epoch": 0.8037383177570093, - "grad_norm": 0.4585318027515355, - "learning_rate": 0.00019247607188461533, - "loss": 1.0446, - "step": 1634 - }, - { - "epoch": 0.8042302016724053, - "grad_norm": 0.4389156114118129, - "learning_rate": 0.00019245262177451043, - "loss": 0.9155, - "step": 1635 - }, - { - "epoch": 0.8047220855878012, - "grad_norm": 0.4371706023446502, - "learning_rate": 0.00019242913661036742, - "loss": 0.9244, - "step": 1636 - }, - { - "epoch": 0.8052139695031972, - "grad_norm": 0.4429513462143878, - "learning_rate": 0.00019240561640109088, - "loss": 0.9678, - "step": 1637 - }, - { - "epoch": 0.8057058534185932, - "grad_norm": 0.45578339516845023, - "learning_rate": 0.0001923820611555986, - "loss": 0.9286, - "step": 1638 - }, - { - "epoch": 0.8061977373339891, - "grad_norm": 0.4438082735917897, - "learning_rate": 0.00019235847088282175, - "loss": 0.8986, - "step": 1639 - }, - { - "epoch": 0.8066896212493851, - "grad_norm": 0.4363231081040358, - "learning_rate": 0.00019233484559170474, - "loss": 0.9207, - "step": 1640 - }, - { - "epoch": 0.8071815051647812, - "grad_norm": 0.47427861827466716, - "learning_rate": 0.00019231118529120525, - "loss": 1.0304, - "step": 1641 - }, - { - "epoch": 0.807673389080177, - "grad_norm": 0.42566058975372983, - "learning_rate": 0.00019228748999029424, - "loss": 0.8729, - "step": 1642 - }, - { - "epoch": 0.8081652729955731, - "grad_norm": 0.47573325704432545, - "learning_rate": 0.00019226375969795596, - "loss": 0.9169, - "step": 1643 - }, - { - "epoch": 0.8086571569109691, - "grad_norm": 4.151933419356274, - "learning_rate": 0.00019223999442318792, - "loss": 0.9422, - "step": 1644 - }, - { - "epoch": 0.809149040826365, - "grad_norm": 0.47818491115512063, - "learning_rate": 0.00019221619417500086, - "loss": 0.9432, - "step": 1645 - }, - { - "epoch": 0.809640924741761, - "grad_norm": 2.5932967494548795, - "learning_rate": 0.00019219235896241878, - "loss": 0.8862, - "step": 1646 - }, - { - "epoch": 0.8101328086571569, - "grad_norm": 0.5538370036659264, - "learning_rate": 0.00019216848879447903, - "loss": 0.9647, - "step": 1647 - }, - { - "epoch": 0.8106246925725529, - "grad_norm": 0.5292785138239161, - "learning_rate": 0.00019214458368023213, - "loss": 0.9156, - "step": 1648 - }, - { - "epoch": 0.8111165764879489, - "grad_norm": 0.5013209288940909, - "learning_rate": 0.0001921206436287418, - "loss": 0.978, - "step": 1649 - }, - { - "epoch": 0.8116084604033448, - "grad_norm": 0.4825509785596944, - "learning_rate": 0.00019209666864908513, - "loss": 0.9485, - "step": 1650 - }, - { - "epoch": 0.8121003443187408, - "grad_norm": 0.4871980247632107, - "learning_rate": 0.00019207265875035242, - "loss": 0.9772, - "step": 1651 - }, - { - "epoch": 0.8125922282341368, - "grad_norm": 0.5304798536348684, - "learning_rate": 0.00019204861394164712, - "loss": 0.9387, - "step": 1652 - }, - { - "epoch": 0.8130841121495327, - "grad_norm": 0.4824297692753066, - "learning_rate": 0.000192024534232086, - "loss": 0.9482, - "step": 1653 - }, - { - "epoch": 0.8135759960649287, - "grad_norm": 0.44976498051806146, - "learning_rate": 0.0001920004196307991, - "loss": 0.8662, - "step": 1654 - }, - { - "epoch": 0.8140678799803246, - "grad_norm": 0.474231975096475, - "learning_rate": 0.00019197627014692957, - "loss": 0.9841, - "step": 1655 - }, - { - "epoch": 0.8145597638957206, - "grad_norm": 0.43054221145474725, - "learning_rate": 0.0001919520857896339, - "loss": 0.8923, - "step": 1656 - }, - { - "epoch": 0.8150516478111166, - "grad_norm": 0.43791924280616396, - "learning_rate": 0.00019192786656808172, - "loss": 0.9218, - "step": 1657 - }, - { - "epoch": 0.8155435317265125, - "grad_norm": 0.4502867626244973, - "learning_rate": 0.0001919036124914559, - "loss": 0.9307, - "step": 1658 - }, - { - "epoch": 0.8160354156419085, - "grad_norm": 0.4601747667702632, - "learning_rate": 0.00019187932356895255, - "loss": 0.888, - "step": 1659 - }, - { - "epoch": 0.8165272995573045, - "grad_norm": 0.41441192313089104, - "learning_rate": 0.000191854999809781, - "loss": 0.8535, - "step": 1660 - }, - { - "epoch": 0.8170191834727004, - "grad_norm": 0.46412142092536074, - "learning_rate": 0.00019183064122316377, - "loss": 0.9116, - "step": 1661 - }, - { - "epoch": 0.8175110673880964, - "grad_norm": 0.4781238738230767, - "learning_rate": 0.00019180624781833653, - "loss": 0.9667, - "step": 1662 - }, - { - "epoch": 0.8180029513034923, - "grad_norm": 0.47809572634639297, - "learning_rate": 0.0001917818196045483, - "loss": 0.9217, - "step": 1663 - }, - { - "epoch": 0.8184948352188883, - "grad_norm": 0.4289354306018643, - "learning_rate": 0.0001917573565910611, - "loss": 0.9228, - "step": 1664 - }, - { - "epoch": 0.8189867191342843, - "grad_norm": 0.4376385534401151, - "learning_rate": 0.0001917328587871503, - "loss": 0.9193, - "step": 1665 - }, - { - "epoch": 0.8194786030496802, - "grad_norm": 0.4602183329184602, - "learning_rate": 0.0001917083262021044, - "loss": 0.9596, - "step": 1666 - }, - { - "epoch": 0.8199704869650762, - "grad_norm": 0.4705010674999434, - "learning_rate": 0.0001916837588452251, - "loss": 0.9906, - "step": 1667 - }, - { - "epoch": 0.8204623708804722, - "grad_norm": 0.4540706040744854, - "learning_rate": 0.00019165915672582728, - "loss": 0.9111, - "step": 1668 - }, - { - "epoch": 0.8209542547958681, - "grad_norm": 0.4399156745771892, - "learning_rate": 0.000191634519853239, - "loss": 0.8972, - "step": 1669 - }, - { - "epoch": 0.8214461387112642, - "grad_norm": 0.4423058953446686, - "learning_rate": 0.0001916098482368015, - "loss": 0.9503, - "step": 1670 - }, - { - "epoch": 0.8219380226266602, - "grad_norm": 0.4505275951880537, - "learning_rate": 0.00019158514188586919, - "loss": 0.9449, - "step": 1671 - }, - { - "epoch": 0.822429906542056, - "grad_norm": 0.4408739893258349, - "learning_rate": 0.00019156040080980962, - "loss": 0.9479, - "step": 1672 - }, - { - "epoch": 0.8229217904574521, - "grad_norm": 0.4380737534669539, - "learning_rate": 0.00019153562501800355, - "loss": 1.0027, - "step": 1673 - }, - { - "epoch": 0.823413674372848, - "grad_norm": 0.4787880601377095, - "learning_rate": 0.00019151081451984495, - "loss": 0.9935, - "step": 1674 - }, - { - "epoch": 0.823905558288244, - "grad_norm": 1.119589826970695, - "learning_rate": 0.00019148596932474078, - "loss": 1.0879, - "step": 1675 - }, - { - "epoch": 0.82439744220364, - "grad_norm": 0.46572840439247004, - "learning_rate": 0.00019146108944211138, - "loss": 0.8878, - "step": 1676 - }, - { - "epoch": 0.8248893261190359, - "grad_norm": 0.4659208296839495, - "learning_rate": 0.00019143617488139004, - "loss": 0.9279, - "step": 1677 - }, - { - "epoch": 0.8253812100344319, - "grad_norm": 0.44553558928537207, - "learning_rate": 0.00019141122565202335, - "loss": 0.9658, - "step": 1678 - }, - { - "epoch": 0.8258730939498279, - "grad_norm": 0.4184334521190072, - "learning_rate": 0.0001913862417634709, - "loss": 0.9047, - "step": 1679 - }, - { - "epoch": 0.8263649778652238, - "grad_norm": 0.467572541474465, - "learning_rate": 0.00019136122322520555, - "loss": 0.9362, - "step": 1680 - }, - { - "epoch": 0.8268568617806198, - "grad_norm": 0.4733217009790865, - "learning_rate": 0.00019133617004671324, - "loss": 0.9328, - "step": 1681 - }, - { - "epoch": 0.8273487456960157, - "grad_norm": 0.4349385845127341, - "learning_rate": 0.0001913110822374931, - "loss": 0.9613, - "step": 1682 - }, - { - "epoch": 0.8278406296114117, - "grad_norm": 0.6962138041986241, - "learning_rate": 0.00019128595980705726, - "loss": 0.9973, - "step": 1683 - }, - { - "epoch": 0.8283325135268077, - "grad_norm": 0.4687083158184151, - "learning_rate": 0.0001912608027649311, - "loss": 0.919, - "step": 1684 - }, - { - "epoch": 0.8288243974422036, - "grad_norm": 0.7325556364348405, - "learning_rate": 0.0001912356111206531, - "loss": 0.9348, - "step": 1685 - }, - { - "epoch": 0.8293162813575996, - "grad_norm": 0.45290954712544573, - "learning_rate": 0.00019121038488377481, - "loss": 0.941, - "step": 1686 - }, - { - "epoch": 0.8298081652729956, - "grad_norm": 0.4638279977461688, - "learning_rate": 0.00019118512406386093, - "loss": 0.9467, - "step": 1687 - }, - { - "epoch": 0.8303000491883915, - "grad_norm": 0.4937722375033083, - "learning_rate": 0.0001911598286704893, - "loss": 1.0182, - "step": 1688 - }, - { - "epoch": 0.8307919331037875, - "grad_norm": 0.4978750826206753, - "learning_rate": 0.00019113449871325082, - "loss": 0.9416, - "step": 1689 - }, - { - "epoch": 0.8312838170191835, - "grad_norm": 0.47674968324113887, - "learning_rate": 0.00019110913420174953, - "loss": 0.9751, - "step": 1690 - }, - { - "epoch": 0.8317757009345794, - "grad_norm": 0.5853175017337767, - "learning_rate": 0.00019108373514560253, - "loss": 0.9332, - "step": 1691 - }, - { - "epoch": 0.8322675848499754, - "grad_norm": 0.8346336216787719, - "learning_rate": 0.00019105830155444006, - "loss": 0.942, - "step": 1692 - }, - { - "epoch": 0.8327594687653713, - "grad_norm": 0.4704554197974211, - "learning_rate": 0.00019103283343790544, - "loss": 0.9197, - "step": 1693 - }, - { - "epoch": 0.8332513526807673, - "grad_norm": 0.44791715506213525, - "learning_rate": 0.00019100733080565507, - "loss": 0.8747, - "step": 1694 - }, - { - "epoch": 0.8337432365961633, - "grad_norm": 0.4785105991498684, - "learning_rate": 0.00019098179366735846, - "loss": 0.9503, - "step": 1695 - }, - { - "epoch": 0.8342351205115592, - "grad_norm": 0.44983205244026786, - "learning_rate": 0.00019095622203269818, - "loss": 0.9216, - "step": 1696 - }, - { - "epoch": 0.8347270044269552, - "grad_norm": 0.4145305835128189, - "learning_rate": 0.00019093061591136988, - "loss": 0.9095, - "step": 1697 - }, - { - "epoch": 0.8352188883423513, - "grad_norm": 0.5481351760124087, - "learning_rate": 0.00019090497531308232, - "loss": 0.9839, - "step": 1698 - }, - { - "epoch": 0.8357107722577471, - "grad_norm": 0.4298929399943066, - "learning_rate": 0.00019087930024755729, - "loss": 0.8435, - "step": 1699 - }, - { - "epoch": 0.8362026561731432, - "grad_norm": 0.5331492927411966, - "learning_rate": 0.00019085359072452965, - "loss": 0.9511, - "step": 1700 - }, - { - "epoch": 0.836694540088539, - "grad_norm": 0.4547815109154883, - "learning_rate": 0.00019082784675374738, - "loss": 0.9574, - "step": 1701 - }, - { - "epoch": 0.8371864240039351, - "grad_norm": 0.4669681706844525, - "learning_rate": 0.00019080206834497143, - "loss": 0.8932, - "step": 1702 - }, - { - "epoch": 0.8376783079193311, - "grad_norm": 0.5294472646601358, - "learning_rate": 0.00019077625550797592, - "loss": 0.8594, - "step": 1703 - }, - { - "epoch": 0.838170191834727, - "grad_norm": 0.49214535071449894, - "learning_rate": 0.0001907504082525479, - "loss": 0.9738, - "step": 1704 - }, - { - "epoch": 0.838662075750123, - "grad_norm": 0.43244479297440763, - "learning_rate": 0.00019072452658848755, - "loss": 0.942, - "step": 1705 - }, - { - "epoch": 0.839153959665519, - "grad_norm": 0.4904340790964634, - "learning_rate": 0.00019069861052560812, - "loss": 0.9537, - "step": 1706 - }, - { - "epoch": 0.8396458435809149, - "grad_norm": 0.43774183775168746, - "learning_rate": 0.00019067266007373582, - "loss": 0.8301, - "step": 1707 - }, - { - "epoch": 0.8401377274963109, - "grad_norm": 0.46050558873106656, - "learning_rate": 0.00019064667524270994, - "loss": 0.953, - "step": 1708 - }, - { - "epoch": 0.8406296114117069, - "grad_norm": 0.4484627683624173, - "learning_rate": 0.0001906206560423828, - "loss": 0.918, - "step": 1709 - }, - { - "epoch": 0.8411214953271028, - "grad_norm": 0.4296532777857049, - "learning_rate": 0.00019059460248261977, - "loss": 0.824, - "step": 1710 - }, - { - "epoch": 0.8416133792424988, - "grad_norm": 0.45532748283712865, - "learning_rate": 0.00019056851457329926, - "loss": 0.991, - "step": 1711 - }, - { - "epoch": 0.8421052631578947, - "grad_norm": 0.4160688152743597, - "learning_rate": 0.00019054239232431264, - "loss": 0.8287, - "step": 1712 - }, - { - "epoch": 0.8425971470732907, - "grad_norm": 0.4187920785854741, - "learning_rate": 0.00019051623574556435, - "loss": 0.9079, - "step": 1713 - }, - { - "epoch": 0.8430890309886867, - "grad_norm": 0.4483790378833997, - "learning_rate": 0.00019049004484697184, - "loss": 0.9457, - "step": 1714 - }, - { - "epoch": 0.8435809149040826, - "grad_norm": 0.4868550243867067, - "learning_rate": 0.00019046381963846553, - "loss": 0.961, - "step": 1715 - }, - { - "epoch": 0.8440727988194786, - "grad_norm": 0.43569359093080523, - "learning_rate": 0.00019043756012998895, - "loss": 0.9038, - "step": 1716 - }, - { - "epoch": 0.8445646827348746, - "grad_norm": 0.44509012324050984, - "learning_rate": 0.0001904112663314985, - "loss": 0.983, - "step": 1717 - }, - { - "epoch": 0.8450565666502705, - "grad_norm": 0.476317419826852, - "learning_rate": 0.0001903849382529637, - "loss": 0.9143, - "step": 1718 - }, - { - "epoch": 0.8455484505656665, - "grad_norm": 0.41731740875023754, - "learning_rate": 0.00019035857590436704, - "loss": 0.8802, - "step": 1719 - }, - { - "epoch": 0.8460403344810624, - "grad_norm": 0.47613499203651155, - "learning_rate": 0.00019033217929570391, - "loss": 0.9191, - "step": 1720 - }, - { - "epoch": 0.8465322183964584, - "grad_norm": 0.4464178651406063, - "learning_rate": 0.00019030574843698281, - "loss": 0.9648, - "step": 1721 - }, - { - "epoch": 0.8470241023118544, - "grad_norm": 0.44463784745952545, - "learning_rate": 0.00019027928333822515, - "loss": 0.8904, - "step": 1722 - }, - { - "epoch": 0.8475159862272503, - "grad_norm": 0.42619905969166394, - "learning_rate": 0.00019025278400946542, - "loss": 0.8552, - "step": 1723 - }, - { - "epoch": 0.8480078701426463, - "grad_norm": 0.4261901301127405, - "learning_rate": 0.00019022625046075095, - "loss": 0.9629, - "step": 1724 - }, - { - "epoch": 0.8484997540580423, - "grad_norm": 0.5796037243857726, - "learning_rate": 0.00019019968270214213, - "loss": 0.9205, - "step": 1725 - }, - { - "epoch": 0.8489916379734382, - "grad_norm": 0.44780252737015724, - "learning_rate": 0.00019017308074371233, - "loss": 0.9937, - "step": 1726 - }, - { - "epoch": 0.8494835218888342, - "grad_norm": 0.4877912151431391, - "learning_rate": 0.00019014644459554786, - "loss": 0.9661, - "step": 1727 - }, - { - "epoch": 0.8499754058042303, - "grad_norm": 0.48469045725544496, - "learning_rate": 0.00019011977426774797, - "loss": 0.9564, - "step": 1728 - }, - { - "epoch": 0.8504672897196262, - "grad_norm": 0.44586207792614124, - "learning_rate": 0.00019009306977042488, - "loss": 0.9836, - "step": 1729 - }, - { - "epoch": 0.8509591736350222, - "grad_norm": 0.41808746204810365, - "learning_rate": 0.00019006633111370386, - "loss": 0.8683, - "step": 1730 - }, - { - "epoch": 0.8514510575504181, - "grad_norm": 0.4524247488232523, - "learning_rate": 0.00019003955830772297, - "loss": 0.8832, - "step": 1731 - }, - { - "epoch": 0.8519429414658141, - "grad_norm": 0.4613344924505022, - "learning_rate": 0.00019001275136263333, - "loss": 0.9312, - "step": 1732 - }, - { - "epoch": 0.8524348253812101, - "grad_norm": 0.45390650900046664, - "learning_rate": 0.00018998591028859902, - "loss": 0.8647, - "step": 1733 - }, - { - "epoch": 0.852926709296606, - "grad_norm": 0.4582601512809806, - "learning_rate": 0.00018995903509579694, - "loss": 0.9831, - "step": 1734 - }, - { - "epoch": 0.853418593212002, - "grad_norm": 0.4711376053055614, - "learning_rate": 0.00018993212579441703, - "loss": 0.9372, - "step": 1735 - }, - { - "epoch": 0.853910477127398, - "grad_norm": 0.44223921031020474, - "learning_rate": 0.00018990518239466215, - "loss": 0.9191, - "step": 1736 - }, - { - "epoch": 0.8544023610427939, - "grad_norm": 0.43119686684735786, - "learning_rate": 0.00018987820490674805, - "loss": 0.9995, - "step": 1737 - }, - { - "epoch": 0.8548942449581899, - "grad_norm": 0.4472573559680049, - "learning_rate": 0.00018985119334090348, - "loss": 0.9674, - "step": 1738 - }, - { - "epoch": 0.8553861288735858, - "grad_norm": 0.44812802548277875, - "learning_rate": 0.00018982414770737, - "loss": 0.94, - "step": 1739 - }, - { - "epoch": 0.8558780127889818, - "grad_norm": 0.4269593800378158, - "learning_rate": 0.00018979706801640212, - "loss": 0.8899, - "step": 1740 - }, - { - "epoch": 0.8563698967043778, - "grad_norm": 0.42748425771216847, - "learning_rate": 0.00018976995427826736, - "loss": 0.9473, - "step": 1741 - }, - { - "epoch": 0.8568617806197737, - "grad_norm": 0.434640751370082, - "learning_rate": 0.00018974280650324606, - "loss": 0.9424, - "step": 1742 - }, - { - "epoch": 0.8573536645351697, - "grad_norm": 0.44851806532954686, - "learning_rate": 0.00018971562470163146, - "loss": 0.9983, - "step": 1743 - }, - { - "epoch": 0.8578455484505657, - "grad_norm": 0.4552608523036954, - "learning_rate": 0.00018968840888372972, - "loss": 0.9899, - "step": 1744 - }, - { - "epoch": 0.8583374323659616, - "grad_norm": 0.4286141547884595, - "learning_rate": 0.00018966115905985994, - "loss": 0.9454, - "step": 1745 - }, - { - "epoch": 0.8588293162813576, - "grad_norm": 0.41057848902217786, - "learning_rate": 0.00018963387524035405, - "loss": 0.947, - "step": 1746 - }, - { - "epoch": 0.8593212001967536, - "grad_norm": 0.4325695053662549, - "learning_rate": 0.0001896065574355569, - "loss": 0.9442, - "step": 1747 - }, - { - "epoch": 0.8598130841121495, - "grad_norm": 0.42886033764967324, - "learning_rate": 0.0001895792056558262, - "loss": 0.9677, - "step": 1748 - }, - { - "epoch": 0.8603049680275455, - "grad_norm": 0.4614553992069052, - "learning_rate": 0.00018955181991153262, - "loss": 0.9718, - "step": 1749 - }, - { - "epoch": 0.8607968519429414, - "grad_norm": 0.5977683144717094, - "learning_rate": 0.00018952440021305958, - "loss": 0.8984, - "step": 1750 - }, - { - "epoch": 0.8612887358583374, - "grad_norm": 0.4444559794221156, - "learning_rate": 0.00018949694657080347, - "loss": 0.9548, - "step": 1751 - }, - { - "epoch": 0.8617806197737334, - "grad_norm": 0.44414338599542635, - "learning_rate": 0.00018946945899517355, - "loss": 0.923, - "step": 1752 - }, - { - "epoch": 0.8622725036891293, - "grad_norm": 0.4367775065694892, - "learning_rate": 0.0001894419374965919, - "loss": 0.9239, - "step": 1753 - }, - { - "epoch": 0.8627643876045253, - "grad_norm": 0.4332821664486575, - "learning_rate": 0.00018941438208549348, - "loss": 0.9647, - "step": 1754 - }, - { - "epoch": 0.8632562715199213, - "grad_norm": 0.442896623249607, - "learning_rate": 0.0001893867927723261, - "loss": 0.9067, - "step": 1755 - }, - { - "epoch": 0.8637481554353172, - "grad_norm": 0.4765982923498218, - "learning_rate": 0.00018935916956755049, - "loss": 0.8763, - "step": 1756 - }, - { - "epoch": 0.8642400393507133, - "grad_norm": 0.5085873054440848, - "learning_rate": 0.00018933151248164013, - "loss": 0.9125, - "step": 1757 - }, - { - "epoch": 0.8647319232661091, - "grad_norm": 0.4450768587114687, - "learning_rate": 0.00018930382152508135, - "loss": 0.9892, - "step": 1758 - }, - { - "epoch": 0.8652238071815052, - "grad_norm": 0.44517932512121955, - "learning_rate": 0.00018927609670837345, - "loss": 0.9625, - "step": 1759 - }, - { - "epoch": 0.8657156910969012, - "grad_norm": 0.48411990596383014, - "learning_rate": 0.0001892483380420284, - "loss": 1.005, - "step": 1760 - }, - { - "epoch": 0.8662075750122971, - "grad_norm": 0.45385066165339627, - "learning_rate": 0.0001892205455365712, - "loss": 0.9319, - "step": 1761 - }, - { - "epoch": 0.8666994589276931, - "grad_norm": 0.45143595633541134, - "learning_rate": 0.00018919271920253946, - "loss": 0.9441, - "step": 1762 - }, - { - "epoch": 0.8671913428430891, - "grad_norm": 0.45260700492803047, - "learning_rate": 0.00018916485905048376, - "loss": 0.9624, - "step": 1763 - }, - { - "epoch": 0.867683226758485, - "grad_norm": 0.5206644890104113, - "learning_rate": 0.00018913696509096744, - "loss": 0.9982, - "step": 1764 - }, - { - "epoch": 0.868175110673881, - "grad_norm": 0.43102045635046177, - "learning_rate": 0.00018910903733456675, - "loss": 0.8632, - "step": 1765 - }, - { - "epoch": 0.8686669945892769, - "grad_norm": 0.45239651443635964, - "learning_rate": 0.00018908107579187062, - "loss": 0.9662, - "step": 1766 - }, - { - "epoch": 0.8691588785046729, - "grad_norm": 0.44902898324976237, - "learning_rate": 0.00018905308047348091, - "loss": 0.8942, - "step": 1767 - }, - { - "epoch": 0.8696507624200689, - "grad_norm": 0.44351856444298865, - "learning_rate": 0.00018902505139001217, - "loss": 0.8789, - "step": 1768 - }, - { - "epoch": 0.8701426463354648, - "grad_norm": 0.4375309473126057, - "learning_rate": 0.00018899698855209191, - "loss": 0.9314, - "step": 1769 - }, - { - "epoch": 0.8706345302508608, - "grad_norm": 0.6783015497558752, - "learning_rate": 0.00018896889197036028, - "loss": 0.9623, - "step": 1770 - }, - { - "epoch": 0.8711264141662568, - "grad_norm": 0.460625183828372, - "learning_rate": 0.00018894076165547026, - "loss": 0.9257, - "step": 1771 - }, - { - "epoch": 0.8716182980816527, - "grad_norm": 0.41840175884135405, - "learning_rate": 0.0001889125976180877, - "loss": 0.941, - "step": 1772 - }, - { - "epoch": 0.8721101819970487, - "grad_norm": 0.4473604481778361, - "learning_rate": 0.0001888843998688912, - "loss": 0.9419, - "step": 1773 - }, - { - "epoch": 0.8726020659124447, - "grad_norm": 0.4575990059010378, - "learning_rate": 0.00018885616841857213, - "loss": 0.9284, - "step": 1774 - }, - { - "epoch": 0.8730939498278406, - "grad_norm": 0.4737021978731937, - "learning_rate": 0.00018882790327783456, - "loss": 0.9625, - "step": 1775 - }, - { - "epoch": 0.8735858337432366, - "grad_norm": 0.4677052281961736, - "learning_rate": 0.00018879960445739545, - "loss": 0.9286, - "step": 1776 - }, - { - "epoch": 0.8740777176586325, - "grad_norm": 0.4466015227284853, - "learning_rate": 0.00018877127196798458, - "loss": 0.9675, - "step": 1777 - }, - { - "epoch": 0.8745696015740285, - "grad_norm": 0.4380609013163242, - "learning_rate": 0.00018874290582034426, - "loss": 0.8788, - "step": 1778 - }, - { - "epoch": 0.8750614854894245, - "grad_norm": 0.42955753585069034, - "learning_rate": 0.00018871450602522978, - "loss": 0.9219, - "step": 1779 - }, - { - "epoch": 0.8755533694048204, - "grad_norm": 0.4673997530348016, - "learning_rate": 0.00018868607259340912, - "loss": 1.0069, - "step": 1780 - }, - { - "epoch": 0.8760452533202164, - "grad_norm": 0.45299558876329443, - "learning_rate": 0.00018865760553566296, - "loss": 0.8895, - "step": 1781 - }, - { - "epoch": 0.8765371372356124, - "grad_norm": 0.43026142358832664, - "learning_rate": 0.00018862910486278485, - "loss": 0.9287, - "step": 1782 - }, - { - "epoch": 0.8770290211510083, - "grad_norm": 0.45346870074831724, - "learning_rate": 0.00018860057058558096, - "loss": 0.967, - "step": 1783 - }, - { - "epoch": 0.8775209050664043, - "grad_norm": 0.41438574406103895, - "learning_rate": 0.0001885720027148703, - "loss": 0.8696, - "step": 1784 - }, - { - "epoch": 0.8780127889818002, - "grad_norm": 0.4209309017198743, - "learning_rate": 0.0001885434012614845, - "loss": 0.893, - "step": 1785 - }, - { - "epoch": 0.8785046728971962, - "grad_norm": 0.45927453652065253, - "learning_rate": 0.00018851476623626804, - "loss": 0.9263, - "step": 1786 - }, - { - "epoch": 0.8789965568125923, - "grad_norm": 0.4428204057844921, - "learning_rate": 0.0001884860976500781, - "loss": 0.9492, - "step": 1787 - }, - { - "epoch": 0.8794884407279882, - "grad_norm": 0.4360707709403154, - "learning_rate": 0.00018845739551378454, - "loss": 0.9493, - "step": 1788 - }, - { - "epoch": 0.8799803246433842, - "grad_norm": 0.42945002857421083, - "learning_rate": 0.00018842865983827, - "loss": 0.9494, - "step": 1789 - }, - { - "epoch": 0.8804722085587802, - "grad_norm": 0.42044826504153504, - "learning_rate": 0.00018839989063442974, - "loss": 0.9344, - "step": 1790 - }, - { - "epoch": 0.8809640924741761, - "grad_norm": 0.4175668456403529, - "learning_rate": 0.00018837108791317192, - "loss": 0.8691, - "step": 1791 - }, - { - "epoch": 0.8814559763895721, - "grad_norm": 0.41589198575210096, - "learning_rate": 0.00018834225168541714, - "loss": 0.8277, - "step": 1792 - }, - { - "epoch": 0.8819478603049681, - "grad_norm": 0.45472823744226093, - "learning_rate": 0.00018831338196209898, - "loss": 0.9579, - "step": 1793 - }, - { - "epoch": 0.882439744220364, - "grad_norm": 0.43642395638749093, - "learning_rate": 0.0001882844787541635, - "loss": 0.8524, - "step": 1794 - }, - { - "epoch": 0.88293162813576, - "grad_norm": 2.101447151660097, - "learning_rate": 0.00018825554207256958, - "loss": 1.0781, - "step": 1795 - }, - { - "epoch": 0.8834235120511559, - "grad_norm": 0.4709293264588742, - "learning_rate": 0.00018822657192828877, - "loss": 0.944, - "step": 1796 - }, - { - "epoch": 0.8839153959665519, - "grad_norm": 0.4847808483734464, - "learning_rate": 0.00018819756833230526, - "loss": 0.912, - "step": 1797 - }, - { - "epoch": 0.8844072798819479, - "grad_norm": 0.4565319301550743, - "learning_rate": 0.00018816853129561601, - "loss": 0.9549, - "step": 1798 - }, - { - "epoch": 0.8848991637973438, - "grad_norm": 0.447116787345367, - "learning_rate": 0.00018813946082923058, - "loss": 0.9802, - "step": 1799 - }, - { - "epoch": 0.8853910477127398, - "grad_norm": 0.45879294419820016, - "learning_rate": 0.00018811035694417126, - "loss": 0.9126, - "step": 1800 - }, - { - "epoch": 0.8858829316281358, - "grad_norm": 0.43693103995260546, - "learning_rate": 0.00018808121965147294, - "loss": 0.8992, - "step": 1801 - }, - { - "epoch": 0.8863748155435317, - "grad_norm": 0.48290914109320904, - "learning_rate": 0.00018805204896218326, - "loss": 0.9972, - "step": 1802 - }, - { - "epoch": 0.8868666994589277, - "grad_norm": 0.4240014058321878, - "learning_rate": 0.00018802284488736245, - "loss": 0.8836, - "step": 1803 - }, - { - "epoch": 0.8873585833743236, - "grad_norm": 0.45171863426094383, - "learning_rate": 0.00018799360743808346, - "loss": 0.9427, - "step": 1804 - }, - { - "epoch": 0.8878504672897196, - "grad_norm": 0.4464676215669015, - "learning_rate": 0.0001879643366254319, - "loss": 0.9595, - "step": 1805 - }, - { - "epoch": 0.8883423512051156, - "grad_norm": 0.4561868913576593, - "learning_rate": 0.00018793503246050593, - "loss": 0.9077, - "step": 1806 - }, - { - "epoch": 0.8888342351205115, - "grad_norm": 0.41949886953612353, - "learning_rate": 0.00018790569495441645, - "loss": 0.8574, - "step": 1807 - }, - { - "epoch": 0.8893261190359075, - "grad_norm": 0.446706485049033, - "learning_rate": 0.00018787632411828697, - "loss": 0.9026, - "step": 1808 - }, - { - "epoch": 0.8898180029513035, - "grad_norm": 0.4697568797443752, - "learning_rate": 0.00018784691996325366, - "loss": 0.968, - "step": 1809 - }, - { - "epoch": 0.8903098868666994, - "grad_norm": 0.44579647844335313, - "learning_rate": 0.0001878174825004653, - "loss": 1.0486, - "step": 1810 - }, - { - "epoch": 0.8908017707820954, - "grad_norm": 0.44467914706379713, - "learning_rate": 0.00018778801174108327, - "loss": 0.9979, - "step": 1811 - }, - { - "epoch": 0.8912936546974914, - "grad_norm": 0.44786684227577395, - "learning_rate": 0.00018775850769628164, - "loss": 1.013, - "step": 1812 - }, - { - "epoch": 0.8917855386128873, - "grad_norm": 0.42923655580706227, - "learning_rate": 0.00018772897037724708, - "loss": 0.9077, - "step": 1813 - }, - { - "epoch": 0.8922774225282833, - "grad_norm": 0.42947719794182665, - "learning_rate": 0.00018769939979517883, - "loss": 0.9336, - "step": 1814 - }, - { - "epoch": 0.8927693064436792, - "grad_norm": 0.4515765543587999, - "learning_rate": 0.0001876697959612888, - "loss": 0.9822, - "step": 1815 - }, - { - "epoch": 0.8932611903590753, - "grad_norm": 0.4433570383113576, - "learning_rate": 0.0001876401588868015, - "loss": 0.8458, - "step": 1816 - }, - { - "epoch": 0.8937530742744713, - "grad_norm": 0.44141775764517255, - "learning_rate": 0.00018761048858295396, - "loss": 0.9131, - "step": 1817 - }, - { - "epoch": 0.8942449581898672, - "grad_norm": 0.4907548622677925, - "learning_rate": 0.00018758078506099594, - "loss": 0.9734, - "step": 1818 - }, - { - "epoch": 0.8947368421052632, - "grad_norm": 0.42816057571922717, - "learning_rate": 0.0001875510483321897, - "loss": 0.8329, - "step": 1819 - }, - { - "epoch": 0.8952287260206592, - "grad_norm": 0.4278009393349627, - "learning_rate": 0.00018752127840781016, - "loss": 0.9078, - "step": 1820 - }, - { - "epoch": 0.8957206099360551, - "grad_norm": 0.4558932669114811, - "learning_rate": 0.00018749147529914477, - "loss": 0.9961, - "step": 1821 - }, - { - "epoch": 0.8962124938514511, - "grad_norm": 0.46760053325802814, - "learning_rate": 0.00018746163901749356, - "loss": 0.8574, - "step": 1822 - }, - { - "epoch": 0.896704377766847, - "grad_norm": 0.44382122126076795, - "learning_rate": 0.0001874317695741692, - "loss": 0.9047, - "step": 1823 - }, - { - "epoch": 0.897196261682243, - "grad_norm": 0.46080332998879403, - "learning_rate": 0.00018740186698049682, - "loss": 0.8905, - "step": 1824 - }, - { - "epoch": 0.897688145597639, - "grad_norm": 0.4685192139052703, - "learning_rate": 0.00018737193124781425, - "loss": 0.9739, - "step": 1825 - }, - { - "epoch": 0.8981800295130349, - "grad_norm": 0.4594300245239511, - "learning_rate": 0.00018734196238747185, - "loss": 0.9846, - "step": 1826 - }, - { - "epoch": 0.8986719134284309, - "grad_norm": 0.5865265442015458, - "learning_rate": 0.00018731196041083244, - "loss": 0.8612, - "step": 1827 - }, - { - "epoch": 0.8991637973438269, - "grad_norm": 0.42629041077369384, - "learning_rate": 0.0001872819253292715, - "loss": 0.9517, - "step": 1828 - }, - { - "epoch": 0.8996556812592228, - "grad_norm": 0.45426128027116996, - "learning_rate": 0.00018725185715417708, - "loss": 0.953, - "step": 1829 - }, - { - "epoch": 0.9001475651746188, - "grad_norm": 0.46208254440984714, - "learning_rate": 0.0001872217558969497, - "loss": 0.8879, - "step": 1830 - }, - { - "epoch": 0.9006394490900148, - "grad_norm": 0.4518760176288391, - "learning_rate": 0.00018719162156900242, - "loss": 0.9349, - "step": 1831 - }, - { - "epoch": 0.9011313330054107, - "grad_norm": 0.4507850113266245, - "learning_rate": 0.00018716145418176092, - "loss": 0.9039, - "step": 1832 - }, - { - "epoch": 0.9016232169208067, - "grad_norm": 0.4257408028936146, - "learning_rate": 0.00018713125374666338, - "loss": 0.8838, - "step": 1833 - }, - { - "epoch": 0.9021151008362026, - "grad_norm": 0.48557527688552893, - "learning_rate": 0.00018710102027516047, - "loss": 1.0541, - "step": 1834 - }, - { - "epoch": 0.9026069847515986, - "grad_norm": 0.4236455716031953, - "learning_rate": 0.00018707075377871543, - "loss": 0.8758, - "step": 1835 - }, - { - "epoch": 0.9030988686669946, - "grad_norm": 0.464174581369836, - "learning_rate": 0.00018704045426880397, - "loss": 0.8693, - "step": 1836 - }, - { - "epoch": 0.9035907525823905, - "grad_norm": 0.42982053434237333, - "learning_rate": 0.0001870101217569144, - "loss": 0.9359, - "step": 1837 - }, - { - "epoch": 0.9040826364977865, - "grad_norm": 0.511779869667633, - "learning_rate": 0.0001869797562545475, - "loss": 1.0436, - "step": 1838 - }, - { - "epoch": 0.9045745204131825, - "grad_norm": 0.4888814299069477, - "learning_rate": 0.00018694935777321652, - "loss": 0.8595, - "step": 1839 - }, - { - "epoch": 0.9050664043285784, - "grad_norm": 0.4508780961225434, - "learning_rate": 0.0001869189263244473, - "loss": 0.895, - "step": 1840 - }, - { - "epoch": 0.9055582882439744, - "grad_norm": 0.4475570054583435, - "learning_rate": 0.00018688846191977808, - "loss": 0.9599, - "step": 1841 - }, - { - "epoch": 0.9060501721593703, - "grad_norm": 0.4866002867345899, - "learning_rate": 0.00018685796457075963, - "loss": 0.9138, - "step": 1842 - }, - { - "epoch": 0.9065420560747663, - "grad_norm": 0.46993297013305496, - "learning_rate": 0.0001868274342889553, - "loss": 1.0056, - "step": 1843 - }, - { - "epoch": 0.9070339399901624, - "grad_norm": 0.4298751366446946, - "learning_rate": 0.00018679687108594082, - "loss": 0.869, - "step": 1844 - }, - { - "epoch": 0.9075258239055582, - "grad_norm": 0.4444019633615243, - "learning_rate": 0.0001867662749733044, - "loss": 0.9685, - "step": 1845 - }, - { - "epoch": 0.9080177078209543, - "grad_norm": 0.4722580187848079, - "learning_rate": 0.0001867356459626468, - "loss": 0.9801, - "step": 1846 - }, - { - "epoch": 0.9085095917363503, - "grad_norm": 0.4628673412343863, - "learning_rate": 0.00018670498406558124, - "loss": 0.9607, - "step": 1847 - }, - { - "epoch": 0.9090014756517462, - "grad_norm": 0.44309659797891776, - "learning_rate": 0.00018667428929373335, - "loss": 0.9297, - "step": 1848 - }, - { - "epoch": 0.9094933595671422, - "grad_norm": 0.46922581862168145, - "learning_rate": 0.00018664356165874123, - "loss": 0.9994, - "step": 1849 - }, - { - "epoch": 0.9099852434825382, - "grad_norm": 0.4398640279326358, - "learning_rate": 0.00018661280117225555, - "loss": 0.9195, - "step": 1850 - }, - { - "epoch": 0.9104771273979341, - "grad_norm": 0.43088492080767665, - "learning_rate": 0.00018658200784593928, - "loss": 0.8881, - "step": 1851 - }, - { - "epoch": 0.9109690113133301, - "grad_norm": 0.46171660251764074, - "learning_rate": 0.00018655118169146797, - "loss": 0.9336, - "step": 1852 - }, - { - "epoch": 0.911460895228726, - "grad_norm": 0.49775393116646227, - "learning_rate": 0.00018652032272052958, - "loss": 0.9685, - "step": 1853 - }, - { - "epoch": 0.911952779144122, - "grad_norm": 0.4944238767925219, - "learning_rate": 0.00018648943094482442, - "loss": 0.8922, - "step": 1854 - }, - { - "epoch": 0.912444663059518, - "grad_norm": 0.4581618552443388, - "learning_rate": 0.00018645850637606537, - "loss": 0.9662, - "step": 1855 - }, - { - "epoch": 0.9129365469749139, - "grad_norm": 0.4489032703552213, - "learning_rate": 0.0001864275490259777, - "loss": 0.9587, - "step": 1856 - }, - { - "epoch": 0.9134284308903099, - "grad_norm": 0.42582539486312315, - "learning_rate": 0.00018639655890629909, - "loss": 0.919, - "step": 1857 - }, - { - "epoch": 0.9139203148057059, - "grad_norm": 0.4377317417163939, - "learning_rate": 0.00018636553602877963, - "loss": 0.8757, - "step": 1858 - }, - { - "epoch": 0.9144121987211018, - "grad_norm": 0.4032185122618743, - "learning_rate": 0.00018633448040518186, - "loss": 0.845, - "step": 1859 - }, - { - "epoch": 0.9149040826364978, - "grad_norm": 0.7158335031811692, - "learning_rate": 0.00018630339204728076, - "loss": 1.0356, - "step": 1860 - }, - { - "epoch": 0.9153959665518937, - "grad_norm": 0.44170501967634584, - "learning_rate": 0.00018627227096686366, - "loss": 0.9958, - "step": 1861 - }, - { - "epoch": 0.9158878504672897, - "grad_norm": 0.416661324416237, - "learning_rate": 0.00018624111717573035, - "loss": 0.8612, - "step": 1862 - }, - { - "epoch": 0.9163797343826857, - "grad_norm": 0.4382768634773933, - "learning_rate": 0.000186209930685693, - "loss": 0.9949, - "step": 1863 - }, - { - "epoch": 0.9168716182980816, - "grad_norm": 0.5154781599152937, - "learning_rate": 0.00018617871150857616, - "loss": 0.9891, - "step": 1864 - }, - { - "epoch": 0.9173635022134776, - "grad_norm": 0.45208056070997843, - "learning_rate": 0.00018614745965621677, - "loss": 0.8528, - "step": 1865 - }, - { - "epoch": 0.9178553861288736, - "grad_norm": 0.5629004632365258, - "learning_rate": 0.00018611617514046426, - "loss": 0.8792, - "step": 1866 - }, - { - "epoch": 0.9183472700442695, - "grad_norm": 0.43089672623691216, - "learning_rate": 0.00018608485797318028, - "loss": 0.9007, - "step": 1867 - }, - { - "epoch": 0.9188391539596655, - "grad_norm": 0.44375858233910803, - "learning_rate": 0.00018605350816623903, - "loss": 0.9179, - "step": 1868 - }, - { - "epoch": 0.9193310378750615, - "grad_norm": 0.4396800077461436, - "learning_rate": 0.00018602212573152693, - "loss": 0.8481, - "step": 1869 - }, - { - "epoch": 0.9198229217904574, - "grad_norm": 0.5170865212268547, - "learning_rate": 0.0001859907106809429, - "loss": 0.9622, - "step": 1870 - }, - { - "epoch": 0.9203148057058534, - "grad_norm": 0.48029214092701894, - "learning_rate": 0.00018595926302639813, - "loss": 0.9137, - "step": 1871 - }, - { - "epoch": 0.9208066896212493, - "grad_norm": 0.42481304631971484, - "learning_rate": 0.0001859277827798162, - "loss": 0.9244, - "step": 1872 - }, - { - "epoch": 0.9212985735366453, - "grad_norm": 0.7570373907074284, - "learning_rate": 0.00018589626995313313, - "loss": 0.9975, - "step": 1873 - }, - { - "epoch": 0.9217904574520414, - "grad_norm": 0.4345765525864509, - "learning_rate": 0.00018586472455829714, - "loss": 0.9243, - "step": 1874 - }, - { - "epoch": 0.9222823413674373, - "grad_norm": 0.6711781365151313, - "learning_rate": 0.00018583314660726888, - "loss": 0.9892, - "step": 1875 - }, - { - "epoch": 0.9227742252828333, - "grad_norm": 0.4348899393531873, - "learning_rate": 0.00018580153611202143, - "loss": 0.9168, - "step": 1876 - }, - { - "epoch": 0.9232661091982293, - "grad_norm": 0.41152523031057653, - "learning_rate": 0.00018576989308454004, - "loss": 0.8642, - "step": 1877 - }, - { - "epoch": 0.9237579931136252, - "grad_norm": 0.43576406709295346, - "learning_rate": 0.00018573821753682242, - "loss": 0.9849, - "step": 1878 - }, - { - "epoch": 0.9242498770290212, - "grad_norm": 0.5755530057660726, - "learning_rate": 0.00018570650948087857, - "loss": 0.9933, - "step": 1879 - }, - { - "epoch": 0.9247417609444171, - "grad_norm": 0.45628544644922775, - "learning_rate": 0.0001856747689287308, - "loss": 0.9279, - "step": 1880 - }, - { - "epoch": 0.9252336448598131, - "grad_norm": 0.49067584406698944, - "learning_rate": 0.00018564299589241375, - "loss": 1.007, - "step": 1881 - }, - { - "epoch": 0.9257255287752091, - "grad_norm": 0.552174844805938, - "learning_rate": 0.0001856111903839744, - "loss": 0.9696, - "step": 1882 - }, - { - "epoch": 0.926217412690605, - "grad_norm": 0.4464444978628804, - "learning_rate": 0.000185579352415472, - "loss": 0.9474, - "step": 1883 - }, - { - "epoch": 0.926709296606001, - "grad_norm": 0.4491417360834481, - "learning_rate": 0.00018554748199897813, - "loss": 0.9586, - "step": 1884 - }, - { - "epoch": 0.927201180521397, - "grad_norm": 0.43677339425529066, - "learning_rate": 0.0001855155791465767, - "loss": 0.896, - "step": 1885 - }, - { - "epoch": 0.9276930644367929, - "grad_norm": 0.43873421867582957, - "learning_rate": 0.0001854836438703639, - "loss": 0.8738, - "step": 1886 - }, - { - "epoch": 0.9281849483521889, - "grad_norm": 0.4324939602446171, - "learning_rate": 0.0001854516761824482, - "loss": 0.863, - "step": 1887 - }, - { - "epoch": 0.9286768322675848, - "grad_norm": 0.4551328220273714, - "learning_rate": 0.00018541967609495032, - "loss": 0.9598, - "step": 1888 - }, - { - "epoch": 0.9291687161829808, - "grad_norm": 0.4478932130948516, - "learning_rate": 0.00018538764362000337, - "loss": 0.9441, - "step": 1889 - }, - { - "epoch": 0.9296606000983768, - "grad_norm": 0.44028936711062305, - "learning_rate": 0.00018535557876975264, - "loss": 0.9548, - "step": 1890 - }, - { - "epoch": 0.9301524840137727, - "grad_norm": 0.43604633499273593, - "learning_rate": 0.00018532348155635576, - "loss": 0.9036, - "step": 1891 - }, - { - "epoch": 0.9306443679291687, - "grad_norm": 0.44310703755069564, - "learning_rate": 0.00018529135199198257, - "loss": 0.864, - "step": 1892 - }, - { - "epoch": 0.9311362518445647, - "grad_norm": 0.4195606656335038, - "learning_rate": 0.00018525919008881525, - "loss": 0.8824, - "step": 1893 - }, - { - "epoch": 0.9316281357599606, - "grad_norm": 0.4752966391769209, - "learning_rate": 0.00018522699585904822, - "loss": 0.9175, - "step": 1894 - }, - { - "epoch": 0.9321200196753566, - "grad_norm": 0.43006861648123, - "learning_rate": 0.00018519476931488807, - "loss": 0.9666, - "step": 1895 - }, - { - "epoch": 0.9326119035907526, - "grad_norm": 0.4441307733494571, - "learning_rate": 0.0001851625104685538, - "loss": 0.9484, - "step": 1896 - }, - { - "epoch": 0.9331037875061485, - "grad_norm": 0.47865967363559037, - "learning_rate": 0.00018513021933227647, - "loss": 0.9295, - "step": 1897 - }, - { - "epoch": 0.9335956714215445, - "grad_norm": 0.5011728461681424, - "learning_rate": 0.00018509789591829957, - "loss": 0.9939, - "step": 1898 - }, - { - "epoch": 0.9340875553369404, - "grad_norm": 0.44450221135964785, - "learning_rate": 0.0001850655402388787, - "loss": 0.937, - "step": 1899 - }, - { - "epoch": 0.9345794392523364, - "grad_norm": 0.47221343746374045, - "learning_rate": 0.00018503315230628176, - "loss": 0.9073, - "step": 1900 - }, - { - "epoch": 0.9350713231677324, - "grad_norm": 0.4526235162335821, - "learning_rate": 0.0001850007321327888, - "loss": 0.9197, - "step": 1901 - }, - { - "epoch": 0.9355632070831283, - "grad_norm": 0.4397491491786641, - "learning_rate": 0.00018496827973069223, - "loss": 0.9732, - "step": 1902 - }, - { - "epoch": 0.9360550909985244, - "grad_norm": 0.42750906504865316, - "learning_rate": 0.00018493579511229656, - "loss": 0.9582, - "step": 1903 - }, - { - "epoch": 0.9365469749139204, - "grad_norm": 0.44320595941036217, - "learning_rate": 0.00018490327828991852, - "loss": 0.953, - "step": 1904 - }, - { - "epoch": 0.9370388588293163, - "grad_norm": 0.42991703773405404, - "learning_rate": 0.0001848707292758871, - "loss": 0.9255, - "step": 1905 - }, - { - "epoch": 0.9375307427447123, - "grad_norm": 0.5780618216337025, - "learning_rate": 0.0001848381480825435, - "loss": 0.9496, - "step": 1906 - }, - { - "epoch": 0.9380226266601082, - "grad_norm": 0.4831354064630574, - "learning_rate": 0.00018480553472224114, - "loss": 1.0226, - "step": 1907 - }, - { - "epoch": 0.9385145105755042, - "grad_norm": 0.7668692313759613, - "learning_rate": 0.0001847728892073455, - "loss": 0.9747, - "step": 1908 - }, - { - "epoch": 0.9390063944909002, - "grad_norm": 0.4471821230833163, - "learning_rate": 0.0001847402115502344, - "loss": 0.885, - "step": 1909 - }, - { - "epoch": 0.9394982784062961, - "grad_norm": 0.49974458096838337, - "learning_rate": 0.00018470750176329781, - "loss": 0.9677, - "step": 1910 - }, - { - "epoch": 0.9399901623216921, - "grad_norm": 0.5012870583762135, - "learning_rate": 0.00018467475985893783, - "loss": 0.9629, - "step": 1911 - }, - { - "epoch": 0.9404820462370881, - "grad_norm": 0.4367049482339108, - "learning_rate": 0.0001846419858495688, - "loss": 0.8721, - "step": 1912 - }, - { - "epoch": 0.940973930152484, - "grad_norm": 0.4699932772670166, - "learning_rate": 0.00018460917974761717, - "loss": 0.8508, - "step": 1913 - }, - { - "epoch": 0.94146581406788, - "grad_norm": 0.4718433943987246, - "learning_rate": 0.00018457634156552168, - "loss": 0.8985, - "step": 1914 - }, - { - "epoch": 0.941957697983276, - "grad_norm": 0.4974602660062723, - "learning_rate": 0.00018454347131573306, - "loss": 1.0154, - "step": 1915 - }, - { - "epoch": 0.9424495818986719, - "grad_norm": 0.4258535297177866, - "learning_rate": 0.00018451056901071434, - "loss": 0.9385, - "step": 1916 - }, - { - "epoch": 0.9429414658140679, - "grad_norm": 0.5211541078048513, - "learning_rate": 0.00018447763466294067, - "loss": 0.9845, - "step": 1917 - }, - { - "epoch": 0.9434333497294638, - "grad_norm": 0.45479187706728935, - "learning_rate": 0.00018444466828489923, - "loss": 0.9489, - "step": 1918 - }, - { - "epoch": 0.9439252336448598, - "grad_norm": 0.46391141014345066, - "learning_rate": 0.00018441166988908956, - "loss": 0.9351, - "step": 1919 - }, - { - "epoch": 0.9444171175602558, - "grad_norm": 0.4745079284455217, - "learning_rate": 0.00018437863948802315, - "loss": 0.9196, - "step": 1920 - }, - { - "epoch": 0.9449090014756517, - "grad_norm": 0.5151632004372109, - "learning_rate": 0.00018434557709422376, - "loss": 0.9256, - "step": 1921 - }, - { - "epoch": 0.9454008853910477, - "grad_norm": 0.47566366606315597, - "learning_rate": 0.00018431248272022716, - "loss": 0.9531, - "step": 1922 - }, - { - "epoch": 0.9458927693064437, - "grad_norm": 0.44407590076749176, - "learning_rate": 0.00018427935637858135, - "loss": 0.9311, - "step": 1923 - }, - { - "epoch": 0.9463846532218396, - "grad_norm": 0.46497905603862427, - "learning_rate": 0.0001842461980818464, - "loss": 0.916, - "step": 1924 - }, - { - "epoch": 0.9468765371372356, - "grad_norm": 0.4074746058828995, - "learning_rate": 0.0001842130078425945, - "loss": 0.8571, - "step": 1925 - }, - { - "epoch": 0.9473684210526315, - "grad_norm": 0.46808976566775873, - "learning_rate": 0.00018417978567340996, - "loss": 0.9876, - "step": 1926 - }, - { - "epoch": 0.9478603049680275, - "grad_norm": 0.4544606752020085, - "learning_rate": 0.00018414653158688914, - "loss": 0.8897, - "step": 1927 - }, - { - "epoch": 0.9483521888834235, - "grad_norm": 0.5237793407899939, - "learning_rate": 0.0001841132455956406, - "loss": 0.959, - "step": 1928 - }, - { - "epoch": 0.9488440727988194, - "grad_norm": 0.4504943544679579, - "learning_rate": 0.00018407992771228497, - "loss": 0.9106, - "step": 1929 - }, - { - "epoch": 0.9493359567142154, - "grad_norm": 0.4247941082073109, - "learning_rate": 0.0001840465779494549, - "loss": 0.9221, - "step": 1930 - }, - { - "epoch": 0.9498278406296115, - "grad_norm": 0.5203584029782751, - "learning_rate": 0.0001840131963197952, - "loss": 1.0279, - "step": 1931 - }, - { - "epoch": 0.9503197245450073, - "grad_norm": 0.462036073986436, - "learning_rate": 0.00018397978283596274, - "loss": 0.8616, - "step": 1932 - }, - { - "epoch": 0.9508116084604034, - "grad_norm": 0.408317546888834, - "learning_rate": 0.0001839463375106265, - "loss": 0.9045, - "step": 1933 - }, - { - "epoch": 0.9513034923757994, - "grad_norm": 0.4588025515575967, - "learning_rate": 0.0001839128603564675, - "loss": 0.9361, - "step": 1934 - }, - { - "epoch": 0.9517953762911953, - "grad_norm": 0.4317585534792301, - "learning_rate": 0.00018387935138617875, - "loss": 0.8756, - "step": 1935 - }, - { - "epoch": 0.9522872602065913, - "grad_norm": 0.4519435724897018, - "learning_rate": 0.00018384581061246547, - "loss": 0.8793, - "step": 1936 - }, - { - "epoch": 0.9527791441219872, - "grad_norm": 0.4520350266397909, - "learning_rate": 0.00018381223804804484, - "loss": 0.9543, - "step": 1937 - }, - { - "epoch": 0.9532710280373832, - "grad_norm": 0.41525151813417516, - "learning_rate": 0.00018377863370564617, - "loss": 0.8823, - "step": 1938 - }, - { - "epoch": 0.9537629119527792, - "grad_norm": 0.4268831399183173, - "learning_rate": 0.00018374499759801074, - "loss": 0.9312, - "step": 1939 - }, - { - "epoch": 0.9542547958681751, - "grad_norm": 0.4253702887268113, - "learning_rate": 0.00018371132973789192, - "loss": 0.8114, - "step": 1940 - }, - { - "epoch": 0.9547466797835711, - "grad_norm": 0.4819668518465043, - "learning_rate": 0.00018367763013805508, - "loss": 0.9675, - "step": 1941 - }, - { - "epoch": 0.9552385636989671, - "grad_norm": 0.4547317992060099, - "learning_rate": 0.0001836438988112777, - "loss": 0.957, - "step": 1942 - }, - { - "epoch": 0.955730447614363, - "grad_norm": 0.44309298006477255, - "learning_rate": 0.0001836101357703492, - "loss": 0.9799, - "step": 1943 - }, - { - "epoch": 0.956222331529759, - "grad_norm": 0.4664756768139931, - "learning_rate": 0.00018357634102807112, - "loss": 1.0056, - "step": 1944 - }, - { - "epoch": 0.9567142154451549, - "grad_norm": 0.4539679694753678, - "learning_rate": 0.0001835425145972569, - "loss": 0.8856, - "step": 1945 - }, - { - "epoch": 0.9572060993605509, - "grad_norm": 0.44164075657025315, - "learning_rate": 0.0001835086564907321, - "loss": 0.8615, - "step": 1946 - }, - { - "epoch": 0.9576979832759469, - "grad_norm": 0.45247831990312676, - "learning_rate": 0.00018347476672133422, - "loss": 0.9535, - "step": 1947 - }, - { - "epoch": 0.9581898671913428, - "grad_norm": 0.44221000588098075, - "learning_rate": 0.00018344084530191282, - "loss": 0.9398, - "step": 1948 - }, - { - "epoch": 0.9586817511067388, - "grad_norm": 0.4213546371708338, - "learning_rate": 0.0001834068922453294, - "loss": 0.8714, - "step": 1949 - }, - { - "epoch": 0.9591736350221348, - "grad_norm": 0.46276619653227, - "learning_rate": 0.00018337290756445757, - "loss": 0.9773, - "step": 1950 - }, - { - "epoch": 0.9596655189375307, - "grad_norm": 0.41336397476424397, - "learning_rate": 0.00018333889127218278, - "loss": 0.8715, - "step": 1951 - }, - { - "epoch": 0.9601574028529267, - "grad_norm": 0.4323107804018267, - "learning_rate": 0.0001833048433814026, - "loss": 0.9519, - "step": 1952 - }, - { - "epoch": 0.9606492867683227, - "grad_norm": 0.4161203428466473, - "learning_rate": 0.00018327076390502645, - "loss": 0.9167, - "step": 1953 - }, - { - "epoch": 0.9611411706837186, - "grad_norm": 0.41943509986690275, - "learning_rate": 0.00018323665285597583, - "loss": 0.9048, - "step": 1954 - }, - { - "epoch": 0.9616330545991146, - "grad_norm": 0.4043101057931535, - "learning_rate": 0.00018320251024718414, - "loss": 0.8473, - "step": 1955 - }, - { - "epoch": 0.9621249385145105, - "grad_norm": 0.42181886666341173, - "learning_rate": 0.00018316833609159682, - "loss": 0.9289, - "step": 1956 - }, - { - "epoch": 0.9626168224299065, - "grad_norm": 0.40080973404412934, - "learning_rate": 0.00018313413040217124, - "loss": 0.8799, - "step": 1957 - }, - { - "epoch": 0.9631087063453025, - "grad_norm": 0.4487507479231779, - "learning_rate": 0.00018309989319187673, - "loss": 0.9578, - "step": 1958 - }, - { - "epoch": 0.9636005902606984, - "grad_norm": 0.44091223877862584, - "learning_rate": 0.00018306562447369449, - "loss": 1.0044, - "step": 1959 - }, - { - "epoch": 0.9640924741760944, - "grad_norm": 0.47143391770110404, - "learning_rate": 0.00018303132426061782, - "loss": 0.8695, - "step": 1960 - }, - { - "epoch": 0.9645843580914905, - "grad_norm": 0.40991211217100143, - "learning_rate": 0.0001829969925656518, - "loss": 0.8824, - "step": 1961 - }, - { - "epoch": 0.9650762420068864, - "grad_norm": 0.44327349800592386, - "learning_rate": 0.0001829626294018136, - "loss": 0.9499, - "step": 1962 - }, - { - "epoch": 0.9655681259222824, - "grad_norm": 0.5725296845392949, - "learning_rate": 0.00018292823478213217, - "loss": 0.9597, - "step": 1963 - }, - { - "epoch": 0.9660600098376783, - "grad_norm": 0.4440215432884469, - "learning_rate": 0.0001828938087196485, - "loss": 0.9184, - "step": 1964 - }, - { - "epoch": 0.9665518937530743, - "grad_norm": 0.42147265987377897, - "learning_rate": 0.00018285935122741554, - "loss": 0.9281, - "step": 1965 - }, - { - "epoch": 0.9670437776684703, - "grad_norm": 0.4188232523231526, - "learning_rate": 0.00018282486231849794, - "loss": 0.8534, - "step": 1966 - }, - { - "epoch": 0.9675356615838662, - "grad_norm": 1.2377315932762718, - "learning_rate": 0.00018279034200597248, - "loss": 0.9696, - "step": 1967 - }, - { - "epoch": 0.9680275454992622, - "grad_norm": 0.4472282474869554, - "learning_rate": 0.00018275579030292778, - "loss": 0.9089, - "step": 1968 - }, - { - "epoch": 0.9685194294146582, - "grad_norm": 0.4191650854370175, - "learning_rate": 0.00018272120722246434, - "loss": 0.8669, - "step": 1969 - }, - { - "epoch": 0.9690113133300541, - "grad_norm": 0.43753161804389906, - "learning_rate": 0.00018268659277769457, - "loss": 0.8819, - "step": 1970 - }, - { - "epoch": 0.9695031972454501, - "grad_norm": 0.4176731671835176, - "learning_rate": 0.00018265194698174276, - "loss": 0.9417, - "step": 1971 - }, - { - "epoch": 0.9699950811608461, - "grad_norm": 0.5380636017973098, - "learning_rate": 0.00018261726984774508, - "loss": 0.8814, - "step": 1972 - }, - { - "epoch": 0.970486965076242, - "grad_norm": 0.4693424487883787, - "learning_rate": 0.00018258256138884967, - "loss": 1.08, - "step": 1973 - }, - { - "epoch": 0.970978848991638, - "grad_norm": 0.44287467230128147, - "learning_rate": 0.0001825478216182164, - "loss": 0.8724, - "step": 1974 - }, - { - "epoch": 0.9714707329070339, - "grad_norm": 0.47401241076558015, - "learning_rate": 0.00018251305054901713, - "loss": 0.9435, - "step": 1975 - }, - { - "epoch": 0.9719626168224299, - "grad_norm": 0.4716995663392478, - "learning_rate": 0.00018247824819443554, - "loss": 0.8926, - "step": 1976 - }, - { - "epoch": 0.9724545007378259, - "grad_norm": 0.4048631732017963, - "learning_rate": 0.00018244341456766718, - "loss": 0.8727, - "step": 1977 - }, - { - "epoch": 0.9729463846532218, - "grad_norm": 0.5501419065196186, - "learning_rate": 0.00018240854968191945, - "loss": 0.9842, - "step": 1978 - }, - { - "epoch": 0.9734382685686178, - "grad_norm": 0.4983078287581272, - "learning_rate": 0.0001823736535504116, - "loss": 0.9187, - "step": 1979 - }, - { - "epoch": 0.9739301524840138, - "grad_norm": 0.456374927682862, - "learning_rate": 0.00018233872618637475, - "loss": 0.8765, - "step": 1980 - }, - { - "epoch": 0.9744220363994097, - "grad_norm": 0.4361771046637676, - "learning_rate": 0.00018230376760305185, - "loss": 0.9327, - "step": 1981 - }, - { - "epoch": 0.9749139203148057, - "grad_norm": 0.5218870199750633, - "learning_rate": 0.0001822687778136977, - "loss": 0.9182, - "step": 1982 - }, - { - "epoch": 0.9754058042302016, - "grad_norm": 0.4377434158553101, - "learning_rate": 0.00018223375683157884, - "loss": 0.9139, - "step": 1983 - }, - { - "epoch": 0.9758976881455976, - "grad_norm": 0.441508679114558, - "learning_rate": 0.0001821987046699738, - "loss": 0.8847, - "step": 1984 - }, - { - "epoch": 0.9763895720609936, - "grad_norm": 0.4231708327844232, - "learning_rate": 0.0001821636213421728, - "loss": 0.8684, - "step": 1985 - }, - { - "epoch": 0.9768814559763895, - "grad_norm": 0.46641621666423816, - "learning_rate": 0.00018212850686147793, - "loss": 0.9781, - "step": 1986 - }, - { - "epoch": 0.9773733398917855, - "grad_norm": 0.47553941657395565, - "learning_rate": 0.00018209336124120308, - "loss": 0.9948, - "step": 1987 - }, - { - "epoch": 0.9778652238071815, - "grad_norm": 0.46381453154697494, - "learning_rate": 0.00018205818449467398, - "loss": 0.9003, - "step": 1988 - }, - { - "epoch": 0.9783571077225774, - "grad_norm": 0.396028159540396, - "learning_rate": 0.00018202297663522807, - "loss": 0.8598, - "step": 1989 - }, - { - "epoch": 0.9788489916379735, - "grad_norm": 0.44853225265759955, - "learning_rate": 0.0001819877376762147, - "loss": 0.9052, - "step": 1990 - }, - { - "epoch": 0.9793408755533693, - "grad_norm": 0.4410343172515614, - "learning_rate": 0.00018195246763099494, - "loss": 0.8545, - "step": 1991 - }, - { - "epoch": 0.9798327594687654, - "grad_norm": 0.445285394442272, - "learning_rate": 0.00018191716651294167, - "loss": 0.9216, - "step": 1992 - }, - { - "epoch": 0.9803246433841614, - "grad_norm": 0.46444107702664084, - "learning_rate": 0.0001818818343354396, - "loss": 0.9005, - "step": 1993 - }, - { - "epoch": 0.9808165272995573, - "grad_norm": 0.4673226564151492, - "learning_rate": 0.00018184647111188506, - "loss": 0.9788, - "step": 1994 - }, - { - "epoch": 0.9813084112149533, - "grad_norm": 0.4338580685874424, - "learning_rate": 0.0001818110768556863, - "loss": 0.9595, - "step": 1995 - }, - { - "epoch": 0.9818002951303493, - "grad_norm": 0.4605529891465894, - "learning_rate": 0.00018177565158026334, - "loss": 0.8888, - "step": 1996 - }, - { - "epoch": 0.9822921790457452, - "grad_norm": 0.461882261804312, - "learning_rate": 0.00018174019529904785, - "loss": 0.9512, - "step": 1997 - }, - { - "epoch": 0.9827840629611412, - "grad_norm": 0.4185592648902373, - "learning_rate": 0.00018170470802548336, - "loss": 0.8775, - "step": 1998 - }, - { - "epoch": 0.9832759468765372, - "grad_norm": 0.46017098695300246, - "learning_rate": 0.00018166918977302506, - "loss": 0.9257, - "step": 1999 - }, - { - "epoch": 0.9837678307919331, - "grad_norm": 0.43770465454669844, - "learning_rate": 0.00018163364055514002, - "loss": 0.8512, - "step": 2000 - }, - { - "epoch": 0.9842597147073291, - "grad_norm": 0.425725999392596, - "learning_rate": 0.00018159806038530688, - "loss": 0.9506, - "step": 2001 - }, - { - "epoch": 0.984751598622725, - "grad_norm": 0.4363217963946529, - "learning_rate": 0.00018156244927701616, - "loss": 0.8999, - "step": 2002 - }, - { - "epoch": 0.985243482538121, - "grad_norm": 0.693829954051338, - "learning_rate": 0.00018152680724377004, - "loss": 0.8858, - "step": 2003 - }, - { - "epoch": 0.985735366453517, - "grad_norm": 0.45623689519322974, - "learning_rate": 0.00018149113429908242, - "loss": 0.8933, - "step": 2004 - }, - { - "epoch": 0.9862272503689129, - "grad_norm": 0.3961063923489086, - "learning_rate": 0.00018145543045647896, - "loss": 0.8796, - "step": 2005 - }, - { - "epoch": 0.9867191342843089, - "grad_norm": 0.44731886453675274, - "learning_rate": 0.000181419695729497, - "loss": 0.9288, - "step": 2006 - }, - { - "epoch": 0.9872110181997049, - "grad_norm": 0.39864669645559797, - "learning_rate": 0.0001813839301316856, - "loss": 0.8674, - "step": 2007 - }, - { - "epoch": 0.9877029021151008, - "grad_norm": 0.4178994922394768, - "learning_rate": 0.00018134813367660555, - "loss": 0.9006, - "step": 2008 - }, - { - "epoch": 0.9881947860304968, - "grad_norm": 0.4998770359493896, - "learning_rate": 0.0001813123063778293, - "loss": 0.9131, - "step": 2009 - }, - { - "epoch": 0.9886866699458927, - "grad_norm": 0.4541264244365027, - "learning_rate": 0.00018127644824894102, - "loss": 0.9381, - "step": 2010 - }, - { - "epoch": 0.9891785538612887, - "grad_norm": 0.45508736170043473, - "learning_rate": 0.00018124055930353653, - "loss": 0.9072, - "step": 2011 - }, - { - "epoch": 0.9896704377766847, - "grad_norm": 0.4873708340449761, - "learning_rate": 0.00018120463955522345, - "loss": 0.8998, - "step": 2012 - }, - { - "epoch": 0.9901623216920806, - "grad_norm": 0.45549724651475265, - "learning_rate": 0.00018116868901762092, - "loss": 0.934, - "step": 2013 - }, - { - "epoch": 0.9906542056074766, - "grad_norm": 0.42652167135627134, - "learning_rate": 0.00018113270770435985, - "loss": 0.9115, - "step": 2014 - }, - { - "epoch": 0.9911460895228726, - "grad_norm": 0.4453477772420515, - "learning_rate": 0.0001810966956290828, - "loss": 0.9395, - "step": 2015 - }, - { - "epoch": 0.9916379734382685, - "grad_norm": 0.4046617832179208, - "learning_rate": 0.000181060652805444, - "loss": 0.8994, - "step": 2016 - }, - { - "epoch": 0.9921298573536645, - "grad_norm": 0.4283315925798483, - "learning_rate": 0.00018102457924710935, - "loss": 0.8584, - "step": 2017 - }, - { - "epoch": 0.9926217412690606, - "grad_norm": 0.3899504326189742, - "learning_rate": 0.00018098847496775635, - "loss": 0.8959, - "step": 2018 - }, - { - "epoch": 0.9931136251844564, - "grad_norm": 1.2193680639418478, - "learning_rate": 0.0001809523399810742, - "loss": 0.9118, - "step": 2019 - }, - { - "epoch": 0.9936055090998525, - "grad_norm": 0.4479688984977468, - "learning_rate": 0.0001809161743007637, - "loss": 0.9338, - "step": 2020 - }, - { - "epoch": 0.9940973930152484, - "grad_norm": 0.44531325059233584, - "learning_rate": 0.00018087997794053733, - "loss": 1.0175, - "step": 2021 - }, - { - "epoch": 0.9945892769306444, - "grad_norm": 0.7929071437715141, - "learning_rate": 0.0001808437509141192, - "loss": 0.9242, - "step": 2022 - }, - { - "epoch": 0.9950811608460404, - "grad_norm": 0.41910134077034905, - "learning_rate": 0.000180807493235245, - "loss": 0.9019, - "step": 2023 - }, - { - "epoch": 0.9955730447614363, - "grad_norm": 0.5899825884926907, - "learning_rate": 0.00018077120491766208, - "loss": 0.9682, - "step": 2024 - }, - { - "epoch": 0.9960649286768323, - "grad_norm": 0.44754835592415726, - "learning_rate": 0.00018073488597512944, - "loss": 0.944, - "step": 2025 - }, - { - "epoch": 0.9965568125922283, - "grad_norm": 0.4195479696566459, - "learning_rate": 0.00018069853642141762, - "loss": 0.965, - "step": 2026 - }, - { - "epoch": 0.9970486965076242, - "grad_norm": 0.4288060668519654, - "learning_rate": 0.0001806621562703088, - "loss": 0.9673, - "step": 2027 - }, - { - "epoch": 0.9975405804230202, - "grad_norm": 0.41813471233399524, - "learning_rate": 0.0001806257455355968, - "loss": 0.8521, - "step": 2028 - }, - { - "epoch": 0.9980324643384161, - "grad_norm": 0.40576008584353307, - "learning_rate": 0.00018058930423108693, - "loss": 0.9327, - "step": 2029 - }, - { - "epoch": 0.9985243482538121, - "grad_norm": 5.737513840425279, - "learning_rate": 0.00018055283237059622, - "loss": 0.9231, - "step": 2030 - }, - { - "epoch": 0.9990162321692081, - "grad_norm": 0.4424306323239711, - "learning_rate": 0.00018051632996795317, - "loss": 0.9896, - "step": 2031 - }, - { - "epoch": 0.999508116084604, - "grad_norm": 0.4472237618121447, - "learning_rate": 0.00018047979703699797, - "loss": 0.8683, - "step": 2032 - }, - { - "epoch": 1.0, - "grad_norm": 0.484997707838406, - "learning_rate": 0.0001804432335915823, - "loss": 1.0024, - "step": 2033 - }, - { - "epoch": 1.000491883915396, - "grad_norm": 0.46317454321941864, - "learning_rate": 0.0001804066396455694, - "loss": 0.9482, - "step": 2034 - }, - { - "epoch": 1.000491883915396, - "eval_loss": 0.8394731283187866, - "eval_runtime": 6667.7597, - "eval_samples_per_second": 4.274, - "eval_steps_per_second": 2.137, - "step": 2034 - }, - { - "epoch": 1.000983767830792, - "grad_norm": 0.44238663483109564, - "learning_rate": 0.00018037001521283418, - "loss": 0.8669, - "step": 2035 - }, - { - "epoch": 1.0014756517461878, - "grad_norm": 0.4357808574170002, - "learning_rate": 0.000180333360307263, - "loss": 0.9221, - "step": 2036 - }, - { - "epoch": 1.0019675356615838, - "grad_norm": 0.4211852718407382, - "learning_rate": 0.0001802966749427538, - "loss": 0.8529, - "step": 2037 - }, - { - "epoch": 1.000184535892231, - "grad_norm": 0.4965038022203159, - "learning_rate": 0.00018025995913321615, - "loss": 0.8953, - "step": 2038 - }, - { - "epoch": 1.0006766316048472, - "grad_norm": 0.554681476096924, - "learning_rate": 0.00018022321289257103, - "loss": 0.716, - "step": 2039 - }, - { - "epoch": 1.0011687273174632, - "grad_norm": 0.556727123533325, - "learning_rate": 0.00018018643623475105, - "loss": 0.7354, - "step": 2040 - }, - { - "epoch": 1.0016608230300794, - "grad_norm": 0.485911946754523, - "learning_rate": 0.00018014962917370036, - "loss": 0.7437, - "step": 2041 - }, - { - "epoch": 1.0021529187426954, - "grad_norm": 0.701277557366079, - "learning_rate": 0.00018011279172337456, - "loss": 0.7734, - "step": 2042 - }, - { - "epoch": 1.0026450144553116, - "grad_norm": 0.5987959982751151, - "learning_rate": 0.00018007592389774086, - "loss": 0.7576, - "step": 2043 - }, - { - "epoch": 1.0031371101679276, - "grad_norm": 0.5011515957192303, - "learning_rate": 0.00018003902571077794, - "loss": 0.7445, - "step": 2044 - }, - { - "epoch": 1.0036292058805438, - "grad_norm": 0.5326564253251685, - "learning_rate": 0.00018000209717647595, - "loss": 0.7925, - "step": 2045 - }, - { - "epoch": 1.0041213015931598, - "grad_norm": 0.5289613239491519, - "learning_rate": 0.00017996513830883664, - "loss": 0.7474, - "step": 2046 - }, - { - "epoch": 1.004613397305776, - "grad_norm": 0.48940192303125907, - "learning_rate": 0.00017992814912187318, - "loss": 0.7219, - "step": 2047 - }, - { - "epoch": 1.005105493018392, - "grad_norm": 0.5272442447192495, - "learning_rate": 0.00017989112962961033, - "loss": 0.7195, - "step": 2048 - }, - { - "epoch": 1.0055975887310082, - "grad_norm": 0.4363098304248932, - "learning_rate": 0.0001798540798460842, - "loss": 0.6887, - "step": 2049 - }, - { - "epoch": 1.0060896844436242, - "grad_norm": 0.46195532336117584, - "learning_rate": 0.00017981699978534256, - "loss": 0.7519, - "step": 2050 - }, - { - "epoch": 1.0065817801562404, - "grad_norm": 0.46424481755052327, - "learning_rate": 0.0001797798894614445, - "loss": 0.6281, - "step": 2051 - }, - { - "epoch": 1.0070738758688564, - "grad_norm": 1.671733380094309, - "learning_rate": 0.00017974274888846065, - "loss": 0.6882, - "step": 2052 - }, - { - "epoch": 1.0075659715814727, - "grad_norm": 0.4677554918316789, - "learning_rate": 0.00017970557808047314, - "loss": 0.7027, - "step": 2053 - }, - { - "epoch": 1.0080580672940886, - "grad_norm": 0.5008453388603415, - "learning_rate": 0.0001796683770515755, - "loss": 0.7026, - "step": 2054 - }, - { - "epoch": 1.0085501630067049, - "grad_norm": 0.44121797690454334, - "learning_rate": 0.0001796311458158728, - "loss": 0.7218, - "step": 2055 - }, - { - "epoch": 1.0090422587193209, - "grad_norm": 0.47635340403842863, - "learning_rate": 0.00017959388438748151, - "loss": 0.7384, - "step": 2056 - }, - { - "epoch": 1.009534354431937, - "grad_norm": 0.4765597636860264, - "learning_rate": 0.0001795565927805295, - "loss": 0.778, - "step": 2057 - }, - { - "epoch": 1.010026450144553, - "grad_norm": 0.5194102087512645, - "learning_rate": 0.00017951927100915617, - "loss": 0.7989, - "step": 2058 - }, - { - "epoch": 1.0105185458571693, - "grad_norm": 0.4455080918099978, - "learning_rate": 0.00017948191908751234, - "loss": 0.683, - "step": 2059 - }, - { - "epoch": 1.0110106415697853, - "grad_norm": 0.46978662112903474, - "learning_rate": 0.00017944453702976022, - "loss": 0.753, - "step": 2060 - }, - { - "epoch": 1.0115027372824015, - "grad_norm": 0.4894148552681891, - "learning_rate": 0.00017940712485007347, - "loss": 0.7841, - "step": 2061 - }, - { - "epoch": 1.0119948329950175, - "grad_norm": 0.49302032266441115, - "learning_rate": 0.00017936968256263719, - "loss": 0.8665, - "step": 2062 - }, - { - "epoch": 1.0124869287076337, - "grad_norm": 0.4643383424097789, - "learning_rate": 0.00017933221018164784, - "loss": 0.7664, - "step": 2063 - }, - { - "epoch": 1.0129790244202497, - "grad_norm": 0.43714680751204893, - "learning_rate": 0.00017929470772131336, - "loss": 0.686, - "step": 2064 - }, - { - "epoch": 1.0134711201328659, - "grad_norm": 0.4343407075088194, - "learning_rate": 0.00017925717519585305, - "loss": 0.6869, - "step": 2065 - }, - { - "epoch": 1.0139632158454819, - "grad_norm": 0.4373287916799852, - "learning_rate": 0.00017921961261949763, - "loss": 0.7374, - "step": 2066 - }, - { - "epoch": 1.014455311558098, - "grad_norm": 0.46816481061573334, - "learning_rate": 0.00017918202000648917, - "loss": 0.7419, - "step": 2067 - }, - { - "epoch": 1.014947407270714, - "grad_norm": 0.49053088889668656, - "learning_rate": 0.00017914439737108128, - "loss": 0.6899, - "step": 2068 - }, - { - "epoch": 1.0154395029833303, - "grad_norm": 0.4802762766167413, - "learning_rate": 0.00017910674472753865, - "loss": 0.7384, - "step": 2069 - }, - { - "epoch": 1.0159315986959463, - "grad_norm": 0.44872570424600855, - "learning_rate": 0.0001790690620901377, - "loss": 0.7102, - "step": 2070 - }, - { - "epoch": 1.0164236944085625, - "grad_norm": 0.4705487092947291, - "learning_rate": 0.00017903134947316594, - "loss": 0.741, - "step": 2071 - }, - { - "epoch": 1.0169157901211785, - "grad_norm": 0.4321593935650233, - "learning_rate": 0.00017899360689092248, - "loss": 0.6556, - "step": 2072 - }, - { - "epoch": 1.0174078858337947, - "grad_norm": 0.5212546002206624, - "learning_rate": 0.00017895583435771758, - "loss": 0.8173, - "step": 2073 - }, - { - "epoch": 1.0178999815464107, - "grad_norm": 0.4635438789335315, - "learning_rate": 0.000178918031887873, - "loss": 0.7334, - "step": 2074 - }, - { - "epoch": 1.018392077259027, - "grad_norm": 0.4436241029043569, - "learning_rate": 0.00017888019949572178, - "loss": 0.7896, - "step": 2075 - }, - { - "epoch": 1.018884172971643, - "grad_norm": 0.45448345835544507, - "learning_rate": 0.00017884233719560832, - "loss": 0.8205, - "step": 2076 - }, - { - "epoch": 1.0193762686842591, - "grad_norm": 0.46201987319120674, - "learning_rate": 0.0001788044450018884, - "loss": 0.7084, - "step": 2077 - }, - { - "epoch": 1.0198683643968751, - "grad_norm": 0.47161403666785723, - "learning_rate": 0.0001787665229289291, - "loss": 0.714, - "step": 2078 - }, - { - "epoch": 1.0203604601094913, - "grad_norm": 0.4755426078626228, - "learning_rate": 0.0001787285709911088, - "loss": 0.7326, - "step": 2079 - }, - { - "epoch": 1.0208525558221073, - "grad_norm": 0.4505436214042156, - "learning_rate": 0.00017869058920281727, - "loss": 0.699, - "step": 2080 - }, - { - "epoch": 1.0213446515347235, - "grad_norm": 0.4579915340549243, - "learning_rate": 0.0001786525775784555, - "loss": 0.6847, - "step": 2081 - }, - { - "epoch": 1.0218367472473395, - "grad_norm": 0.4577831593591981, - "learning_rate": 0.00017861453613243593, - "loss": 0.7078, - "step": 2082 - }, - { - "epoch": 1.0223288429599557, - "grad_norm": 0.5072246200909621, - "learning_rate": 0.0001785764648791822, - "loss": 0.8272, - "step": 2083 - }, - { - "epoch": 1.0228209386725717, - "grad_norm": 0.47134968568286834, - "learning_rate": 0.0001785383638331293, - "loss": 0.7323, - "step": 2084 - }, - { - "epoch": 1.023313034385188, - "grad_norm": 0.45264240486960794, - "learning_rate": 0.00017850023300872346, - "loss": 0.7033, - "step": 2085 - }, - { - "epoch": 1.023805130097804, - "grad_norm": 0.47031111770245215, - "learning_rate": 0.00017846207242042228, - "loss": 0.7706, - "step": 2086 - }, - { - "epoch": 1.0242972258104202, - "grad_norm": 0.4658262557406207, - "learning_rate": 0.00017842388208269457, - "loss": 0.7325, - "step": 2087 - }, - { - "epoch": 1.0247893215230361, - "grad_norm": 0.46871993189108374, - "learning_rate": 0.00017838566201002046, - "loss": 0.7374, - "step": 2088 - }, - { - "epoch": 1.0252814172356524, - "grad_norm": 0.4754992124043651, - "learning_rate": 0.00017834741221689143, - "loss": 0.7363, - "step": 2089 - }, - { - "epoch": 1.0257735129482684, - "grad_norm": 0.4538167742547551, - "learning_rate": 0.00017830913271781005, - "loss": 0.7165, - "step": 2090 - }, - { - "epoch": 1.0262656086608846, - "grad_norm": 0.46410781780592864, - "learning_rate": 0.00017827082352729026, - "loss": 0.7135, - "step": 2091 - }, - { - "epoch": 1.0267577043735006, - "grad_norm": 0.4420478759408918, - "learning_rate": 0.00017823248465985732, - "loss": 0.7796, - "step": 2092 - }, - { - "epoch": 1.0272498000861168, - "grad_norm": 0.4443775997398954, - "learning_rate": 0.0001781941161300476, - "loss": 0.7298, - "step": 2093 - }, - { - "epoch": 1.0277418957987328, - "grad_norm": 0.46999968781222573, - "learning_rate": 0.0001781557179524088, - "loss": 0.7263, - "step": 2094 - }, - { - "epoch": 1.028233991511349, - "grad_norm": 0.4482118121027141, - "learning_rate": 0.0001781172901414999, - "loss": 0.722, - "step": 2095 - }, - { - "epoch": 1.028726087223965, - "grad_norm": 0.4601441480915822, - "learning_rate": 0.00017807883271189098, - "loss": 0.7515, - "step": 2096 - }, - { - "epoch": 1.0292181829365812, - "grad_norm": 0.46318942282534875, - "learning_rate": 0.0001780403456781635, - "loss": 0.7406, - "step": 2097 - }, - { - "epoch": 1.0297102786491972, - "grad_norm": 0.4350085824417785, - "learning_rate": 0.0001780018290549101, - "loss": 0.6987, - "step": 2098 - }, - { - "epoch": 1.0302023743618134, - "grad_norm": 0.45798874605425643, - "learning_rate": 0.00017796328285673454, - "loss": 0.6562, - "step": 2099 - }, - { - "epoch": 1.0306944700744294, - "grad_norm": 0.4656180646396438, - "learning_rate": 0.00017792470709825193, - "loss": 0.7056, - "step": 2100 - }, - { - "epoch": 1.0311865657870456, - "grad_norm": 0.5069432109096865, - "learning_rate": 0.00017788610179408852, - "loss": 0.7901, - "step": 2101 - }, - { - "epoch": 1.0316786614996616, - "grad_norm": 0.4717718604355321, - "learning_rate": 0.00017784746695888174, - "loss": 0.8137, - "step": 2102 - }, - { - "epoch": 1.0321707572122778, - "grad_norm": 0.43441187391989783, - "learning_rate": 0.0001778088026072803, - "loss": 0.6866, - "step": 2103 - }, - { - "epoch": 1.0326628529248938, - "grad_norm": 0.45221372848824265, - "learning_rate": 0.00017777010875394403, - "loss": 0.672, - "step": 2104 - }, - { - "epoch": 1.03315494863751, - "grad_norm": 0.4328549218543093, - "learning_rate": 0.00017773138541354397, - "loss": 0.6551, - "step": 2105 - }, - { - "epoch": 1.033647044350126, - "grad_norm": 0.45683272510185197, - "learning_rate": 0.00017769263260076232, - "loss": 0.7069, - "step": 2106 - }, - { - "epoch": 1.0341391400627422, - "grad_norm": 0.5068602570947087, - "learning_rate": 0.00017765385033029248, - "loss": 0.6768, - "step": 2107 - }, - { - "epoch": 1.0346312357753582, - "grad_norm": 0.45976586253535545, - "learning_rate": 0.000177615038616839, - "loss": 0.7623, - "step": 2108 - }, - { - "epoch": 1.0351233314879744, - "grad_norm": 0.45961453783662415, - "learning_rate": 0.00017757619747511765, - "loss": 0.7822, - "step": 2109 - }, - { - "epoch": 1.0356154272005904, - "grad_norm": 0.4663721653624339, - "learning_rate": 0.00017753732691985525, - "loss": 0.7308, - "step": 2110 - }, - { - "epoch": 1.0361075229132066, - "grad_norm": 0.4374999127844936, - "learning_rate": 0.00017749842696578987, - "loss": 0.7453, - "step": 2111 - }, - { - "epoch": 1.0365996186258226, - "grad_norm": 0.4549081742466566, - "learning_rate": 0.00017745949762767072, - "loss": 0.83, - "step": 2112 - }, - { - "epoch": 1.0370917143384388, - "grad_norm": 0.44504919310170415, - "learning_rate": 0.00017742053892025802, - "loss": 0.727, - "step": 2113 - }, - { - "epoch": 1.0375838100510548, - "grad_norm": 0.4609188149655354, - "learning_rate": 0.00017738155085832337, - "loss": 0.6878, - "step": 2114 - }, - { - "epoch": 1.038075905763671, - "grad_norm": 0.47085891920119277, - "learning_rate": 0.00017734253345664925, - "loss": 0.7279, - "step": 2115 - }, - { - "epoch": 1.0385680014762873, - "grad_norm": 0.45176435553856, - "learning_rate": 0.0001773034867300294, - "loss": 0.7784, - "step": 2116 - }, - { - "epoch": 1.0390600971889032, - "grad_norm": 0.42587095371218864, - "learning_rate": 0.00017726441069326865, - "loss": 0.6972, - "step": 2117 - }, - { - "epoch": 1.0395521929015192, - "grad_norm": 0.44384491365038603, - "learning_rate": 0.000177225305361183, - "loss": 0.7523, - "step": 2118 - }, - { - "epoch": 1.0400442886141354, - "grad_norm": 0.43432218603855066, - "learning_rate": 0.0001771861707485994, - "loss": 0.7433, - "step": 2119 - }, - { - "epoch": 1.0405363843267517, - "grad_norm": 0.4729564357018369, - "learning_rate": 0.00017714700687035607, - "loss": 0.7847, - "step": 2120 - }, - { - "epoch": 1.0410284800393677, - "grad_norm": 0.4516088185913565, - "learning_rate": 0.00017710781374130226, - "loss": 0.7404, - "step": 2121 - }, - { - "epoch": 1.0415205757519839, - "grad_norm": 0.5076092053877148, - "learning_rate": 0.00017706859137629825, - "loss": 0.7635, - "step": 2122 - }, - { - "epoch": 1.0420126714645999, - "grad_norm": 0.43121740216834487, - "learning_rate": 0.00017702933979021554, - "loss": 0.7523, - "step": 2123 - }, - { - "epoch": 1.042504767177216, - "grad_norm": 0.4377234039381213, - "learning_rate": 0.0001769900589979366, - "loss": 0.7238, - "step": 2124 - }, - { - "epoch": 1.042996862889832, - "grad_norm": 0.44984533226658263, - "learning_rate": 0.00017695074901435506, - "loss": 0.731, - "step": 2125 - }, - { - "epoch": 1.0434889586024483, - "grad_norm": 0.45407476366180155, - "learning_rate": 0.0001769114098543755, - "loss": 0.7172, - "step": 2126 - }, - { - "epoch": 1.0439810543150643, - "grad_norm": 0.4727108415085289, - "learning_rate": 0.00017687204153291365, - "loss": 0.7908, - "step": 2127 - }, - { - "epoch": 1.0444731500276805, - "grad_norm": 0.405208877396721, - "learning_rate": 0.00017683264406489625, - "loss": 0.6784, - "step": 2128 - }, - { - "epoch": 1.0449652457402965, - "grad_norm": 0.434904148515732, - "learning_rate": 0.00017679321746526117, - "loss": 0.6561, - "step": 2129 - }, - { - "epoch": 1.0454573414529127, - "grad_norm": 0.4592627747583997, - "learning_rate": 0.00017675376174895724, - "loss": 0.7214, - "step": 2130 - }, - { - "epoch": 1.0459494371655287, - "grad_norm": 0.44081523246731175, - "learning_rate": 0.00017671427693094437, - "loss": 0.7354, - "step": 2131 - }, - { - "epoch": 1.046441532878145, - "grad_norm": 0.4423562297857163, - "learning_rate": 0.00017667476302619354, - "loss": 0.6852, - "step": 2132 - }, - { - "epoch": 1.046933628590761, - "grad_norm": 0.485161027908596, - "learning_rate": 0.00017663522004968658, - "loss": 0.7069, - "step": 2133 - }, - { - "epoch": 1.047425724303377, - "grad_norm": 0.49260707389446934, - "learning_rate": 0.00017659564801641664, - "loss": 0.752, - "step": 2134 - }, - { - "epoch": 1.047917820015993, - "grad_norm": 0.45045532516594605, - "learning_rate": 0.00017655604694138762, - "loss": 0.7502, - "step": 2135 - }, - { - "epoch": 1.0484099157286093, - "grad_norm": 0.46244390432668536, - "learning_rate": 0.00017651641683961457, - "loss": 0.7466, - "step": 2136 - }, - { - "epoch": 1.0489020114412253, - "grad_norm": 0.47524874462793565, - "learning_rate": 0.00017647675772612353, - "loss": 0.7272, - "step": 2137 - }, - { - "epoch": 1.0493941071538415, - "grad_norm": 0.4493309339046456, - "learning_rate": 0.00017643706961595148, - "loss": 0.7626, - "step": 2138 - }, - { - "epoch": 1.0498862028664575, - "grad_norm": 0.48658266275674944, - "learning_rate": 0.00017639735252414647, - "loss": 0.7389, - "step": 2139 - }, - { - "epoch": 1.0503782985790737, - "grad_norm": 0.4842574407728781, - "learning_rate": 0.00017635760646576748, - "loss": 0.8179, - "step": 2140 - }, - { - "epoch": 1.0508703942916897, - "grad_norm": 0.48790494389203287, - "learning_rate": 0.00017631783145588453, - "loss": 0.7801, - "step": 2141 - }, - { - "epoch": 1.051362490004306, - "grad_norm": 0.43366852253965477, - "learning_rate": 0.00017627802750957853, - "loss": 0.6612, - "step": 2142 - }, - { - "epoch": 1.051854585716922, - "grad_norm": 0.43545977277735415, - "learning_rate": 0.00017623819464194148, - "loss": 0.7348, - "step": 2143 - }, - { - "epoch": 1.0523466814295381, - "grad_norm": 0.4494947095246579, - "learning_rate": 0.00017619833286807625, - "loss": 0.6911, - "step": 2144 - }, - { - "epoch": 1.0528387771421541, - "grad_norm": 0.4832898099383395, - "learning_rate": 0.00017615844220309667, - "loss": 0.7399, - "step": 2145 - }, - { - "epoch": 1.0533308728547703, - "grad_norm": 0.4845604328962755, - "learning_rate": 0.00017611852266212762, - "loss": 0.7519, - "step": 2146 - }, - { - "epoch": 1.0538229685673863, - "grad_norm": 0.5155619319445711, - "learning_rate": 0.00017607857426030484, - "loss": 0.7451, - "step": 2147 - }, - { - "epoch": 1.0543150642800025, - "grad_norm": 0.4203321493512875, - "learning_rate": 0.00017603859701277502, - "loss": 0.7028, - "step": 2148 - }, - { - "epoch": 1.0548071599926185, - "grad_norm": 0.4389995266081142, - "learning_rate": 0.0001759985909346958, - "loss": 0.711, - "step": 2149 - }, - { - "epoch": 1.0552992557052348, - "grad_norm": 0.5154157780916582, - "learning_rate": 0.0001759585560412358, - "loss": 0.7303, - "step": 2150 - }, - { - "epoch": 1.0557913514178507, - "grad_norm": 0.42620801001840763, - "learning_rate": 0.00017591849234757447, - "loss": 0.7073, - "step": 2151 - }, - { - "epoch": 1.056283447130467, - "grad_norm": 0.4419775781650784, - "learning_rate": 0.00017587839986890228, - "loss": 0.6977, - "step": 2152 - }, - { - "epoch": 1.056775542843083, - "grad_norm": 0.45640857304759624, - "learning_rate": 0.00017583827862042054, - "loss": 0.7826, - "step": 2153 - }, - { - "epoch": 1.0572676385556992, - "grad_norm": 0.4681387690418811, - "learning_rate": 0.00017579812861734143, - "loss": 0.7432, - "step": 2154 - }, - { - "epoch": 1.0577597342683152, - "grad_norm": 0.42792426567868647, - "learning_rate": 0.00017575794987488824, - "loss": 0.6719, - "step": 2155 - }, - { - "epoch": 1.0582518299809314, - "grad_norm": 0.43220590496957695, - "learning_rate": 0.00017571774240829487, - "loss": 0.7404, - "step": 2156 - }, - { - "epoch": 1.0587439256935474, - "grad_norm": 0.45791478079235115, - "learning_rate": 0.00017567750623280633, - "loss": 0.6881, - "step": 2157 - }, - { - "epoch": 1.0592360214061636, - "grad_norm": 0.43224341363914714, - "learning_rate": 0.00017563724136367842, - "loss": 0.7255, - "step": 2158 - }, - { - "epoch": 1.0597281171187796, - "grad_norm": 0.4473423163991253, - "learning_rate": 0.00017559694781617787, - "loss": 0.7639, - "step": 2159 - }, - { - "epoch": 1.0602202128313958, - "grad_norm": 0.4471786879502586, - "learning_rate": 0.0001755566256055822, - "loss": 0.6978, - "step": 2160 - }, - { - "epoch": 1.0607123085440118, - "grad_norm": 0.5020232214769531, - "learning_rate": 0.00017551627474717986, - "loss": 0.7834, - "step": 2161 - }, - { - "epoch": 1.061204404256628, - "grad_norm": 0.5026469782707866, - "learning_rate": 0.00017547589525627018, - "loss": 0.8847, - "step": 2162 - }, - { - "epoch": 1.061696499969244, - "grad_norm": 0.47665535206322823, - "learning_rate": 0.0001754354871481633, - "loss": 0.6764, - "step": 2163 - }, - { - "epoch": 1.0621885956818602, - "grad_norm": 0.4420774179727459, - "learning_rate": 0.0001753950504381802, - "loss": 0.7618, - "step": 2164 - }, - { - "epoch": 1.0626806913944762, - "grad_norm": 0.4582259735972832, - "learning_rate": 0.00017535458514165278, - "loss": 0.7019, - "step": 2165 - }, - { - "epoch": 1.0631727871070924, - "grad_norm": 0.4776584769453669, - "learning_rate": 0.00017531409127392373, - "loss": 0.6775, - "step": 2166 - }, - { - "epoch": 1.0636648828197084, - "grad_norm": 0.4363715883984462, - "learning_rate": 0.00017527356885034653, - "loss": 0.7295, - "step": 2167 - }, - { - "epoch": 1.0641569785323246, - "grad_norm": 0.45831574764188365, - "learning_rate": 0.00017523301788628556, - "loss": 0.7827, - "step": 2168 - }, - { - "epoch": 1.0646490742449406, - "grad_norm": 0.4590135901022549, - "learning_rate": 0.000175192438397116, - "loss": 0.7199, - "step": 2169 - }, - { - "epoch": 1.0651411699575568, - "grad_norm": 0.46030473454145515, - "learning_rate": 0.00017515183039822383, - "loss": 0.787, - "step": 2170 - }, - { - "epoch": 1.0656332656701728, - "grad_norm": 0.46878407586423176, - "learning_rate": 0.00017511119390500586, - "loss": 0.7143, - "step": 2171 - }, - { - "epoch": 1.066125361382789, - "grad_norm": 0.42573502630114696, - "learning_rate": 0.00017507052893286966, - "loss": 0.6712, - "step": 2172 - }, - { - "epoch": 1.066617457095405, - "grad_norm": 0.4370551028018308, - "learning_rate": 0.00017502983549723365, - "loss": 0.7221, - "step": 2173 - }, - { - "epoch": 1.0671095528080212, - "grad_norm": 0.4231519476593116, - "learning_rate": 0.00017498911361352702, - "loss": 0.6729, - "step": 2174 - }, - { - "epoch": 1.0676016485206372, - "grad_norm": 0.4891257396776434, - "learning_rate": 0.00017494836329718974, - "loss": 0.7522, - "step": 2175 - }, - { - "epoch": 1.0680937442332534, - "grad_norm": 0.4236571353722022, - "learning_rate": 0.0001749075845636726, - "loss": 0.6895, - "step": 2176 - }, - { - "epoch": 1.0685858399458694, - "grad_norm": 0.4899655849378574, - "learning_rate": 0.0001748667774284371, - "loss": 0.7899, - "step": 2177 - }, - { - "epoch": 1.0690779356584856, - "grad_norm": 0.5124751631506923, - "learning_rate": 0.00017482594190695557, - "loss": 0.7775, - "step": 2178 - }, - { - "epoch": 1.0695700313711016, - "grad_norm": 0.4604651269621623, - "learning_rate": 0.000174785078014711, - "loss": 0.6805, - "step": 2179 - }, - { - "epoch": 1.0700621270837178, - "grad_norm": 0.5027190232775441, - "learning_rate": 0.00017474418576719734, - "loss": 0.7402, - "step": 2180 - }, - { - "epoch": 1.0705542227963338, - "grad_norm": 0.4384068034829691, - "learning_rate": 0.00017470326517991905, - "loss": 0.7724, - "step": 2181 - }, - { - "epoch": 1.07104631850895, - "grad_norm": 0.45884553767656433, - "learning_rate": 0.0001746623162683915, - "loss": 0.7247, - "step": 2182 - }, - { - "epoch": 1.071538414221566, - "grad_norm": 0.496978133299866, - "learning_rate": 0.00017462133904814074, - "loss": 0.7243, - "step": 2183 - }, - { - "epoch": 1.0720305099341823, - "grad_norm": 0.4465078830394886, - "learning_rate": 0.00017458033353470354, - "loss": 0.6879, - "step": 2184 - }, - { - "epoch": 1.0725226056467982, - "grad_norm": 0.4536245142154501, - "learning_rate": 0.0001745392997436275, - "loss": 0.781, - "step": 2185 - }, - { - "epoch": 1.0730147013594145, - "grad_norm": 2.9669028314572117, - "learning_rate": 0.00017449823769047072, - "loss": 0.8559, - "step": 2186 - }, - { - "epoch": 1.0735067970720304, - "grad_norm": 0.5012689594444691, - "learning_rate": 0.00017445714739080227, - "loss": 0.8166, - "step": 2187 - }, - { - "epoch": 1.0739988927846467, - "grad_norm": 0.4543299719609067, - "learning_rate": 0.0001744160288602018, - "loss": 0.7688, - "step": 2188 - }, - { - "epoch": 1.0744909884972627, - "grad_norm": 0.49085748053135525, - "learning_rate": 0.00017437488211425957, - "loss": 0.7885, - "step": 2189 - }, - { - "epoch": 1.0749830842098789, - "grad_norm": 0.4799641811424035, - "learning_rate": 0.0001743337071685768, - "loss": 0.6744, - "step": 2190 - }, - { - "epoch": 1.0754751799224949, - "grad_norm": 0.47759641698512745, - "learning_rate": 0.0001742925040387652, - "loss": 0.7843, - "step": 2191 - }, - { - "epoch": 1.075967275635111, - "grad_norm": 0.4896965131557718, - "learning_rate": 0.00017425127274044714, - "loss": 0.7888, - "step": 2192 - }, - { - "epoch": 1.076459371347727, - "grad_norm": 0.4795673015714201, - "learning_rate": 0.00017421001328925585, - "loss": 0.772, - "step": 2193 - }, - { - "epoch": 1.0769514670603433, - "grad_norm": 0.4311158106248663, - "learning_rate": 0.00017416872570083508, - "loss": 0.7267, - "step": 2194 - }, - { - "epoch": 1.0774435627729593, - "grad_norm": 0.43624552001017447, - "learning_rate": 0.00017412740999083931, - "loss": 0.6747, - "step": 2195 - }, - { - "epoch": 1.0779356584855755, - "grad_norm": 0.4627150853495084, - "learning_rate": 0.00017408606617493367, - "loss": 0.726, - "step": 2196 - }, - { - "epoch": 1.0784277541981915, - "grad_norm": 0.47247549042808046, - "learning_rate": 0.00017404469426879392, - "loss": 0.7389, - "step": 2197 - }, - { - "epoch": 1.0789198499108077, - "grad_norm": 0.4475268862699268, - "learning_rate": 0.00017400329428810655, - "loss": 0.7291, - "step": 2198 - }, - { - "epoch": 1.0794119456234237, - "grad_norm": 0.46337366071213587, - "learning_rate": 0.00017396186624856863, - "loss": 0.8107, - "step": 2199 - }, - { - "epoch": 1.07990404133604, - "grad_norm": 0.4282607673568657, - "learning_rate": 0.00017392041016588781, - "loss": 0.7312, - "step": 2200 - }, - { - "epoch": 1.080396137048656, - "grad_norm": 0.44883945305298184, - "learning_rate": 0.00017387892605578257, - "loss": 0.7327, - "step": 2201 - }, - { - "epoch": 1.080888232761272, - "grad_norm": 0.4622041375900797, - "learning_rate": 0.00017383741393398177, - "loss": 0.7729, - "step": 2202 - }, - { - "epoch": 1.081380328473888, - "grad_norm": 0.447719502426236, - "learning_rate": 0.00017379587381622513, - "loss": 0.7054, - "step": 2203 - }, - { - "epoch": 1.0818724241865043, - "grad_norm": 0.43936405353026553, - "learning_rate": 0.00017375430571826277, - "loss": 0.7353, - "step": 2204 - }, - { - "epoch": 1.0823645198991203, - "grad_norm": 0.45444989271124786, - "learning_rate": 0.00017371270965585556, - "loss": 0.7765, - "step": 2205 - }, - { - "epoch": 1.0828566156117365, - "grad_norm": 0.4333094373678045, - "learning_rate": 0.0001736710856447749, - "loss": 0.7195, - "step": 2206 - }, - { - "epoch": 1.0833487113243525, - "grad_norm": 0.4426912048826289, - "learning_rate": 0.00017362943370080282, - "loss": 0.7756, - "step": 2207 - }, - { - "epoch": 1.0838408070369687, - "grad_norm": 0.46608728261291205, - "learning_rate": 0.00017358775383973201, - "loss": 0.7369, - "step": 2208 - }, - { - "epoch": 1.0843329027495847, - "grad_norm": 0.5499731215025803, - "learning_rate": 0.00017354604607736556, - "loss": 0.7295, - "step": 2209 - }, - { - "epoch": 1.084824998462201, - "grad_norm": 0.47059985575331065, - "learning_rate": 0.00017350431042951735, - "loss": 0.8143, - "step": 2210 - }, - { - "epoch": 1.085317094174817, - "grad_norm": 0.4987450515535781, - "learning_rate": 0.00017346254691201165, - "loss": 0.7546, - "step": 2211 - }, - { - "epoch": 1.0858091898874331, - "grad_norm": 0.4157649272895208, - "learning_rate": 0.00017342075554068343, - "loss": 0.7087, - "step": 2212 - }, - { - "epoch": 1.0863012856000491, - "grad_norm": 0.4394968554901881, - "learning_rate": 0.00017337893633137817, - "loss": 0.7286, - "step": 2213 - }, - { - "epoch": 1.0867933813126653, - "grad_norm": 0.4454638368328683, - "learning_rate": 0.00017333708929995192, - "loss": 0.7267, - "step": 2214 - }, - { - "epoch": 1.0872854770252813, - "grad_norm": 0.463833356574586, - "learning_rate": 0.00017329521446227122, - "loss": 0.673, - "step": 2215 - }, - { - "epoch": 1.0877775727378975, - "grad_norm": 0.5488935729149627, - "learning_rate": 0.00017325331183421324, - "loss": 0.8265, - "step": 2216 - }, - { - "epoch": 1.0882696684505135, - "grad_norm": 0.4388729476804382, - "learning_rate": 0.0001732113814316656, - "loss": 0.7175, - "step": 2217 - }, - { - "epoch": 1.0887617641631298, - "grad_norm": 0.4445625165505587, - "learning_rate": 0.00017316942327052652, - "loss": 0.7486, - "step": 2218 - }, - { - "epoch": 1.0892538598757457, - "grad_norm": 0.44796055484436026, - "learning_rate": 0.00017312743736670473, - "loss": 0.7858, - "step": 2219 - }, - { - "epoch": 1.089745955588362, - "grad_norm": 0.44343202198450404, - "learning_rate": 0.00017308542373611948, - "loss": 0.7668, - "step": 2220 - }, - { - "epoch": 1.090238051300978, - "grad_norm": 0.40910960991257106, - "learning_rate": 0.00017304338239470052, - "loss": 0.6735, - "step": 2221 - }, - { - "epoch": 1.0907301470135942, - "grad_norm": 0.46148660147352416, - "learning_rate": 0.00017300131335838806, - "loss": 0.7881, - "step": 2222 - }, - { - "epoch": 1.0912222427262102, - "grad_norm": 0.43129583307473374, - "learning_rate": 0.0001729592166431329, - "loss": 0.7698, - "step": 2223 - }, - { - "epoch": 1.0917143384388264, - "grad_norm": 0.4424498838899368, - "learning_rate": 0.00017291709226489635, - "loss": 0.6895, - "step": 2224 - }, - { - "epoch": 1.0922064341514424, - "grad_norm": 0.49562791398681033, - "learning_rate": 0.00017287494023965003, - "loss": 0.8248, - "step": 2225 - }, - { - "epoch": 1.0926985298640586, - "grad_norm": 0.46740114333129823, - "learning_rate": 0.00017283276058337624, - "loss": 0.7096, - "step": 2226 - }, - { - "epoch": 1.0931906255766746, - "grad_norm": 0.4376232396500971, - "learning_rate": 0.0001727905533120677, - "loss": 0.6524, - "step": 2227 - }, - { - "epoch": 1.0936827212892908, - "grad_norm": 0.4575713395407467, - "learning_rate": 0.00017274831844172757, - "loss": 0.7172, - "step": 2228 - }, - { - "epoch": 1.0941748170019068, - "grad_norm": 0.43953153703339076, - "learning_rate": 0.00017270605598836944, - "loss": 0.6764, - "step": 2229 - }, - { - "epoch": 1.094666912714523, - "grad_norm": 0.445750420266202, - "learning_rate": 0.00017266376596801743, - "loss": 0.7335, - "step": 2230 - }, - { - "epoch": 1.095159008427139, - "grad_norm": 0.4611985883315243, - "learning_rate": 0.00017262144839670613, - "loss": 0.7505, - "step": 2231 - }, - { - "epoch": 1.0956511041397552, - "grad_norm": 0.45842080565617677, - "learning_rate": 0.0001725791032904805, - "loss": 0.7346, - "step": 2232 - }, - { - "epoch": 1.0961431998523712, - "grad_norm": 0.5740138281698376, - "learning_rate": 0.00017253673066539596, - "loss": 0.8259, - "step": 2233 - }, - { - "epoch": 1.0966352955649874, - "grad_norm": 0.42582203968776655, - "learning_rate": 0.0001724943305375184, - "loss": 0.6761, - "step": 2234 - }, - { - "epoch": 1.0971273912776034, - "grad_norm": 0.4652739229260749, - "learning_rate": 0.00017245190292292412, - "loss": 0.6901, - "step": 2235 - }, - { - "epoch": 1.0976194869902196, - "grad_norm": 0.432367305195393, - "learning_rate": 0.0001724094478376998, - "loss": 0.7304, - "step": 2236 - }, - { - "epoch": 1.0981115827028356, - "grad_norm": 0.5989411933659401, - "learning_rate": 0.00017236696529794262, - "loss": 0.7272, - "step": 2237 - }, - { - "epoch": 1.0986036784154518, - "grad_norm": 0.4698301123236689, - "learning_rate": 0.00017232445531976012, - "loss": 0.7513, - "step": 2238 - }, - { - "epoch": 1.099095774128068, - "grad_norm": 0.46462850411082196, - "learning_rate": 0.0001722819179192702, - "loss": 0.7998, - "step": 2239 - }, - { - "epoch": 1.099587869840684, - "grad_norm": 0.4548416372221925, - "learning_rate": 0.00017223935311260125, - "loss": 0.6745, - "step": 2240 - }, - { - "epoch": 1.1000799655533, - "grad_norm": 0.45439739200697815, - "learning_rate": 0.00017219676091589198, - "loss": 0.6473, - "step": 2241 - }, - { - "epoch": 1.1005720612659162, - "grad_norm": 0.5158247670472813, - "learning_rate": 0.00017215414134529153, - "loss": 0.7285, - "step": 2242 - }, - { - "epoch": 1.1010641569785324, - "grad_norm": 0.44207593480252416, - "learning_rate": 0.00017211149441695938, - "loss": 0.7564, - "step": 2243 - }, - { - "epoch": 1.1015562526911484, - "grad_norm": 0.44620600073109734, - "learning_rate": 0.0001720688201470654, - "loss": 0.7439, - "step": 2244 - }, - { - "epoch": 1.1020483484037644, - "grad_norm": 0.48831775008135214, - "learning_rate": 0.00017202611855178987, - "loss": 0.7847, - "step": 2245 - }, - { - "epoch": 1.1025404441163806, - "grad_norm": 0.45163132354543045, - "learning_rate": 0.00017198338964732334, - "loss": 0.6993, - "step": 2246 - }, - { - "epoch": 1.1030325398289968, - "grad_norm": 0.47662254693220946, - "learning_rate": 0.00017194063344986676, - "loss": 0.7711, - "step": 2247 - }, - { - "epoch": 1.1035246355416128, - "grad_norm": 0.46584053994009733, - "learning_rate": 0.00017189784997563147, - "loss": 0.7479, - "step": 2248 - }, - { - "epoch": 1.1040167312542288, - "grad_norm": 0.4581977072685601, - "learning_rate": 0.0001718550392408391, - "loss": 0.7518, - "step": 2249 - }, - { - "epoch": 1.104508826966845, - "grad_norm": 1.106498959634675, - "learning_rate": 0.00017181220126172164, - "loss": 0.938, - "step": 2250 - }, - { - "epoch": 1.1050009226794613, - "grad_norm": 0.4374225120852344, - "learning_rate": 0.00017176933605452137, - "loss": 0.7394, - "step": 2251 - }, - { - "epoch": 1.1054930183920773, - "grad_norm": 0.46203521006258097, - "learning_rate": 0.00017172644363549092, - "loss": 0.7336, - "step": 2252 - }, - { - "epoch": 1.1059851141046932, - "grad_norm": 0.4646136658884963, - "learning_rate": 0.0001716835240208933, - "loss": 0.7741, - "step": 2253 - }, - { - "epoch": 1.1064772098173095, - "grad_norm": 0.4505833272163009, - "learning_rate": 0.00017164057722700174, - "loss": 0.6399, - "step": 2254 - }, - { - "epoch": 1.1069693055299257, - "grad_norm": 0.46665113137328695, - "learning_rate": 0.00017159760327009976, - "loss": 0.7365, - "step": 2255 - }, - { - "epoch": 1.1074614012425417, - "grad_norm": 0.4598719270091006, - "learning_rate": 0.00017155460216648131, - "loss": 0.7157, - "step": 2256 - }, - { - "epoch": 1.1079534969551579, - "grad_norm": 1.7020270183409554, - "learning_rate": 0.0001715115739324505, - "loss": 0.7406, - "step": 2257 - }, - { - "epoch": 1.1084455926677739, - "grad_norm": 0.45547883137071377, - "learning_rate": 0.0001714685185843218, - "loss": 0.7748, - "step": 2258 - }, - { - "epoch": 1.10893768838039, - "grad_norm": 0.48366139004901476, - "learning_rate": 0.0001714254361384199, - "loss": 0.778, - "step": 2259 - }, - { - "epoch": 1.109429784093006, - "grad_norm": 0.5294775630886707, - "learning_rate": 0.00017138232661107982, - "loss": 0.789, - "step": 2260 - }, - { - "epoch": 1.1099218798056223, - "grad_norm": 0.46851869269796753, - "learning_rate": 0.00017133919001864687, - "loss": 0.8326, - "step": 2261 - }, - { - "epoch": 1.1104139755182383, - "grad_norm": 0.46248523358403254, - "learning_rate": 0.0001712960263774765, - "loss": 0.6937, - "step": 2262 - }, - { - "epoch": 1.1109060712308545, - "grad_norm": 0.49243432547808386, - "learning_rate": 0.00017125283570393457, - "loss": 0.6992, - "step": 2263 - }, - { - "epoch": 1.1113981669434705, - "grad_norm": 0.4284797536044024, - "learning_rate": 0.0001712096180143971, - "loss": 0.8002, - "step": 2264 - }, - { - "epoch": 1.1118902626560867, - "grad_norm": 0.46352338618065314, - "learning_rate": 0.00017116637332525035, - "loss": 0.7287, - "step": 2265 - }, - { - "epoch": 1.1123823583687027, - "grad_norm": 0.4245556645074505, - "learning_rate": 0.00017112310165289082, - "loss": 0.6463, - "step": 2266 - }, - { - "epoch": 1.112874454081319, - "grad_norm": 0.48546026679680565, - "learning_rate": 0.00017107980301372532, - "loss": 0.7225, - "step": 2267 - }, - { - "epoch": 1.113366549793935, - "grad_norm": 0.46099006053184366, - "learning_rate": 0.0001710364774241708, - "loss": 0.7582, - "step": 2268 - }, - { - "epoch": 1.1138586455065511, - "grad_norm": 0.45945252544408977, - "learning_rate": 0.0001709931249006544, - "loss": 0.7532, - "step": 2269 - }, - { - "epoch": 1.114350741219167, - "grad_norm": 0.478142870545973, - "learning_rate": 0.00017094974545961357, - "loss": 0.7544, - "step": 2270 - }, - { - "epoch": 1.1148428369317833, - "grad_norm": 0.46784284146501, - "learning_rate": 0.0001709063391174959, - "loss": 0.7667, - "step": 2271 - }, - { - "epoch": 1.1153349326443993, - "grad_norm": 0.4652646918442406, - "learning_rate": 0.00017086290589075925, - "loss": 0.7029, - "step": 2272 - }, - { - "epoch": 1.1158270283570155, - "grad_norm": 0.42024375595041674, - "learning_rate": 0.00017081944579587152, - "loss": 0.6997, - "step": 2273 - }, - { - "epoch": 1.1163191240696315, - "grad_norm": 0.4588599547020526, - "learning_rate": 0.000170775958849311, - "loss": 0.7094, - "step": 2274 - }, - { - "epoch": 1.1168112197822477, - "grad_norm": 0.5138911216762359, - "learning_rate": 0.00017073244506756602, - "loss": 0.7752, - "step": 2275 - }, - { - "epoch": 1.1173033154948637, - "grad_norm": 0.4537775613774461, - "learning_rate": 0.00017068890446713512, - "loss": 0.7158, - "step": 2276 - }, - { - "epoch": 1.11779541120748, - "grad_norm": 0.5499498245682529, - "learning_rate": 0.00017064533706452704, - "loss": 0.7772, - "step": 2277 - }, - { - "epoch": 1.118287506920096, - "grad_norm": 0.476587121108361, - "learning_rate": 0.0001706017428762606, - "loss": 0.7724, - "step": 2278 - }, - { - "epoch": 1.1187796026327121, - "grad_norm": 0.6875046666894814, - "learning_rate": 0.0001705581219188649, - "loss": 0.8251, - "step": 2279 - }, - { - "epoch": 1.1192716983453281, - "grad_norm": 0.434734476091617, - "learning_rate": 0.00017051447420887906, - "loss": 0.7132, - "step": 2280 - }, - { - "epoch": 1.1197637940579444, - "grad_norm": 0.46175851533503126, - "learning_rate": 0.00017047079976285247, - "loss": 0.7632, - "step": 2281 - }, - { - "epoch": 1.1202558897705603, - "grad_norm": 0.4750203299707701, - "learning_rate": 0.00017042709859734455, - "loss": 0.7548, - "step": 2282 - }, - { - "epoch": 1.1207479854831766, - "grad_norm": 0.6847769258262533, - "learning_rate": 0.00017038337072892485, - "loss": 0.8399, - "step": 2283 - }, - { - "epoch": 1.1212400811957925, - "grad_norm": 0.42056091819107644, - "learning_rate": 0.0001703396161741732, - "loss": 0.7052, - "step": 2284 - }, - { - "epoch": 1.1217321769084088, - "grad_norm": 0.5053689475536761, - "learning_rate": 0.00017029583494967935, - "loss": 0.8291, - "step": 2285 - }, - { - "epoch": 1.1222242726210248, - "grad_norm": 0.4662819700684409, - "learning_rate": 0.00017025202707204325, - "loss": 0.715, - "step": 2286 - }, - { - "epoch": 1.122716368333641, - "grad_norm": 0.44534392370815595, - "learning_rate": 0.000170208192557875, - "loss": 0.7347, - "step": 2287 - }, - { - "epoch": 1.123208464046257, - "grad_norm": 0.6510306626472884, - "learning_rate": 0.00017016433142379473, - "loss": 0.8045, - "step": 2288 - }, - { - "epoch": 1.1237005597588732, - "grad_norm": 0.4419822695979343, - "learning_rate": 0.00017012044368643268, - "loss": 0.714, - "step": 2289 - }, - { - "epoch": 1.1241926554714892, - "grad_norm": 0.5317833744497776, - "learning_rate": 0.0001700765293624292, - "loss": 0.8055, - "step": 2290 - }, - { - "epoch": 1.1246847511841054, - "grad_norm": 0.4491957822634459, - "learning_rate": 0.00017003258846843465, - "loss": 0.7083, - "step": 2291 - }, - { - "epoch": 1.1251768468967214, - "grad_norm": 0.4861262392437043, - "learning_rate": 0.0001699886210211096, - "loss": 0.8051, - "step": 2292 - }, - { - "epoch": 1.1256689426093376, - "grad_norm": 0.4586144821407561, - "learning_rate": 0.00016994462703712456, - "loss": 0.7624, - "step": 2293 - }, - { - "epoch": 1.1261610383219536, - "grad_norm": 0.44717349121927574, - "learning_rate": 0.00016990060653316013, - "loss": 0.7486, - "step": 2294 - }, - { - "epoch": 1.1266531340345698, - "grad_norm": 0.47667487702788763, - "learning_rate": 0.00016985655952590702, - "loss": 0.7568, - "step": 2295 - }, - { - "epoch": 1.1271452297471858, - "grad_norm": 0.5202189184782849, - "learning_rate": 0.00016981248603206592, - "loss": 0.877, - "step": 2296 - }, - { - "epoch": 1.127637325459802, - "grad_norm": 0.45493093827344977, - "learning_rate": 0.00016976838606834764, - "loss": 0.7834, - "step": 2297 - }, - { - "epoch": 1.128129421172418, - "grad_norm": 0.48036913116061, - "learning_rate": 0.00016972425965147293, - "loss": 0.7749, - "step": 2298 - }, - { - "epoch": 1.1286215168850342, - "grad_norm": 0.4359946394576743, - "learning_rate": 0.00016968010679817264, - "loss": 0.7249, - "step": 2299 - }, - { - "epoch": 1.1291136125976502, - "grad_norm": 0.4377335882015854, - "learning_rate": 0.00016963592752518763, - "loss": 0.7379, - "step": 2300 - }, - { - "epoch": 1.1296057083102664, - "grad_norm": 0.43918112206406407, - "learning_rate": 0.00016959172184926875, - "loss": 0.7775, - "step": 2301 - }, - { - "epoch": 1.1300978040228824, - "grad_norm": 0.46085230534250393, - "learning_rate": 0.0001695474897871769, - "loss": 0.7272, - "step": 2302 - }, - { - "epoch": 1.1305898997354986, - "grad_norm": 0.47015545707857204, - "learning_rate": 0.00016950323135568298, - "loss": 0.7234, - "step": 2303 - }, - { - "epoch": 1.1310819954481146, - "grad_norm": 0.4944887088290653, - "learning_rate": 0.00016945894657156784, - "loss": 0.759, - "step": 2304 - }, - { - "epoch": 1.1315740911607308, - "grad_norm": 0.456194455456828, - "learning_rate": 0.00016941463545162234, - "loss": 0.7579, - "step": 2305 - }, - { - "epoch": 1.1320661868733468, - "grad_norm": 0.5019289572944563, - "learning_rate": 0.00016937029801264742, - "loss": 0.7301, - "step": 2306 - }, - { - "epoch": 1.132558282585963, - "grad_norm": 0.44457249428378054, - "learning_rate": 0.00016932593427145386, - "loss": 0.7618, - "step": 2307 - }, - { - "epoch": 1.133050378298579, - "grad_norm": 0.4319289977770867, - "learning_rate": 0.0001692815442448625, - "loss": 0.7327, - "step": 2308 - }, - { - "epoch": 1.1335424740111952, - "grad_norm": 0.4883241687960278, - "learning_rate": 0.0001692371279497041, - "loss": 0.7523, - "step": 2309 - }, - { - "epoch": 1.1340345697238112, - "grad_norm": 0.45780220449658393, - "learning_rate": 0.00016919268540281936, - "loss": 0.6821, - "step": 2310 - }, - { - "epoch": 1.1345266654364274, - "grad_norm": 0.4753729627358104, - "learning_rate": 0.00016914821662105908, - "loss": 0.8173, - "step": 2311 - }, - { - "epoch": 1.1350187611490434, - "grad_norm": 0.44447953352027786, - "learning_rate": 0.00016910372162128382, - "loss": 0.7236, - "step": 2312 - }, - { - "epoch": 1.1355108568616596, - "grad_norm": 0.4243710961328533, - "learning_rate": 0.00016905920042036417, - "loss": 0.7329, - "step": 2313 - }, - { - "epoch": 1.1360029525742756, - "grad_norm": 0.4468629999914307, - "learning_rate": 0.00016901465303518064, - "loss": 0.7685, - "step": 2314 - }, - { - "epoch": 1.1364950482868919, - "grad_norm": 0.45134253816361103, - "learning_rate": 0.00016897007948262372, - "loss": 0.7661, - "step": 2315 - }, - { - "epoch": 1.1369871439995078, - "grad_norm": 0.4478372330374271, - "learning_rate": 0.0001689254797795937, - "loss": 0.8289, - "step": 2316 - }, - { - "epoch": 1.137479239712124, - "grad_norm": 0.45223770169916977, - "learning_rate": 0.00016888085394300094, - "loss": 0.7879, - "step": 2317 - }, - { - "epoch": 1.13797133542474, - "grad_norm": 0.4183224861655834, - "learning_rate": 0.00016883620198976558, - "loss": 0.6802, - "step": 2318 - }, - { - "epoch": 1.1384634311373563, - "grad_norm": 0.44475791786138535, - "learning_rate": 0.0001687915239368177, - "loss": 0.7196, - "step": 2319 - }, - { - "epoch": 1.1389555268499723, - "grad_norm": 0.4389263002912342, - "learning_rate": 0.00016874681980109734, - "loss": 0.7083, - "step": 2320 - }, - { - "epoch": 1.1394476225625885, - "grad_norm": 0.4313633988441347, - "learning_rate": 0.00016870208959955435, - "loss": 0.8626, - "step": 2321 - }, - { - "epoch": 1.1399397182752045, - "grad_norm": 0.44536307722028046, - "learning_rate": 0.00016865733334914845, - "loss": 0.7838, - "step": 2322 - }, - { - "epoch": 1.1404318139878207, - "grad_norm": 0.4308906082632619, - "learning_rate": 0.00016861255106684933, - "loss": 0.7292, - "step": 2323 - }, - { - "epoch": 1.1409239097004367, - "grad_norm": 0.4715337438609384, - "learning_rate": 0.00016856774276963646, - "loss": 0.7318, - "step": 2324 - }, - { - "epoch": 1.1414160054130529, - "grad_norm": 0.5015199873249059, - "learning_rate": 0.0001685229084744992, - "loss": 0.7696, - "step": 2325 - }, - { - "epoch": 1.1419081011256689, - "grad_norm": 0.44433966071888925, - "learning_rate": 0.00016847804819843684, - "loss": 0.66, - "step": 2326 - }, - { - "epoch": 1.142400196838285, - "grad_norm": 0.46345689407796764, - "learning_rate": 0.00016843316195845842, - "loss": 0.7575, - "step": 2327 - }, - { - "epoch": 1.142892292550901, - "grad_norm": 0.43189372122649544, - "learning_rate": 0.00016838824977158284, - "loss": 0.7627, - "step": 2328 - }, - { - "epoch": 1.1433843882635173, - "grad_norm": 0.47553138873250456, - "learning_rate": 0.00016834331165483887, - "loss": 0.7917, - "step": 2329 - }, - { - "epoch": 1.1438764839761333, - "grad_norm": 0.43652638702069657, - "learning_rate": 0.00016829834762526513, - "loss": 0.7457, - "step": 2330 - }, - { - "epoch": 1.1443685796887495, - "grad_norm": 0.43384112294714045, - "learning_rate": 0.00016825335769991002, - "loss": 0.7283, - "step": 2331 - }, - { - "epoch": 1.1448606754013655, - "grad_norm": 0.46679399150765144, - "learning_rate": 0.00016820834189583175, - "loss": 0.7201, - "step": 2332 - }, - { - "epoch": 1.1453527711139817, - "grad_norm": 0.4471805264967156, - "learning_rate": 0.00016816330023009841, - "loss": 0.7751, - "step": 2333 - }, - { - "epoch": 1.1458448668265977, - "grad_norm": 0.5097809916933203, - "learning_rate": 0.00016811823271978784, - "loss": 0.7356, - "step": 2334 - }, - { - "epoch": 1.146336962539214, - "grad_norm": 0.45399847525617715, - "learning_rate": 0.00016807313938198767, - "loss": 0.7642, - "step": 2335 - }, - { - "epoch": 1.14682905825183, - "grad_norm": 0.44216215398989483, - "learning_rate": 0.00016802802023379538, - "loss": 0.7776, - "step": 2336 - }, - { - "epoch": 1.1473211539644461, - "grad_norm": 0.47391952554629074, - "learning_rate": 0.00016798287529231815, - "loss": 0.7241, - "step": 2337 - }, - { - "epoch": 1.147813249677062, - "grad_norm": 0.45151083393427893, - "learning_rate": 0.00016793770457467302, - "loss": 0.7254, - "step": 2338 - }, - { - "epoch": 1.1483053453896783, - "grad_norm": 0.44787964084182247, - "learning_rate": 0.0001678925080979868, - "loss": 0.7138, - "step": 2339 - }, - { - "epoch": 1.1487974411022943, - "grad_norm": 0.5878711015458571, - "learning_rate": 0.00016784728587939602, - "loss": 0.761, - "step": 2340 - }, - { - "epoch": 1.1492895368149105, - "grad_norm": 0.5067852039768288, - "learning_rate": 0.00016780203793604695, - "loss": 0.8009, - "step": 2341 - }, - { - "epoch": 1.1497816325275265, - "grad_norm": 0.4948494649935818, - "learning_rate": 0.0001677567642850957, - "loss": 0.7566, - "step": 2342 - }, - { - "epoch": 1.1502737282401427, - "grad_norm": 0.4445892575995358, - "learning_rate": 0.00016771146494370812, - "loss": 0.7102, - "step": 2343 - }, - { - "epoch": 1.1507658239527587, - "grad_norm": 0.4415227933373556, - "learning_rate": 0.00016766613992905965, - "loss": 0.787, - "step": 2344 - }, - { - "epoch": 1.151257919665375, - "grad_norm": 0.4369592312691189, - "learning_rate": 0.0001676207892583357, - "loss": 0.7441, - "step": 2345 - }, - { - "epoch": 1.151750015377991, - "grad_norm": 0.45450469510797614, - "learning_rate": 0.00016757541294873117, - "loss": 0.7359, - "step": 2346 - }, - { - "epoch": 1.1522421110906071, - "grad_norm": 0.4612742978346103, - "learning_rate": 0.00016753001101745088, - "loss": 0.743, - "step": 2347 - }, - { - "epoch": 1.1527342068032231, - "grad_norm": 0.4612558288498186, - "learning_rate": 0.00016748458348170924, - "loss": 0.7068, - "step": 2348 - }, - { - "epoch": 1.1532263025158394, - "grad_norm": 0.44262806429187956, - "learning_rate": 0.00016743913035873042, - "loss": 0.7073, - "step": 2349 - }, - { - "epoch": 1.1537183982284553, - "grad_norm": 0.49501560308668063, - "learning_rate": 0.00016739365166574827, - "loss": 0.7744, - "step": 2350 - }, - { - "epoch": 1.1542104939410716, - "grad_norm": 0.48938371898703736, - "learning_rate": 0.00016734814742000635, - "loss": 0.8085, - "step": 2351 - }, - { - "epoch": 1.1547025896536875, - "grad_norm": 0.44161795904827694, - "learning_rate": 0.0001673026176387579, - "loss": 0.7402, - "step": 2352 - }, - { - "epoch": 1.1551946853663038, - "grad_norm": 0.47192240761441423, - "learning_rate": 0.00016725706233926589, - "loss": 0.779, - "step": 2353 - }, - { - "epoch": 1.1556867810789198, - "grad_norm": 0.4462355788643901, - "learning_rate": 0.00016721148153880285, - "loss": 0.7466, - "step": 2354 - }, - { - "epoch": 1.156178876791536, - "grad_norm": 0.49915936011147427, - "learning_rate": 0.00016716587525465108, - "loss": 0.8115, - "step": 2355 - }, - { - "epoch": 1.156670972504152, - "grad_norm": 0.43363277540110245, - "learning_rate": 0.00016712024350410253, - "loss": 0.6893, - "step": 2356 - }, - { - "epoch": 1.1571630682167682, - "grad_norm": 0.48002361737554866, - "learning_rate": 0.00016707458630445875, - "loss": 0.7675, - "step": 2357 - }, - { - "epoch": 1.1576551639293844, - "grad_norm": 0.44353609318569737, - "learning_rate": 0.00016702890367303102, - "loss": 0.6764, - "step": 2358 - }, - { - "epoch": 1.1581472596420004, - "grad_norm": 0.41941593553602335, - "learning_rate": 0.0001669831956271402, - "loss": 0.692, - "step": 2359 - }, - { - "epoch": 1.1586393553546164, - "grad_norm": 0.42467355773936255, - "learning_rate": 0.00016693746218411677, - "loss": 0.7411, - "step": 2360 - }, - { - "epoch": 1.1591314510672326, - "grad_norm": 0.4282108620441184, - "learning_rate": 0.00016689170336130088, - "loss": 0.7182, - "step": 2361 - }, - { - "epoch": 1.1596235467798488, - "grad_norm": 0.3833084268608547, - "learning_rate": 0.0001668459191760424, - "loss": 0.643, - "step": 2362 - }, - { - "epoch": 1.1601156424924648, - "grad_norm": 0.4286783736712506, - "learning_rate": 0.00016680010964570058, - "loss": 0.8028, - "step": 2363 - }, - { - "epoch": 1.1606077382050808, - "grad_norm": 0.41664458769042473, - "learning_rate": 0.00016675427478764448, - "loss": 0.7111, - "step": 2364 - }, - { - "epoch": 1.161099833917697, - "grad_norm": 0.4283241918209358, - "learning_rate": 0.0001667084146192527, - "loss": 0.7747, - "step": 2365 - }, - { - "epoch": 1.1615919296303132, - "grad_norm": 0.45885220296525653, - "learning_rate": 0.00016666252915791346, - "loss": 0.6824, - "step": 2366 - }, - { - "epoch": 1.1620840253429292, - "grad_norm": 0.46532975154226963, - "learning_rate": 0.00016661661842102445, - "loss": 0.7824, - "step": 2367 - }, - { - "epoch": 1.1625761210555452, - "grad_norm": 0.575659023474276, - "learning_rate": 0.00016657068242599313, - "loss": 0.7705, - "step": 2368 - }, - { - "epoch": 1.1630682167681614, - "grad_norm": 0.44966093797615586, - "learning_rate": 0.00016652472119023636, - "loss": 0.7729, - "step": 2369 - }, - { - "epoch": 1.1635603124807776, - "grad_norm": 0.4393724975779391, - "learning_rate": 0.00016647873473118075, - "loss": 0.7316, - "step": 2370 - }, - { - "epoch": 1.1640524081933936, - "grad_norm": 0.4325600374647357, - "learning_rate": 0.0001664327230662623, - "loss": 0.7517, - "step": 2371 - }, - { - "epoch": 1.1645445039060096, - "grad_norm": 0.4338406490012966, - "learning_rate": 0.00016638668621292668, - "loss": 0.7504, - "step": 2372 - }, - { - "epoch": 1.1650365996186258, - "grad_norm": 0.43878306722290084, - "learning_rate": 0.00016634062418862907, - "loss": 0.6998, - "step": 2373 - }, - { - "epoch": 1.165528695331242, - "grad_norm": 0.46726108298167734, - "learning_rate": 0.0001662945370108342, - "loss": 0.7676, - "step": 2374 - }, - { - "epoch": 1.166020791043858, - "grad_norm": 0.4082613099682847, - "learning_rate": 0.00016624842469701632, - "loss": 0.7274, - "step": 2375 - }, - { - "epoch": 1.166512886756474, - "grad_norm": 0.4343343585815525, - "learning_rate": 0.00016620228726465922, - "loss": 0.7877, - "step": 2376 - }, - { - "epoch": 1.1670049824690902, - "grad_norm": 0.4424821203300492, - "learning_rate": 0.0001661561247312563, - "loss": 0.8563, - "step": 2377 - }, - { - "epoch": 1.1674970781817064, - "grad_norm": 0.42277985017173697, - "learning_rate": 0.00016610993711431028, - "loss": 0.7049, - "step": 2378 - }, - { - "epoch": 1.1679891738943224, - "grad_norm": 0.4095320909773662, - "learning_rate": 0.0001660637244313336, - "loss": 0.7574, - "step": 2379 - }, - { - "epoch": 1.1684812696069384, - "grad_norm": 0.44118747830725574, - "learning_rate": 0.00016601748669984806, - "loss": 0.7645, - "step": 2380 - }, - { - "epoch": 1.1689733653195546, - "grad_norm": 0.4247096205640213, - "learning_rate": 0.00016597122393738505, - "loss": 0.7665, - "step": 2381 - }, - { - "epoch": 1.1694654610321709, - "grad_norm": 0.4414360555795094, - "learning_rate": 0.00016592493616148535, - "loss": 0.7603, - "step": 2382 - }, - { - "epoch": 1.1699575567447869, - "grad_norm": 0.42371789178715363, - "learning_rate": 0.00016587862338969934, - "loss": 0.7234, - "step": 2383 - }, - { - "epoch": 1.1704496524574028, - "grad_norm": 0.4814483832167124, - "learning_rate": 0.00016583228563958678, - "loss": 0.7139, - "step": 2384 - }, - { - "epoch": 1.170941748170019, - "grad_norm": 0.44822704115743667, - "learning_rate": 0.00016578592292871698, - "loss": 0.7532, - "step": 2385 - }, - { - "epoch": 1.1714338438826353, - "grad_norm": 0.4535158060643928, - "learning_rate": 0.00016573953527466864, - "loss": 0.7691, - "step": 2386 - }, - { - "epoch": 1.1719259395952513, - "grad_norm": 0.40666153379654235, - "learning_rate": 0.00016569312269503, - "loss": 0.7385, - "step": 2387 - }, - { - "epoch": 1.1724180353078673, - "grad_norm": 0.39869523953436276, - "learning_rate": 0.00016564668520739867, - "loss": 0.715, - "step": 2388 - }, - { - "epoch": 1.1729101310204835, - "grad_norm": 0.4576211301504304, - "learning_rate": 0.00016560022282938172, - "loss": 0.799, - "step": 2389 - }, - { - "epoch": 1.1734022267330997, - "grad_norm": 0.42050802997000103, - "learning_rate": 0.00016555373557859573, - "loss": 0.7401, - "step": 2390 - }, - { - "epoch": 1.1738943224457157, - "grad_norm": 0.4058734036272197, - "learning_rate": 0.00016550722347266663, - "loss": 0.7186, - "step": 2391 - }, - { - "epoch": 1.174386418158332, - "grad_norm": 0.4460714696577042, - "learning_rate": 0.00016546068652922976, - "loss": 0.7377, - "step": 2392 - }, - { - "epoch": 1.1748785138709479, - "grad_norm": 0.4610839703524595, - "learning_rate": 0.00016541412476593, - "loss": 0.7252, - "step": 2393 - }, - { - "epoch": 1.175370609583564, - "grad_norm": 0.43171886607653037, - "learning_rate": 0.0001653675382004215, - "loss": 0.7036, - "step": 2394 - }, - { - "epoch": 1.17586270529618, - "grad_norm": 0.5673192512943801, - "learning_rate": 0.00016532092685036785, - "loss": 0.7262, - "step": 2395 - }, - { - "epoch": 1.1763548010087963, - "grad_norm": 0.45432051430363635, - "learning_rate": 0.0001652742907334421, - "loss": 0.7917, - "step": 2396 - }, - { - "epoch": 1.1768468967214123, - "grad_norm": 0.43763563327576555, - "learning_rate": 0.00016522762986732664, - "loss": 0.7891, - "step": 2397 - }, - { - "epoch": 1.1773389924340285, - "grad_norm": 0.4368731350206338, - "learning_rate": 0.0001651809442697133, - "loss": 0.7782, - "step": 2398 - }, - { - "epoch": 1.1778310881466445, - "grad_norm": 0.44500281421300364, - "learning_rate": 0.00016513423395830316, - "loss": 0.7669, - "step": 2399 - }, - { - "epoch": 1.1783231838592607, - "grad_norm": 0.41061630531196897, - "learning_rate": 0.0001650874989508068, - "loss": 0.7342, - "step": 2400 - }, - { - "epoch": 1.1788152795718767, - "grad_norm": 0.44385162575035897, - "learning_rate": 0.0001650407392649441, - "loss": 0.7506, - "step": 2401 - }, - { - "epoch": 1.179307375284493, - "grad_norm": 0.419968995168489, - "learning_rate": 0.0001649939549184443, - "loss": 0.7116, - "step": 2402 - }, - { - "epoch": 1.179799470997109, - "grad_norm": 0.45280405961816544, - "learning_rate": 0.00016494714592904606, - "loss": 0.7031, - "step": 2403 - }, - { - "epoch": 1.1802915667097251, - "grad_norm": 0.44911412973344544, - "learning_rate": 0.00016490031231449726, - "loss": 0.7743, - "step": 2404 - }, - { - "epoch": 1.1807836624223411, - "grad_norm": 0.43994851601899965, - "learning_rate": 0.0001648534540925552, - "loss": 0.7139, - "step": 2405 - }, - { - "epoch": 1.1812757581349573, - "grad_norm": 0.42151538382286186, - "learning_rate": 0.0001648065712809865, - "loss": 0.7426, - "step": 2406 - }, - { - "epoch": 1.1817678538475733, - "grad_norm": 0.421772510581625, - "learning_rate": 0.0001647596638975671, - "loss": 0.7238, - "step": 2407 - }, - { - "epoch": 1.1822599495601895, - "grad_norm": 0.43466246290054783, - "learning_rate": 0.00016471273196008224, - "loss": 0.7355, - "step": 2408 - }, - { - "epoch": 1.1827520452728055, - "grad_norm": 0.429120524397614, - "learning_rate": 0.00016466577548632648, - "loss": 0.7545, - "step": 2409 - }, - { - "epoch": 1.1832441409854217, - "grad_norm": 0.436247531580134, - "learning_rate": 0.0001646187944941037, - "loss": 0.7201, - "step": 2410 - }, - { - "epoch": 1.1837362366980377, - "grad_norm": 0.49303383774654047, - "learning_rate": 0.00016457178900122706, - "loss": 0.7482, - "step": 2411 - }, - { - "epoch": 1.184228332410654, - "grad_norm": 0.44407875153581605, - "learning_rate": 0.00016452475902551902, - "loss": 0.749, - "step": 2412 - }, - { - "epoch": 1.18472042812327, - "grad_norm": 0.3942366518830162, - "learning_rate": 0.00016447770458481124, - "loss": 0.6715, - "step": 2413 - }, - { - "epoch": 1.1852125238358862, - "grad_norm": 0.46875812009328377, - "learning_rate": 0.00016443062569694483, - "loss": 0.7764, - "step": 2414 - }, - { - "epoch": 1.1857046195485021, - "grad_norm": 0.4521465433035599, - "learning_rate": 0.00016438352237977007, - "loss": 0.736, - "step": 2415 - }, - { - "epoch": 1.1861967152611184, - "grad_norm": 0.431950611879082, - "learning_rate": 0.00016433639465114638, - "loss": 0.756, - "step": 2416 - }, - { - "epoch": 1.1866888109737344, - "grad_norm": 0.4258989569384757, - "learning_rate": 0.0001642892425289427, - "loss": 0.6816, - "step": 2417 - }, - { - "epoch": 1.1871809066863506, - "grad_norm": 0.4301938931562309, - "learning_rate": 0.000164242066031037, - "loss": 0.7712, - "step": 2418 - }, - { - "epoch": 1.1876730023989666, - "grad_norm": 0.433492593276431, - "learning_rate": 0.00016419486517531658, - "loss": 0.708, - "step": 2419 - }, - { - "epoch": 1.1881650981115828, - "grad_norm": 0.46069856340259263, - "learning_rate": 0.00016414763997967793, - "loss": 0.759, - "step": 2420 - }, - { - "epoch": 1.1886571938241988, - "grad_norm": 0.4239294138071813, - "learning_rate": 0.0001641003904620269, - "loss": 0.801, - "step": 2421 - }, - { - "epoch": 1.189149289536815, - "grad_norm": 0.4921042860737761, - "learning_rate": 0.00016405311664027838, - "loss": 0.8156, - "step": 2422 - }, - { - "epoch": 1.189641385249431, - "grad_norm": 0.4196707896059516, - "learning_rate": 0.0001640058185323566, - "loss": 0.6957, - "step": 2423 - }, - { - "epoch": 1.1901334809620472, - "grad_norm": 0.4578294751734398, - "learning_rate": 0.00016395849615619495, - "loss": 0.7897, - "step": 2424 - }, - { - "epoch": 1.1906255766746632, - "grad_norm": 0.4964683375347343, - "learning_rate": 0.00016391114952973602, - "loss": 0.7925, - "step": 2425 - }, - { - "epoch": 1.1911176723872794, - "grad_norm": 0.4711647247164641, - "learning_rate": 0.00016386377867093157, - "loss": 0.7544, - "step": 2426 - }, - { - "epoch": 1.1916097680998954, - "grad_norm": 0.433824246721515, - "learning_rate": 0.0001638163835977427, - "loss": 0.7688, - "step": 2427 - }, - { - "epoch": 1.1921018638125116, - "grad_norm": 0.4716366493461372, - "learning_rate": 0.00016376896432813943, - "loss": 0.7886, - "step": 2428 - }, - { - "epoch": 1.1925939595251276, - "grad_norm": 0.5575845841328761, - "learning_rate": 0.00016372152088010118, - "loss": 0.753, - "step": 2429 - }, - { - "epoch": 1.1930860552377438, - "grad_norm": 0.4323134443913653, - "learning_rate": 0.0001636740532716164, - "loss": 0.7244, - "step": 2430 - }, - { - "epoch": 1.1935781509503598, - "grad_norm": 0.43494712206575137, - "learning_rate": 0.00016362656152068287, - "loss": 0.7102, - "step": 2431 - }, - { - "epoch": 1.194070246662976, - "grad_norm": 0.4310915793828525, - "learning_rate": 0.00016357904564530726, - "loss": 0.7172, - "step": 2432 - }, - { - "epoch": 1.194562342375592, - "grad_norm": 0.49717458394673475, - "learning_rate": 0.00016353150566350563, - "loss": 0.7615, - "step": 2433 - }, - { - "epoch": 1.1950544380882082, - "grad_norm": 0.4609230308039873, - "learning_rate": 0.00016348394159330308, - "loss": 0.7557, - "step": 2434 - }, - { - "epoch": 1.1955465338008242, - "grad_norm": 0.46317593356743825, - "learning_rate": 0.0001634363534527338, - "loss": 0.7888, - "step": 2435 - }, - { - "epoch": 1.1960386295134404, - "grad_norm": 0.43803936604089877, - "learning_rate": 0.0001633887412598412, - "loss": 0.7291, - "step": 2436 - }, - { - "epoch": 1.1965307252260564, - "grad_norm": 0.4583230627898153, - "learning_rate": 0.00016334110503267777, - "loss": 0.7471, - "step": 2437 - }, - { - "epoch": 1.1970228209386726, - "grad_norm": 0.4465355586005098, - "learning_rate": 0.0001632934447893051, - "loss": 0.6913, - "step": 2438 - }, - { - "epoch": 1.1975149166512886, - "grad_norm": 0.4284055764535819, - "learning_rate": 0.00016324576054779386, - "loss": 0.7395, - "step": 2439 - }, - { - "epoch": 1.1980070123639048, - "grad_norm": 0.4336490156541325, - "learning_rate": 0.0001631980523262239, - "loss": 0.7994, - "step": 2440 - }, - { - "epoch": 1.1984991080765208, - "grad_norm": 0.4362318709859781, - "learning_rate": 0.0001631503201426841, - "loss": 0.675, - "step": 2441 - }, - { - "epoch": 1.198991203789137, - "grad_norm": 0.4972034285587136, - "learning_rate": 0.00016310256401527243, - "loss": 0.7645, - "step": 2442 - }, - { - "epoch": 1.199483299501753, - "grad_norm": 0.43066554321273104, - "learning_rate": 0.00016305478396209594, - "loss": 0.7103, - "step": 2443 - }, - { - "epoch": 1.1999753952143692, - "grad_norm": 0.4434164205535384, - "learning_rate": 0.0001630069800012708, - "loss": 0.7312, - "step": 2444 - }, - { - "epoch": 1.2004674909269852, - "grad_norm": 0.4419446094796589, - "learning_rate": 0.00016295915215092216, - "loss": 0.7484, - "step": 2445 - }, - { - "epoch": 1.2009595866396015, - "grad_norm": 0.436222360431337, - "learning_rate": 0.0001629113004291843, - "loss": 0.725, - "step": 2446 - }, - { - "epoch": 1.2014516823522174, - "grad_norm": 0.4381375698311324, - "learning_rate": 0.00016286342485420056, - "loss": 0.7175, - "step": 2447 - }, - { - "epoch": 1.2019437780648337, - "grad_norm": 0.4219271108947798, - "learning_rate": 0.0001628155254441232, - "loss": 0.7172, - "step": 2448 - }, - { - "epoch": 1.2024358737774496, - "grad_norm": 0.48131877200504575, - "learning_rate": 0.00016276760221711368, - "loss": 0.7589, - "step": 2449 - }, - { - "epoch": 1.2029279694900659, - "grad_norm": 0.4416067404334387, - "learning_rate": 0.0001627196551913424, - "loss": 0.7681, - "step": 2450 - }, - { - "epoch": 1.2034200652026819, - "grad_norm": 0.451485208062328, - "learning_rate": 0.00016267168438498879, - "loss": 0.7628, - "step": 2451 - }, - { - "epoch": 1.203912160915298, - "grad_norm": 0.4399384763222408, - "learning_rate": 0.0001626236898162413, - "loss": 0.7273, - "step": 2452 - }, - { - "epoch": 1.204404256627914, - "grad_norm": 0.4522320174695235, - "learning_rate": 0.0001625756715032974, - "loss": 0.7437, - "step": 2453 - }, - { - "epoch": 1.2048963523405303, - "grad_norm": 0.4509852633870999, - "learning_rate": 0.00016252762946436357, - "loss": 0.7971, - "step": 2454 - }, - { - "epoch": 1.2053884480531463, - "grad_norm": 0.4467131917702599, - "learning_rate": 0.00016247956371765528, - "loss": 0.7466, - "step": 2455 - }, - { - "epoch": 1.2058805437657625, - "grad_norm": 0.45212991474422043, - "learning_rate": 0.00016243147428139694, - "loss": 0.7864, - "step": 2456 - }, - { - "epoch": 1.2063726394783785, - "grad_norm": 0.4753890663625771, - "learning_rate": 0.00016238336117382204, - "loss": 0.8267, - "step": 2457 - }, - { - "epoch": 1.2068647351909947, - "grad_norm": 0.4779672497083118, - "learning_rate": 0.00016233522441317296, - "loss": 0.7139, - "step": 2458 - }, - { - "epoch": 1.2073568309036107, - "grad_norm": 0.4569575903984626, - "learning_rate": 0.00016228706401770108, - "loss": 0.7367, - "step": 2459 - }, - { - "epoch": 1.207848926616227, - "grad_norm": 0.45729088174785243, - "learning_rate": 0.00016223888000566677, - "loss": 0.7817, - "step": 2460 - }, - { - "epoch": 1.2083410223288429, - "grad_norm": 1.243983620542089, - "learning_rate": 0.0001621906723953393, - "loss": 0.8791, - "step": 2461 - }, - { - "epoch": 1.208833118041459, - "grad_norm": 0.4655000992948183, - "learning_rate": 0.0001621424412049969, - "loss": 0.7644, - "step": 2462 - }, - { - "epoch": 1.209325213754075, - "grad_norm": 0.45367678123478067, - "learning_rate": 0.00016209418645292674, - "loss": 0.7059, - "step": 2463 - }, - { - "epoch": 1.2098173094666913, - "grad_norm": 0.4402459317799849, - "learning_rate": 0.00016204590815742503, - "loss": 0.7171, - "step": 2464 - }, - { - "epoch": 1.2103094051793073, - "grad_norm": 0.41688045725088835, - "learning_rate": 0.00016199760633679668, - "loss": 0.7157, - "step": 2465 - }, - { - "epoch": 1.2108015008919235, - "grad_norm": 0.41866447525231854, - "learning_rate": 0.00016194928100935575, - "loss": 0.6409, - "step": 2466 - }, - { - "epoch": 1.2112935966045395, - "grad_norm": 0.4547058457916331, - "learning_rate": 0.0001619009321934251, - "loss": 0.7391, - "step": 2467 - }, - { - "epoch": 1.2117856923171557, - "grad_norm": 0.465123914545746, - "learning_rate": 0.00016185255990733649, - "loss": 0.7336, - "step": 2468 - }, - { - "epoch": 1.2122777880297717, - "grad_norm": 0.44621528747575384, - "learning_rate": 0.00016180416416943056, - "loss": 0.7793, - "step": 2469 - }, - { - "epoch": 1.212769883742388, - "grad_norm": 0.4547779940734016, - "learning_rate": 0.00016175574499805698, - "loss": 0.7979, - "step": 2470 - }, - { - "epoch": 1.213261979455004, - "grad_norm": 0.44229750375713783, - "learning_rate": 0.00016170730241157414, - "loss": 0.8386, - "step": 2471 - }, - { - "epoch": 1.2137540751676201, - "grad_norm": 0.41456072260825083, - "learning_rate": 0.00016165883642834937, - "loss": 0.7462, - "step": 2472 - }, - { - "epoch": 1.2142461708802361, - "grad_norm": 0.4664640247816799, - "learning_rate": 0.00016161034706675892, - "loss": 0.7187, - "step": 2473 - }, - { - "epoch": 1.2147382665928523, - "grad_norm": 0.438875039135234, - "learning_rate": 0.0001615618343451878, - "loss": 0.7352, - "step": 2474 - }, - { - "epoch": 1.2152303623054683, - "grad_norm": 0.43223943483733024, - "learning_rate": 0.00016151329828203, - "loss": 0.7417, - "step": 2475 - }, - { - "epoch": 1.2157224580180845, - "grad_norm": 0.5025039703358414, - "learning_rate": 0.00016146473889568828, - "loss": 0.7799, - "step": 2476 - }, - { - "epoch": 1.2162145537307005, - "grad_norm": 1.0461634593702156, - "learning_rate": 0.00016141615620457423, - "loss": 0.888, - "step": 2477 - }, - { - "epoch": 1.2167066494433167, - "grad_norm": 0.4425790122445545, - "learning_rate": 0.00016136755022710836, - "loss": 0.7175, - "step": 2478 - }, - { - "epoch": 1.2171987451559327, - "grad_norm": 0.40985986664650526, - "learning_rate": 0.00016131892098171987, - "loss": 0.7551, - "step": 2479 - }, - { - "epoch": 1.217690840868549, - "grad_norm": 0.43781332426632397, - "learning_rate": 0.00016127026848684696, - "loss": 0.7048, - "step": 2480 - }, - { - "epoch": 1.218182936581165, - "grad_norm": 0.4595583384205723, - "learning_rate": 0.0001612215927609365, - "loss": 0.7895, - "step": 2481 - }, - { - "epoch": 1.2186750322937812, - "grad_norm": 0.4446513017453665, - "learning_rate": 0.00016117289382244424, - "loss": 0.6593, - "step": 2482 - }, - { - "epoch": 1.2191671280063971, - "grad_norm": 0.4397360591458268, - "learning_rate": 0.00016112417168983472, - "loss": 0.7679, - "step": 2483 - }, - { - "epoch": 1.2196592237190134, - "grad_norm": 0.4188480806178211, - "learning_rate": 0.00016107542638158122, - "loss": 0.7232, - "step": 2484 - }, - { - "epoch": 1.2201513194316294, - "grad_norm": 0.5717598128986419, - "learning_rate": 0.0001610266579161659, - "loss": 0.7683, - "step": 2485 - }, - { - "epoch": 1.2206434151442456, - "grad_norm": 0.5937203025328015, - "learning_rate": 0.00016097786631207966, - "loss": 0.7768, - "step": 2486 - }, - { - "epoch": 1.2211355108568616, - "grad_norm": 0.43135713049256946, - "learning_rate": 0.00016092905158782215, - "loss": 0.7328, - "step": 2487 - }, - { - "epoch": 1.2216276065694778, - "grad_norm": 0.44961496046256094, - "learning_rate": 0.00016088021376190175, - "loss": 0.7784, - "step": 2488 - }, - { - "epoch": 1.222119702282094, - "grad_norm": 0.4459949098102091, - "learning_rate": 0.00016083135285283577, - "loss": 0.816, - "step": 2489 - }, - { - "epoch": 1.22261179799471, - "grad_norm": 0.44488660493938537, - "learning_rate": 0.00016078246887915007, - "loss": 0.7453, - "step": 2490 - }, - { - "epoch": 1.223103893707326, - "grad_norm": 0.45206552513800424, - "learning_rate": 0.00016073356185937934, - "loss": 0.7711, - "step": 2491 - }, - { - "epoch": 1.2235959894199422, - "grad_norm": 0.4449658085773436, - "learning_rate": 0.00016068463181206707, - "loss": 0.7194, - "step": 2492 - }, - { - "epoch": 1.2240880851325584, - "grad_norm": 0.43733253215838264, - "learning_rate": 0.00016063567875576536, - "loss": 0.7599, - "step": 2493 - }, - { - "epoch": 1.2245801808451744, - "grad_norm": 0.4470056028033283, - "learning_rate": 0.00016058670270903514, - "loss": 0.8031, - "step": 2494 - }, - { - "epoch": 1.2250722765577904, - "grad_norm": 0.41171495343876024, - "learning_rate": 0.00016053770369044593, - "loss": 0.7445, - "step": 2495 - }, - { - "epoch": 1.2255643722704066, - "grad_norm": 0.4786555526848234, - "learning_rate": 0.00016048868171857612, - "loss": 0.7908, - "step": 2496 - }, - { - "epoch": 1.2260564679830228, - "grad_norm": 0.4396705191785551, - "learning_rate": 0.0001604396368120127, - "loss": 0.7545, - "step": 2497 - }, - { - "epoch": 1.2265485636956388, - "grad_norm": 0.4525018260389032, - "learning_rate": 0.00016039056898935132, - "loss": 0.7737, - "step": 2498 - }, - { - "epoch": 1.2270406594082548, - "grad_norm": 0.5308962668568638, - "learning_rate": 0.00016034147826919648, - "loss": 0.855, - "step": 2499 - }, - { - "epoch": 1.227532755120871, - "grad_norm": 0.4557709197330377, - "learning_rate": 0.0001602923646701612, - "loss": 0.7744, - "step": 2500 - }, - { - "epoch": 1.2280248508334872, - "grad_norm": 0.4265830603196255, - "learning_rate": 0.00016024322821086724, - "loss": 0.7284, - "step": 2501 - }, - { - "epoch": 1.2285169465461032, - "grad_norm": 0.44895742693998336, - "learning_rate": 0.00016019406890994503, - "loss": 0.6931, - "step": 2502 - }, - { - "epoch": 1.2290090422587192, - "grad_norm": 0.47371664963505233, - "learning_rate": 0.00016014488678603366, - "loss": 0.8013, - "step": 2503 - }, - { - "epoch": 1.2295011379713354, - "grad_norm": 0.4581579926376679, - "learning_rate": 0.00016009568185778084, - "loss": 0.7289, - "step": 2504 - }, - { - "epoch": 1.2299932336839516, - "grad_norm": 0.4739863590086033, - "learning_rate": 0.00016004645414384293, - "loss": 0.8198, - "step": 2505 - }, - { - "epoch": 1.2304853293965676, - "grad_norm": 0.46902170356954354, - "learning_rate": 0.00015999720366288503, - "loss": 0.7792, - "step": 2506 - }, - { - "epoch": 1.2309774251091836, - "grad_norm": 0.4388140815686061, - "learning_rate": 0.00015994793043358073, - "loss": 0.7442, - "step": 2507 - }, - { - "epoch": 1.2314695208217998, - "grad_norm": 0.46388307849058275, - "learning_rate": 0.00015989863447461234, - "loss": 0.6854, - "step": 2508 - }, - { - "epoch": 1.231961616534416, - "grad_norm": 0.445047008840957, - "learning_rate": 0.00015984931580467074, - "loss": 0.7868, - "step": 2509 - }, - { - "epoch": 1.232453712247032, - "grad_norm": 0.4120760036892778, - "learning_rate": 0.00015979997444245543, - "loss": 0.7572, - "step": 2510 - }, - { - "epoch": 1.232945807959648, - "grad_norm": 0.4506870700950525, - "learning_rate": 0.00015975061040667454, - "loss": 0.8236, - "step": 2511 - }, - { - "epoch": 1.2334379036722642, - "grad_norm": 0.43353881339164957, - "learning_rate": 0.00015970122371604476, - "loss": 0.7513, - "step": 2512 - }, - { - "epoch": 1.2339299993848805, - "grad_norm": 0.4197091516443743, - "learning_rate": 0.00015965181438929146, - "loss": 0.7263, - "step": 2513 - }, - { - "epoch": 1.2344220950974965, - "grad_norm": 0.4573628112829376, - "learning_rate": 0.00015960238244514842, - "loss": 0.7554, - "step": 2514 - }, - { - "epoch": 1.2349141908101124, - "grad_norm": 0.4446582717598241, - "learning_rate": 0.00015955292790235815, - "loss": 0.7417, - "step": 2515 - }, - { - "epoch": 1.2354062865227287, - "grad_norm": 0.4382642106035462, - "learning_rate": 0.00015950345077967167, - "loss": 0.7643, - "step": 2516 - }, - { - "epoch": 1.2358983822353449, - "grad_norm": 0.46496882337561524, - "learning_rate": 0.00015945395109584856, - "loss": 0.6811, - "step": 2517 - }, - { - "epoch": 1.2363904779479609, - "grad_norm": 0.41513919237903263, - "learning_rate": 0.00015940442886965694, - "loss": 0.707, - "step": 2518 - }, - { - "epoch": 1.236882573660577, - "grad_norm": 0.456625249700212, - "learning_rate": 0.00015935488411987354, - "loss": 0.6953, - "step": 2519 - }, - { - "epoch": 1.237374669373193, - "grad_norm": 0.4234628275490277, - "learning_rate": 0.00015930531686528357, - "loss": 0.7408, - "step": 2520 - }, - { - "epoch": 1.2378667650858093, - "grad_norm": 0.43349572800507596, - "learning_rate": 0.00015925572712468074, - "loss": 0.6897, - "step": 2521 - }, - { - "epoch": 1.2383588607984253, - "grad_norm": 0.46079156509028607, - "learning_rate": 0.00015920611491686745, - "loss": 0.7461, - "step": 2522 - }, - { - "epoch": 1.2388509565110415, - "grad_norm": 0.44076547659732174, - "learning_rate": 0.0001591564802606544, - "loss": 0.7291, - "step": 2523 - }, - { - "epoch": 1.2393430522236575, - "grad_norm": 0.4482904356132328, - "learning_rate": 0.00015910682317486096, - "loss": 0.7393, - "step": 2524 - }, - { - "epoch": 1.2398351479362737, - "grad_norm": 0.44724083589325964, - "learning_rate": 0.00015905714367831495, - "loss": 0.7942, - "step": 2525 - }, - { - "epoch": 1.2403272436488897, - "grad_norm": 0.4400022233662369, - "learning_rate": 0.00015900744178985266, - "loss": 0.7723, - "step": 2526 - }, - { - "epoch": 1.240819339361506, - "grad_norm": 0.4405444597335698, - "learning_rate": 0.00015895771752831892, - "loss": 0.7166, - "step": 2527 - }, - { - "epoch": 1.241311435074122, - "grad_norm": 0.4766422904958864, - "learning_rate": 0.00015890797091256697, - "loss": 0.7676, - "step": 2528 - }, - { - "epoch": 1.241803530786738, - "grad_norm": 0.4096156751962372, - "learning_rate": 0.00015885820196145865, - "loss": 0.714, - "step": 2529 - }, - { - "epoch": 1.242295626499354, - "grad_norm": 0.4141158344096702, - "learning_rate": 0.00015880841069386417, - "loss": 0.7124, - "step": 2530 - }, - { - "epoch": 1.2427877222119703, - "grad_norm": 0.4383880333252046, - "learning_rate": 0.00015875859712866224, - "loss": 0.7772, - "step": 2531 - }, - { - "epoch": 1.2432798179245863, - "grad_norm": 0.4458031566093325, - "learning_rate": 0.00015870876128473994, - "loss": 0.8147, - "step": 2532 - }, - { - "epoch": 1.2437719136372025, - "grad_norm": 0.4421373461075213, - "learning_rate": 0.00015865890318099296, - "loss": 0.7479, - "step": 2533 - }, - { - "epoch": 1.2442640093498185, - "grad_norm": 0.4451710441613152, - "learning_rate": 0.0001586090228363253, - "loss": 0.685, - "step": 2534 - }, - { - "epoch": 1.2447561050624347, - "grad_norm": 0.43236320468734696, - "learning_rate": 0.0001585591202696494, - "loss": 0.7551, - "step": 2535 - }, - { - "epoch": 1.2452482007750507, - "grad_norm": 0.44693080770780086, - "learning_rate": 0.0001585091954998862, - "loss": 0.6972, - "step": 2536 - }, - { - "epoch": 1.245740296487667, - "grad_norm": 0.4090323440422212, - "learning_rate": 0.000158459248545965, - "loss": 0.6781, - "step": 2537 - }, - { - "epoch": 1.246232392200283, - "grad_norm": 0.417463605139135, - "learning_rate": 0.0001584092794268235, - "loss": 0.7371, - "step": 2538 - }, - { - "epoch": 1.2467244879128991, - "grad_norm": 0.4283222722616072, - "learning_rate": 0.0001583592881614079, - "loss": 0.7516, - "step": 2539 - }, - { - "epoch": 1.2472165836255151, - "grad_norm": 0.47081669612513066, - "learning_rate": 0.00015830927476867266, - "loss": 0.7918, - "step": 2540 - }, - { - "epoch": 1.2477086793381313, - "grad_norm": 0.46696184974319493, - "learning_rate": 0.00015825923926758071, - "loss": 0.8245, - "step": 2541 - }, - { - "epoch": 1.2482007750507473, - "grad_norm": 0.4222758370165774, - "learning_rate": 0.00015820918167710338, - "loss": 0.7826, - "step": 2542 - }, - { - "epoch": 1.2486928707633635, - "grad_norm": 0.41071507656104234, - "learning_rate": 0.00015815910201622032, - "loss": 0.6986, - "step": 2543 - }, - { - "epoch": 1.2491849664759795, - "grad_norm": 0.42879530840772173, - "learning_rate": 0.00015810900030391954, - "loss": 0.776, - "step": 2544 - }, - { - "epoch": 1.2496770621885958, - "grad_norm": 0.41098861871762604, - "learning_rate": 0.0001580588765591975, - "loss": 0.6994, - "step": 2545 - }, - { - "epoch": 1.2501691579012117, - "grad_norm": 0.4315909924016978, - "learning_rate": 0.00015800873080105895, - "loss": 0.7169, - "step": 2546 - }, - { - "epoch": 1.250661253613828, - "grad_norm": 0.4293799610847144, - "learning_rate": 0.00015795856304851697, - "loss": 0.7805, - "step": 2547 - }, - { - "epoch": 1.251153349326444, - "grad_norm": 0.4263069892019798, - "learning_rate": 0.000157908373320593, - "loss": 0.759, - "step": 2548 - }, - { - "epoch": 1.2516454450390602, - "grad_norm": 0.4616225247037991, - "learning_rate": 0.00015785816163631686, - "loss": 0.7394, - "step": 2549 - }, - { - "epoch": 1.2521375407516762, - "grad_norm": 0.4236887316755888, - "learning_rate": 0.0001578079280147266, - "loss": 0.7259, - "step": 2550 - }, - { - "epoch": 1.2526296364642924, - "grad_norm": 0.41721242240395623, - "learning_rate": 0.00015775767247486867, - "loss": 0.7236, - "step": 2551 - }, - { - "epoch": 1.2531217321769084, - "grad_norm": 0.43083139186312563, - "learning_rate": 0.0001577073950357978, - "loss": 0.7252, - "step": 2552 - }, - { - "epoch": 1.2536138278895246, - "grad_norm": 0.4812351771240485, - "learning_rate": 0.00015765709571657698, - "loss": 0.7246, - "step": 2553 - }, - { - "epoch": 1.2541059236021406, - "grad_norm": 0.4998599187138841, - "learning_rate": 0.00015760677453627756, - "loss": 0.7791, - "step": 2554 - }, - { - "epoch": 1.2545980193147568, - "grad_norm": 0.4951475330728904, - "learning_rate": 0.00015755643151397922, - "loss": 0.6723, - "step": 2555 - }, - { - "epoch": 1.2550901150273728, - "grad_norm": 0.46110719532733896, - "learning_rate": 0.00015750606666876978, - "loss": 0.7506, - "step": 2556 - }, - { - "epoch": 1.255582210739989, - "grad_norm": 0.41538497146655773, - "learning_rate": 0.00015745568001974538, - "loss": 0.7298, - "step": 2557 - }, - { - "epoch": 1.256074306452605, - "grad_norm": 0.4354878103310598, - "learning_rate": 0.00015740527158601055, - "loss": 0.7703, - "step": 2558 - }, - { - "epoch": 1.2565664021652212, - "grad_norm": 0.5616558173336792, - "learning_rate": 0.00015735484138667798, - "loss": 0.7723, - "step": 2559 - }, - { - "epoch": 1.2570584978778372, - "grad_norm": 0.4102438659567689, - "learning_rate": 0.00015730438944086856, - "loss": 0.7432, - "step": 2560 - }, - { - "epoch": 1.2575505935904534, - "grad_norm": 0.48142001034247683, - "learning_rate": 0.00015725391576771155, - "loss": 0.7573, - "step": 2561 - }, - { - "epoch": 1.2580426893030694, - "grad_norm": 0.4630310952308724, - "learning_rate": 0.0001572034203863443, - "loss": 0.7605, - "step": 2562 - }, - { - "epoch": 1.2585347850156856, - "grad_norm": 0.4395908066141942, - "learning_rate": 0.00015715290331591258, - "loss": 0.746, - "step": 2563 - }, - { - "epoch": 1.2590268807283016, - "grad_norm": 0.5529886102587573, - "learning_rate": 0.00015710236457557023, - "loss": 0.7502, - "step": 2564 - }, - { - "epoch": 1.2595189764409178, - "grad_norm": 0.42981905900486217, - "learning_rate": 0.00015705180418447933, - "loss": 0.7287, - "step": 2565 - }, - { - "epoch": 1.2600110721535338, - "grad_norm": 0.4216863490800534, - "learning_rate": 0.00015700122216181028, - "loss": 0.7159, - "step": 2566 - }, - { - "epoch": 1.26050316786615, - "grad_norm": 0.46652529393818115, - "learning_rate": 0.00015695061852674148, - "loss": 0.7915, - "step": 2567 - }, - { - "epoch": 1.260995263578766, - "grad_norm": 0.4500385023800325, - "learning_rate": 0.00015689999329845974, - "loss": 0.7151, - "step": 2568 - }, - { - "epoch": 1.2614873592913822, - "grad_norm": 0.4572461898834975, - "learning_rate": 0.00015684934649615986, - "loss": 0.8097, - "step": 2569 - }, - { - "epoch": 1.2619794550039982, - "grad_norm": 0.4059854077300595, - "learning_rate": 0.00015679867813904504, - "loss": 0.7113, - "step": 2570 - }, - { - "epoch": 1.2624715507166144, - "grad_norm": 0.4432979874271413, - "learning_rate": 0.00015674798824632646, - "loss": 0.7716, - "step": 2571 - }, - { - "epoch": 1.2629636464292304, - "grad_norm": 0.40674965149502756, - "learning_rate": 0.00015669727683722355, - "loss": 0.7182, - "step": 2572 - }, - { - "epoch": 1.2634557421418466, - "grad_norm": 0.4107488757313884, - "learning_rate": 0.00015664654393096386, - "loss": 0.727, - "step": 2573 - }, - { - "epoch": 1.2639478378544626, - "grad_norm": 0.4044172220207941, - "learning_rate": 0.00015659578954678314, - "loss": 0.658, - "step": 2574 - }, - { - "epoch": 1.2644399335670788, - "grad_norm": 0.47038385597153354, - "learning_rate": 0.00015654501370392524, - "loss": 0.7226, - "step": 2575 - }, - { - "epoch": 1.2649320292796948, - "grad_norm": 0.4349551722546035, - "learning_rate": 0.00015649421642164221, - "loss": 0.7548, - "step": 2576 - }, - { - "epoch": 1.265424124992311, - "grad_norm": 0.4729920720959477, - "learning_rate": 0.00015644339771919415, - "loss": 0.7762, - "step": 2577 - }, - { - "epoch": 1.265916220704927, - "grad_norm": 0.4288585025886129, - "learning_rate": 0.0001563925576158493, - "loss": 0.7939, - "step": 2578 - }, - { - "epoch": 1.2664083164175433, - "grad_norm": 0.412815562650632, - "learning_rate": 0.00015634169613088403, - "loss": 0.734, - "step": 2579 - }, - { - "epoch": 1.2669004121301592, - "grad_norm": 0.4468556521950253, - "learning_rate": 0.00015629081328358285, - "loss": 0.7285, - "step": 2580 - }, - { - "epoch": 1.2673925078427755, - "grad_norm": 0.44179881487038664, - "learning_rate": 0.00015623990909323834, - "loss": 0.7738, - "step": 2581 - }, - { - "epoch": 1.2678846035553915, - "grad_norm": 0.4285827365625489, - "learning_rate": 0.00015618898357915115, - "loss": 0.724, - "step": 2582 - }, - { - "epoch": 1.2683766992680077, - "grad_norm": 0.45272459710874907, - "learning_rate": 0.00015613803676063, - "loss": 0.81, - "step": 2583 - }, - { - "epoch": 1.2688687949806237, - "grad_norm": 0.41262992071549875, - "learning_rate": 0.00015608706865699178, - "loss": 0.7521, - "step": 2584 - }, - { - "epoch": 1.2693608906932399, - "grad_norm": 0.45924739219356164, - "learning_rate": 0.00015603607928756137, - "loss": 0.8915, - "step": 2585 - }, - { - "epoch": 1.2698529864058559, - "grad_norm": 0.4415343245957275, - "learning_rate": 0.0001559850686716717, - "loss": 0.7632, - "step": 2586 - }, - { - "epoch": 1.270345082118472, - "grad_norm": 0.4224491218804207, - "learning_rate": 0.00015593403682866389, - "loss": 0.7221, - "step": 2587 - }, - { - "epoch": 1.270837177831088, - "grad_norm": 0.432127519328177, - "learning_rate": 0.00015588298377788688, - "loss": 0.7502, - "step": 2588 - }, - { - "epoch": 1.2713292735437043, - "grad_norm": 0.42226438361882757, - "learning_rate": 0.00015583190953869785, - "loss": 0.7434, - "step": 2589 - }, - { - "epoch": 1.2718213692563203, - "grad_norm": 0.4201903008300504, - "learning_rate": 0.00015578081413046191, - "loss": 0.7284, - "step": 2590 - }, - { - "epoch": 1.2723134649689365, - "grad_norm": 0.4242112061271643, - "learning_rate": 0.0001557296975725523, - "loss": 0.7589, - "step": 2591 - }, - { - "epoch": 1.2728055606815525, - "grad_norm": 0.42544311018827574, - "learning_rate": 0.0001556785598843502, - "loss": 0.7441, - "step": 2592 - }, - { - "epoch": 1.2732976563941687, - "grad_norm": 0.41125085727573196, - "learning_rate": 0.00015562740108524472, - "loss": 0.7414, - "step": 2593 - }, - { - "epoch": 1.2737897521067847, - "grad_norm": 0.44852864466061376, - "learning_rate": 0.00015557622119463313, - "loss": 0.8235, - "step": 2594 - }, - { - "epoch": 1.274281847819401, - "grad_norm": 0.4539169694183089, - "learning_rate": 0.00015552502023192063, - "loss": 0.7368, - "step": 2595 - }, - { - "epoch": 1.274773943532017, - "grad_norm": 0.4166987356758154, - "learning_rate": 0.00015547379821652038, - "loss": 0.7476, - "step": 2596 - }, - { - "epoch": 1.275266039244633, - "grad_norm": 0.42703010260322033, - "learning_rate": 0.00015542255516785361, - "loss": 0.7314, - "step": 2597 - }, - { - "epoch": 1.275758134957249, - "grad_norm": 0.4769045049464421, - "learning_rate": 0.00015537129110534945, - "loss": 0.74, - "step": 2598 - }, - { - "epoch": 1.2762502306698653, - "grad_norm": 0.39678421251550167, - "learning_rate": 0.000155320006048445, - "loss": 0.7511, - "step": 2599 - }, - { - "epoch": 1.2767423263824815, - "grad_norm": 0.5044737420814177, - "learning_rate": 0.00015526870001658532, - "loss": 0.7248, - "step": 2600 - }, - { - "epoch": 1.2772344220950975, - "grad_norm": 0.40143263726257994, - "learning_rate": 0.00015521737302922348, - "loss": 0.7398, - "step": 2601 - }, - { - "epoch": 1.2777265178077135, - "grad_norm": 0.43710518643214913, - "learning_rate": 0.00015516602510582043, - "loss": 0.7526, - "step": 2602 - }, - { - "epoch": 1.2782186135203297, - "grad_norm": 0.4662321654183513, - "learning_rate": 0.00015511465626584509, - "loss": 0.7682, - "step": 2603 - }, - { - "epoch": 1.278710709232946, - "grad_norm": 0.4964709774050139, - "learning_rate": 0.00015506326652877433, - "loss": 0.8085, - "step": 2604 - }, - { - "epoch": 1.279202804945562, - "grad_norm": 0.44658978575127173, - "learning_rate": 0.00015501185591409287, - "loss": 0.7278, - "step": 2605 - }, - { - "epoch": 1.279694900658178, - "grad_norm": 0.4175225364618719, - "learning_rate": 0.00015496042444129338, - "loss": 0.7228, - "step": 2606 - }, - { - "epoch": 1.2801869963707941, - "grad_norm": 0.43449038536138557, - "learning_rate": 0.00015490897212987656, - "loss": 0.7747, - "step": 2607 - }, - { - "epoch": 1.2806790920834104, - "grad_norm": 0.41695126357161955, - "learning_rate": 0.0001548574989993508, - "loss": 0.7668, - "step": 2608 - }, - { - "epoch": 1.2811711877960263, - "grad_norm": 0.4381103385716402, - "learning_rate": 0.00015480600506923248, - "loss": 0.6944, - "step": 2609 - }, - { - "epoch": 1.2816632835086423, - "grad_norm": 0.4514121424525538, - "learning_rate": 0.00015475449035904596, - "loss": 0.7744, - "step": 2610 - }, - { - "epoch": 1.2821553792212586, - "grad_norm": 0.40863780724558096, - "learning_rate": 0.00015470295488832333, - "loss": 0.7081, - "step": 2611 - }, - { - "epoch": 1.2826474749338748, - "grad_norm": 0.5372182678341804, - "learning_rate": 0.0001546513986766046, - "loss": 0.7893, - "step": 2612 - }, - { - "epoch": 1.2831395706464908, - "grad_norm": 0.4379451533982603, - "learning_rate": 0.00015459982174343767, - "loss": 0.7394, - "step": 2613 - }, - { - "epoch": 1.2836316663591067, - "grad_norm": 0.4257835176263186, - "learning_rate": 0.00015454822410837832, - "loss": 0.7494, - "step": 2614 - }, - { - "epoch": 1.284123762071723, - "grad_norm": 0.47811432081825217, - "learning_rate": 0.00015449660579099013, - "loss": 0.7512, - "step": 2615 - }, - { - "epoch": 1.2846158577843392, - "grad_norm": 0.42179771281231876, - "learning_rate": 0.0001544449668108445, - "loss": 0.7946, - "step": 2616 - }, - { - "epoch": 1.2851079534969552, - "grad_norm": 0.44306737117407474, - "learning_rate": 0.0001543933071875207, - "loss": 0.6966, - "step": 2617 - }, - { - "epoch": 1.2856000492095712, - "grad_norm": 0.4833762980530231, - "learning_rate": 0.0001543416269406059, - "loss": 0.7088, - "step": 2618 - }, - { - "epoch": 1.2860921449221874, - "grad_norm": 0.3972083842652231, - "learning_rate": 0.0001542899260896949, - "loss": 0.7038, - "step": 2619 - }, - { - "epoch": 1.2865842406348036, - "grad_norm": 0.43506703855597095, - "learning_rate": 0.0001542382046543905, - "loss": 0.7748, - "step": 2620 - }, - { - "epoch": 1.2870763363474196, - "grad_norm": 0.4392513447268944, - "learning_rate": 0.00015418646265430327, - "loss": 0.7561, - "step": 2621 - }, - { - "epoch": 1.2875684320600356, - "grad_norm": 0.46418470143332413, - "learning_rate": 0.00015413470010905146, - "loss": 0.7978, - "step": 2622 - }, - { - "epoch": 1.2880605277726518, - "grad_norm": 0.43135991816412844, - "learning_rate": 0.00015408291703826127, - "loss": 0.7525, - "step": 2623 - }, - { - "epoch": 1.288552623485268, - "grad_norm": 0.4109210674724624, - "learning_rate": 0.00015403111346156648, - "loss": 0.7428, - "step": 2624 - }, - { - "epoch": 1.289044719197884, - "grad_norm": 0.43396204884755263, - "learning_rate": 0.00015397928939860886, - "loss": 0.7443, - "step": 2625 - }, - { - "epoch": 1.2895368149105, - "grad_norm": 0.40773554323909833, - "learning_rate": 0.00015392744486903787, - "loss": 0.6975, - "step": 2626 - }, - { - "epoch": 1.2900289106231162, - "grad_norm": 0.44892980909023017, - "learning_rate": 0.00015387557989251068, - "loss": 0.7925, - "step": 2627 - }, - { - "epoch": 1.2905210063357324, - "grad_norm": 0.4220792652483258, - "learning_rate": 0.00015382369448869226, - "loss": 0.6736, - "step": 2628 - }, - { - "epoch": 1.2910131020483484, - "grad_norm": 0.4209547035227594, - "learning_rate": 0.00015377178867725526, - "loss": 0.7182, - "step": 2629 - }, - { - "epoch": 1.2915051977609644, - "grad_norm": 0.42210710631459336, - "learning_rate": 0.00015371986247788018, - "loss": 0.786, - "step": 2630 - }, - { - "epoch": 1.2919972934735806, - "grad_norm": 0.4233103738421945, - "learning_rate": 0.0001536679159102552, - "loss": 0.6988, - "step": 2631 - }, - { - "epoch": 1.2924893891861968, - "grad_norm": 0.4403054673592731, - "learning_rate": 0.00015361594899407615, - "loss": 0.7675, - "step": 2632 - }, - { - "epoch": 1.2929814848988128, - "grad_norm": 0.42557645989762755, - "learning_rate": 0.00015356396174904665, - "loss": 0.7679, - "step": 2633 - }, - { - "epoch": 1.2934735806114288, - "grad_norm": 0.4236263081120599, - "learning_rate": 0.0001535119541948781, - "loss": 0.7581, - "step": 2634 - }, - { - "epoch": 1.293965676324045, - "grad_norm": 0.43803156837979607, - "learning_rate": 0.00015345992635128943, - "loss": 0.7325, - "step": 2635 - }, - { - "epoch": 1.2944577720366612, - "grad_norm": 0.4420668030812656, - "learning_rate": 0.00015340787823800737, - "loss": 0.6998, - "step": 2636 - }, - { - "epoch": 1.2949498677492772, - "grad_norm": 0.4209700553026789, - "learning_rate": 0.00015335580987476633, - "loss": 0.7103, - "step": 2637 - }, - { - "epoch": 1.2954419634618932, - "grad_norm": 0.4390333251021473, - "learning_rate": 0.0001533037212813084, - "loss": 0.8193, - "step": 2638 - }, - { - "epoch": 1.2959340591745094, - "grad_norm": 0.4397623052427127, - "learning_rate": 0.00015325161247738324, - "loss": 0.7609, - "step": 2639 - }, - { - "epoch": 1.2964261548871256, - "grad_norm": 0.4202037110563578, - "learning_rate": 0.00015319948348274835, - "loss": 0.7672, - "step": 2640 - }, - { - "epoch": 1.2969182505997416, - "grad_norm": 0.46863029685362373, - "learning_rate": 0.00015314733431716877, - "loss": 0.7904, - "step": 2641 - }, - { - "epoch": 1.2974103463123576, - "grad_norm": 0.45916684501587546, - "learning_rate": 0.00015309516500041718, - "loss": 0.7986, - "step": 2642 - }, - { - "epoch": 1.2979024420249738, - "grad_norm": 0.4146209464048312, - "learning_rate": 0.00015304297555227393, - "loss": 0.7309, - "step": 2643 - }, - { - "epoch": 1.29839453773759, - "grad_norm": 0.4345550879236155, - "learning_rate": 0.00015299076599252701, - "loss": 0.7794, - "step": 2644 - }, - { - "epoch": 1.298886633450206, - "grad_norm": 0.44004376957854224, - "learning_rate": 0.00015293853634097207, - "loss": 0.731, - "step": 2645 - }, - { - "epoch": 1.299378729162822, - "grad_norm": 0.42114874972899674, - "learning_rate": 0.00015288628661741229, - "loss": 0.6931, - "step": 2646 - }, - { - "epoch": 1.2998708248754383, - "grad_norm": 0.43650945760271986, - "learning_rate": 0.00015283401684165851, - "loss": 0.7435, - "step": 2647 - }, - { - "epoch": 1.3003629205880545, - "grad_norm": 0.49219684391170015, - "learning_rate": 0.00015278172703352916, - "loss": 0.7146, - "step": 2648 - }, - { - "epoch": 1.3008550163006705, - "grad_norm": 0.4322023793096788, - "learning_rate": 0.00015272941721285033, - "loss": 0.8173, - "step": 2649 - }, - { - "epoch": 1.3013471120132865, - "grad_norm": 0.44082095384624576, - "learning_rate": 0.00015267708739945558, - "loss": 0.8049, - "step": 2650 - }, - { - "epoch": 1.3018392077259027, - "grad_norm": 0.45126265501229934, - "learning_rate": 0.00015262473761318618, - "loss": 0.759, - "step": 2651 - }, - { - "epoch": 1.3023313034385189, - "grad_norm": 0.4483440426939856, - "learning_rate": 0.00015257236787389084, - "loss": 0.6709, - "step": 2652 - }, - { - "epoch": 1.3028233991511349, - "grad_norm": 0.42136193587078413, - "learning_rate": 0.00015251997820142594, - "loss": 0.722, - "step": 2653 - }, - { - "epoch": 1.3033154948637509, - "grad_norm": 0.4288100440936685, - "learning_rate": 0.00015246756861565536, - "loss": 0.7478, - "step": 2654 - }, - { - "epoch": 1.303807590576367, - "grad_norm": 0.43267631712947824, - "learning_rate": 0.00015241513913645055, - "loss": 0.751, - "step": 2655 - }, - { - "epoch": 1.3042996862889833, - "grad_norm": 0.42818314355417175, - "learning_rate": 0.00015236268978369052, - "loss": 0.7629, - "step": 2656 - }, - { - "epoch": 1.3047917820015993, - "grad_norm": 0.4099356351781274, - "learning_rate": 0.0001523102205772618, - "loss": 0.7288, - "step": 2657 - }, - { - "epoch": 1.3052838777142153, - "grad_norm": 0.4236925245280607, - "learning_rate": 0.0001522577315370584, - "loss": 0.7601, - "step": 2658 - }, - { - "epoch": 1.3057759734268315, - "grad_norm": 0.40563395419484544, - "learning_rate": 0.00015220522268298193, - "loss": 0.6793, - "step": 2659 - }, - { - "epoch": 1.3062680691394477, - "grad_norm": 0.40432994975635483, - "learning_rate": 0.0001521526940349415, - "loss": 0.6969, - "step": 2660 - }, - { - "epoch": 1.3067601648520637, - "grad_norm": 0.42299007827818075, - "learning_rate": 0.00015210014561285365, - "loss": 0.7626, - "step": 2661 - }, - { - "epoch": 1.30725226056468, - "grad_norm": 0.4537197460382443, - "learning_rate": 0.00015204757743664252, - "loss": 0.8275, - "step": 2662 - }, - { - "epoch": 1.307744356277296, - "grad_norm": 0.42102164306599055, - "learning_rate": 0.00015199498952623967, - "loss": 0.7387, - "step": 2663 - }, - { - "epoch": 1.3082364519899121, - "grad_norm": 0.40736146587821015, - "learning_rate": 0.00015194238190158416, - "loss": 0.76, - "step": 2664 - }, - { - "epoch": 1.3087285477025281, - "grad_norm": 0.4839056190832484, - "learning_rate": 0.00015188975458262256, - "loss": 0.7263, - "step": 2665 - }, - { - "epoch": 1.3092206434151443, - "grad_norm": 0.4228987201039662, - "learning_rate": 0.0001518371075893088, - "loss": 0.7212, - "step": 2666 - }, - { - "epoch": 1.3097127391277603, - "grad_norm": 0.4281709483909016, - "learning_rate": 0.0001517844409416044, - "loss": 0.7377, - "step": 2667 - }, - { - "epoch": 1.3102048348403765, - "grad_norm": 0.4202265635833236, - "learning_rate": 0.00015173175465947827, - "loss": 0.7236, - "step": 2668 - }, - { - "epoch": 1.3106969305529925, - "grad_norm": 0.41013205706430633, - "learning_rate": 0.00015167904876290677, - "loss": 0.698, - "step": 2669 - }, - { - "epoch": 1.3111890262656087, - "grad_norm": 0.4559053248434141, - "learning_rate": 0.00015162632327187368, - "loss": 0.6948, - "step": 2670 - }, - { - "epoch": 1.3116811219782247, - "grad_norm": 0.47860523913959824, - "learning_rate": 0.00015157357820637021, - "loss": 0.793, - "step": 2671 - }, - { - "epoch": 1.312173217690841, - "grad_norm": 0.43126462460049897, - "learning_rate": 0.0001515208135863951, - "loss": 0.781, - "step": 2672 - }, - { - "epoch": 1.312665313403457, - "grad_norm": 0.45629304452154124, - "learning_rate": 0.00015146802943195433, - "loss": 0.8345, - "step": 2673 - }, - { - "epoch": 1.3131574091160731, - "grad_norm": 0.41588679447347654, - "learning_rate": 0.00015141522576306136, - "loss": 0.7517, - "step": 2674 - }, - { - "epoch": 1.3136495048286891, - "grad_norm": 0.4430729681470319, - "learning_rate": 0.00015136240259973715, - "loss": 0.7445, - "step": 2675 - }, - { - "epoch": 1.3141416005413054, - "grad_norm": 0.45830387363452274, - "learning_rate": 0.0001513095599620099, - "loss": 0.78, - "step": 2676 - }, - { - "epoch": 1.3146336962539213, - "grad_norm": 0.43554428453591837, - "learning_rate": 0.0001512566978699152, - "loss": 0.748, - "step": 2677 - }, - { - "epoch": 1.3151257919665376, - "grad_norm": 0.4369067135485014, - "learning_rate": 0.00015120381634349617, - "loss": 0.7524, - "step": 2678 - }, - { - "epoch": 1.3156178876791536, - "grad_norm": 0.4438122596677527, - "learning_rate": 0.00015115091540280316, - "loss": 0.7754, - "step": 2679 - }, - { - "epoch": 1.3161099833917698, - "grad_norm": 0.4695610067237712, - "learning_rate": 0.0001510979950678939, - "loss": 0.7698, - "step": 2680 - }, - { - "epoch": 1.3166020791043858, - "grad_norm": 0.4290852887766823, - "learning_rate": 0.0001510450553588335, - "loss": 0.7156, - "step": 2681 - }, - { - "epoch": 1.317094174817002, - "grad_norm": 0.42483568878517364, - "learning_rate": 0.00015099209629569442, - "loss": 0.7312, - "step": 2682 - }, - { - "epoch": 1.317586270529618, - "grad_norm": 0.46614704246217564, - "learning_rate": 0.00015093911789855645, - "loss": 0.722, - "step": 2683 - }, - { - "epoch": 1.3180783662422342, - "grad_norm": 0.40866774536870926, - "learning_rate": 0.0001508861201875067, - "loss": 0.6605, - "step": 2684 - }, - { - "epoch": 1.3185704619548502, - "grad_norm": 0.47439495732029524, - "learning_rate": 0.00015083310318263964, - "loss": 0.7408, - "step": 2685 - }, - { - "epoch": 1.3190625576674664, - "grad_norm": 0.395423912110938, - "learning_rate": 0.000150780066904057, - "loss": 0.698, - "step": 2686 - }, - { - "epoch": 1.3195546533800824, - "grad_norm": 0.45237785162409755, - "learning_rate": 0.00015072701137186785, - "loss": 0.7701, - "step": 2687 - }, - { - "epoch": 1.3200467490926986, - "grad_norm": 0.4436148748238658, - "learning_rate": 0.00015067393660618853, - "loss": 0.7413, - "step": 2688 - }, - { - "epoch": 1.3205388448053146, - "grad_norm": 0.4026463831862003, - "learning_rate": 0.00015062084262714275, - "loss": 0.7118, - "step": 2689 - }, - { - "epoch": 1.3210309405179308, - "grad_norm": 0.41205806933136185, - "learning_rate": 0.0001505677294548614, - "loss": 0.6963, - "step": 2690 - }, - { - "epoch": 1.3215230362305468, - "grad_norm": 0.4355776162302929, - "learning_rate": 0.0001505145971094827, - "loss": 0.7035, - "step": 2691 - }, - { - "epoch": 1.322015131943163, - "grad_norm": 0.4265899734182896, - "learning_rate": 0.0001504614456111522, - "loss": 0.7754, - "step": 2692 - }, - { - "epoch": 1.322507227655779, - "grad_norm": 0.41098003634808444, - "learning_rate": 0.0001504082749800226, - "loss": 0.7134, - "step": 2693 - }, - { - "epoch": 1.3229993233683952, - "grad_norm": 0.4326632723797512, - "learning_rate": 0.0001503550852362539, - "loss": 0.7819, - "step": 2694 - }, - { - "epoch": 1.3234914190810112, - "grad_norm": 0.4223330074094377, - "learning_rate": 0.0001503018764000134, - "loss": 0.7096, - "step": 2695 - }, - { - "epoch": 1.3239835147936274, - "grad_norm": 0.42211742162701826, - "learning_rate": 0.00015024864849147554, - "loss": 0.7337, - "step": 2696 - }, - { - "epoch": 1.3244756105062434, - "grad_norm": 0.4143320500928967, - "learning_rate": 0.00015019540153082201, - "loss": 0.6731, - "step": 2697 - }, - { - "epoch": 1.3249677062188596, - "grad_norm": 0.418157045554795, - "learning_rate": 0.00015014213553824187, - "loss": 0.8011, - "step": 2698 - }, - { - "epoch": 1.3254598019314756, - "grad_norm": 0.43450430706254045, - "learning_rate": 0.0001500888505339312, - "loss": 0.7006, - "step": 2699 - }, - { - "epoch": 1.3259518976440918, - "grad_norm": 0.411237308827399, - "learning_rate": 0.00015003554653809342, - "loss": 0.7208, - "step": 2700 - }, - { - "epoch": 1.3264439933567078, - "grad_norm": 0.3991299352012469, - "learning_rate": 0.00014998222357093903, - "loss": 0.7275, - "step": 2701 - }, - { - "epoch": 1.326936089069324, - "grad_norm": 0.41991732320264646, - "learning_rate": 0.00014992888165268583, - "loss": 0.6953, - "step": 2702 - }, - { - "epoch": 1.32742818478194, - "grad_norm": 0.44469660282099244, - "learning_rate": 0.0001498755208035588, - "loss": 0.7119, - "step": 2703 - }, - { - "epoch": 1.3279202804945562, - "grad_norm": 0.4501658533700222, - "learning_rate": 0.00014982214104379, - "loss": 0.7995, - "step": 2704 - }, - { - "epoch": 1.3284123762071722, - "grad_norm": 0.42220874843132583, - "learning_rate": 0.00014976874239361882, - "loss": 0.7337, - "step": 2705 - }, - { - "epoch": 1.3289044719197884, - "grad_norm": 0.4381833508887948, - "learning_rate": 0.00014971532487329165, - "loss": 0.7355, - "step": 2706 - }, - { - "epoch": 1.3293965676324044, - "grad_norm": 0.41296393354127864, - "learning_rate": 0.0001496618885030621, - "loss": 0.6702, - "step": 2707 - }, - { - "epoch": 1.3298886633450206, - "grad_norm": 0.4449986243348617, - "learning_rate": 0.000149608433303191, - "loss": 0.7757, - "step": 2708 - }, - { - "epoch": 1.3303807590576366, - "grad_norm": 0.4125510892416481, - "learning_rate": 0.0001495549592939462, - "loss": 0.7235, - "step": 2709 - }, - { - "epoch": 1.3308728547702529, - "grad_norm": 0.4159871936786702, - "learning_rate": 0.00014950146649560274, - "loss": 0.7513, - "step": 2710 - }, - { - "epoch": 1.3313649504828688, - "grad_norm": 0.4562920096613522, - "learning_rate": 0.00014944795492844278, - "loss": 0.7282, - "step": 2711 - }, - { - "epoch": 1.331857046195485, - "grad_norm": 0.4236634548305959, - "learning_rate": 0.00014939442461275556, - "loss": 0.7675, - "step": 2712 - }, - { - "epoch": 1.332349141908101, - "grad_norm": 0.4483689418369436, - "learning_rate": 0.00014934087556883754, - "loss": 0.8156, - "step": 2713 - }, - { - "epoch": 1.3328412376207173, - "grad_norm": 0.41471430318775926, - "learning_rate": 0.00014928730781699212, - "loss": 0.7828, - "step": 2714 - }, - { - "epoch": 1.3333333333333333, - "grad_norm": 0.45280773074731756, - "learning_rate": 0.00014923372137752992, - "loss": 0.7223, - "step": 2715 - }, - { - "epoch": 1.3338254290459495, - "grad_norm": 0.4433372619402007, - "learning_rate": 0.0001491801162707686, - "loss": 0.7417, - "step": 2716 - }, - { - "epoch": 1.3343175247585655, - "grad_norm": 0.4218838171767951, - "learning_rate": 0.00014912649251703288, - "loss": 0.724, - "step": 2717 - }, - { - "epoch": 1.3348096204711817, - "grad_norm": 0.44886560272394316, - "learning_rate": 0.0001490728501366546, - "loss": 0.7547, - "step": 2718 - }, - { - "epoch": 1.3353017161837977, - "grad_norm": 0.5339889527003661, - "learning_rate": 0.0001490191891499726, - "loss": 0.7936, - "step": 2719 - }, - { - "epoch": 1.3357938118964139, - "grad_norm": 0.4508003935947812, - "learning_rate": 0.00014896550957733284, - "loss": 0.775, - "step": 2720 - }, - { - "epoch": 1.3362859076090299, - "grad_norm": 0.4299488485447041, - "learning_rate": 0.00014891181143908826, - "loss": 0.7989, - "step": 2721 - }, - { - "epoch": 1.336778003321646, - "grad_norm": 0.46604109703082197, - "learning_rate": 0.0001488580947555989, - "loss": 0.7592, - "step": 2722 - }, - { - "epoch": 1.337270099034262, - "grad_norm": 0.4696760045210466, - "learning_rate": 0.0001488043595472318, - "loss": 0.7481, - "step": 2723 - }, - { - "epoch": 1.3377621947468783, - "grad_norm": 0.43535020769769234, - "learning_rate": 0.00014875060583436101, - "loss": 0.7488, - "step": 2724 - }, - { - "epoch": 1.3382542904594943, - "grad_norm": 0.42759257210065515, - "learning_rate": 0.00014869683363736763, - "loss": 0.7461, - "step": 2725 - }, - { - "epoch": 1.3387463861721105, - "grad_norm": 0.4163764624439068, - "learning_rate": 0.00014864304297663975, - "loss": 0.7547, - "step": 2726 - }, - { - "epoch": 1.3392384818847267, - "grad_norm": 0.40831939882603613, - "learning_rate": 0.00014858923387257245, - "loss": 0.6954, - "step": 2727 - }, - { - "epoch": 1.3397305775973427, - "grad_norm": 0.4691562198083282, - "learning_rate": 0.00014853540634556783, - "loss": 0.7478, - "step": 2728 - }, - { - "epoch": 1.3402226733099587, - "grad_norm": 0.43429507939103096, - "learning_rate": 0.000148481560416035, - "loss": 0.7302, - "step": 2729 - }, - { - "epoch": 1.340714769022575, - "grad_norm": 0.4302405186388293, - "learning_rate": 0.00014842769610438992, - "loss": 0.7908, - "step": 2730 - }, - { - "epoch": 1.3412068647351911, - "grad_norm": 0.41467307101897777, - "learning_rate": 0.00014837381343105568, - "loss": 0.7113, - "step": 2731 - }, - { - "epoch": 1.3416989604478071, - "grad_norm": 0.4396330083425095, - "learning_rate": 0.00014831991241646226, - "loss": 0.7036, - "step": 2732 - }, - { - "epoch": 1.3421910561604231, - "grad_norm": 0.4273104827213855, - "learning_rate": 0.00014826599308104653, - "loss": 0.7805, - "step": 2733 - }, - { - "epoch": 1.3426831518730393, - "grad_norm": 0.4660312695309136, - "learning_rate": 0.00014821205544525244, - "loss": 0.7577, - "step": 2734 - }, - { - "epoch": 1.3431752475856555, - "grad_norm": 0.44721064984702363, - "learning_rate": 0.00014815809952953082, - "loss": 0.7386, - "step": 2735 - }, - { - "epoch": 1.3436673432982715, - "grad_norm": 0.44093357124612287, - "learning_rate": 0.00014810412535433935, - "loss": 0.6815, - "step": 2736 - }, - { - "epoch": 1.3441594390108875, - "grad_norm": 0.4320067264334088, - "learning_rate": 0.00014805013294014275, - "loss": 0.703, - "step": 2737 - }, - { - "epoch": 1.3446515347235037, - "grad_norm": 0.48853666968241427, - "learning_rate": 0.00014799612230741258, - "loss": 0.8311, - "step": 2738 - }, - { - "epoch": 1.34514363043612, - "grad_norm": 0.42564282859711905, - "learning_rate": 0.0001479420934766274, - "loss": 0.75, - "step": 2739 - }, - { - "epoch": 1.345635726148736, - "grad_norm": 0.4551383735721082, - "learning_rate": 0.0001478880464682725, - "loss": 0.7773, - "step": 2740 - }, - { - "epoch": 1.346127821861352, - "grad_norm": 0.4823949036191055, - "learning_rate": 0.0001478339813028403, - "loss": 0.7904, - "step": 2741 - }, - { - "epoch": 1.3466199175739682, - "grad_norm": 0.45454536255046707, - "learning_rate": 0.0001477798980008299, - "loss": 0.7397, - "step": 2742 - }, - { - "epoch": 1.3471120132865844, - "grad_norm": 0.4896659331141192, - "learning_rate": 0.00014772579658274733, - "loss": 0.7963, - "step": 2743 - }, - { - "epoch": 1.3476041089992004, - "grad_norm": 0.43717123793847806, - "learning_rate": 0.00014767167706910555, - "loss": 0.7434, - "step": 2744 - }, - { - "epoch": 1.3480962047118163, - "grad_norm": 0.44713356814795013, - "learning_rate": 0.00014761753948042434, - "loss": 0.7242, - "step": 2745 - }, - { - "epoch": 1.3485883004244326, - "grad_norm": 0.4453704215510461, - "learning_rate": 0.00014756338383723033, - "loss": 0.7451, - "step": 2746 - }, - { - "epoch": 1.3490803961370488, - "grad_norm": 0.43980454771431327, - "learning_rate": 0.000147509210160057, - "loss": 0.7768, - "step": 2747 - }, - { - "epoch": 1.3495724918496648, - "grad_norm": 0.4196286931419831, - "learning_rate": 0.00014745501846944462, - "loss": 0.6887, - "step": 2748 - }, - { - "epoch": 1.3500645875622808, - "grad_norm": 0.47158978701859167, - "learning_rate": 0.00014740080878594043, - "loss": 0.8065, - "step": 2749 - }, - { - "epoch": 1.350556683274897, - "grad_norm": 0.42575906936646557, - "learning_rate": 0.0001473465811300983, - "loss": 0.7393, - "step": 2750 - }, - { - "epoch": 1.3510487789875132, - "grad_norm": 0.46702662801495093, - "learning_rate": 0.00014729233552247906, - "loss": 0.7982, - "step": 2751 - }, - { - "epoch": 1.3515408747001292, - "grad_norm": 0.4687994252429321, - "learning_rate": 0.00014723807198365033, - "loss": 0.7859, - "step": 2752 - }, - { - "epoch": 1.3520329704127452, - "grad_norm": 0.40948468726502346, - "learning_rate": 0.00014718379053418643, - "loss": 0.739, - "step": 2753 - }, - { - "epoch": 1.3525250661253614, - "grad_norm": 0.45155120830076073, - "learning_rate": 0.0001471294911946686, - "loss": 0.773, - "step": 2754 - }, - { - "epoch": 1.3530171618379776, - "grad_norm": 0.43327535587765165, - "learning_rate": 0.00014707517398568478, - "loss": 0.7334, - "step": 2755 - }, - { - "epoch": 1.3535092575505936, - "grad_norm": 0.423007154634038, - "learning_rate": 0.0001470208389278297, - "loss": 0.724, - "step": 2756 - }, - { - "epoch": 1.3540013532632096, - "grad_norm": 0.4392988409483355, - "learning_rate": 0.00014696648604170488, - "loss": 0.7741, - "step": 2757 - }, - { - "epoch": 1.3544934489758258, - "grad_norm": 0.429697872741243, - "learning_rate": 0.00014691211534791857, - "loss": 0.7331, - "step": 2758 - }, - { - "epoch": 1.354985544688442, - "grad_norm": 0.463454284334484, - "learning_rate": 0.00014685772686708577, - "loss": 0.705, - "step": 2759 - }, - { - "epoch": 1.355477640401058, - "grad_norm": 0.40727469892993934, - "learning_rate": 0.0001468033206198283, - "loss": 0.7576, - "step": 2760 - }, - { - "epoch": 1.355969736113674, - "grad_norm": 0.4446907558160837, - "learning_rate": 0.00014674889662677463, - "loss": 0.7904, - "step": 2761 - }, - { - "epoch": 1.3564618318262902, - "grad_norm": 0.44088645970811163, - "learning_rate": 0.00014669445490855996, - "loss": 0.7535, - "step": 2762 - }, - { - "epoch": 1.3569539275389064, - "grad_norm": 0.41702807251086554, - "learning_rate": 0.00014663999548582623, - "loss": 0.7527, - "step": 2763 - }, - { - "epoch": 1.3574460232515224, - "grad_norm": 0.4401593184803354, - "learning_rate": 0.00014658551837922214, - "loss": 0.7713, - "step": 2764 - }, - { - "epoch": 1.3579381189641384, - "grad_norm": 0.41800494517405506, - "learning_rate": 0.00014653102360940304, - "loss": 0.767, - "step": 2765 - }, - { - "epoch": 1.3584302146767546, - "grad_norm": 0.4213390147614005, - "learning_rate": 0.00014647651119703098, - "loss": 0.7639, - "step": 2766 - }, - { - "epoch": 1.3589223103893708, - "grad_norm": 0.4264935640399931, - "learning_rate": 0.00014642198116277477, - "loss": 0.753, - "step": 2767 - }, - { - "epoch": 1.3594144061019868, - "grad_norm": 0.39343467342051713, - "learning_rate": 0.00014636743352730976, - "loss": 0.7162, - "step": 2768 - }, - { - "epoch": 1.3599065018146028, - "grad_norm": 0.4308535571127244, - "learning_rate": 0.00014631286831131806, - "loss": 0.7264, - "step": 2769 - }, - { - "epoch": 1.360398597527219, - "grad_norm": 0.4149011577472118, - "learning_rate": 0.00014625828553548853, - "loss": 0.6912, - "step": 2770 - }, - { - "epoch": 1.3608906932398352, - "grad_norm": 0.4099517014433799, - "learning_rate": 0.0001462036852205165, - "loss": 0.7752, - "step": 2771 - }, - { - "epoch": 1.3613827889524512, - "grad_norm": 0.4092203740757273, - "learning_rate": 0.00014614906738710408, - "loss": 0.7293, - "step": 2772 - }, - { - "epoch": 1.3618748846650672, - "grad_norm": 0.4549383754358427, - "learning_rate": 0.00014609443205596, - "loss": 0.7842, - "step": 2773 - }, - { - "epoch": 1.3623669803776834, - "grad_norm": 0.4306240858525313, - "learning_rate": 0.00014603977924779963, - "loss": 0.7812, - "step": 2774 - }, - { - "epoch": 1.3628590760902997, - "grad_norm": 0.45801146218745514, - "learning_rate": 0.00014598510898334498, - "loss": 0.7374, - "step": 2775 - }, - { - "epoch": 1.3633511718029157, - "grad_norm": 0.43859349099215705, - "learning_rate": 0.00014593042128332453, - "loss": 0.6934, - "step": 2776 - }, - { - "epoch": 1.3638432675155316, - "grad_norm": 0.4542107179550836, - "learning_rate": 0.00014587571616847363, - "loss": 0.7173, - "step": 2777 - }, - { - "epoch": 1.3643353632281479, - "grad_norm": 0.44054827069162666, - "learning_rate": 0.00014582099365953398, - "loss": 0.7642, - "step": 2778 - }, - { - "epoch": 1.364827458940764, - "grad_norm": 0.415389898067285, - "learning_rate": 0.0001457662537772541, - "loss": 0.7136, - "step": 2779 - }, - { - "epoch": 1.36531955465338, - "grad_norm": 0.4508678812762311, - "learning_rate": 0.0001457114965423889, - "loss": 0.679, - "step": 2780 - }, - { - "epoch": 1.365811650365996, - "grad_norm": 0.4502740150907887, - "learning_rate": 0.00014565672197570004, - "loss": 0.7419, - "step": 2781 - }, - { - "epoch": 1.3663037460786123, - "grad_norm": 0.42787059612739786, - "learning_rate": 0.00014560193009795555, - "loss": 0.6888, - "step": 2782 - }, - { - "epoch": 1.3667958417912285, - "grad_norm": 0.4194007248160413, - "learning_rate": 0.00014554712092993026, - "loss": 0.7562, - "step": 2783 - }, - { - "epoch": 1.3672879375038445, - "grad_norm": 0.46160780550536346, - "learning_rate": 0.0001454922944924054, - "loss": 0.7636, - "step": 2784 - }, - { - "epoch": 1.3677800332164605, - "grad_norm": 0.5217135724226263, - "learning_rate": 0.00014543745080616876, - "loss": 0.7046, - "step": 2785 - }, - { - "epoch": 1.3682721289290767, - "grad_norm": 2.621954437128583, - "learning_rate": 0.00014538258989201466, - "loss": 0.7991, - "step": 2786 - }, - { - "epoch": 1.368764224641693, - "grad_norm": 0.4167833300901098, - "learning_rate": 0.0001453277117707441, - "loss": 0.7142, - "step": 2787 - }, - { - "epoch": 1.3692563203543089, - "grad_norm": 0.423599670071847, - "learning_rate": 0.00014527281646316438, - "loss": 0.7201, - "step": 2788 - }, - { - "epoch": 1.3697484160669249, - "grad_norm": 0.4360138295144438, - "learning_rate": 0.0001452179039900895, - "loss": 0.7964, - "step": 2789 - }, - { - "epoch": 1.370240511779541, - "grad_norm": 0.39593837473454535, - "learning_rate": 0.00014516297437233987, - "loss": 0.6846, - "step": 2790 - }, - { - "epoch": 1.3707326074921573, - "grad_norm": 0.39349976449439356, - "learning_rate": 0.00014510802763074241, - "loss": 0.7268, - "step": 2791 - }, - { - "epoch": 1.3712247032047733, - "grad_norm": 0.46673403957282444, - "learning_rate": 0.00014505306378613062, - "loss": 0.7293, - "step": 2792 - }, - { - "epoch": 1.3717167989173895, - "grad_norm": 0.4306549813980889, - "learning_rate": 0.00014499808285934433, - "loss": 0.7627, - "step": 2793 - }, - { - "epoch": 1.3722088946300055, - "grad_norm": 0.426697419320064, - "learning_rate": 0.00014494308487123, - "loss": 0.7223, - "step": 2794 - }, - { - "epoch": 1.3727009903426217, - "grad_norm": 0.42243755494952273, - "learning_rate": 0.00014488806984264038, - "loss": 0.7061, - "step": 2795 - }, - { - "epoch": 1.3731930860552377, - "grad_norm": 0.42098719038053645, - "learning_rate": 0.0001448330377944349, - "loss": 0.74, - "step": 2796 - }, - { - "epoch": 1.373685181767854, - "grad_norm": 0.4381137328460556, - "learning_rate": 0.00014477798874747933, - "loss": 0.6986, - "step": 2797 - }, - { - "epoch": 1.37417727748047, - "grad_norm": 0.4377979248161887, - "learning_rate": 0.00014472292272264584, - "loss": 0.6892, - "step": 2798 - }, - { - "epoch": 1.3746693731930861, - "grad_norm": 0.4628397260824491, - "learning_rate": 0.0001446678397408131, - "loss": 0.8194, - "step": 2799 - }, - { - "epoch": 1.3751614689057021, - "grad_norm": 0.4105910026261328, - "learning_rate": 0.00014461273982286618, - "loss": 0.7701, - "step": 2800 - }, - { - "epoch": 1.3756535646183183, - "grad_norm": 0.4052498504584658, - "learning_rate": 0.00014455762298969663, - "loss": 0.7477, - "step": 2801 - }, - { - "epoch": 1.3761456603309343, - "grad_norm": 0.4301997582693896, - "learning_rate": 0.00014450248926220236, - "loss": 0.7358, - "step": 2802 - }, - { - "epoch": 1.3766377560435505, - "grad_norm": 0.40260835089540903, - "learning_rate": 0.00014444733866128765, - "loss": 0.7107, - "step": 2803 - }, - { - "epoch": 1.3771298517561665, - "grad_norm": 0.4319017994896772, - "learning_rate": 0.0001443921712078633, - "loss": 0.7418, - "step": 2804 - }, - { - "epoch": 1.3776219474687827, - "grad_norm": 0.4364926584592031, - "learning_rate": 0.00014433698692284635, - "loss": 0.7856, - "step": 2805 - }, - { - "epoch": 1.3781140431813987, - "grad_norm": 0.4118175977294697, - "learning_rate": 0.00014428178582716035, - "loss": 0.7492, - "step": 2806 - }, - { - "epoch": 1.378606138894015, - "grad_norm": 0.4102828788466361, - "learning_rate": 0.00014422656794173513, - "loss": 0.7743, - "step": 2807 - }, - { - "epoch": 1.379098234606631, - "grad_norm": 0.429336876318765, - "learning_rate": 0.00014417133328750693, - "loss": 0.8042, - "step": 2808 - }, - { - "epoch": 1.3795903303192472, - "grad_norm": 0.40914285687385166, - "learning_rate": 0.00014411608188541838, - "loss": 0.7258, - "step": 2809 - }, - { - "epoch": 1.3800824260318632, - "grad_norm": 0.4592385228100208, - "learning_rate": 0.0001440608137564184, - "loss": 0.7655, - "step": 2810 - }, - { - "epoch": 1.3805745217444794, - "grad_norm": 0.40524311159529497, - "learning_rate": 0.0001440055289214623, - "loss": 0.7451, - "step": 2811 - }, - { - "epoch": 1.3810666174570954, - "grad_norm": 0.42070709054702543, - "learning_rate": 0.00014395022740151163, - "loss": 0.7721, - "step": 2812 - }, - { - "epoch": 1.3815587131697116, - "grad_norm": 0.4398688206824279, - "learning_rate": 0.0001438949092175344, - "loss": 0.7318, - "step": 2813 - }, - { - "epoch": 1.3820508088823276, - "grad_norm": 0.46987366202392017, - "learning_rate": 0.00014383957439050485, - "loss": 0.7147, - "step": 2814 - }, - { - "epoch": 1.3825429045949438, - "grad_norm": 0.44418643849007833, - "learning_rate": 0.00014378422294140358, - "loss": 0.7462, - "step": 2815 - }, - { - "epoch": 1.3830350003075598, - "grad_norm": 0.4402508658792176, - "learning_rate": 0.00014372885489121744, - "loss": 0.7566, - "step": 2816 - }, - { - "epoch": 1.383527096020176, - "grad_norm": 0.46917452097201523, - "learning_rate": 0.00014367347026093965, - "loss": 0.8237, - "step": 2817 - }, - { - "epoch": 1.384019191732792, - "grad_norm": 0.4308436511988418, - "learning_rate": 0.00014361806907156957, - "loss": 0.7602, - "step": 2818 - }, - { - "epoch": 1.3845112874454082, - "grad_norm": 0.43453500343053675, - "learning_rate": 0.000143562651344113, - "loss": 0.7695, - "step": 2819 - }, - { - "epoch": 1.3850033831580242, - "grad_norm": 0.40580720584426183, - "learning_rate": 0.00014350721709958196, - "loss": 0.7215, - "step": 2820 - }, - { - "epoch": 1.3854954788706404, - "grad_norm": 0.4304814434001006, - "learning_rate": 0.00014345176635899466, - "loss": 0.7581, - "step": 2821 - }, - { - "epoch": 1.3859875745832564, - "grad_norm": 0.43924056411036916, - "learning_rate": 0.00014339629914337571, - "loss": 0.7778, - "step": 2822 - }, - { - "epoch": 1.3864796702958726, - "grad_norm": 0.4318623964899569, - "learning_rate": 0.00014334081547375584, - "loss": 0.773, - "step": 2823 - }, - { - "epoch": 1.3869717660084886, - "grad_norm": 0.415662725821242, - "learning_rate": 0.00014328531537117204, - "loss": 0.7721, - "step": 2824 - }, - { - "epoch": 1.3874638617211048, - "grad_norm": 0.4232390291083941, - "learning_rate": 0.00014322979885666756, - "loss": 0.7137, - "step": 2825 - }, - { - "epoch": 1.3879559574337208, - "grad_norm": 0.43954365193076456, - "learning_rate": 0.0001431742659512919, - "loss": 0.7975, - "step": 2826 - }, - { - "epoch": 1.388448053146337, - "grad_norm": 0.42311931954421095, - "learning_rate": 0.00014311871667610067, - "loss": 0.703, - "step": 2827 - }, - { - "epoch": 1.388940148858953, - "grad_norm": 0.40842248284601124, - "learning_rate": 0.00014306315105215578, - "loss": 0.7509, - "step": 2828 - }, - { - "epoch": 1.3894322445715692, - "grad_norm": 0.417261173584133, - "learning_rate": 0.00014300756910052534, - "loss": 0.6907, - "step": 2829 - }, - { - "epoch": 1.3899243402841852, - "grad_norm": 0.4056303091245692, - "learning_rate": 0.0001429519708422836, - "loss": 0.756, - "step": 2830 - }, - { - "epoch": 1.3904164359968014, - "grad_norm": 0.4087623746617413, - "learning_rate": 0.000142896356298511, - "loss": 0.6662, - "step": 2831 - }, - { - "epoch": 1.3909085317094174, - "grad_norm": 0.41966032125022756, - "learning_rate": 0.00014284072549029423, - "loss": 0.7825, - "step": 2832 - }, - { - "epoch": 1.3914006274220336, - "grad_norm": 0.40466274275782793, - "learning_rate": 0.00014278507843872604, - "loss": 0.7306, - "step": 2833 - }, - { - "epoch": 1.3918927231346496, - "grad_norm": 0.46833029296545736, - "learning_rate": 0.0001427294151649054, - "loss": 0.8599, - "step": 2834 - }, - { - "epoch": 1.3923848188472658, - "grad_norm": 0.4247522298363245, - "learning_rate": 0.00014267373568993742, - "loss": 0.7241, - "step": 2835 - }, - { - "epoch": 1.3928769145598818, - "grad_norm": 0.4720901230149452, - "learning_rate": 0.00014261804003493333, - "loss": 0.7737, - "step": 2836 - }, - { - "epoch": 1.393369010272498, - "grad_norm": 0.47239587469693006, - "learning_rate": 0.00014256232822101053, - "loss": 0.7604, - "step": 2837 - }, - { - "epoch": 1.393861105985114, - "grad_norm": 0.48548609967352985, - "learning_rate": 0.00014250660026929256, - "loss": 0.7775, - "step": 2838 - }, - { - "epoch": 1.3943532016977302, - "grad_norm": 0.4443374125802025, - "learning_rate": 0.00014245085620090902, - "loss": 0.7482, - "step": 2839 - }, - { - "epoch": 1.3948452974103462, - "grad_norm": 0.4111894277871565, - "learning_rate": 0.00014239509603699562, - "loss": 0.7135, - "step": 2840 - }, - { - "epoch": 1.3953373931229625, - "grad_norm": 0.44508601998564107, - "learning_rate": 0.00014233931979869426, - "loss": 0.8062, - "step": 2841 - }, - { - "epoch": 1.3958294888355784, - "grad_norm": 0.43154416020043423, - "learning_rate": 0.00014228352750715286, - "loss": 0.7587, - "step": 2842 - }, - { - "epoch": 1.3963215845481947, - "grad_norm": 0.48828566395571754, - "learning_rate": 0.00014222771918352543, - "loss": 0.6708, - "step": 2843 - }, - { - "epoch": 1.3968136802608107, - "grad_norm": 0.4353818605607434, - "learning_rate": 0.0001421718948489721, - "loss": 0.7412, - "step": 2844 - }, - { - "epoch": 1.3973057759734269, - "grad_norm": 0.42988995848635425, - "learning_rate": 0.000142116054524659, - "loss": 0.6936, - "step": 2845 - }, - { - "epoch": 1.3977978716860429, - "grad_norm": 0.4167451646577703, - "learning_rate": 0.00014206019823175843, - "loss": 0.7439, - "step": 2846 - }, - { - "epoch": 1.398289967398659, - "grad_norm": 0.4194091490877988, - "learning_rate": 0.00014200432599144867, - "loss": 0.7945, - "step": 2847 - }, - { - "epoch": 1.398782063111275, - "grad_norm": 0.40440317420121163, - "learning_rate": 0.00014194843782491402, - "loss": 0.7532, - "step": 2848 - }, - { - "epoch": 1.3992741588238913, - "grad_norm": 0.42397793268354184, - "learning_rate": 0.00014189253375334487, - "loss": 0.6868, - "step": 2849 - }, - { - "epoch": 1.3997662545365073, - "grad_norm": 0.4348167217634709, - "learning_rate": 0.00014183661379793764, - "loss": 0.7858, - "step": 2850 - }, - { - "epoch": 1.4002583502491235, - "grad_norm": 0.39554208001514635, - "learning_rate": 0.00014178067797989473, - "loss": 0.7384, - "step": 2851 - }, - { - "epoch": 1.4007504459617395, - "grad_norm": 0.42910325009244044, - "learning_rate": 0.00014172472632042465, - "loss": 0.7856, - "step": 2852 - }, - { - "epoch": 1.4012425416743557, - "grad_norm": 0.40517337832430766, - "learning_rate": 0.0001416687588407418, - "loss": 0.7217, - "step": 2853 - }, - { - "epoch": 1.401734637386972, - "grad_norm": 0.4348992153207263, - "learning_rate": 0.0001416127755620666, - "loss": 0.7943, - "step": 2854 - }, - { - "epoch": 1.402226733099588, - "grad_norm": 0.4120607437044399, - "learning_rate": 0.00014155677650562554, - "loss": 0.7881, - "step": 2855 - }, - { - "epoch": 1.4027188288122039, - "grad_norm": 0.4208840397746761, - "learning_rate": 0.00014150076169265106, - "loss": 0.7014, - "step": 2856 - }, - { - "epoch": 1.40321092452482, - "grad_norm": 0.44500297874848843, - "learning_rate": 0.0001414447311443815, - "loss": 0.7544, - "step": 2857 - }, - { - "epoch": 1.4037030202374363, - "grad_norm": 0.4518710615369706, - "learning_rate": 0.00014138868488206127, - "loss": 0.6722, - "step": 2858 - }, - { - "epoch": 1.4041951159500523, - "grad_norm": 0.4579671823678311, - "learning_rate": 0.00014133262292694067, - "loss": 0.7084, - "step": 2859 - }, - { - "epoch": 1.4046872116626683, - "grad_norm": 0.46593325559466814, - "learning_rate": 0.00014127654530027596, - "loss": 0.7568, - "step": 2860 - }, - { - "epoch": 1.4051793073752845, - "grad_norm": 0.42065468000896555, - "learning_rate": 0.0001412204520233294, - "loss": 0.7637, - "step": 2861 - }, - { - "epoch": 1.4056714030879007, - "grad_norm": 0.4351068289757589, - "learning_rate": 0.00014116434311736904, - "loss": 0.7539, - "step": 2862 - }, - { - "epoch": 1.4061634988005167, - "grad_norm": 0.4371758917451206, - "learning_rate": 0.00014110821860366903, - "loss": 0.7676, - "step": 2863 - }, - { - "epoch": 1.4066555945131327, - "grad_norm": 0.4219230357329842, - "learning_rate": 0.00014105207850350932, - "loss": 0.838, - "step": 2864 - }, - { - "epoch": 1.407147690225749, - "grad_norm": 0.4216272512315438, - "learning_rate": 0.00014099592283817585, - "loss": 0.7818, - "step": 2865 - }, - { - "epoch": 1.4076397859383651, - "grad_norm": 0.4396059343719567, - "learning_rate": 0.00014093975162896038, - "loss": 0.7467, - "step": 2866 - }, - { - "epoch": 1.4081318816509811, - "grad_norm": 0.42006314652049326, - "learning_rate": 0.00014088356489716064, - "loss": 0.7703, - "step": 2867 - }, - { - "epoch": 1.4086239773635971, - "grad_norm": 0.4224713073924144, - "learning_rate": 0.0001408273626640802, - "loss": 0.7167, - "step": 2868 - }, - { - "epoch": 1.4091160730762133, - "grad_norm": 0.4343128075983513, - "learning_rate": 0.00014077114495102849, - "loss": 0.7962, - "step": 2869 - }, - { - "epoch": 1.4096081687888296, - "grad_norm": 0.40606316906369183, - "learning_rate": 0.0001407149117793209, - "loss": 0.694, - "step": 2870 - }, - { - "epoch": 1.4101002645014455, - "grad_norm": 0.4508075061782764, - "learning_rate": 0.00014065866317027853, - "loss": 0.8228, - "step": 2871 - }, - { - "epoch": 1.4105923602140615, - "grad_norm": 0.4070111815672579, - "learning_rate": 0.0001406023991452285, - "loss": 0.7674, - "step": 2872 - }, - { - "epoch": 1.4110844559266777, - "grad_norm": 0.4171067618610642, - "learning_rate": 0.00014054611972550365, - "loss": 0.7455, - "step": 2873 - }, - { - "epoch": 1.411576551639294, - "grad_norm": 0.43382133531872263, - "learning_rate": 0.00014048982493244268, - "loss": 0.7755, - "step": 2874 - }, - { - "epoch": 1.41206864735191, - "grad_norm": 0.4363850746655965, - "learning_rate": 0.00014043351478739024, - "loss": 0.743, - "step": 2875 - }, - { - "epoch": 1.412560743064526, - "grad_norm": 0.4360042913510568, - "learning_rate": 0.00014037718931169662, - "loss": 0.7641, - "step": 2876 - }, - { - "epoch": 1.4130528387771422, - "grad_norm": 0.45730795402467167, - "learning_rate": 0.00014032084852671803, - "loss": 0.7505, - "step": 2877 - }, - { - "epoch": 1.4135449344897584, - "grad_norm": 0.4022597947003752, - "learning_rate": 0.00014026449245381647, - "loss": 0.6883, - "step": 2878 - }, - { - "epoch": 1.4140370302023744, - "grad_norm": 0.44552551333264356, - "learning_rate": 0.0001402081211143597, - "loss": 0.717, - "step": 2879 - }, - { - "epoch": 1.4145291259149904, - "grad_norm": 0.47377949637491557, - "learning_rate": 0.0001401517345297213, - "loss": 0.7866, - "step": 2880 - }, - { - "epoch": 1.4150212216276066, - "grad_norm": 0.4371521549322124, - "learning_rate": 0.00014009533272128067, - "loss": 0.7803, - "step": 2881 - }, - { - "epoch": 1.4155133173402228, - "grad_norm": 0.4200894722489111, - "learning_rate": 0.0001400389157104229, - "loss": 0.6691, - "step": 2882 - }, - { - "epoch": 1.4160054130528388, - "grad_norm": 0.4165235110057634, - "learning_rate": 0.0001399824835185389, - "loss": 0.746, - "step": 2883 - }, - { - "epoch": 1.4164975087654548, - "grad_norm": 0.431662953834368, - "learning_rate": 0.00013992603616702525, - "loss": 0.6527, - "step": 2884 - }, - { - "epoch": 1.416989604478071, - "grad_norm": 0.43850081677439445, - "learning_rate": 0.00013986957367728445, - "loss": 0.8324, - "step": 2885 - }, - { - "epoch": 1.4174817001906872, - "grad_norm": 0.4187732141177877, - "learning_rate": 0.0001398130960707246, - "loss": 0.7149, - "step": 2886 - }, - { - "epoch": 1.4179737959033032, - "grad_norm": 0.40634513468351635, - "learning_rate": 0.0001397566033687595, - "loss": 0.6807, - "step": 2887 - }, - { - "epoch": 1.4184658916159192, - "grad_norm": 0.4583126497511633, - "learning_rate": 0.00013970009559280882, - "loss": 0.7505, - "step": 2888 - }, - { - "epoch": 1.4189579873285354, - "grad_norm": 0.4891573664815128, - "learning_rate": 0.00013964357276429788, - "loss": 0.7477, - "step": 2889 - }, - { - "epoch": 1.4194500830411516, - "grad_norm": 0.4137821213859619, - "learning_rate": 0.00013958703490465758, - "loss": 0.7773, - "step": 2890 - }, - { - "epoch": 1.4199421787537676, - "grad_norm": 0.4460968509297009, - "learning_rate": 0.00013953048203532476, - "loss": 0.6913, - "step": 2891 - }, - { - "epoch": 1.4204342744663836, - "grad_norm": 0.4005465440971991, - "learning_rate": 0.00013947391417774176, - "loss": 0.7472, - "step": 2892 - }, - { - "epoch": 1.4209263701789998, - "grad_norm": 0.44251907002578933, - "learning_rate": 0.0001394173313533567, - "loss": 0.7393, - "step": 2893 - }, - { - "epoch": 1.421418465891616, - "grad_norm": 0.41224985680157783, - "learning_rate": 0.00013936073358362328, - "loss": 0.7421, - "step": 2894 - }, - { - "epoch": 1.421910561604232, - "grad_norm": 0.42618771413527917, - "learning_rate": 0.00013930412089000098, - "loss": 0.7587, - "step": 2895 - }, - { - "epoch": 1.422402657316848, - "grad_norm": 0.45540669872859213, - "learning_rate": 0.00013924749329395487, - "loss": 0.8174, - "step": 2896 - }, - { - "epoch": 1.4228947530294642, - "grad_norm": 0.4359599281544839, - "learning_rate": 0.0001391908508169557, - "loss": 0.7311, - "step": 2897 - }, - { - "epoch": 1.4233868487420804, - "grad_norm": 0.43010465875363985, - "learning_rate": 0.00013913419348047983, - "loss": 0.7768, - "step": 2898 - }, - { - "epoch": 1.4238789444546964, - "grad_norm": 0.435110098412238, - "learning_rate": 0.0001390775213060093, - "loss": 0.7449, - "step": 2899 - }, - { - "epoch": 1.4243710401673124, - "grad_norm": 0.40177833018764403, - "learning_rate": 0.0001390208343150317, - "loss": 0.7227, - "step": 2900 - }, - { - "epoch": 1.4248631358799286, - "grad_norm": 0.8512405579703662, - "learning_rate": 0.00013896413252904036, - "loss": 0.7107, - "step": 2901 - }, - { - "epoch": 1.4253552315925448, - "grad_norm": 0.403879588650031, - "learning_rate": 0.00013890741596953406, - "loss": 0.712, - "step": 2902 - }, - { - "epoch": 1.4258473273051608, - "grad_norm": 0.3964710248800138, - "learning_rate": 0.00013885068465801733, - "loss": 0.6513, - "step": 2903 - }, - { - "epoch": 1.4263394230177768, - "grad_norm": 0.4318649470861009, - "learning_rate": 0.00013879393861600023, - "loss": 0.7263, - "step": 2904 - }, - { - "epoch": 1.426831518730393, - "grad_norm": 0.442240871740813, - "learning_rate": 0.0001387371778649984, - "loss": 0.7094, - "step": 2905 - }, - { - "epoch": 1.4273236144430093, - "grad_norm": 0.4552714815809044, - "learning_rate": 0.000138680402426533, - "loss": 0.7924, - "step": 2906 - }, - { - "epoch": 1.4278157101556253, - "grad_norm": 0.481478122154512, - "learning_rate": 0.00013862361232213095, - "loss": 0.7331, - "step": 2907 - }, - { - "epoch": 1.4283078058682412, - "grad_norm": 0.42710383729499984, - "learning_rate": 0.0001385668075733245, - "loss": 0.7276, - "step": 2908 - }, - { - "epoch": 1.4287999015808575, - "grad_norm": 0.4280297934009003, - "learning_rate": 0.00013850998820165157, - "loss": 0.7365, - "step": 2909 - }, - { - "epoch": 1.4292919972934737, - "grad_norm": 0.46622341113853055, - "learning_rate": 0.00013845315422865561, - "loss": 0.8161, - "step": 2910 - }, - { - "epoch": 1.4297840930060897, - "grad_norm": 0.46592536759745623, - "learning_rate": 0.00013839630567588564, - "loss": 0.7298, - "step": 2911 - }, - { - "epoch": 1.4302761887187057, - "grad_norm": 0.49640539779225473, - "learning_rate": 0.00013833944256489615, - "loss": 0.7563, - "step": 2912 - }, - { - "epoch": 1.4307682844313219, - "grad_norm": 0.45185106365996336, - "learning_rate": 0.0001382825649172472, - "loss": 0.8016, - "step": 2913 - }, - { - "epoch": 1.431260380143938, - "grad_norm": 0.4428266892150101, - "learning_rate": 0.00013822567275450427, - "loss": 0.794, - "step": 2914 - }, - { - "epoch": 1.431752475856554, - "grad_norm": 0.45759508801690096, - "learning_rate": 0.00013816876609823849, - "loss": 0.765, - "step": 2915 - }, - { - "epoch": 1.43224457156917, - "grad_norm": 0.544503508687506, - "learning_rate": 0.00013811184497002635, - "loss": 0.7612, - "step": 2916 - }, - { - "epoch": 1.4327366672817863, - "grad_norm": 0.42121930764428955, - "learning_rate": 0.00013805490939144992, - "loss": 0.7171, - "step": 2917 - }, - { - "epoch": 1.4332287629944025, - "grad_norm": 0.4381253918038311, - "learning_rate": 0.0001379979593840967, - "loss": 0.7915, - "step": 2918 - }, - { - "epoch": 1.4337208587070185, - "grad_norm": 0.4145705065574621, - "learning_rate": 0.0001379409949695596, - "loss": 0.7264, - "step": 2919 - }, - { - "epoch": 1.4342129544196347, - "grad_norm": 0.42024556353558573, - "learning_rate": 0.00013788401616943716, - "loss": 0.6993, - "step": 2920 - }, - { - "epoch": 1.4347050501322507, - "grad_norm": 0.43110288012934284, - "learning_rate": 0.00013782702300533324, - "loss": 0.7126, - "step": 2921 - }, - { - "epoch": 1.435197145844867, - "grad_norm": 0.4349493312847473, - "learning_rate": 0.0001377700154988572, - "loss": 0.7444, - "step": 2922 - }, - { - "epoch": 1.435689241557483, - "grad_norm": 0.41663983120576736, - "learning_rate": 0.00013771299367162378, - "loss": 0.7253, - "step": 2923 - }, - { - "epoch": 1.4361813372700991, - "grad_norm": 0.4303844427833147, - "learning_rate": 0.00013765595754525325, - "loss": 0.7297, - "step": 2924 - }, - { - "epoch": 1.436673432982715, - "grad_norm": 0.48581012699068177, - "learning_rate": 0.0001375989071413712, - "loss": 0.7889, - "step": 2925 - }, - { - "epoch": 1.4371655286953313, - "grad_norm": 0.4284137641486597, - "learning_rate": 0.00013754184248160868, - "loss": 0.7166, - "step": 2926 - }, - { - "epoch": 1.4376576244079473, - "grad_norm": 0.7831283240149731, - "learning_rate": 0.0001374847635876022, - "loss": 0.8392, - "step": 2927 - }, - { - "epoch": 1.4381497201205635, - "grad_norm": 0.4580485811164869, - "learning_rate": 0.00013742767048099353, - "loss": 0.7546, - "step": 2928 - }, - { - "epoch": 1.4386418158331795, - "grad_norm": 0.42604106634937855, - "learning_rate": 0.00013737056318342995, - "loss": 0.7532, - "step": 2929 - }, - { - "epoch": 1.4391339115457957, - "grad_norm": 0.4308574750819755, - "learning_rate": 0.0001373134417165641, - "loss": 0.6814, - "step": 2930 - }, - { - "epoch": 1.4396260072584117, - "grad_norm": 0.5332424690162715, - "learning_rate": 0.0001372563061020539, - "loss": 0.8342, - "step": 2931 - }, - { - "epoch": 1.440118102971028, - "grad_norm": 0.46080850919005695, - "learning_rate": 0.00013719915636156276, - "loss": 0.7292, - "step": 2932 - }, - { - "epoch": 1.440610198683644, - "grad_norm": 0.4261657382522858, - "learning_rate": 0.0001371419925167594, - "loss": 0.7263, - "step": 2933 - }, - { - "epoch": 1.4411022943962601, - "grad_norm": 0.4248726170760586, - "learning_rate": 0.00013708481458931784, - "loss": 0.7546, - "step": 2934 - }, - { - "epoch": 1.4415943901088761, - "grad_norm": 0.4410965585873861, - "learning_rate": 0.0001370276226009175, - "loss": 0.8106, - "step": 2935 - }, - { - "epoch": 1.4420864858214923, - "grad_norm": 0.45333982658390326, - "learning_rate": 0.0001369704165732431, - "loss": 0.758, - "step": 2936 - }, - { - "epoch": 1.4425785815341083, - "grad_norm": 0.39423850287590273, - "learning_rate": 0.0001369131965279847, - "loss": 0.6921, - "step": 2937 - }, - { - "epoch": 1.4430706772467246, - "grad_norm": 0.44768422955250525, - "learning_rate": 0.00013685596248683772, - "loss": 0.7737, - "step": 2938 - }, - { - "epoch": 1.4435627729593405, - "grad_norm": 0.42327666978599154, - "learning_rate": 0.00013679871447150275, - "loss": 0.7247, - "step": 2939 - }, - { - "epoch": 1.4440548686719568, - "grad_norm": 0.43022591934590015, - "learning_rate": 0.0001367414525036858, - "loss": 0.7282, - "step": 2940 - }, - { - "epoch": 1.4445469643845728, - "grad_norm": 0.5328276382722564, - "learning_rate": 0.00013668417660509812, - "loss": 0.86, - "step": 2941 - }, - { - "epoch": 1.445039060097189, - "grad_norm": 0.42019164310891927, - "learning_rate": 0.00013662688679745626, - "loss": 0.7572, - "step": 2942 - }, - { - "epoch": 1.445531155809805, - "grad_norm": 0.45317888939026624, - "learning_rate": 0.00013656958310248206, - "loss": 0.7606, - "step": 2943 - }, - { - "epoch": 1.4460232515224212, - "grad_norm": 0.4159749206257165, - "learning_rate": 0.00013651226554190258, - "loss": 0.7374, - "step": 2944 - }, - { - "epoch": 1.4465153472350372, - "grad_norm": 0.4167892666138694, - "learning_rate": 0.00013645493413745016, - "loss": 0.7534, - "step": 2945 - }, - { - "epoch": 1.4470074429476534, - "grad_norm": 0.4180296030672964, - "learning_rate": 0.0001363975889108624, - "loss": 0.683, - "step": 2946 - }, - { - "epoch": 1.4474995386602694, - "grad_norm": 0.4097577876076059, - "learning_rate": 0.00013634022988388215, - "loss": 0.713, - "step": 2947 - }, - { - "epoch": 1.4479916343728856, - "grad_norm": 0.3988887096595805, - "learning_rate": 0.00013628285707825745, - "loss": 0.696, - "step": 2948 - }, - { - "epoch": 1.4484837300855016, - "grad_norm": 0.43352472144513926, - "learning_rate": 0.00013622547051574157, - "loss": 0.7573, - "step": 2949 - }, - { - "epoch": 1.4489758257981178, - "grad_norm": 0.43484806320664127, - "learning_rate": 0.00013616807021809305, - "loss": 0.7093, - "step": 2950 - }, - { - "epoch": 1.4494679215107338, - "grad_norm": 0.3975537927628502, - "learning_rate": 0.00013611065620707563, - "loss": 0.6672, - "step": 2951 - }, - { - "epoch": 1.44996001722335, - "grad_norm": 0.4149696034740369, - "learning_rate": 0.00013605322850445813, - "loss": 0.7438, - "step": 2952 - }, - { - "epoch": 1.450452112935966, - "grad_norm": 0.4096247946570659, - "learning_rate": 0.00013599578713201473, - "loss": 0.7311, - "step": 2953 - }, - { - "epoch": 1.4509442086485822, - "grad_norm": 0.45218011553335336, - "learning_rate": 0.0001359383321115247, - "loss": 0.8454, - "step": 2954 - }, - { - "epoch": 1.4514363043611982, - "grad_norm": 0.41679761193630593, - "learning_rate": 0.00013588086346477245, - "loss": 0.7589, - "step": 2955 - }, - { - "epoch": 1.4519284000738144, - "grad_norm": 0.42096420998370043, - "learning_rate": 0.00013582338121354768, - "loss": 0.8439, - "step": 2956 - }, - { - "epoch": 1.4524204957864304, - "grad_norm": 0.4213610590768471, - "learning_rate": 0.00013576588537964513, - "loss": 0.771, - "step": 2957 - }, - { - "epoch": 1.4529125914990466, - "grad_norm": 0.4346284009249143, - "learning_rate": 0.00013570837598486475, - "loss": 0.8156, - "step": 2958 - }, - { - "epoch": 1.4534046872116626, - "grad_norm": 0.42241315752609315, - "learning_rate": 0.0001356508530510116, - "loss": 0.7452, - "step": 2959 - }, - { - "epoch": 1.4538967829242788, - "grad_norm": 0.42923311696960353, - "learning_rate": 0.0001355933165998959, - "loss": 0.7415, - "step": 2960 - }, - { - "epoch": 1.4543888786368948, - "grad_norm": 0.4669393188409626, - "learning_rate": 0.00013553576665333302, - "loss": 0.8085, - "step": 2961 - }, - { - "epoch": 1.454880974349511, - "grad_norm": 0.47428268942239843, - "learning_rate": 0.00013547820323314336, - "loss": 0.7517, - "step": 2962 - }, - { - "epoch": 1.455373070062127, - "grad_norm": 0.4167283198912779, - "learning_rate": 0.0001354206263611525, - "loss": 0.6868, - "step": 2963 - }, - { - "epoch": 1.4558651657747432, - "grad_norm": 0.41549655527914137, - "learning_rate": 0.0001353630360591911, - "loss": 0.7469, - "step": 2964 - }, - { - "epoch": 1.4563572614873592, - "grad_norm": 0.3832641047494742, - "learning_rate": 0.00013530543234909493, - "loss": 0.7121, - "step": 2965 - }, - { - "epoch": 1.4568493571999754, - "grad_norm": 0.4294927014503728, - "learning_rate": 0.0001352478152527048, - "loss": 0.7719, - "step": 2966 - }, - { - "epoch": 1.4573414529125914, - "grad_norm": 2.7573070317840536, - "learning_rate": 0.00013519018479186666, - "loss": 0.7394, - "step": 2967 - }, - { - "epoch": 1.4578335486252076, - "grad_norm": 0.4299412409158559, - "learning_rate": 0.00013513254098843143, - "loss": 0.7342, - "step": 2968 - }, - { - "epoch": 1.4583256443378236, - "grad_norm": 0.43663358819270803, - "learning_rate": 0.00013507488386425522, - "loss": 0.7065, - "step": 2969 - }, - { - "epoch": 1.4588177400504398, - "grad_norm": 0.41952467955875783, - "learning_rate": 0.00013501721344119907, - "loss": 0.7074, - "step": 2970 - }, - { - "epoch": 1.4593098357630558, - "grad_norm": 0.4018966441407418, - "learning_rate": 0.00013495952974112914, - "loss": 0.7468, - "step": 2971 - }, - { - "epoch": 1.459801931475672, - "grad_norm": 0.4460412406846254, - "learning_rate": 0.00013490183278591652, - "loss": 0.7875, - "step": 2972 - }, - { - "epoch": 1.460294027188288, - "grad_norm": 0.3995218655936246, - "learning_rate": 0.00013484412259743753, - "loss": 0.7061, - "step": 2973 - }, - { - "epoch": 1.4607861229009043, - "grad_norm": 0.3902152859646245, - "learning_rate": 0.0001347863991975733, - "loss": 0.7033, - "step": 2974 - }, - { - "epoch": 1.4612782186135203, - "grad_norm": 0.45560542563149636, - "learning_rate": 0.00013472866260821006, - "loss": 0.7665, - "step": 2975 - }, - { - "epoch": 1.4617703143261365, - "grad_norm": 1.0765117525058105, - "learning_rate": 0.00013467091285123903, - "loss": 0.7584, - "step": 2976 - }, - { - "epoch": 1.4622624100387525, - "grad_norm": 0.588424029730406, - "learning_rate": 0.0001346131499485564, - "loss": 0.7788, - "step": 2977 - }, - { - "epoch": 1.4627545057513687, - "grad_norm": 0.42299824952303533, - "learning_rate": 0.00013455537392206339, - "loss": 0.7062, - "step": 2978 - }, - { - "epoch": 1.4632466014639847, - "grad_norm": 0.42037663156928856, - "learning_rate": 0.00013449758479366618, - "loss": 0.7981, - "step": 2979 - }, - { - "epoch": 1.4637386971766009, - "grad_norm": 0.46299269963475975, - "learning_rate": 0.0001344397825852759, - "loss": 0.7638, - "step": 2980 - }, - { - "epoch": 1.4642307928892169, - "grad_norm": 0.39361158967557547, - "learning_rate": 0.00013438196731880867, - "loss": 0.7406, - "step": 2981 - }, - { - "epoch": 1.464722888601833, - "grad_norm": 0.42228860720601463, - "learning_rate": 0.00013432413901618548, - "loss": 0.7183, - "step": 2982 - }, - { - "epoch": 1.465214984314449, - "grad_norm": 0.42235631862864903, - "learning_rate": 0.00013426629769933238, - "loss": 0.7768, - "step": 2983 - }, - { - "epoch": 1.4657070800270653, - "grad_norm": 0.43676713475216566, - "learning_rate": 0.00013420844339018028, - "loss": 0.6861, - "step": 2984 - }, - { - "epoch": 1.4661991757396815, - "grad_norm": 0.4250906194418104, - "learning_rate": 0.00013415057611066504, - "loss": 0.6991, - "step": 2985 - }, - { - "epoch": 1.4666912714522975, - "grad_norm": 0.4800896798565744, - "learning_rate": 0.0001340926958827274, - "loss": 0.7703, - "step": 2986 - }, - { - "epoch": 1.4671833671649135, - "grad_norm": 0.4387660488504194, - "learning_rate": 0.0001340348027283131, - "loss": 0.7642, - "step": 2987 - }, - { - "epoch": 1.4676754628775297, - "grad_norm": 0.4185327189884391, - "learning_rate": 0.00013397689666937266, - "loss": 0.7397, - "step": 2988 - }, - { - "epoch": 1.468167558590146, - "grad_norm": 0.40837776294476474, - "learning_rate": 0.00013391897772786158, - "loss": 0.7363, - "step": 2989 - }, - { - "epoch": 1.468659654302762, - "grad_norm": 0.4043044621180528, - "learning_rate": 0.00013386104592574022, - "loss": 0.7346, - "step": 2990 - }, - { - "epoch": 1.469151750015378, - "grad_norm": 0.41509835595502276, - "learning_rate": 0.0001338031012849738, - "loss": 0.7581, - "step": 2991 - }, - { - "epoch": 1.4696438457279941, - "grad_norm": 0.40084265083532755, - "learning_rate": 0.00013374514382753246, - "loss": 0.7346, - "step": 2992 - }, - { - "epoch": 1.4701359414406103, - "grad_norm": 0.4364298452244065, - "learning_rate": 0.00013368717357539116, - "loss": 0.6604, - "step": 2993 - }, - { - "epoch": 1.4706280371532263, - "grad_norm": 0.4305777408560004, - "learning_rate": 0.00013362919055052966, - "loss": 0.6825, - "step": 2994 - }, - { - "epoch": 1.4711201328658423, - "grad_norm": 0.4096676372200456, - "learning_rate": 0.00013357119477493265, - "loss": 0.7291, - "step": 2995 - }, - { - "epoch": 1.4716122285784585, - "grad_norm": 0.4398855482135969, - "learning_rate": 0.00013351318627058964, - "loss": 0.7369, - "step": 2996 - }, - { - "epoch": 1.4721043242910747, - "grad_norm": 0.42490468698913875, - "learning_rate": 0.00013345516505949492, - "loss": 0.7583, - "step": 2997 - }, - { - "epoch": 1.4725964200036907, - "grad_norm": 0.4521087201321453, - "learning_rate": 0.00013339713116364768, - "loss": 0.731, - "step": 2998 - }, - { - "epoch": 1.4730885157163067, - "grad_norm": 0.4558298349166806, - "learning_rate": 0.00013333908460505178, - "loss": 0.7345, - "step": 2999 - }, - { - "epoch": 1.473580611428923, - "grad_norm": 0.43819505834952743, - "learning_rate": 0.000133281025405716, - "loss": 0.7375, - "step": 3000 - }, - { - "epoch": 1.4740727071415392, - "grad_norm": 0.41865292773313334, - "learning_rate": 0.0001332229535876539, - "loss": 0.6631, - "step": 3001 - }, - { - "epoch": 1.4745648028541551, - "grad_norm": 0.4120333157523723, - "learning_rate": 0.00013316486917288377, - "loss": 0.6795, - "step": 3002 - }, - { - "epoch": 1.4750568985667711, - "grad_norm": 0.4021473198346083, - "learning_rate": 0.00013310677218342874, - "loss": 0.6411, - "step": 3003 - }, - { - "epoch": 1.4755489942793873, - "grad_norm": 0.4728250527841678, - "learning_rate": 0.00013304866264131669, - "loss": 0.7827, - "step": 3004 - }, - { - "epoch": 1.4760410899920036, - "grad_norm": 0.44633656756790946, - "learning_rate": 0.0001329905405685802, - "loss": 0.7199, - "step": 3005 - }, - { - "epoch": 1.4765331857046196, - "grad_norm": 0.4948855099340994, - "learning_rate": 0.00013293240598725666, - "loss": 0.7533, - "step": 3006 - }, - { - "epoch": 1.4770252814172355, - "grad_norm": 0.4162448508051315, - "learning_rate": 0.00013287425891938824, - "loss": 0.7467, - "step": 3007 - }, - { - "epoch": 1.4775173771298518, - "grad_norm": 0.4543701745347057, - "learning_rate": 0.00013281609938702173, - "loss": 0.7436, - "step": 3008 - }, - { - "epoch": 1.478009472842468, - "grad_norm": 0.457882069485982, - "learning_rate": 0.00013275792741220875, - "loss": 0.7581, - "step": 3009 - }, - { - "epoch": 1.478501568555084, - "grad_norm": 0.39603707426970053, - "learning_rate": 0.00013269974301700557, - "loss": 0.7293, - "step": 3010 - }, - { - "epoch": 1.4789936642677, - "grad_norm": 0.43248034847450656, - "learning_rate": 0.0001326415462234732, - "loss": 0.7342, - "step": 3011 - }, - { - "epoch": 1.4794857599803162, - "grad_norm": 0.46342484194924166, - "learning_rate": 0.0001325833370536774, - "loss": 0.769, - "step": 3012 - }, - { - "epoch": 1.4799778556929324, - "grad_norm": 0.4066229020918127, - "learning_rate": 0.00013252511552968853, - "loss": 0.719, - "step": 3013 - }, - { - "epoch": 1.4804699514055484, - "grad_norm": 0.41616605393800105, - "learning_rate": 0.00013246688167358164, - "loss": 0.7519, - "step": 3014 - }, - { - "epoch": 1.4809620471181644, - "grad_norm": 0.4219271358945579, - "learning_rate": 0.00013240863550743653, - "loss": 0.7224, - "step": 3015 - }, - { - "epoch": 1.4814541428307806, - "grad_norm": 0.36729337510638593, - "learning_rate": 0.00013235037705333765, - "loss": 0.716, - "step": 3016 - }, - { - "epoch": 1.4819462385433968, - "grad_norm": 0.4215228606656191, - "learning_rate": 0.00013229210633337404, - "loss": 0.741, - "step": 3017 - }, - { - "epoch": 1.4824383342560128, - "grad_norm": 0.39460507590391153, - "learning_rate": 0.00013223382336963952, - "loss": 0.7464, - "step": 3018 - }, - { - "epoch": 1.4829304299686288, - "grad_norm": 0.39184879004971424, - "learning_rate": 0.00013217552818423238, - "loss": 0.7545, - "step": 3019 - }, - { - "epoch": 1.483422525681245, - "grad_norm": 0.41510153203765787, - "learning_rate": 0.00013211722079925568, - "loss": 0.7773, - "step": 3020 - }, - { - "epoch": 1.4839146213938612, - "grad_norm": 0.4255275634596563, - "learning_rate": 0.00013205890123681706, - "loss": 0.7504, - "step": 3021 - }, - { - "epoch": 1.4844067171064772, - "grad_norm": 0.466119135799236, - "learning_rate": 0.00013200056951902876, - "loss": 0.7264, - "step": 3022 - }, - { - "epoch": 1.4848988128190932, - "grad_norm": 0.42155715395882765, - "learning_rate": 0.00013194222566800766, - "loss": 0.758, - "step": 3023 - }, - { - "epoch": 1.4853909085317094, - "grad_norm": 0.4180660786457265, - "learning_rate": 0.00013188386970587517, - "loss": 0.7597, - "step": 3024 - }, - { - "epoch": 1.4858830042443256, - "grad_norm": 0.4257204204565261, - "learning_rate": 0.00013182550165475744, - "loss": 0.7086, - "step": 3025 - }, - { - "epoch": 1.4863750999569416, - "grad_norm": 0.43132732910529703, - "learning_rate": 0.00013176712153678509, - "loss": 0.7341, - "step": 3026 - }, - { - "epoch": 1.4868671956695576, - "grad_norm": 0.42438460081333756, - "learning_rate": 0.00013170872937409326, - "loss": 0.7845, - "step": 3027 - }, - { - "epoch": 1.4873592913821738, - "grad_norm": 0.44307121545989486, - "learning_rate": 0.00013165032518882184, - "loss": 0.782, - "step": 3028 - }, - { - "epoch": 1.48785138709479, - "grad_norm": 0.4044677607381762, - "learning_rate": 0.00013159190900311512, - "loss": 0.6836, - "step": 3029 - }, - { - "epoch": 1.488343482807406, - "grad_norm": 0.3973183220334357, - "learning_rate": 0.000131533480839122, - "loss": 0.6734, - "step": 3030 - }, - { - "epoch": 1.488835578520022, - "grad_norm": 0.40780023312514757, - "learning_rate": 0.00013147504071899587, - "loss": 0.727, - "step": 3031 - }, - { - "epoch": 1.4893276742326382, - "grad_norm": 0.46018741993093354, - "learning_rate": 0.00013141658866489477, - "loss": 0.8059, - "step": 3032 - }, - { - "epoch": 1.4898197699452544, - "grad_norm": 0.42135487675244954, - "learning_rate": 0.00013135812469898116, - "loss": 0.694, - "step": 3033 - }, - { - "epoch": 1.4903118656578704, - "grad_norm": 0.4068258129416411, - "learning_rate": 0.00013129964884342206, - "loss": 0.6834, - "step": 3034 - }, - { - "epoch": 1.4908039613704864, - "grad_norm": 0.4095911790891875, - "learning_rate": 0.00013124116112038896, - "loss": 0.6842, - "step": 3035 - }, - { - "epoch": 1.4912960570831026, - "grad_norm": 0.38707934469090177, - "learning_rate": 0.0001311826615520579, - "loss": 0.6721, - "step": 3036 - }, - { - "epoch": 1.4917881527957189, - "grad_norm": 0.4227456592517028, - "learning_rate": 0.00013112415016060938, - "loss": 0.6808, - "step": 3037 - }, - { - "epoch": 1.4922802485083349, - "grad_norm": 0.40541024584666174, - "learning_rate": 0.0001310656269682284, - "loss": 0.6862, - "step": 3038 - }, - { - "epoch": 1.4927723442209508, - "grad_norm": 0.3940813292989956, - "learning_rate": 0.00013100709199710442, - "loss": 0.7146, - "step": 3039 - }, - { - "epoch": 1.493264439933567, - "grad_norm": 0.42566679643736666, - "learning_rate": 0.00013094854526943134, - "loss": 0.796, - "step": 3040 - }, - { - "epoch": 1.4937565356461833, - "grad_norm": 0.46101186943081, - "learning_rate": 0.0001308899868074076, - "loss": 0.761, - "step": 3041 - }, - { - "epoch": 1.4942486313587993, - "grad_norm": 0.4181233516858649, - "learning_rate": 0.00013083141663323603, - "loss": 0.6625, - "step": 3042 - }, - { - "epoch": 1.4947407270714153, - "grad_norm": 0.41896694413954866, - "learning_rate": 0.0001307728347691239, - "loss": 0.8141, - "step": 3043 - }, - { - "epoch": 1.4952328227840315, - "grad_norm": 0.4091474992052696, - "learning_rate": 0.0001307142412372829, - "loss": 0.7523, - "step": 3044 - }, - { - "epoch": 1.4957249184966477, - "grad_norm": 0.4365804852716206, - "learning_rate": 0.00013065563605992918, - "loss": 0.7722, - "step": 3045 - }, - { - "epoch": 1.4962170142092637, - "grad_norm": 0.45798639122756435, - "learning_rate": 0.00013059701925928328, - "loss": 0.7476, - "step": 3046 - }, - { - "epoch": 1.4967091099218799, - "grad_norm": 0.39903783066877924, - "learning_rate": 0.00013053839085757013, - "loss": 0.6797, - "step": 3047 - }, - { - "epoch": 1.4972012056344959, - "grad_norm": 0.41403278268558885, - "learning_rate": 0.00013047975087701917, - "loss": 0.7381, - "step": 3048 - }, - { - "epoch": 1.497693301347112, - "grad_norm": 0.41042336037569394, - "learning_rate": 0.0001304210993398641, - "loss": 0.7862, - "step": 3049 - }, - { - "epoch": 1.498185397059728, - "grad_norm": 0.39440513575421626, - "learning_rate": 0.000130362436268343, - "loss": 0.7004, - "step": 3050 - }, - { - "epoch": 1.4986774927723443, - "grad_norm": 0.40701229150870666, - "learning_rate": 0.00013030376168469848, - "loss": 0.6817, - "step": 3051 - }, - { - "epoch": 1.4986774927723443, - "eval_loss": 0.8062564730644226, - "eval_runtime": 6668.7735, - "eval_samples_per_second": 4.273, - "eval_steps_per_second": 2.137, - "step": 3051 - }, - { - "epoch": 1.4991695884849603, - "grad_norm": 0.4012959059324339, - "learning_rate": 0.0001302450756111773, - "loss": 0.697, - "step": 3052 - }, - { - "epoch": 1.4996616841975765, - "grad_norm": 0.42031391099745824, - "learning_rate": 0.00013018637807003078, - "loss": 0.75, - "step": 3053 - }, - { - "epoch": 1.5001537799101925, - "grad_norm": 0.3785034515960198, - "learning_rate": 0.00013012766908351438, - "loss": 0.7043, - "step": 3054 - }, - { - "epoch": 1.5006458756228085, - "grad_norm": 0.4330691443923283, - "learning_rate": 0.0001300689486738881, - "loss": 0.8017, - "step": 3055 - }, - { - "epoch": 1.5011379713354247, - "grad_norm": 0.39835898188657426, - "learning_rate": 0.00013001021686341615, - "loss": 0.7541, - "step": 3056 - }, - { - "epoch": 1.501630067048041, - "grad_norm": 0.4766175640336095, - "learning_rate": 0.00012995147367436704, - "loss": 0.7813, - "step": 3057 - }, - { - "epoch": 1.502122162760657, - "grad_norm": 0.41451668275342507, - "learning_rate": 0.00012989271912901374, - "loss": 0.7684, - "step": 3058 - }, - { - "epoch": 1.502614258473273, - "grad_norm": 0.4153085818390717, - "learning_rate": 0.0001298339532496334, - "loss": 0.7648, - "step": 3059 - }, - { - "epoch": 1.5031063541858891, - "grad_norm": 0.40939606435324155, - "learning_rate": 0.00012977517605850745, - "loss": 0.7622, - "step": 3060 - }, - { - "epoch": 1.5035984498985053, - "grad_norm": 0.4097088626024703, - "learning_rate": 0.00012971638757792176, - "loss": 0.7138, - "step": 3061 - }, - { - "epoch": 1.5040905456111213, - "grad_norm": 0.40471514832795313, - "learning_rate": 0.00012965758783016633, - "loss": 0.7048, - "step": 3062 - }, - { - "epoch": 1.5045826413237373, - "grad_norm": 0.42223220004450407, - "learning_rate": 0.00012959877683753544, - "loss": 0.7556, - "step": 3063 - }, - { - "epoch": 1.5050747370363535, - "grad_norm": 0.43361994525438585, - "learning_rate": 0.00012953995462232771, - "loss": 0.7584, - "step": 3064 - }, - { - "epoch": 1.5055668327489697, - "grad_norm": 0.4123106850196711, - "learning_rate": 0.00012948112120684602, - "loss": 0.7816, - "step": 3065 - }, - { - "epoch": 1.5060589284615857, - "grad_norm": 0.4254681923207696, - "learning_rate": 0.00012942227661339744, - "loss": 0.7001, - "step": 3066 - }, - { - "epoch": 1.5065510241742017, - "grad_norm": 0.4118680636606478, - "learning_rate": 0.00012936342086429325, - "loss": 0.752, - "step": 3067 - }, - { - "epoch": 1.507043119886818, - "grad_norm": 0.42605811058729165, - "learning_rate": 0.00012930455398184904, - "loss": 0.7462, - "step": 3068 - }, - { - "epoch": 1.5075352155994342, - "grad_norm": 0.44241327862069835, - "learning_rate": 0.0001292456759883846, - "loss": 0.7251, - "step": 3069 - }, - { - "epoch": 1.5080273113120501, - "grad_norm": 0.41674748185876725, - "learning_rate": 0.00012918678690622388, - "loss": 0.8003, - "step": 3070 - }, - { - "epoch": 1.5085194070246661, - "grad_norm": 0.47726405246653525, - "learning_rate": 0.00012912788675769512, - "loss": 0.7392, - "step": 3071 - }, - { - "epoch": 1.5090115027372824, - "grad_norm": 0.4012505019853829, - "learning_rate": 0.0001290689755651307, - "loss": 0.6901, - "step": 3072 - }, - { - "epoch": 1.5095035984498986, - "grad_norm": 0.4152165138620225, - "learning_rate": 0.00012901005335086717, - "loss": 0.7833, - "step": 3073 - }, - { - "epoch": 1.5099956941625146, - "grad_norm": 0.3854227271538217, - "learning_rate": 0.00012895112013724532, - "loss": 0.6685, - "step": 3074 - }, - { - "epoch": 1.5104877898751305, - "grad_norm": 0.3968242727990472, - "learning_rate": 0.00012889217594661006, - "loss": 0.7435, - "step": 3075 - }, - { - "epoch": 1.5109798855877468, - "grad_norm": 0.40764086246986103, - "learning_rate": 0.00012883322080131047, - "loss": 0.6891, - "step": 3076 - }, - { - "epoch": 1.511471981300363, - "grad_norm": 0.460514775082263, - "learning_rate": 0.00012877425472369983, - "loss": 0.7618, - "step": 3077 - }, - { - "epoch": 1.511964077012979, - "grad_norm": 0.40059181460704496, - "learning_rate": 0.00012871527773613547, - "loss": 0.7208, - "step": 3078 - }, - { - "epoch": 1.5124561727255952, - "grad_norm": 0.44988655894234547, - "learning_rate": 0.00012865628986097897, - "loss": 0.778, - "step": 3079 - }, - { - "epoch": 1.5129482684382114, - "grad_norm": 0.4048849756938812, - "learning_rate": 0.00012859729112059596, - "loss": 0.7492, - "step": 3080 - }, - { - "epoch": 1.5134403641508274, - "grad_norm": 0.4191796085659502, - "learning_rate": 0.00012853828153735618, - "loss": 0.7296, - "step": 3081 - }, - { - "epoch": 1.5139324598634434, - "grad_norm": 0.41812779771448, - "learning_rate": 0.0001284792611336336, - "loss": 0.7048, - "step": 3082 - }, - { - "epoch": 1.5144245555760596, - "grad_norm": 0.4797066515574714, - "learning_rate": 0.00012842022993180612, - "loss": 0.828, - "step": 3083 - }, - { - "epoch": 1.5149166512886758, - "grad_norm": 0.41106092005251393, - "learning_rate": 0.00012836118795425585, - "loss": 0.7813, - "step": 3084 - }, - { - "epoch": 1.5154087470012918, - "grad_norm": 0.40817359434708045, - "learning_rate": 0.00012830213522336897, - "loss": 0.686, - "step": 3085 - }, - { - "epoch": 1.5159008427139078, - "grad_norm": 0.4076667748465361, - "learning_rate": 0.0001282430717615357, - "loss": 0.7094, - "step": 3086 - }, - { - "epoch": 1.516392938426524, - "grad_norm": 0.4386133393504196, - "learning_rate": 0.0001281839975911504, - "loss": 0.7551, - "step": 3087 - }, - { - "epoch": 1.5168850341391402, - "grad_norm": 0.429260417303817, - "learning_rate": 0.00012812491273461136, - "loss": 0.6982, - "step": 3088 - }, - { - "epoch": 1.5173771298517562, - "grad_norm": 0.41918902030986416, - "learning_rate": 0.00012806581721432108, - "loss": 0.7249, - "step": 3089 - }, - { - "epoch": 1.5178692255643722, - "grad_norm": 0.4215972943876984, - "learning_rate": 0.00012800671105268598, - "loss": 0.7339, - "step": 3090 - }, - { - "epoch": 1.5183613212769884, - "grad_norm": 0.4038371270111055, - "learning_rate": 0.00012794759427211657, - "loss": 0.7138, - "step": 3091 - }, - { - "epoch": 1.5188534169896046, - "grad_norm": 0.41956476832990003, - "learning_rate": 0.0001278884668950274, - "loss": 0.7477, - "step": 3092 - }, - { - "epoch": 1.5193455127022206, - "grad_norm": 0.44100240467913016, - "learning_rate": 0.000127829328943837, - "loss": 0.7658, - "step": 3093 - }, - { - "epoch": 1.5198376084148366, - "grad_norm": 0.4178678387214205, - "learning_rate": 0.00012777018044096792, - "loss": 0.6944, - "step": 3094 - }, - { - "epoch": 1.5203297041274528, - "grad_norm": 0.4053607057062803, - "learning_rate": 0.00012771102140884675, - "loss": 0.6927, - "step": 3095 - }, - { - "epoch": 1.520821799840069, - "grad_norm": 0.4350538092655992, - "learning_rate": 0.00012765185186990396, - "loss": 0.6907, - "step": 3096 - }, - { - "epoch": 1.521313895552685, - "grad_norm": 0.40854114917475026, - "learning_rate": 0.00012759267184657416, - "loss": 0.7095, - "step": 3097 - }, - { - "epoch": 1.521805991265301, - "grad_norm": 0.4724031030864346, - "learning_rate": 0.00012753348136129583, - "loss": 0.7489, - "step": 3098 - }, - { - "epoch": 1.5222980869779172, - "grad_norm": 0.4337096882195965, - "learning_rate": 0.00012747428043651145, - "loss": 0.7324, - "step": 3099 - }, - { - "epoch": 1.5227901826905335, - "grad_norm": 0.40855131995965477, - "learning_rate": 0.00012741506909466743, - "loss": 0.7754, - "step": 3100 - }, - { - "epoch": 1.5232822784031494, - "grad_norm": 0.3953499045725943, - "learning_rate": 0.0001273558473582142, - "loss": 0.6775, - "step": 3101 - }, - { - "epoch": 1.5237743741157654, - "grad_norm": 0.41147652055069633, - "learning_rate": 0.00012729661524960598, - "loss": 0.752, - "step": 3102 - }, - { - "epoch": 1.5242664698283817, - "grad_norm": 0.44204125317371185, - "learning_rate": 0.00012723737279130115, - "loss": 0.6875, - "step": 3103 - }, - { - "epoch": 1.5247585655409979, - "grad_norm": 0.4012087670532406, - "learning_rate": 0.00012717812000576182, - "loss": 0.7021, - "step": 3104 - }, - { - "epoch": 1.5252506612536139, - "grad_norm": 0.4071975517738948, - "learning_rate": 0.0001271188569154541, - "loss": 0.7156, - "step": 3105 - }, - { - "epoch": 1.5257427569662299, - "grad_norm": 0.48279921604761133, - "learning_rate": 0.00012705958354284797, - "loss": 0.8309, - "step": 3106 - }, - { - "epoch": 1.526234852678846, - "grad_norm": 0.4496843301558887, - "learning_rate": 0.00012700029991041738, - "loss": 0.7971, - "step": 3107 - }, - { - "epoch": 1.5267269483914623, - "grad_norm": 0.41853871282158156, - "learning_rate": 0.0001269410060406401, - "loss": 0.7736, - "step": 3108 - }, - { - "epoch": 1.5272190441040783, - "grad_norm": 0.394324717785068, - "learning_rate": 0.00012688170195599774, - "loss": 0.7316, - "step": 3109 - }, - { - "epoch": 1.5277111398166943, - "grad_norm": 0.4361928198770574, - "learning_rate": 0.00012682238767897596, - "loss": 0.7639, - "step": 3110 - }, - { - "epoch": 1.5282032355293105, - "grad_norm": 0.4032868357652532, - "learning_rate": 0.0001267630632320641, - "loss": 0.7107, - "step": 3111 - }, - { - "epoch": 1.5286953312419267, - "grad_norm": 0.43093832486121686, - "learning_rate": 0.00012670372863775545, - "loss": 0.7767, - "step": 3112 - }, - { - "epoch": 1.5291874269545427, - "grad_norm": 0.3990443330477586, - "learning_rate": 0.00012664438391854708, - "loss": 0.7084, - "step": 3113 - }, - { - "epoch": 1.5296795226671587, - "grad_norm": 0.4078595316209392, - "learning_rate": 0.00012658502909694, - "loss": 0.7367, - "step": 3114 - }, - { - "epoch": 1.530171618379775, - "grad_norm": 0.4842332587104113, - "learning_rate": 0.0001265256641954389, - "loss": 0.7251, - "step": 3115 - }, - { - "epoch": 1.530663714092391, - "grad_norm": 0.41205273022195266, - "learning_rate": 0.00012646628923655253, - "loss": 0.7531, - "step": 3116 - }, - { - "epoch": 1.531155809805007, - "grad_norm": 0.4129857433946101, - "learning_rate": 0.0001264069042427932, - "loss": 0.7627, - "step": 3117 - }, - { - "epoch": 1.531647905517623, - "grad_norm": 0.4608841180298976, - "learning_rate": 0.00012634750923667717, - "loss": 0.7522, - "step": 3118 - }, - { - "epoch": 1.5321400012302393, - "grad_norm": 0.4007114571605506, - "learning_rate": 0.0001262881042407244, - "loss": 0.7688, - "step": 3119 - }, - { - "epoch": 1.5326320969428555, - "grad_norm": 0.40875220062558654, - "learning_rate": 0.00012622868927745882, - "loss": 0.7977, - "step": 3120 - }, - { - "epoch": 1.5331241926554715, - "grad_norm": 0.4363642818563895, - "learning_rate": 0.00012616926436940793, - "loss": 0.8101, - "step": 3121 - }, - { - "epoch": 1.5336162883680875, - "grad_norm": 0.3932820988657897, - "learning_rate": 0.00012610982953910308, - "loss": 0.6781, - "step": 3122 - }, - { - "epoch": 1.5341083840807037, - "grad_norm": 0.4351803023940728, - "learning_rate": 0.00012605038480907943, - "loss": 0.7682, - "step": 3123 - }, - { - "epoch": 1.53460047979332, - "grad_norm": 0.3946445702138645, - "learning_rate": 0.00012599093020187582, - "loss": 0.6606, - "step": 3124 - }, - { - "epoch": 1.535092575505936, - "grad_norm": 0.4513592470917633, - "learning_rate": 0.00012593146574003486, - "loss": 0.7405, - "step": 3125 - }, - { - "epoch": 1.535584671218552, - "grad_norm": 0.4341125634658012, - "learning_rate": 0.00012587199144610292, - "loss": 0.75, - "step": 3126 - }, - { - "epoch": 1.5360767669311681, - "grad_norm": 0.9598121480047209, - "learning_rate": 0.00012581250734263011, - "loss": 0.7155, - "step": 3127 - }, - { - "epoch": 1.5365688626437843, - "grad_norm": 0.4059330316147248, - "learning_rate": 0.00012575301345217022, - "loss": 0.7371, - "step": 3128 - }, - { - "epoch": 1.5370609583564003, - "grad_norm": 0.41905133678894035, - "learning_rate": 0.00012569350979728072, - "loss": 0.7472, - "step": 3129 - }, - { - "epoch": 1.5375530540690163, - "grad_norm": 0.39863870202992024, - "learning_rate": 0.00012563399640052288, - "loss": 0.6779, - "step": 3130 - }, - { - "epoch": 1.5380451497816325, - "grad_norm": 0.4372611193678232, - "learning_rate": 0.00012557447328446155, - "loss": 0.6678, - "step": 3131 - }, - { - "epoch": 1.5385372454942488, - "grad_norm": 0.44284328012292046, - "learning_rate": 0.00012551494047166533, - "loss": 0.7023, - "step": 3132 - }, - { - "epoch": 1.5390293412068647, - "grad_norm": 0.4368631008394386, - "learning_rate": 0.00012545539798470654, - "loss": 0.775, - "step": 3133 - }, - { - "epoch": 1.5395214369194807, - "grad_norm": 0.40955854086198185, - "learning_rate": 0.00012539584584616109, - "loss": 0.6759, - "step": 3134 - }, - { - "epoch": 1.540013532632097, - "grad_norm": 0.399361340268841, - "learning_rate": 0.00012533628407860857, - "loss": 0.6799, - "step": 3135 - }, - { - "epoch": 1.5405056283447132, - "grad_norm": 0.44816478436128276, - "learning_rate": 0.00012527671270463225, - "loss": 0.7643, - "step": 3136 - }, - { - "epoch": 1.5409977240573292, - "grad_norm": 0.4469735527379651, - "learning_rate": 0.000125217131746819, - "loss": 0.8086, - "step": 3137 - }, - { - "epoch": 1.5414898197699451, - "grad_norm": 0.44165998397296513, - "learning_rate": 0.00012515754122775931, - "loss": 0.7714, - "step": 3138 - }, - { - "epoch": 1.5419819154825614, - "grad_norm": 0.4009003684739982, - "learning_rate": 0.00012509794117004737, - "loss": 0.6879, - "step": 3139 - }, - { - "epoch": 1.5424740111951776, - "grad_norm": 0.41411732986544114, - "learning_rate": 0.000125038331596281, - "loss": 0.7548, - "step": 3140 - }, - { - "epoch": 1.5429661069077936, - "grad_norm": 0.40094591741964813, - "learning_rate": 0.00012497871252906148, - "loss": 0.7728, - "step": 3141 - }, - { - "epoch": 1.5434582026204096, - "grad_norm": 0.3836291781594014, - "learning_rate": 0.0001249190839909938, - "loss": 0.6485, - "step": 3142 - }, - { - "epoch": 1.5439502983330258, - "grad_norm": 0.4181964897836226, - "learning_rate": 0.00012485944600468658, - "loss": 0.7176, - "step": 3143 - }, - { - "epoch": 1.544442394045642, - "grad_norm": 0.3900476893534145, - "learning_rate": 0.0001247997985927519, - "loss": 0.6686, - "step": 3144 - }, - { - "epoch": 1.544934489758258, - "grad_norm": 0.4125968693070065, - "learning_rate": 0.00012474014177780553, - "loss": 0.7701, - "step": 3145 - }, - { - "epoch": 1.545426585470874, - "grad_norm": 0.40477972245023536, - "learning_rate": 0.00012468047558246675, - "loss": 0.7071, - "step": 3146 - }, - { - "epoch": 1.5459186811834902, - "grad_norm": 0.4831825795027475, - "learning_rate": 0.00012462080002935836, - "loss": 0.7199, - "step": 3147 - }, - { - "epoch": 1.5464107768961064, - "grad_norm": 0.41862299991321594, - "learning_rate": 0.00012456111514110677, - "loss": 0.722, - "step": 3148 - }, - { - "epoch": 1.5469028726087224, - "grad_norm": 0.4123363747171097, - "learning_rate": 0.00012450142094034194, - "loss": 0.7545, - "step": 3149 - }, - { - "epoch": 1.5473949683213384, - "grad_norm": 0.40525575830553767, - "learning_rate": 0.00012444171744969732, - "loss": 0.7153, - "step": 3150 - }, - { - "epoch": 1.5478870640339546, - "grad_norm": 0.4424817544085473, - "learning_rate": 0.00012438200469180985, - "loss": 0.7219, - "step": 3151 - }, - { - "epoch": 1.5483791597465708, - "grad_norm": 0.4624706482322748, - "learning_rate": 0.0001243222826893201, - "loss": 0.7537, - "step": 3152 - }, - { - "epoch": 1.5488712554591868, - "grad_norm": 0.3996180166604787, - "learning_rate": 0.00012426255146487201, - "loss": 0.75, - "step": 3153 - }, - { - "epoch": 1.5493633511718028, - "grad_norm": 0.4002862540823509, - "learning_rate": 0.0001242028110411131, - "loss": 0.7402, - "step": 3154 - }, - { - "epoch": 1.549855446884419, - "grad_norm": 0.4052397801704882, - "learning_rate": 0.00012414306144069436, - "loss": 0.7441, - "step": 3155 - }, - { - "epoch": 1.5503475425970352, - "grad_norm": 0.39188769814813246, - "learning_rate": 0.00012408330268627027, - "loss": 0.6907, - "step": 3156 - }, - { - "epoch": 1.5508396383096512, - "grad_norm": 0.4703358633800996, - "learning_rate": 0.00012402353480049874, - "loss": 0.8066, - "step": 3157 - }, - { - "epoch": 1.5513317340222672, - "grad_norm": 0.49725706722828167, - "learning_rate": 0.00012396375780604116, - "loss": 0.7568, - "step": 3158 - }, - { - "epoch": 1.5518238297348834, - "grad_norm": 0.3889181561017834, - "learning_rate": 0.00012390397172556242, - "loss": 0.7377, - "step": 3159 - }, - { - "epoch": 1.5523159254474996, - "grad_norm": 0.4146720624581034, - "learning_rate": 0.0001238441765817308, - "loss": 0.7054, - "step": 3160 - }, - { - "epoch": 1.5528080211601156, - "grad_norm": 0.40078899808712837, - "learning_rate": 0.000123784372397218, - "loss": 0.7031, - "step": 3161 - }, - { - "epoch": 1.5533001168727316, - "grad_norm": 0.39964674244438475, - "learning_rate": 0.00012372455919469925, - "loss": 0.7717, - "step": 3162 - }, - { - "epoch": 1.5537922125853478, - "grad_norm": 0.39330947490961, - "learning_rate": 0.00012366473699685309, - "loss": 0.7092, - "step": 3163 - }, - { - "epoch": 1.554284308297964, - "grad_norm": 0.38671419115056166, - "learning_rate": 0.0001236049058263615, - "loss": 0.6819, - "step": 3164 - }, - { - "epoch": 1.55477640401058, - "grad_norm": 0.41233618348051926, - "learning_rate": 0.00012354506570590992, - "loss": 0.751, - "step": 3165 - }, - { - "epoch": 1.555268499723196, - "grad_norm": 0.40554561359199665, - "learning_rate": 0.00012348521665818708, - "loss": 0.7965, - "step": 3166 - }, - { - "epoch": 1.5557605954358122, - "grad_norm": 0.4255750503354243, - "learning_rate": 0.0001234253587058852, - "loss": 0.7032, - "step": 3167 - }, - { - "epoch": 1.5562526911484285, - "grad_norm": 0.4243341887658289, - "learning_rate": 0.00012336549187169982, - "loss": 0.7741, - "step": 3168 - }, - { - "epoch": 1.5567447868610444, - "grad_norm": 0.41913378225345127, - "learning_rate": 0.00012330561617832984, - "loss": 0.7246, - "step": 3169 - }, - { - "epoch": 1.5572368825736604, - "grad_norm": 0.39463951391533864, - "learning_rate": 0.0001232457316484775, - "loss": 0.6937, - "step": 3170 - }, - { - "epoch": 1.5577289782862767, - "grad_norm": 0.4432901611924005, - "learning_rate": 0.0001231858383048485, - "loss": 0.7259, - "step": 3171 - }, - { - "epoch": 1.5582210739988929, - "grad_norm": 0.42505712154884673, - "learning_rate": 0.00012312593617015176, - "loss": 0.7723, - "step": 3172 - }, - { - "epoch": 1.5587131697115089, - "grad_norm": 0.4017494696724237, - "learning_rate": 0.0001230660252670996, - "loss": 0.7465, - "step": 3173 - }, - { - "epoch": 1.5592052654241249, - "grad_norm": 0.4457221849906673, - "learning_rate": 0.00012300610561840762, - "loss": 0.7829, - "step": 3174 - }, - { - "epoch": 1.559697361136741, - "grad_norm": 0.43366350686788174, - "learning_rate": 0.0001229461772467948, - "loss": 0.7266, - "step": 3175 - }, - { - "epoch": 1.5601894568493573, - "grad_norm": 0.4133768951049731, - "learning_rate": 0.00012288624017498336, - "loss": 0.7523, - "step": 3176 - }, - { - "epoch": 1.5606815525619733, - "grad_norm": 0.4226671272286967, - "learning_rate": 0.00012282629442569886, - "loss": 0.7377, - "step": 3177 - }, - { - "epoch": 1.5611736482745893, - "grad_norm": 0.4233081981379191, - "learning_rate": 0.0001227663400216701, - "loss": 0.8065, - "step": 3178 - }, - { - "epoch": 1.5616657439872055, - "grad_norm": 0.4116049168948208, - "learning_rate": 0.00012270637698562925, - "loss": 0.7369, - "step": 3179 - }, - { - "epoch": 1.5621578396998217, - "grad_norm": 0.4085878673645208, - "learning_rate": 0.0001226464053403117, - "loss": 0.7834, - "step": 3180 - }, - { - "epoch": 1.5626499354124377, - "grad_norm": 0.44664794329108926, - "learning_rate": 0.00012258642510845608, - "loss": 0.8174, - "step": 3181 - }, - { - "epoch": 1.5631420311250537, - "grad_norm": 0.414396505533701, - "learning_rate": 0.0001225264363128043, - "loss": 0.7368, - "step": 3182 - }, - { - "epoch": 1.56363412683767, - "grad_norm": 0.4337292987659127, - "learning_rate": 0.00012246643897610154, - "loss": 0.7606, - "step": 3183 - }, - { - "epoch": 1.564126222550286, - "grad_norm": 0.37977316865869715, - "learning_rate": 0.00012240643312109615, - "loss": 0.6468, - "step": 3184 - }, - { - "epoch": 1.564618318262902, - "grad_norm": 0.42540973031554663, - "learning_rate": 0.0001223464187705398, - "loss": 0.7432, - "step": 3185 - }, - { - "epoch": 1.565110413975518, - "grad_norm": 0.40031097511476715, - "learning_rate": 0.00012228639594718735, - "loss": 0.6835, - "step": 3186 - }, - { - "epoch": 1.5656025096881343, - "grad_norm": 0.4423994790045654, - "learning_rate": 0.0001222263646737968, - "loss": 0.7433, - "step": 3187 - }, - { - "epoch": 1.5660946054007505, - "grad_norm": 0.39284839259840126, - "learning_rate": 0.00012216632497312948, - "loss": 0.7109, - "step": 3188 - }, - { - "epoch": 1.5665867011133665, - "grad_norm": 0.38096019470032755, - "learning_rate": 0.00012210627686794982, - "loss": 0.7217, - "step": 3189 - }, - { - "epoch": 1.5670787968259825, - "grad_norm": 0.43208719127571726, - "learning_rate": 0.00012204622038102547, - "loss": 0.7379, - "step": 3190 - }, - { - "epoch": 1.5675708925385987, - "grad_norm": 0.5125978311629931, - "learning_rate": 0.00012198615553512724, - "loss": 0.7665, - "step": 3191 - }, - { - "epoch": 1.568062988251215, - "grad_norm": 0.39537945211758196, - "learning_rate": 0.00012192608235302914, - "loss": 0.7575, - "step": 3192 - }, - { - "epoch": 1.568555083963831, - "grad_norm": 0.40283597554945677, - "learning_rate": 0.00012186600085750832, - "loss": 0.6771, - "step": 3193 - }, - { - "epoch": 1.569047179676447, - "grad_norm": 0.445836777284328, - "learning_rate": 0.00012180591107134507, - "loss": 0.7641, - "step": 3194 - }, - { - "epoch": 1.5695392753890631, - "grad_norm": 0.4066461003156354, - "learning_rate": 0.00012174581301732288, - "loss": 0.7844, - "step": 3195 - }, - { - "epoch": 1.5700313711016793, - "grad_norm": 0.4013926668665078, - "learning_rate": 0.0001216857067182283, - "loss": 0.7591, - "step": 3196 - }, - { - "epoch": 1.5705234668142953, - "grad_norm": 0.43761053795693694, - "learning_rate": 0.00012162559219685103, - "loss": 0.7691, - "step": 3197 - }, - { - "epoch": 1.5710155625269113, - "grad_norm": 0.4364570057829297, - "learning_rate": 0.00012156546947598393, - "loss": 0.7213, - "step": 3198 - }, - { - "epoch": 1.5715076582395275, - "grad_norm": 0.40217805746179286, - "learning_rate": 0.00012150533857842294, - "loss": 0.6977, - "step": 3199 - }, - { - "epoch": 1.5719997539521438, - "grad_norm": 0.4059245290376753, - "learning_rate": 0.00012144519952696707, - "loss": 0.7809, - "step": 3200 - }, - { - "epoch": 1.5724918496647597, - "grad_norm": 0.43773026575758556, - "learning_rate": 0.00012138505234441846, - "loss": 0.7461, - "step": 3201 - }, - { - "epoch": 1.5729839453773757, - "grad_norm": 0.5257924412805737, - "learning_rate": 0.00012132489705358234, - "loss": 0.6759, - "step": 3202 - }, - { - "epoch": 1.573476041089992, - "grad_norm": 0.4015383996173517, - "learning_rate": 0.00012126473367726697, - "loss": 0.7324, - "step": 3203 - }, - { - "epoch": 1.5739681368026082, - "grad_norm": 0.4228274459204497, - "learning_rate": 0.00012120456223828371, - "loss": 0.6825, - "step": 3204 - }, - { - "epoch": 1.5744602325152242, - "grad_norm": 0.41065820678004206, - "learning_rate": 0.00012114438275944697, - "loss": 0.7421, - "step": 3205 - }, - { - "epoch": 1.5749523282278404, - "grad_norm": 0.4096908137103904, - "learning_rate": 0.00012108419526357421, - "loss": 0.6982, - "step": 3206 - }, - { - "epoch": 1.5754444239404566, - "grad_norm": 0.3902471204861861, - "learning_rate": 0.0001210239997734859, - "loss": 0.7206, - "step": 3207 - }, - { - "epoch": 1.5759365196530726, - "grad_norm": 0.394550442380344, - "learning_rate": 0.00012096379631200563, - "loss": 0.7049, - "step": 3208 - }, - { - "epoch": 1.5764286153656886, - "grad_norm": 0.3980228960206703, - "learning_rate": 0.0001209035849019599, - "loss": 0.7079, - "step": 3209 - }, - { - "epoch": 1.5769207110783048, - "grad_norm": 0.3884921384266705, - "learning_rate": 0.00012084336556617826, - "loss": 0.7638, - "step": 3210 - }, - { - "epoch": 1.577412806790921, - "grad_norm": 0.388694547737168, - "learning_rate": 0.00012078313832749335, - "loss": 0.6976, - "step": 3211 - }, - { - "epoch": 1.577904902503537, - "grad_norm": 0.4106725701987002, - "learning_rate": 0.00012072290320874067, - "loss": 0.7455, - "step": 3212 - }, - { - "epoch": 1.578396998216153, - "grad_norm": 0.379571478980294, - "learning_rate": 0.00012066266023275881, - "loss": 0.7182, - "step": 3213 - }, - { - "epoch": 1.5788890939287692, - "grad_norm": 0.3781914515938951, - "learning_rate": 0.00012060240942238927, - "loss": 0.746, - "step": 3214 - }, - { - "epoch": 1.5793811896413854, - "grad_norm": 0.4094939873193998, - "learning_rate": 0.0001205421508004766, - "loss": 0.7251, - "step": 3215 - }, - { - "epoch": 1.5798732853540014, - "grad_norm": 0.406974633221615, - "learning_rate": 0.00012048188438986821, - "loss": 0.725, - "step": 3216 - }, - { - "epoch": 1.5803653810666174, - "grad_norm": 0.41874649043239853, - "learning_rate": 0.00012042161021341454, - "loss": 0.7302, - "step": 3217 - }, - { - "epoch": 1.5808574767792336, - "grad_norm": 0.4372105347499653, - "learning_rate": 0.00012036132829396895, - "loss": 0.7779, - "step": 3218 - }, - { - "epoch": 1.5813495724918498, - "grad_norm": 0.4613183652990205, - "learning_rate": 0.00012030103865438778, - "loss": 0.7261, - "step": 3219 - }, - { - "epoch": 1.5818416682044658, - "grad_norm": 0.42093631785720165, - "learning_rate": 0.00012024074131753018, - "loss": 0.7896, - "step": 3220 - }, - { - "epoch": 1.5823337639170818, - "grad_norm": 0.4302448082721251, - "learning_rate": 0.00012018043630625835, - "loss": 0.6971, - "step": 3221 - }, - { - "epoch": 1.582825859629698, - "grad_norm": 0.46152397247781207, - "learning_rate": 0.00012012012364343735, - "loss": 0.7125, - "step": 3222 - }, - { - "epoch": 1.5833179553423142, - "grad_norm": 0.39258700347927744, - "learning_rate": 0.00012005980335193507, - "loss": 0.7238, - "step": 3223 - }, - { - "epoch": 1.5838100510549302, - "grad_norm": 0.42986712350839656, - "learning_rate": 0.00011999947545462242, - "loss": 0.745, - "step": 3224 - }, - { - "epoch": 1.5843021467675462, - "grad_norm": 0.4333873903179454, - "learning_rate": 0.00011993913997437313, - "loss": 0.7584, - "step": 3225 - }, - { - "epoch": 1.5847942424801624, - "grad_norm": 0.4131821348341676, - "learning_rate": 0.00011987879693406379, - "loss": 0.7382, - "step": 3226 - }, - { - "epoch": 1.5852863381927786, - "grad_norm": 0.4036706507775948, - "learning_rate": 0.00011981844635657386, - "loss": 0.7521, - "step": 3227 - }, - { - "epoch": 1.5857784339053946, - "grad_norm": 0.43449693180274446, - "learning_rate": 0.00011975808826478567, - "loss": 0.7233, - "step": 3228 - }, - { - "epoch": 1.5862705296180106, - "grad_norm": 0.4115064959380051, - "learning_rate": 0.00011969772268158443, - "loss": 0.6833, - "step": 3229 - }, - { - "epoch": 1.5867626253306268, - "grad_norm": 0.47992814892736324, - "learning_rate": 0.00011963734962985811, - "loss": 0.7624, - "step": 3230 - }, - { - "epoch": 1.587254721043243, - "grad_norm": 0.4157417072109434, - "learning_rate": 0.00011957696913249761, - "loss": 0.7168, - "step": 3231 - }, - { - "epoch": 1.587746816755859, - "grad_norm": 0.4044420894643676, - "learning_rate": 0.0001195165812123966, - "loss": 0.7936, - "step": 3232 - }, - { - "epoch": 1.588238912468475, - "grad_norm": 0.39319196370554976, - "learning_rate": 0.00011945618589245151, - "loss": 0.7117, - "step": 3233 - }, - { - "epoch": 1.5887310081810913, - "grad_norm": 0.4432426510736256, - "learning_rate": 0.00011939578319556173, - "loss": 0.7836, - "step": 3234 - }, - { - "epoch": 1.5892231038937075, - "grad_norm": 0.4244378022492606, - "learning_rate": 0.00011933537314462929, - "loss": 0.6452, - "step": 3235 - }, - { - "epoch": 1.5897151996063235, - "grad_norm": 0.4241822686502179, - "learning_rate": 0.00011927495576255907, - "loss": 0.7163, - "step": 3236 - }, - { - "epoch": 1.5902072953189395, - "grad_norm": 0.43134795646055973, - "learning_rate": 0.00011921453107225877, - "loss": 0.7867, - "step": 3237 - }, - { - "epoch": 1.5906993910315557, - "grad_norm": 0.3737104615007287, - "learning_rate": 0.00011915409909663878, - "loss": 0.6735, - "step": 3238 - }, - { - "epoch": 1.5911914867441719, - "grad_norm": 0.4068521661983082, - "learning_rate": 0.0001190936598586123, - "loss": 0.7293, - "step": 3239 - }, - { - "epoch": 1.5916835824567879, - "grad_norm": 0.3995051886846135, - "learning_rate": 0.00011903321338109527, - "loss": 0.653, - "step": 3240 - }, - { - "epoch": 1.5921756781694039, - "grad_norm": 0.4328633281994684, - "learning_rate": 0.0001189727596870064, - "loss": 0.7238, - "step": 3241 - }, - { - "epoch": 1.59266777388202, - "grad_norm": 0.390485099531083, - "learning_rate": 0.00011891229879926715, - "loss": 0.6991, - "step": 3242 - }, - { - "epoch": 1.5931598695946363, - "grad_norm": 0.4334795235601448, - "learning_rate": 0.0001188518307408016, - "loss": 0.7513, - "step": 3243 - }, - { - "epoch": 1.5936519653072523, - "grad_norm": 0.3978157461790713, - "learning_rate": 0.00011879135553453666, - "loss": 0.7426, - "step": 3244 - }, - { - "epoch": 1.5941440610198683, - "grad_norm": 0.450012737682133, - "learning_rate": 0.00011873087320340194, - "loss": 0.795, - "step": 3245 - }, - { - "epoch": 1.5946361567324845, - "grad_norm": 0.4339573027096241, - "learning_rate": 0.00011867038377032968, - "loss": 0.7899, - "step": 3246 - }, - { - "epoch": 1.5951282524451007, - "grad_norm": 0.4315192870884105, - "learning_rate": 0.0001186098872582549, - "loss": 0.7264, - "step": 3247 - }, - { - "epoch": 1.5956203481577167, - "grad_norm": 0.4356228528840322, - "learning_rate": 0.00011854938369011524, - "loss": 0.7265, - "step": 3248 - }, - { - "epoch": 1.5961124438703327, - "grad_norm": 0.4186532600742828, - "learning_rate": 0.00011848887308885103, - "loss": 0.8307, - "step": 3249 - }, - { - "epoch": 1.596604539582949, - "grad_norm": 0.4389404768825083, - "learning_rate": 0.00011842835547740532, - "loss": 0.6989, - "step": 3250 - }, - { - "epoch": 1.5970966352955651, - "grad_norm": 0.4282987131053194, - "learning_rate": 0.00011836783087872372, - "loss": 0.8298, - "step": 3251 - }, - { - "epoch": 1.597588731008181, - "grad_norm": 0.3709300892850735, - "learning_rate": 0.00011830729931575455, - "loss": 0.7213, - "step": 3252 - }, - { - "epoch": 1.598080826720797, - "grad_norm": 0.39262352567043607, - "learning_rate": 0.00011824676081144876, - "loss": 0.7402, - "step": 3253 - }, - { - "epoch": 1.5985729224334133, - "grad_norm": 0.42026054831734694, - "learning_rate": 0.00011818621538875998, - "loss": 0.7269, - "step": 3254 - }, - { - "epoch": 1.5990650181460295, - "grad_norm": 0.4423013018477197, - "learning_rate": 0.00011812566307064437, - "loss": 0.7616, - "step": 3255 - }, - { - "epoch": 1.5995571138586455, - "grad_norm": 0.38771548614502505, - "learning_rate": 0.00011806510388006074, - "loss": 0.6695, - "step": 3256 - }, - { - "epoch": 1.6000492095712615, - "grad_norm": 0.4293862443775147, - "learning_rate": 0.00011800453783997056, - "loss": 0.7205, - "step": 3257 - }, - { - "epoch": 1.6005413052838777, - "grad_norm": 0.40091041932693333, - "learning_rate": 0.00011794396497333787, - "loss": 0.7765, - "step": 3258 - }, - { - "epoch": 1.601033400996494, - "grad_norm": 0.5402956068930469, - "learning_rate": 0.00011788338530312921, - "loss": 0.7701, - "step": 3259 - }, - { - "epoch": 1.60152549670911, - "grad_norm": 0.3947668363909619, - "learning_rate": 0.00011782279885231385, - "loss": 0.6962, - "step": 3260 - }, - { - "epoch": 1.602017592421726, - "grad_norm": 0.4243497046931594, - "learning_rate": 0.00011776220564386348, - "loss": 0.7828, - "step": 3261 - }, - { - "epoch": 1.6025096881343421, - "grad_norm": 0.3909881441241604, - "learning_rate": 0.00011770160570075248, - "loss": 0.7384, - "step": 3262 - }, - { - "epoch": 1.6030017838469584, - "grad_norm": 0.4110875557112792, - "learning_rate": 0.00011764099904595772, - "loss": 0.7701, - "step": 3263 - }, - { - "epoch": 1.6034938795595743, - "grad_norm": 0.3912785966690083, - "learning_rate": 0.0001175803857024586, - "loss": 0.7465, - "step": 3264 - }, - { - "epoch": 1.6039859752721903, - "grad_norm": 0.3911070561680225, - "learning_rate": 0.00011751976569323711, - "loss": 0.6896, - "step": 3265 - }, - { - "epoch": 1.6044780709848065, - "grad_norm": 0.4177901406196027, - "learning_rate": 0.00011745913904127769, - "loss": 0.7255, - "step": 3266 - }, - { - "epoch": 1.6049701666974228, - "grad_norm": 0.4128201313649831, - "learning_rate": 0.00011739850576956741, - "loss": 0.7424, - "step": 3267 - }, - { - "epoch": 1.6054622624100388, - "grad_norm": 0.4081329442940276, - "learning_rate": 0.00011733786590109577, - "loss": 0.764, - "step": 3268 - }, - { - "epoch": 1.6059543581226547, - "grad_norm": 0.4209126311839199, - "learning_rate": 0.00011727721945885473, - "loss": 0.7276, - "step": 3269 - }, - { - "epoch": 1.606446453835271, - "grad_norm": 0.39826639801393804, - "learning_rate": 0.00011721656646583885, - "loss": 0.7158, - "step": 3270 - }, - { - "epoch": 1.6069385495478872, - "grad_norm": 0.3878353242093959, - "learning_rate": 0.00011715590694504515, - "loss": 0.7083, - "step": 3271 - }, - { - "epoch": 1.6074306452605032, - "grad_norm": 0.4164322323191292, - "learning_rate": 0.00011709524091947304, - "loss": 0.7355, - "step": 3272 - }, - { - "epoch": 1.6079227409731192, - "grad_norm": 0.40521745639168855, - "learning_rate": 0.00011703456841212448, - "loss": 0.7585, - "step": 3273 - }, - { - "epoch": 1.6084148366857354, - "grad_norm": 0.42530553004267596, - "learning_rate": 0.00011697388944600385, - "loss": 0.8443, - "step": 3274 - }, - { - "epoch": 1.6089069323983516, - "grad_norm": 0.3811248910807747, - "learning_rate": 0.00011691320404411801, - "loss": 0.6883, - "step": 3275 - }, - { - "epoch": 1.6093990281109676, - "grad_norm": 0.40406970060832814, - "learning_rate": 0.00011685251222947621, - "loss": 0.7003, - "step": 3276 - }, - { - "epoch": 1.6098911238235836, - "grad_norm": 0.4323123527635906, - "learning_rate": 0.00011679181402509023, - "loss": 0.7701, - "step": 3277 - }, - { - "epoch": 1.6103832195361998, - "grad_norm": 0.4069240672567787, - "learning_rate": 0.00011673110945397414, - "loss": 0.6808, - "step": 3278 - }, - { - "epoch": 1.610875315248816, - "grad_norm": 0.42143949186525315, - "learning_rate": 0.0001166703985391445, - "loss": 0.7189, - "step": 3279 - }, - { - "epoch": 1.611367410961432, - "grad_norm": 0.4304924244856845, - "learning_rate": 0.00011660968130362029, - "loss": 0.7689, - "step": 3280 - }, - { - "epoch": 1.611859506674048, - "grad_norm": 0.46127782547072554, - "learning_rate": 0.00011654895777042285, - "loss": 0.8438, - "step": 3281 - }, - { - "epoch": 1.6123516023866642, - "grad_norm": 0.4029495862792948, - "learning_rate": 0.0001164882279625759, - "loss": 0.7118, - "step": 3282 - }, - { - "epoch": 1.6128436980992804, - "grad_norm": 0.40606752911623156, - "learning_rate": 0.0001164274919031056, - "loss": 0.7395, - "step": 3283 - }, - { - "epoch": 1.6133357938118964, - "grad_norm": 0.4299344055067732, - "learning_rate": 0.0001163667496150404, - "loss": 0.7283, - "step": 3284 - }, - { - "epoch": 1.6138278895245124, - "grad_norm": 0.41832702246026865, - "learning_rate": 0.00011630600112141112, - "loss": 0.817, - "step": 3285 - }, - { - "epoch": 1.6143199852371286, - "grad_norm": 0.3814569894630706, - "learning_rate": 0.00011624524644525108, - "loss": 0.7179, - "step": 3286 - }, - { - "epoch": 1.6148120809497448, - "grad_norm": 0.40109249265628677, - "learning_rate": 0.00011618448560959572, - "loss": 0.68, - "step": 3287 - }, - { - "epoch": 1.6153041766623608, - "grad_norm": 0.415736511427573, - "learning_rate": 0.00011612371863748295, - "loss": 0.7296, - "step": 3288 - }, - { - "epoch": 1.6157962723749768, - "grad_norm": 0.4701272415745234, - "learning_rate": 0.00011606294555195297, - "loss": 0.8022, - "step": 3289 - }, - { - "epoch": 1.616288368087593, - "grad_norm": 0.4110937527985462, - "learning_rate": 0.00011600216637604835, - "loss": 0.7109, - "step": 3290 - }, - { - "epoch": 1.6167804638002092, - "grad_norm": 0.3855358948624381, - "learning_rate": 0.0001159413811328139, - "loss": 0.6798, - "step": 3291 - }, - { - "epoch": 1.6172725595128252, - "grad_norm": 0.4043043048790532, - "learning_rate": 0.00011588058984529673, - "loss": 0.7102, - "step": 3292 - }, - { - "epoch": 1.6177646552254412, - "grad_norm": 0.41123227564716486, - "learning_rate": 0.00011581979253654632, - "loss": 0.7546, - "step": 3293 - }, - { - "epoch": 1.6182567509380574, - "grad_norm": 0.38828934945244903, - "learning_rate": 0.00011575898922961435, - "loss": 0.7667, - "step": 3294 - }, - { - "epoch": 1.6187488466506736, - "grad_norm": 0.3828094938101845, - "learning_rate": 0.00011569817994755481, - "loss": 0.6885, - "step": 3295 - }, - { - "epoch": 1.6192409423632896, - "grad_norm": 0.4440985368095448, - "learning_rate": 0.00011563736471342395, - "loss": 0.8214, - "step": 3296 - }, - { - "epoch": 1.6197330380759056, - "grad_norm": 0.408667806656832, - "learning_rate": 0.00011557654355028029, - "loss": 0.7572, - "step": 3297 - }, - { - "epoch": 1.6202251337885218, - "grad_norm": 0.4243654636971181, - "learning_rate": 0.00011551571648118456, - "loss": 0.7392, - "step": 3298 - }, - { - "epoch": 1.620717229501138, - "grad_norm": 0.38966391468078304, - "learning_rate": 0.0001154548835291998, - "loss": 0.785, - "step": 3299 - }, - { - "epoch": 1.621209325213754, - "grad_norm": 0.44030242312298823, - "learning_rate": 0.0001153940447173912, - "loss": 0.7674, - "step": 3300 - }, - { - "epoch": 1.62170142092637, - "grad_norm": 0.6004304936722245, - "learning_rate": 0.00011533320006882621, - "loss": 0.7561, - "step": 3301 - }, - { - "epoch": 1.6221935166389863, - "grad_norm": 0.4168449461682847, - "learning_rate": 0.00011527234960657449, - "loss": 0.7368, - "step": 3302 - }, - { - "epoch": 1.6226856123516025, - "grad_norm": 0.3858236200316087, - "learning_rate": 0.00011521149335370794, - "loss": 0.7126, - "step": 3303 - }, - { - "epoch": 1.6231777080642185, - "grad_norm": 0.3980531870370931, - "learning_rate": 0.00011515063133330057, - "loss": 0.7081, - "step": 3304 - }, - { - "epoch": 1.6236698037768345, - "grad_norm": 0.424134351707148, - "learning_rate": 0.00011508976356842867, - "loss": 0.7767, - "step": 3305 - }, - { - "epoch": 1.6241618994894507, - "grad_norm": 0.4138559895405696, - "learning_rate": 0.00011502889008217063, - "loss": 0.7079, - "step": 3306 - }, - { - "epoch": 1.6246539952020669, - "grad_norm": 0.4301282248555545, - "learning_rate": 0.00011496801089760709, - "loss": 0.7661, - "step": 3307 - }, - { - "epoch": 1.6251460909146829, - "grad_norm": 0.39714112078295116, - "learning_rate": 0.00011490712603782073, - "loss": 0.7219, - "step": 3308 - }, - { - "epoch": 1.6256381866272989, - "grad_norm": 0.40018756939891, - "learning_rate": 0.00011484623552589653, - "loss": 0.7125, - "step": 3309 - }, - { - "epoch": 1.626130282339915, - "grad_norm": 0.40254911696783763, - "learning_rate": 0.00011478533938492153, - "loss": 0.7989, - "step": 3310 - }, - { - "epoch": 1.6266223780525313, - "grad_norm": 0.38957602831300703, - "learning_rate": 0.00011472443763798486, - "loss": 0.7428, - "step": 3311 - }, - { - "epoch": 1.6271144737651473, - "grad_norm": 0.38243558218122387, - "learning_rate": 0.00011466353030817791, - "loss": 0.7082, - "step": 3312 - }, - { - "epoch": 1.6276065694777633, - "grad_norm": 0.42587027710463005, - "learning_rate": 0.00011460261741859406, - "loss": 0.7277, - "step": 3313 - }, - { - "epoch": 1.6280986651903795, - "grad_norm": 0.38962302245061, - "learning_rate": 0.00011454169899232885, - "loss": 0.7066, - "step": 3314 - }, - { - "epoch": 1.6285907609029957, - "grad_norm": 0.42216413584744017, - "learning_rate": 0.00011448077505247989, - "loss": 0.7611, - "step": 3315 - }, - { - "epoch": 1.6290828566156117, - "grad_norm": 0.40613777396095335, - "learning_rate": 0.00011441984562214693, - "loss": 0.6904, - "step": 3316 - }, - { - "epoch": 1.6295749523282277, - "grad_norm": 0.4074139553910867, - "learning_rate": 0.00011435891072443181, - "loss": 0.7217, - "step": 3317 - }, - { - "epoch": 1.630067048040844, - "grad_norm": 0.38582421711941, - "learning_rate": 0.00011429797038243836, - "loss": 0.7161, - "step": 3318 - }, - { - "epoch": 1.6305591437534601, - "grad_norm": 0.4309490690294988, - "learning_rate": 0.00011423702461927255, - "loss": 0.7732, - "step": 3319 - }, - { - "epoch": 1.631051239466076, - "grad_norm": 0.3907965510498859, - "learning_rate": 0.00011417607345804238, - "loss": 0.7222, - "step": 3320 - }, - { - "epoch": 1.631543335178692, - "grad_norm": 0.38648927105318526, - "learning_rate": 0.00011411511692185783, - "loss": 0.729, - "step": 3321 - }, - { - "epoch": 1.6320354308913083, - "grad_norm": 0.39061194647669145, - "learning_rate": 0.00011405415503383107, - "loss": 0.7122, - "step": 3322 - }, - { - "epoch": 1.6325275266039245, - "grad_norm": 0.4054147916082172, - "learning_rate": 0.0001139931878170762, - "loss": 0.6943, - "step": 3323 - }, - { - "epoch": 1.6330196223165405, - "grad_norm": 0.4217960383778778, - "learning_rate": 0.0001139322152947093, - "loss": 0.7283, - "step": 3324 - }, - { - "epoch": 1.6335117180291565, - "grad_norm": 0.410729409117959, - "learning_rate": 0.00011387123748984855, - "loss": 0.7121, - "step": 3325 - }, - { - "epoch": 1.6340038137417727, - "grad_norm": 0.38439346326153434, - "learning_rate": 0.00011381025442561415, - "loss": 0.6958, - "step": 3326 - }, - { - "epoch": 1.634495909454389, - "grad_norm": 0.3952011191182585, - "learning_rate": 0.00011374926612512814, - "loss": 0.6614, - "step": 3327 - }, - { - "epoch": 1.634988005167005, - "grad_norm": 0.39460101798607156, - "learning_rate": 0.00011368827261151473, - "loss": 0.7238, - "step": 3328 - }, - { - "epoch": 1.635480100879621, - "grad_norm": 0.4342295559628419, - "learning_rate": 0.00011362727390789998, - "loss": 0.7837, - "step": 3329 - }, - { - "epoch": 1.6359721965922371, - "grad_norm": 0.41323209537972694, - "learning_rate": 0.00011356627003741198, - "loss": 0.7662, - "step": 3330 - }, - { - "epoch": 1.6364642923048534, - "grad_norm": 0.461378329922537, - "learning_rate": 0.00011350526102318071, - "loss": 0.7566, - "step": 3331 - }, - { - "epoch": 1.6369563880174693, - "grad_norm": 0.5254097987571077, - "learning_rate": 0.00011344424688833823, - "loss": 0.7061, - "step": 3332 - }, - { - "epoch": 1.6374484837300853, - "grad_norm": 0.4000376526024257, - "learning_rate": 0.00011338322765601845, - "loss": 0.7018, - "step": 3333 - }, - { - "epoch": 1.6379405794427018, - "grad_norm": 0.43837849244177035, - "learning_rate": 0.00011332220334935715, - "loss": 0.6869, - "step": 3334 - }, - { - "epoch": 1.6384326751553178, - "grad_norm": 0.3881624402368388, - "learning_rate": 0.0001132611739914922, - "loss": 0.6675, - "step": 3335 - }, - { - "epoch": 1.6389247708679338, - "grad_norm": 0.39920920733537274, - "learning_rate": 0.00011320013960556326, - "loss": 0.7165, - "step": 3336 - }, - { - "epoch": 1.63941686658055, - "grad_norm": 0.39521704754594184, - "learning_rate": 0.00011313910021471192, - "loss": 0.7171, - "step": 3337 - }, - { - "epoch": 1.6399089622931662, - "grad_norm": 0.40696160903551065, - "learning_rate": 0.00011307805584208167, - "loss": 0.7165, - "step": 3338 - }, - { - "epoch": 1.6404010580057822, - "grad_norm": 0.40674244664924436, - "learning_rate": 0.00011301700651081795, - "loss": 0.713, - "step": 3339 - }, - { - "epoch": 1.6408931537183982, - "grad_norm": 0.39380716623228934, - "learning_rate": 0.00011295595224406796, - "loss": 0.7077, - "step": 3340 - }, - { - "epoch": 1.6413852494310144, - "grad_norm": 0.4170624213001939, - "learning_rate": 0.00011289489306498091, - "loss": 0.755, - "step": 3341 - }, - { - "epoch": 1.6418773451436306, - "grad_norm": 0.40302670170896127, - "learning_rate": 0.00011283382899670774, - "loss": 0.7103, - "step": 3342 - }, - { - "epoch": 1.6423694408562466, - "grad_norm": 0.40330042413991274, - "learning_rate": 0.00011277276006240133, - "loss": 0.7599, - "step": 3343 - }, - { - "epoch": 1.6428615365688626, - "grad_norm": 0.38296817165820113, - "learning_rate": 0.00011271168628521636, - "loss": 0.6903, - "step": 3344 - }, - { - "epoch": 1.6433536322814788, - "grad_norm": 0.38312626311902653, - "learning_rate": 0.00011265060768830942, - "loss": 0.6818, - "step": 3345 - }, - { - "epoch": 1.643845727994095, - "grad_norm": 0.3881725372107873, - "learning_rate": 0.00011258952429483882, - "loss": 0.7075, - "step": 3346 - }, - { - "epoch": 1.644337823706711, - "grad_norm": 0.4541579945274235, - "learning_rate": 0.00011252843612796475, - "loss": 0.7642, - "step": 3347 - }, - { - "epoch": 1.644829919419327, - "grad_norm": 0.4033663840772927, - "learning_rate": 0.00011246734321084925, - "loss": 0.7589, - "step": 3348 - }, - { - "epoch": 1.6453220151319432, - "grad_norm": 0.4523417049892899, - "learning_rate": 0.00011240624556665605, - "loss": 0.8577, - "step": 3349 - }, - { - "epoch": 1.6458141108445594, - "grad_norm": 0.41245399605827937, - "learning_rate": 0.00011234514321855078, - "loss": 0.7973, - "step": 3350 - }, - { - "epoch": 1.6463062065571754, - "grad_norm": 0.4043051870315983, - "learning_rate": 0.00011228403618970078, - "loss": 0.7732, - "step": 3351 - }, - { - "epoch": 1.6467983022697914, - "grad_norm": 0.4375866898033138, - "learning_rate": 0.00011222292450327523, - "loss": 0.7484, - "step": 3352 - }, - { - "epoch": 1.6472903979824076, - "grad_norm": 0.42413886901311715, - "learning_rate": 0.000112161808182445, - "loss": 0.6895, - "step": 3353 - }, - { - "epoch": 1.6477824936950238, - "grad_norm": 0.3601379139559961, - "learning_rate": 0.00011210068725038277, - "loss": 0.6597, - "step": 3354 - }, - { - "epoch": 1.6482745894076398, - "grad_norm": 0.41712540386337577, - "learning_rate": 0.00011203956173026297, - "loss": 0.7566, - "step": 3355 - }, - { - "epoch": 1.6487666851202558, - "grad_norm": 0.40304211283122093, - "learning_rate": 0.00011197843164526173, - "loss": 0.6943, - "step": 3356 - }, - { - "epoch": 1.649258780832872, - "grad_norm": 0.4060560773043631, - "learning_rate": 0.00011191729701855696, - "loss": 0.7507, - "step": 3357 - }, - { - "epoch": 1.6497508765454882, - "grad_norm": 0.4200077621192976, - "learning_rate": 0.00011185615787332826, - "loss": 0.7193, - "step": 3358 - }, - { - "epoch": 1.6502429722581042, - "grad_norm": 0.4135040350138502, - "learning_rate": 0.00011179501423275698, - "loss": 0.6496, - "step": 3359 - }, - { - "epoch": 1.6507350679707202, - "grad_norm": 0.4157681663944609, - "learning_rate": 0.00011173386612002605, - "loss": 0.7406, - "step": 3360 - }, - { - "epoch": 1.6512271636833364, - "grad_norm": 0.45892053218899187, - "learning_rate": 0.00011167271355832033, - "loss": 0.7491, - "step": 3361 - }, - { - "epoch": 1.6517192593959527, - "grad_norm": 0.4739170942789628, - "learning_rate": 0.00011161155657082611, - "loss": 0.716, - "step": 3362 - }, - { - "epoch": 1.6522113551085686, - "grad_norm": 0.38768998241486385, - "learning_rate": 0.00011155039518073156, - "loss": 0.6988, - "step": 3363 - }, - { - "epoch": 1.6527034508211846, - "grad_norm": 0.3778780511974089, - "learning_rate": 0.00011148922941122637, - "loss": 0.7272, - "step": 3364 - }, - { - "epoch": 1.6531955465338009, - "grad_norm": 0.4137502099541195, - "learning_rate": 0.000111428059285502, - "loss": 0.7088, - "step": 3365 - }, - { - "epoch": 1.653687642246417, - "grad_norm": 0.4033041644605808, - "learning_rate": 0.0001113668848267515, - "loss": 0.6829, - "step": 3366 - }, - { - "epoch": 1.654179737959033, - "grad_norm": 0.4126681957030245, - "learning_rate": 0.00011130570605816956, - "loss": 0.7165, - "step": 3367 - }, - { - "epoch": 1.654671833671649, - "grad_norm": 0.4190034080828862, - "learning_rate": 0.00011124452300295256, - "loss": 0.7627, - "step": 3368 - }, - { - "epoch": 1.6551639293842653, - "grad_norm": 0.428151834297233, - "learning_rate": 0.00011118333568429848, - "loss": 0.787, - "step": 3369 - }, - { - "epoch": 1.6556560250968815, - "grad_norm": 0.3905762729161534, - "learning_rate": 0.00011112214412540685, - "loss": 0.677, - "step": 3370 - }, - { - "epoch": 1.6561481208094975, - "grad_norm": 0.40359898466168803, - "learning_rate": 0.00011106094834947891, - "loss": 0.7339, - "step": 3371 - }, - { - "epoch": 1.6566402165221135, - "grad_norm": 0.4019980640244289, - "learning_rate": 0.00011099974837971745, - "loss": 0.7641, - "step": 3372 - }, - { - "epoch": 1.6571323122347297, - "grad_norm": 0.4012060101976718, - "learning_rate": 0.00011093854423932684, - "loss": 0.7098, - "step": 3373 - }, - { - "epoch": 1.657624407947346, - "grad_norm": 0.3829638774866954, - "learning_rate": 0.00011087733595151306, - "loss": 0.715, - "step": 3374 - }, - { - "epoch": 1.6581165036599619, - "grad_norm": 0.4257124339227231, - "learning_rate": 0.00011081612353948363, - "loss": 0.7411, - "step": 3375 - }, - { - "epoch": 1.6586085993725779, - "grad_norm": 0.4234409047536631, - "learning_rate": 0.00011075490702644765, - "loss": 0.7731, - "step": 3376 - }, - { - "epoch": 1.659100695085194, - "grad_norm": 0.3861076102536674, - "learning_rate": 0.00011069368643561578, - "loss": 0.6559, - "step": 3377 - }, - { - "epoch": 1.6595927907978103, - "grad_norm": 0.40596536069740446, - "learning_rate": 0.00011063246179020022, - "loss": 0.7371, - "step": 3378 - }, - { - "epoch": 1.6600848865104263, - "grad_norm": 0.41494050533596094, - "learning_rate": 0.00011057123311341473, - "loss": 0.6786, - "step": 3379 - }, - { - "epoch": 1.6605769822230423, - "grad_norm": 0.5018810497425636, - "learning_rate": 0.00011051000042847453, - "loss": 0.8102, - "step": 3380 - }, - { - "epoch": 1.6610690779356585, - "grad_norm": 0.43553815752747455, - "learning_rate": 0.00011044876375859647, - "loss": 0.752, - "step": 3381 - }, - { - "epoch": 1.6615611736482747, - "grad_norm": 0.42698662255119313, - "learning_rate": 0.00011038752312699883, - "loss": 0.6795, - "step": 3382 - }, - { - "epoch": 1.6620532693608907, - "grad_norm": 0.4173636835720084, - "learning_rate": 0.00011032627855690137, - "loss": 0.7322, - "step": 3383 - }, - { - "epoch": 1.6625453650735067, - "grad_norm": 0.3806345058818375, - "learning_rate": 0.00011026503007152542, - "loss": 0.7413, - "step": 3384 - }, - { - "epoch": 1.663037460786123, - "grad_norm": 0.4151686021688595, - "learning_rate": 0.00011020377769409376, - "loss": 0.7111, - "step": 3385 - }, - { - "epoch": 1.6635295564987391, - "grad_norm": 0.43133613729928794, - "learning_rate": 0.00011014252144783061, - "loss": 0.7359, - "step": 3386 - }, - { - "epoch": 1.6640216522113551, - "grad_norm": 0.4201512584519894, - "learning_rate": 0.00011008126135596175, - "loss": 0.6755, - "step": 3387 - }, - { - "epoch": 1.664513747923971, - "grad_norm": 0.4089224483162079, - "learning_rate": 0.00011001999744171431, - "loss": 0.7419, - "step": 3388 - }, - { - "epoch": 1.6650058436365873, - "grad_norm": 0.4260506493443712, - "learning_rate": 0.00010995872972831694, - "loss": 0.7322, - "step": 3389 - }, - { - "epoch": 1.6654979393492035, - "grad_norm": 0.3964593546037636, - "learning_rate": 0.00010989745823899968, - "loss": 0.742, - "step": 3390 - }, - { - "epoch": 1.6659900350618195, - "grad_norm": 0.4138708548288306, - "learning_rate": 0.00010983618299699407, - "loss": 0.7447, - "step": 3391 - }, - { - "epoch": 1.6664821307744355, - "grad_norm": 0.3918383348638912, - "learning_rate": 0.000109774904025533, - "loss": 0.683, - "step": 3392 - }, - { - "epoch": 1.6669742264870517, - "grad_norm": 0.42073281948385244, - "learning_rate": 0.00010971362134785081, - "loss": 0.7421, - "step": 3393 - }, - { - "epoch": 1.667466322199668, - "grad_norm": 0.46714175568654714, - "learning_rate": 0.0001096523349871833, - "loss": 0.7444, - "step": 3394 - }, - { - "epoch": 1.667958417912284, - "grad_norm": 0.4192828537110579, - "learning_rate": 0.00010959104496676753, - "loss": 0.7158, - "step": 3395 - }, - { - "epoch": 1.6684505136249, - "grad_norm": 0.43638900066541814, - "learning_rate": 0.00010952975130984209, - "loss": 0.6868, - "step": 3396 - }, - { - "epoch": 1.6689426093375161, - "grad_norm": 0.40577543224584955, - "learning_rate": 0.00010946845403964683, - "loss": 0.7062, - "step": 3397 - }, - { - "epoch": 1.6694347050501324, - "grad_norm": 0.40755905873082676, - "learning_rate": 0.00010940715317942308, - "loss": 0.7403, - "step": 3398 - }, - { - "epoch": 1.6699268007627484, - "grad_norm": 0.4089856597537415, - "learning_rate": 0.00010934584875241342, - "loss": 0.7309, - "step": 3399 - }, - { - "epoch": 1.6704188964753643, - "grad_norm": 0.41081620165600885, - "learning_rate": 0.00010928454078186186, - "loss": 0.722, - "step": 3400 - }, - { - "epoch": 1.6709109921879806, - "grad_norm": 0.4135880756035975, - "learning_rate": 0.00010922322929101377, - "loss": 0.6873, - "step": 3401 - }, - { - "epoch": 1.6714030879005968, - "grad_norm": 0.41057211036682234, - "learning_rate": 0.00010916191430311576, - "loss": 0.737, - "step": 3402 - }, - { - "epoch": 1.6718951836132128, - "grad_norm": 0.4495536689809271, - "learning_rate": 0.00010910059584141585, - "loss": 0.8352, - "step": 3403 - }, - { - "epoch": 1.6723872793258288, - "grad_norm": 0.42069116659953226, - "learning_rate": 0.00010903927392916335, - "loss": 0.6848, - "step": 3404 - }, - { - "epoch": 1.672879375038445, - "grad_norm": 0.4062013235084829, - "learning_rate": 0.0001089779485896089, - "loss": 0.7541, - "step": 3405 - }, - { - "epoch": 1.6733714707510612, - "grad_norm": 0.40290597719805843, - "learning_rate": 0.00010891661984600437, - "loss": 0.7067, - "step": 3406 - }, - { - "epoch": 1.6738635664636772, - "grad_norm": 0.3858918378569007, - "learning_rate": 0.00010885528772160303, - "loss": 0.6757, - "step": 3407 - }, - { - "epoch": 1.6743556621762932, - "grad_norm": 0.4071178562868276, - "learning_rate": 0.00010879395223965932, - "loss": 0.7594, - "step": 3408 - }, - { - "epoch": 1.6748477578889094, - "grad_norm": 0.42233551115816875, - "learning_rate": 0.00010873261342342902, - "loss": 0.7572, - "step": 3409 - }, - { - "epoch": 1.6753398536015256, - "grad_norm": 0.3697017015625988, - "learning_rate": 0.00010867127129616917, - "loss": 0.7018, - "step": 3410 - }, - { - "epoch": 1.6758319493141416, - "grad_norm": 0.38368958954840904, - "learning_rate": 0.00010860992588113802, - "loss": 0.7188, - "step": 3411 - }, - { - "epoch": 1.6763240450267576, - "grad_norm": 0.391557735397661, - "learning_rate": 0.00010854857720159515, - "loss": 0.6693, - "step": 3412 - }, - { - "epoch": 1.6768161407393738, - "grad_norm": 0.432049371194674, - "learning_rate": 0.00010848722528080124, - "loss": 0.7832, - "step": 3413 - }, - { - "epoch": 1.67730823645199, - "grad_norm": 0.39263819815769896, - "learning_rate": 0.0001084258701420184, - "loss": 0.7177, - "step": 3414 - }, - { - "epoch": 1.677800332164606, - "grad_norm": 0.4150764571185619, - "learning_rate": 0.00010836451180850977, - "loss": 0.6902, - "step": 3415 - }, - { - "epoch": 1.678292427877222, - "grad_norm": 0.4143147513945901, - "learning_rate": 0.00010830315030353979, - "loss": 0.7163, - "step": 3416 - }, - { - "epoch": 1.6787845235898382, - "grad_norm": 0.3921533202636694, - "learning_rate": 0.00010824178565037413, - "loss": 0.6945, - "step": 3417 - }, - { - "epoch": 1.6792766193024544, - "grad_norm": 0.4343821634061841, - "learning_rate": 0.00010818041787227957, - "loss": 0.7652, - "step": 3418 - }, - { - "epoch": 1.6797687150150704, - "grad_norm": 0.6397532413548314, - "learning_rate": 0.00010811904699252416, - "loss": 0.674, - "step": 3419 - }, - { - "epoch": 1.6802608107276864, - "grad_norm": 0.43034617515641505, - "learning_rate": 0.00010805767303437702, - "loss": 0.762, - "step": 3420 - }, - { - "epoch": 1.6807529064403026, - "grad_norm": 0.4197674143293288, - "learning_rate": 0.00010799629602110857, - "loss": 0.6872, - "step": 3421 - }, - { - "epoch": 1.6812450021529188, - "grad_norm": 0.40073736756931694, - "learning_rate": 0.00010793491597599026, - "loss": 0.7064, - "step": 3422 - }, - { - "epoch": 1.6817370978655348, - "grad_norm": 0.384927613805364, - "learning_rate": 0.00010787353292229478, - "loss": 0.7071, - "step": 3423 - }, - { - "epoch": 1.6822291935781508, - "grad_norm": 0.4107789427659322, - "learning_rate": 0.00010781214688329598, - "loss": 0.7093, - "step": 3424 - }, - { - "epoch": 1.682721289290767, - "grad_norm": 0.39193366996824297, - "learning_rate": 0.00010775075788226872, - "loss": 0.6758, - "step": 3425 - }, - { - "epoch": 1.6832133850033832, - "grad_norm": 0.4041036729796495, - "learning_rate": 0.00010768936594248904, - "loss": 0.7216, - "step": 3426 - }, - { - "epoch": 1.6837054807159992, - "grad_norm": 0.40704820513533807, - "learning_rate": 0.00010762797108723419, - "loss": 0.7579, - "step": 3427 - }, - { - "epoch": 1.6841975764286152, - "grad_norm": 0.40699307998332807, - "learning_rate": 0.00010756657333978242, - "loss": 0.6755, - "step": 3428 - }, - { - "epoch": 1.6846896721412314, - "grad_norm": 0.4152553427668556, - "learning_rate": 0.00010750517272341305, - "loss": 0.7296, - "step": 3429 - }, - { - "epoch": 1.6851817678538477, - "grad_norm": 0.3861353882469795, - "learning_rate": 0.0001074437692614066, - "loss": 0.69, - "step": 3430 - }, - { - "epoch": 1.6856738635664636, - "grad_norm": 0.38661979479447617, - "learning_rate": 0.00010738236297704458, - "loss": 0.7068, - "step": 3431 - }, - { - "epoch": 1.6861659592790796, - "grad_norm": 0.3830334849957862, - "learning_rate": 0.0001073209538936096, - "loss": 0.6544, - "step": 3432 - }, - { - "epoch": 1.6866580549916959, - "grad_norm": 0.4037450767030282, - "learning_rate": 0.00010725954203438538, - "loss": 0.7059, - "step": 3433 - }, - { - "epoch": 1.687150150704312, - "grad_norm": 0.42276141430010444, - "learning_rate": 0.00010719812742265656, - "loss": 0.7198, - "step": 3434 - }, - { - "epoch": 1.687642246416928, - "grad_norm": 0.41011513013201595, - "learning_rate": 0.00010713671008170895, - "loss": 0.6751, - "step": 3435 - }, - { - "epoch": 1.688134342129544, - "grad_norm": 0.42142869323810805, - "learning_rate": 0.00010707529003482932, - "loss": 0.7365, - "step": 3436 - }, - { - "epoch": 1.6886264378421603, - "grad_norm": 0.41207768141580325, - "learning_rate": 0.00010701386730530556, - "loss": 0.7399, - "step": 3437 - }, - { - "epoch": 1.6891185335547765, - "grad_norm": 0.37682752812936443, - "learning_rate": 0.00010695244191642648, - "loss": 0.6606, - "step": 3438 - }, - { - "epoch": 1.6896106292673925, - "grad_norm": 0.37990615917430404, - "learning_rate": 0.00010689101389148188, - "loss": 0.6944, - "step": 3439 - }, - { - "epoch": 1.6901027249800085, - "grad_norm": 0.3965579495629195, - "learning_rate": 0.00010682958325376271, - "loss": 0.7264, - "step": 3440 - }, - { - "epoch": 1.6905948206926247, - "grad_norm": 0.41776268306578, - "learning_rate": 0.00010676815002656076, - "loss": 0.7259, - "step": 3441 - }, - { - "epoch": 1.691086916405241, - "grad_norm": 0.3739672904934821, - "learning_rate": 0.00010670671423316884, - "loss": 0.6501, - "step": 3442 - }, - { - "epoch": 1.6915790121178569, - "grad_norm": 0.39365934144674114, - "learning_rate": 0.00010664527589688079, - "loss": 0.723, - "step": 3443 - }, - { - "epoch": 1.6920711078304729, - "grad_norm": 0.43340793664309574, - "learning_rate": 0.00010658383504099134, - "loss": 0.6439, - "step": 3444 - }, - { - "epoch": 1.692563203543089, - "grad_norm": 0.3905102487641378, - "learning_rate": 0.00010652239168879619, - "loss": 0.7088, - "step": 3445 - }, - { - "epoch": 1.6930552992557053, - "grad_norm": 0.4149420738539436, - "learning_rate": 0.00010646094586359203, - "loss": 0.765, - "step": 3446 - }, - { - "epoch": 1.6935473949683213, - "grad_norm": 0.4111554063228533, - "learning_rate": 0.00010639949758867649, - "loss": 0.7387, - "step": 3447 - }, - { - "epoch": 1.6940394906809373, - "grad_norm": 0.4051040945649876, - "learning_rate": 0.00010633804688734806, - "loss": 0.6392, - "step": 3448 - }, - { - "epoch": 1.6945315863935535, - "grad_norm": 0.42401110785586005, - "learning_rate": 0.00010627659378290617, - "loss": 0.7706, - "step": 3449 - }, - { - "epoch": 1.6950236821061697, - "grad_norm": 0.4098836282687331, - "learning_rate": 0.00010621513829865124, - "loss": 0.7824, - "step": 3450 - }, - { - "epoch": 1.6955157778187857, - "grad_norm": 0.3953800504291363, - "learning_rate": 0.0001061536804578845, - "loss": 0.7369, - "step": 3451 - }, - { - "epoch": 1.6960078735314017, - "grad_norm": 0.3879011551239534, - "learning_rate": 0.00010609222028390808, - "loss": 0.674, - "step": 3452 - }, - { - "epoch": 1.696499969244018, - "grad_norm": 0.3936995377938904, - "learning_rate": 0.00010603075780002507, - "loss": 0.7182, - "step": 3453 - }, - { - "epoch": 1.6969920649566341, - "grad_norm": 0.4124503683356637, - "learning_rate": 0.00010596929302953937, - "loss": 0.7175, - "step": 3454 - }, - { - "epoch": 1.6974841606692501, - "grad_norm": 0.4221111801284014, - "learning_rate": 0.00010590782599575578, - "loss": 0.6894, - "step": 3455 - }, - { - "epoch": 1.6979762563818661, - "grad_norm": 0.38030934929246407, - "learning_rate": 0.0001058463567219799, - "loss": 0.6647, - "step": 3456 - }, - { - "epoch": 1.6984683520944823, - "grad_norm": 0.4356937652188898, - "learning_rate": 0.00010578488523151829, - "loss": 0.734, - "step": 3457 - }, - { - "epoch": 1.6989604478070985, - "grad_norm": 0.4079031161575368, - "learning_rate": 0.00010572341154767817, - "loss": 0.6947, - "step": 3458 - }, - { - "epoch": 1.6994525435197145, - "grad_norm": 0.40368267187106555, - "learning_rate": 0.0001056619356937678, - "loss": 0.6886, - "step": 3459 - }, - { - "epoch": 1.6999446392323305, - "grad_norm": 0.39113338766393774, - "learning_rate": 0.00010560045769309617, - "loss": 0.6906, - "step": 3460 - }, - { - "epoch": 1.7004367349449467, - "grad_norm": 0.38863425958578685, - "learning_rate": 0.00010553897756897304, - "loss": 0.771, - "step": 3461 - }, - { - "epoch": 1.700928830657563, - "grad_norm": 0.39353367669803496, - "learning_rate": 0.00010547749534470898, - "loss": 0.6912, - "step": 3462 - }, - { - "epoch": 1.701420926370179, - "grad_norm": 0.382638751415916, - "learning_rate": 0.0001054160110436155, - "loss": 0.6611, - "step": 3463 - }, - { - "epoch": 1.7019130220827952, - "grad_norm": 0.4416567025751971, - "learning_rate": 0.00010535452468900471, - "loss": 0.6856, - "step": 3464 - }, - { - "epoch": 1.7024051177954114, - "grad_norm": 0.3989790595531368, - "learning_rate": 0.00010529303630418959, - "loss": 0.7047, - "step": 3465 - }, - { - "epoch": 1.7028972135080274, - "grad_norm": 0.4302125927914887, - "learning_rate": 0.00010523154591248387, - "loss": 0.6971, - "step": 3466 - }, - { - "epoch": 1.7033893092206434, - "grad_norm": 0.3897444973557634, - "learning_rate": 0.00010517005353720208, - "loss": 0.6806, - "step": 3467 - }, - { - "epoch": 1.7038814049332596, - "grad_norm": 0.3998831439045781, - "learning_rate": 0.00010510855920165944, - "loss": 0.7455, - "step": 3468 - }, - { - "epoch": 1.7043735006458758, - "grad_norm": 0.434049063496724, - "learning_rate": 0.00010504706292917196, - "loss": 0.7863, - "step": 3469 - }, - { - "epoch": 1.7048655963584918, - "grad_norm": 0.362344809714173, - "learning_rate": 0.00010498556474305638, - "loss": 0.6478, - "step": 3470 - }, - { - "epoch": 1.7053576920711078, - "grad_norm": 0.4004932249867723, - "learning_rate": 0.00010492406466663012, - "loss": 0.7207, - "step": 3471 - }, - { - "epoch": 1.705849787783724, - "grad_norm": 0.3847313072495366, - "learning_rate": 0.00010486256272321137, - "loss": 0.7137, - "step": 3472 - }, - { - "epoch": 1.7063418834963402, - "grad_norm": 0.40582667619910506, - "learning_rate": 0.00010480105893611902, - "loss": 0.6809, - "step": 3473 - }, - { - "epoch": 1.7068339792089562, - "grad_norm": 0.4081347903126302, - "learning_rate": 0.00010473955332867265, - "loss": 0.7758, - "step": 3474 - }, - { - "epoch": 1.7073260749215722, - "grad_norm": 0.39818872530915195, - "learning_rate": 0.00010467804592419248, - "loss": 0.6788, - "step": 3475 - }, - { - "epoch": 1.7078181706341884, - "grad_norm": 0.4315985147326263, - "learning_rate": 0.00010461653674599951, - "loss": 0.7231, - "step": 3476 - }, - { - "epoch": 1.7083102663468046, - "grad_norm": 0.36912180977733466, - "learning_rate": 0.00010455502581741536, - "loss": 0.6004, - "step": 3477 - }, - { - "epoch": 1.7088023620594206, - "grad_norm": 0.4252613710701617, - "learning_rate": 0.0001044935131617623, - "loss": 0.7596, - "step": 3478 - }, - { - "epoch": 1.7092944577720366, - "grad_norm": 0.411811627204728, - "learning_rate": 0.00010443199880236325, - "loss": 0.7626, - "step": 3479 - }, - { - "epoch": 1.7097865534846528, - "grad_norm": 0.4087737533711953, - "learning_rate": 0.00010437048276254185, - "loss": 0.7217, - "step": 3480 - }, - { - "epoch": 1.710278649197269, - "grad_norm": 0.3867689710603881, - "learning_rate": 0.00010430896506562224, - "loss": 0.697, - "step": 3481 - }, - { - "epoch": 1.710770744909885, - "grad_norm": 0.39001765816253, - "learning_rate": 0.00010424744573492937, - "loss": 0.7491, - "step": 3482 - }, - { - "epoch": 1.711262840622501, - "grad_norm": 0.4040324344323463, - "learning_rate": 0.00010418592479378863, - "loss": 0.6807, - "step": 3483 - }, - { - "epoch": 1.7117549363351172, - "grad_norm": 0.39205894436891514, - "learning_rate": 0.00010412440226552618, - "loss": 0.7686, - "step": 3484 - }, - { - "epoch": 1.7122470320477334, - "grad_norm": 0.39573002089340803, - "learning_rate": 0.0001040628781734686, - "loss": 0.6872, - "step": 3485 - }, - { - "epoch": 1.7127391277603494, - "grad_norm": 0.38844382559662977, - "learning_rate": 0.00010400135254094328, - "loss": 0.6386, - "step": 3486 - }, - { - "epoch": 1.7132312234729654, - "grad_norm": 0.4053088198277832, - "learning_rate": 0.000103939825391278, - "loss": 0.7674, - "step": 3487 - }, - { - "epoch": 1.7137233191855816, - "grad_norm": 0.41086119397989745, - "learning_rate": 0.00010387829674780123, - "loss": 0.7798, - "step": 3488 - }, - { - "epoch": 1.7142154148981978, - "grad_norm": 0.40896189748626177, - "learning_rate": 0.00010381676663384196, - "loss": 0.7271, - "step": 3489 - }, - { - "epoch": 1.7147075106108138, - "grad_norm": 0.38509716197342253, - "learning_rate": 0.00010375523507272975, - "loss": 0.6816, - "step": 3490 - }, - { - "epoch": 1.7151996063234298, - "grad_norm": 0.3822168242330676, - "learning_rate": 0.00010369370208779469, - "loss": 0.689, - "step": 3491 - }, - { - "epoch": 1.715691702036046, - "grad_norm": 0.39191773645131417, - "learning_rate": 0.0001036321677023675, - "loss": 0.7319, - "step": 3492 - }, - { - "epoch": 1.7161837977486623, - "grad_norm": 0.43689208804666463, - "learning_rate": 0.0001035706319397793, - "loss": 0.7913, - "step": 3493 - }, - { - "epoch": 1.7166758934612782, - "grad_norm": 0.40220947564620113, - "learning_rate": 0.00010350909482336176, - "loss": 0.7456, - "step": 3494 - }, - { - "epoch": 1.7171679891738942, - "grad_norm": 0.4026476252829011, - "learning_rate": 0.00010344755637644717, - "loss": 0.7621, - "step": 3495 - }, - { - "epoch": 1.7176600848865105, - "grad_norm": 0.4354533643323663, - "learning_rate": 0.00010338601662236823, - "loss": 0.7264, - "step": 3496 - }, - { - "epoch": 1.7181521805991267, - "grad_norm": 0.4465269393370047, - "learning_rate": 0.0001033244755844581, - "loss": 0.7096, - "step": 3497 - }, - { - "epoch": 1.7186442763117427, - "grad_norm": 0.38531188475617795, - "learning_rate": 0.00010326293328605052, - "loss": 0.7259, - "step": 3498 - }, - { - "epoch": 1.7191363720243586, - "grad_norm": 0.3955474469345509, - "learning_rate": 0.00010320138975047971, - "loss": 0.6948, - "step": 3499 - }, - { - "epoch": 1.7196284677369749, - "grad_norm": 0.372532556838744, - "learning_rate": 0.00010313984500108025, - "loss": 0.653, - "step": 3500 - }, - { - "epoch": 1.720120563449591, - "grad_norm": 0.3817651953050749, - "learning_rate": 0.0001030782990611873, - "loss": 0.7308, - "step": 3501 - }, - { - "epoch": 1.720612659162207, - "grad_norm": 0.43360979815950806, - "learning_rate": 0.0001030167519541364, - "loss": 0.7916, - "step": 3502 - }, - { - "epoch": 1.721104754874823, - "grad_norm": 0.3670332825002412, - "learning_rate": 0.00010295520370326355, - "loss": 0.734, - "step": 3503 - }, - { - "epoch": 1.7215968505874393, - "grad_norm": 0.41344384343051993, - "learning_rate": 0.00010289365433190514, - "loss": 0.7197, - "step": 3504 - }, - { - "epoch": 1.7220889463000555, - "grad_norm": 0.38374499673397267, - "learning_rate": 0.00010283210386339813, - "loss": 0.7351, - "step": 3505 - }, - { - "epoch": 1.7225810420126715, - "grad_norm": 0.3941941455094037, - "learning_rate": 0.00010277055232107975, - "loss": 0.7782, - "step": 3506 - }, - { - "epoch": 1.7230731377252875, - "grad_norm": 0.3933031831370362, - "learning_rate": 0.00010270899972828764, - "loss": 0.7382, - "step": 3507 - }, - { - "epoch": 1.7235652334379037, - "grad_norm": 0.40792635676036754, - "learning_rate": 0.00010264744610835995, - "loss": 0.7309, - "step": 3508 - }, - { - "epoch": 1.72405732915052, - "grad_norm": 0.4037259709765141, - "learning_rate": 0.00010258589148463513, - "loss": 0.7129, - "step": 3509 - }, - { - "epoch": 1.724549424863136, - "grad_norm": 0.40434860022010893, - "learning_rate": 0.00010252433588045203, - "loss": 0.7048, - "step": 3510 - }, - { - "epoch": 1.7250415205757519, - "grad_norm": 0.3930347623977529, - "learning_rate": 0.00010246277931914987, - "loss": 0.6624, - "step": 3511 - }, - { - "epoch": 1.725533616288368, - "grad_norm": 0.41578831003075, - "learning_rate": 0.00010240122182406824, - "loss": 0.7506, - "step": 3512 - }, - { - "epoch": 1.7260257120009843, - "grad_norm": 0.4079183364657208, - "learning_rate": 0.00010233966341854708, - "loss": 0.7897, - "step": 3513 - }, - { - "epoch": 1.7265178077136003, - "grad_norm": 0.4006686873888735, - "learning_rate": 0.00010227810412592667, - "loss": 0.7007, - "step": 3514 - }, - { - "epoch": 1.7270099034262163, - "grad_norm": 0.41546339244659536, - "learning_rate": 0.00010221654396954765, - "loss": 0.6996, - "step": 3515 - }, - { - "epoch": 1.7275019991388325, - "grad_norm": 0.3612698481311449, - "learning_rate": 0.00010215498297275095, - "loss": 0.6563, - "step": 3516 - }, - { - "epoch": 1.7279940948514487, - "grad_norm": 0.3788470095424007, - "learning_rate": 0.00010209342115887786, - "loss": 0.7211, - "step": 3517 - }, - { - "epoch": 1.7284861905640647, - "grad_norm": 0.38345562832385494, - "learning_rate": 0.00010203185855126995, - "loss": 0.7401, - "step": 3518 - }, - { - "epoch": 1.7289782862766807, - "grad_norm": 0.4389762480016236, - "learning_rate": 0.0001019702951732691, - "loss": 0.7652, - "step": 3519 - }, - { - "epoch": 1.729470381989297, - "grad_norm": 0.44322486526136545, - "learning_rate": 0.00010190873104821747, - "loss": 0.7873, - "step": 3520 - }, - { - "epoch": 1.7299624777019131, - "grad_norm": 0.4107555332048576, - "learning_rate": 0.00010184716619945753, - "loss": 0.7415, - "step": 3521 - }, - { - "epoch": 1.7304545734145291, - "grad_norm": 0.4175407609007106, - "learning_rate": 0.00010178560065033202, - "loss": 0.7055, - "step": 3522 - }, - { - "epoch": 1.7309466691271451, - "grad_norm": 0.3824759288659564, - "learning_rate": 0.00010172403442418394, - "loss": 0.6757, - "step": 3523 - }, - { - "epoch": 1.7314387648397613, - "grad_norm": 0.3842239327936125, - "learning_rate": 0.0001016624675443565, - "loss": 0.7057, - "step": 3524 - }, - { - "epoch": 1.7319308605523775, - "grad_norm": 0.442323995185006, - "learning_rate": 0.00010160090003419324, - "loss": 0.7685, - "step": 3525 - }, - { - "epoch": 1.7324229562649935, - "grad_norm": 0.3791806550187048, - "learning_rate": 0.00010153933191703789, - "loss": 0.6874, - "step": 3526 - }, - { - "epoch": 1.7329150519776095, - "grad_norm": 0.400614393343862, - "learning_rate": 0.00010147776321623437, - "loss": 0.7042, - "step": 3527 - }, - { - "epoch": 1.7334071476902257, - "grad_norm": 0.401074185035461, - "learning_rate": 0.00010141619395512694, - "loss": 0.6988, - "step": 3528 - }, - { - "epoch": 1.733899243402842, - "grad_norm": 0.389821659209944, - "learning_rate": 0.00010135462415705996, - "loss": 0.7341, - "step": 3529 - }, - { - "epoch": 1.734391339115458, - "grad_norm": 0.41109927618522046, - "learning_rate": 0.00010129305384537803, - "loss": 0.713, - "step": 3530 - }, - { - "epoch": 1.734883434828074, - "grad_norm": 0.4303110985782416, - "learning_rate": 0.000101231483043426, - "loss": 0.7744, - "step": 3531 - }, - { - "epoch": 1.7353755305406902, - "grad_norm": 0.3926575069143071, - "learning_rate": 0.00010116991177454884, - "loss": 0.6485, - "step": 3532 - }, - { - "epoch": 1.7358676262533064, - "grad_norm": 0.3829580495426352, - "learning_rate": 0.0001011083400620917, - "loss": 0.6795, - "step": 3533 - }, - { - "epoch": 1.7363597219659224, - "grad_norm": 0.38757407855706405, - "learning_rate": 0.00010104676792939991, - "loss": 0.6671, - "step": 3534 - }, - { - "epoch": 1.7368518176785384, - "grad_norm": 0.40191137525061993, - "learning_rate": 0.00010098519539981895, - "loss": 0.7354, - "step": 3535 - }, - { - "epoch": 1.7373439133911546, - "grad_norm": 0.4192264881292666, - "learning_rate": 0.00010092362249669449, - "loss": 0.6715, - "step": 3536 - }, - { - "epoch": 1.7378360091037708, - "grad_norm": 0.41467412279185634, - "learning_rate": 0.00010086204924337228, - "loss": 0.7166, - "step": 3537 - }, - { - "epoch": 1.7383281048163868, - "grad_norm": 0.4271854923288818, - "learning_rate": 0.00010080047566319828, - "loss": 0.7443, - "step": 3538 - }, - { - "epoch": 1.7388202005290028, - "grad_norm": 0.411345068577774, - "learning_rate": 0.0001007389017795185, - "loss": 0.7376, - "step": 3539 - }, - { - "epoch": 1.739312296241619, - "grad_norm": 0.39822703509584123, - "learning_rate": 0.00010067732761567909, - "loss": 0.7129, - "step": 3540 - }, - { - "epoch": 1.7398043919542352, - "grad_norm": 0.4109190607654912, - "learning_rate": 0.00010061575319502634, - "loss": 0.7226, - "step": 3541 - }, - { - "epoch": 1.7402964876668512, - "grad_norm": 0.3960863412071096, - "learning_rate": 0.00010055417854090661, - "loss": 0.6881, - "step": 3542 - }, - { - "epoch": 1.7407885833794672, - "grad_norm": 0.3691519032507848, - "learning_rate": 0.00010049260367666629, - "loss": 0.6281, - "step": 3543 - }, - { - "epoch": 1.7412806790920834, - "grad_norm": 0.47692918908217197, - "learning_rate": 0.00010043102862565197, - "loss": 0.7108, - "step": 3544 - }, - { - "epoch": 1.7417727748046996, - "grad_norm": 0.36731186336445404, - "learning_rate": 0.00010036945341121023, - "loss": 0.6829, - "step": 3545 - }, - { - "epoch": 1.7422648705173156, - "grad_norm": 0.41197569915970433, - "learning_rate": 0.00010030787805668772, - "loss": 0.7398, - "step": 3546 - }, - { - "epoch": 1.7427569662299316, - "grad_norm": 0.3956203084461126, - "learning_rate": 0.00010024630258543115, - "loss": 0.7136, - "step": 3547 - }, - { - "epoch": 1.7432490619425478, - "grad_norm": 0.3764832927847127, - "learning_rate": 0.00010018472702078731, - "loss": 0.6258, - "step": 3548 - }, - { - "epoch": 1.743741157655164, - "grad_norm": 0.39374042961756567, - "learning_rate": 0.00010012315138610296, - "loss": 0.6854, - "step": 3549 - }, - { - "epoch": 1.74423325336778, - "grad_norm": 0.3718920513883015, - "learning_rate": 0.0001000615757047249, - "loss": 0.7036, - "step": 3550 - }, - { - "epoch": 1.744725349080396, - "grad_norm": 0.3840694493698443, - "learning_rate": 0.0001, - "loss": 0.6737, - "step": 3551 - }, - { - "epoch": 1.7452174447930122, - "grad_norm": 0.37453480334956263, - "learning_rate": 9.993842429527511e-05, - "loss": 0.6893, - "step": 3552 - }, - { - "epoch": 1.7457095405056284, - "grad_norm": 0.3855906756896309, - "learning_rate": 9.98768486138971e-05, - "loss": 0.6951, - "step": 3553 - }, - { - "epoch": 1.7462016362182444, - "grad_norm": 0.4039952146641488, - "learning_rate": 9.981527297921271e-05, - "loss": 0.7419, - "step": 3554 - }, - { - "epoch": 1.7466937319308604, - "grad_norm": 0.374779436380444, - "learning_rate": 9.975369741456886e-05, - "loss": 0.7277, - "step": 3555 - }, - { - "epoch": 1.7471858276434766, - "grad_norm": 0.4017553648948238, - "learning_rate": 9.96921219433123e-05, - "loss": 0.7364, - "step": 3556 - }, - { - "epoch": 1.7476779233560928, - "grad_norm": 0.3874209994250295, - "learning_rate": 9.963054658878979e-05, - "loss": 0.6758, - "step": 3557 - }, - { - "epoch": 1.7481700190687088, - "grad_norm": 0.4008569565521517, - "learning_rate": 9.956897137434803e-05, - "loss": 0.6873, - "step": 3558 - }, - { - "epoch": 1.7486621147813248, - "grad_norm": 0.4084491154396771, - "learning_rate": 9.950739632333374e-05, - "loss": 0.7869, - "step": 3559 - }, - { - "epoch": 1.749154210493941, - "grad_norm": 0.3842445774848982, - "learning_rate": 9.944582145909342e-05, - "loss": 0.775, - "step": 3560 - }, - { - "epoch": 1.7496463062065573, - "grad_norm": 0.38670512594838846, - "learning_rate": 9.938424680497366e-05, - "loss": 0.7093, - "step": 3561 - }, - { - "epoch": 1.7501384019191732, - "grad_norm": 0.4232140184965088, - "learning_rate": 9.932267238432092e-05, - "loss": 0.6848, - "step": 3562 - }, - { - "epoch": 1.7506304976317892, - "grad_norm": 0.3967726417745073, - "learning_rate": 9.926109822048152e-05, - "loss": 0.6915, - "step": 3563 - }, - { - "epoch": 1.7511225933444055, - "grad_norm": 0.3992389662750598, - "learning_rate": 9.919952433680176e-05, - "loss": 0.7599, - "step": 3564 - }, - { - "epoch": 1.7516146890570217, - "grad_norm": 0.4069296621023427, - "learning_rate": 9.913795075662773e-05, - "loss": 0.7919, - "step": 3565 - }, - { - "epoch": 1.7521067847696377, - "grad_norm": 0.40169081802961726, - "learning_rate": 9.907637750330552e-05, - "loss": 0.7502, - "step": 3566 - }, - { - "epoch": 1.7525988804822537, - "grad_norm": 0.44601310963902435, - "learning_rate": 9.901480460018109e-05, - "loss": 0.6969, - "step": 3567 - }, - { - "epoch": 1.7530909761948699, - "grad_norm": 0.43521158507426405, - "learning_rate": 9.895323207060012e-05, - "loss": 0.7, - "step": 3568 - }, - { - "epoch": 1.753583071907486, - "grad_norm": 0.40027753321132364, - "learning_rate": 9.889165993790833e-05, - "loss": 0.7111, - "step": 3569 - }, - { - "epoch": 1.754075167620102, - "grad_norm": 0.40321945384277336, - "learning_rate": 9.883008822545118e-05, - "loss": 0.7506, - "step": 3570 - }, - { - "epoch": 1.754567263332718, - "grad_norm": 0.41366983129488605, - "learning_rate": 9.876851695657401e-05, - "loss": 0.7994, - "step": 3571 - }, - { - "epoch": 1.7550593590453343, - "grad_norm": 0.3958610074652765, - "learning_rate": 9.870694615462196e-05, - "loss": 0.7408, - "step": 3572 - }, - { - "epoch": 1.7555514547579505, - "grad_norm": 0.40465987135097947, - "learning_rate": 9.864537584294009e-05, - "loss": 0.7041, - "step": 3573 - }, - { - "epoch": 1.7560435504705665, - "grad_norm": 0.4131957896563108, - "learning_rate": 9.85838060448731e-05, - "loss": 0.6722, - "step": 3574 - }, - { - "epoch": 1.7565356461831825, - "grad_norm": 0.4070671999796951, - "learning_rate": 9.852223678376564e-05, - "loss": 0.6942, - "step": 3575 - }, - { - "epoch": 1.7570277418957987, - "grad_norm": 0.40961386529538774, - "learning_rate": 9.846066808296216e-05, - "loss": 0.7206, - "step": 3576 - }, - { - "epoch": 1.757519837608415, - "grad_norm": 0.3957878469143598, - "learning_rate": 9.839909996580678e-05, - "loss": 0.6698, - "step": 3577 - }, - { - "epoch": 1.758011933321031, - "grad_norm": 0.43925914277772965, - "learning_rate": 9.83375324556435e-05, - "loss": 0.7104, - "step": 3578 - }, - { - "epoch": 1.7585040290336469, - "grad_norm": 0.41004679098200747, - "learning_rate": 9.827596557581608e-05, - "loss": 0.6971, - "step": 3579 - }, - { - "epoch": 1.758996124746263, - "grad_norm": 0.40025436562277616, - "learning_rate": 9.821439934966799e-05, - "loss": 0.7186, - "step": 3580 - }, - { - "epoch": 1.7594882204588793, - "grad_norm": 0.4013237402959138, - "learning_rate": 9.815283380054245e-05, - "loss": 0.7392, - "step": 3581 - }, - { - "epoch": 1.7599803161714953, - "grad_norm": 0.39082561983900566, - "learning_rate": 9.809126895178255e-05, - "loss": 0.7119, - "step": 3582 - }, - { - "epoch": 1.7604724118841113, - "grad_norm": 0.44344168165424497, - "learning_rate": 9.802970482673092e-05, - "loss": 0.7736, - "step": 3583 - }, - { - "epoch": 1.7609645075967275, - "grad_norm": 0.4265648361327368, - "learning_rate": 9.796814144873006e-05, - "loss": 0.7046, - "step": 3584 - }, - { - "epoch": 1.7614566033093437, - "grad_norm": 0.3885664868990184, - "learning_rate": 9.790657884112218e-05, - "loss": 0.7191, - "step": 3585 - }, - { - "epoch": 1.7619486990219597, - "grad_norm": 0.43208693779897167, - "learning_rate": 9.784501702724906e-05, - "loss": 0.7569, - "step": 3586 - }, - { - "epoch": 1.7624407947345757, - "grad_norm": 6.769422811998193, - "learning_rate": 9.778345603045236e-05, - "loss": 0.9136, - "step": 3587 - }, - { - "epoch": 1.762932890447192, - "grad_norm": 0.42892805162230574, - "learning_rate": 9.772189587407337e-05, - "loss": 0.687, - "step": 3588 - }, - { - "epoch": 1.7634249861598081, - "grad_norm": 0.39870719955427314, - "learning_rate": 9.766033658145293e-05, - "loss": 0.6832, - "step": 3589 - }, - { - "epoch": 1.7639170818724241, - "grad_norm": 0.41477009627463773, - "learning_rate": 9.759877817593181e-05, - "loss": 0.7545, - "step": 3590 - }, - { - "epoch": 1.7644091775850403, - "grad_norm": 0.4338401142547945, - "learning_rate": 9.753722068085017e-05, - "loss": 0.7534, - "step": 3591 - }, - { - "epoch": 1.7649012732976566, - "grad_norm": 0.4310440685314093, - "learning_rate": 9.7475664119548e-05, - "loss": 0.7866, - "step": 3592 - }, - { - "epoch": 1.7653933690102726, - "grad_norm": 0.4006500381452929, - "learning_rate": 9.741410851536489e-05, - "loss": 0.7156, - "step": 3593 - }, - { - "epoch": 1.7658854647228885, - "grad_norm": 0.3983126074848176, - "learning_rate": 9.735255389164007e-05, - "loss": 0.7056, - "step": 3594 - }, - { - "epoch": 1.7663775604355048, - "grad_norm": 0.41122094306894436, - "learning_rate": 9.729100027171237e-05, - "loss": 0.6845, - "step": 3595 - }, - { - "epoch": 1.766869656148121, - "grad_norm": 0.43211998328105466, - "learning_rate": 9.72294476789203e-05, - "loss": 0.726, - "step": 3596 - }, - { - "epoch": 1.767361751860737, - "grad_norm": 0.4370230378403562, - "learning_rate": 9.716789613660188e-05, - "loss": 0.7546, - "step": 3597 - }, - { - "epoch": 1.767853847573353, - "grad_norm": 0.45340938619946647, - "learning_rate": 9.710634566809484e-05, - "loss": 0.7013, - "step": 3598 - }, - { - "epoch": 1.7683459432859692, - "grad_norm": 0.4086115283344419, - "learning_rate": 9.70447962967365e-05, - "loss": 0.7106, - "step": 3599 - }, - { - "epoch": 1.7688380389985854, - "grad_norm": 0.41458569303006004, - "learning_rate": 9.698324804586362e-05, - "loss": 0.6908, - "step": 3600 - }, - { - "epoch": 1.7693301347112014, - "grad_norm": 0.3746455260273381, - "learning_rate": 9.692170093881272e-05, - "loss": 0.6919, - "step": 3601 - }, - { - "epoch": 1.7698222304238174, - "grad_norm": 0.38263056436158766, - "learning_rate": 9.686015499891976e-05, - "loss": 0.7329, - "step": 3602 - }, - { - "epoch": 1.7703143261364336, - "grad_norm": 0.39690722756054714, - "learning_rate": 9.679861024952031e-05, - "loss": 0.7098, - "step": 3603 - }, - { - "epoch": 1.7708064218490498, - "grad_norm": 0.4288916707528441, - "learning_rate": 9.673706671394947e-05, - "loss": 0.7258, - "step": 3604 - }, - { - "epoch": 1.7712985175616658, - "grad_norm": 0.40693943074181127, - "learning_rate": 9.667552441554193e-05, - "loss": 0.6631, - "step": 3605 - }, - { - "epoch": 1.7717906132742818, - "grad_norm": 0.3943991617454967, - "learning_rate": 9.661398337763181e-05, - "loss": 0.6899, - "step": 3606 - }, - { - "epoch": 1.772282708986898, - "grad_norm": 0.46438425466168803, - "learning_rate": 9.655244362355283e-05, - "loss": 0.7604, - "step": 3607 - }, - { - "epoch": 1.7727748046995142, - "grad_norm": 0.39289139588856387, - "learning_rate": 9.649090517663825e-05, - "loss": 0.6647, - "step": 3608 - }, - { - "epoch": 1.7732669004121302, - "grad_norm": 0.37589813982708253, - "learning_rate": 9.642936806022074e-05, - "loss": 0.7453, - "step": 3609 - }, - { - "epoch": 1.7737589961247462, - "grad_norm": 0.43598144036229847, - "learning_rate": 9.63678322976325e-05, - "loss": 0.752, - "step": 3610 - }, - { - "epoch": 1.7742510918373624, - "grad_norm": 0.4202884364136635, - "learning_rate": 9.630629791220532e-05, - "loss": 0.7024, - "step": 3611 - }, - { - "epoch": 1.7747431875499786, - "grad_norm": 0.41047943093238265, - "learning_rate": 9.624476492727026e-05, - "loss": 0.7001, - "step": 3612 - }, - { - "epoch": 1.7752352832625946, - "grad_norm": 0.38733211562690373, - "learning_rate": 9.618323336615809e-05, - "loss": 0.6984, - "step": 3613 - }, - { - "epoch": 1.7757273789752106, - "grad_norm": 0.3915299895644845, - "learning_rate": 9.61217032521988e-05, - "loss": 0.7614, - "step": 3614 - }, - { - "epoch": 1.7762194746878268, - "grad_norm": 0.4864737172546369, - "learning_rate": 9.606017460872202e-05, - "loss": 0.702, - "step": 3615 - }, - { - "epoch": 1.776711570400443, - "grad_norm": 0.4287295052493886, - "learning_rate": 9.599864745905676e-05, - "loss": 0.7666, - "step": 3616 - }, - { - "epoch": 1.777203666113059, - "grad_norm": 0.37777073408162265, - "learning_rate": 9.593712182653142e-05, - "loss": 0.7069, - "step": 3617 - }, - { - "epoch": 1.777695761825675, - "grad_norm": 0.4041294321789745, - "learning_rate": 9.587559773447386e-05, - "loss": 0.7171, - "step": 3618 - }, - { - "epoch": 1.7781878575382912, - "grad_norm": 0.4045073679131275, - "learning_rate": 9.581407520621139e-05, - "loss": 0.7409, - "step": 3619 - }, - { - "epoch": 1.7786799532509074, - "grad_norm": 0.4203245957136642, - "learning_rate": 9.575255426507066e-05, - "loss": 0.7248, - "step": 3620 - }, - { - "epoch": 1.7791720489635234, - "grad_norm": 0.41894224530100027, - "learning_rate": 9.569103493437775e-05, - "loss": 0.7337, - "step": 3621 - }, - { - "epoch": 1.7796641446761394, - "grad_norm": 0.4010499604903825, - "learning_rate": 9.56295172374582e-05, - "loss": 0.7348, - "step": 3622 - }, - { - "epoch": 1.7801562403887556, - "grad_norm": 0.3976124715314281, - "learning_rate": 9.556800119763676e-05, - "loss": 0.7038, - "step": 3623 - }, - { - "epoch": 1.7806483361013719, - "grad_norm": 0.4316771595023527, - "learning_rate": 9.550648683823774e-05, - "loss": 0.7351, - "step": 3624 - }, - { - "epoch": 1.7811404318139878, - "grad_norm": 0.39222661297048333, - "learning_rate": 9.544497418258466e-05, - "loss": 0.7287, - "step": 3625 - }, - { - "epoch": 1.7816325275266038, - "grad_norm": 0.41841392270252, - "learning_rate": 9.53834632540005e-05, - "loss": 0.7575, - "step": 3626 - }, - { - "epoch": 1.78212462323922, - "grad_norm": 0.3927915030903514, - "learning_rate": 9.532195407580751e-05, - "loss": 0.7238, - "step": 3627 - }, - { - "epoch": 1.7826167189518363, - "grad_norm": 0.39068585990983884, - "learning_rate": 9.52604466713274e-05, - "loss": 0.7126, - "step": 3628 - }, - { - "epoch": 1.7831088146644523, - "grad_norm": 0.37218797397745434, - "learning_rate": 9.5198941063881e-05, - "loss": 0.661, - "step": 3629 - }, - { - "epoch": 1.7836009103770682, - "grad_norm": 0.4231800035280616, - "learning_rate": 9.513743727678862e-05, - "loss": 0.7152, - "step": 3630 - }, - { - "epoch": 1.7840930060896845, - "grad_norm": 0.4109899483833016, - "learning_rate": 9.507593533336991e-05, - "loss": 0.7397, - "step": 3631 - }, - { - "epoch": 1.7845851018023007, - "grad_norm": 0.4048869671348017, - "learning_rate": 9.501443525694364e-05, - "loss": 0.7574, - "step": 3632 - }, - { - "epoch": 1.7850771975149167, - "grad_norm": 0.42718661700723454, - "learning_rate": 9.495293707082803e-05, - "loss": 0.6622, - "step": 3633 - }, - { - "epoch": 1.7855692932275327, - "grad_norm": 0.40615682533588254, - "learning_rate": 9.489144079834057e-05, - "loss": 0.7172, - "step": 3634 - }, - { - "epoch": 1.7860613889401489, - "grad_norm": 0.38567327017355046, - "learning_rate": 9.482994646279794e-05, - "loss": 0.6907, - "step": 3635 - }, - { - "epoch": 1.786553484652765, - "grad_norm": 0.3898865033843525, - "learning_rate": 9.476845408751614e-05, - "loss": 0.7232, - "step": 3636 - }, - { - "epoch": 1.787045580365381, - "grad_norm": 0.38392837452996853, - "learning_rate": 9.470696369581043e-05, - "loss": 0.661, - "step": 3637 - }, - { - "epoch": 1.787537676077997, - "grad_norm": 0.3957580491721606, - "learning_rate": 9.464547531099531e-05, - "loss": 0.7026, - "step": 3638 - }, - { - "epoch": 1.7880297717906133, - "grad_norm": 0.3841754582784987, - "learning_rate": 9.458398895638453e-05, - "loss": 0.6594, - "step": 3639 - }, - { - "epoch": 1.7885218675032295, - "grad_norm": 0.381540839072192, - "learning_rate": 9.452250465529103e-05, - "loss": 0.7105, - "step": 3640 - }, - { - "epoch": 1.7890139632158455, - "grad_norm": 0.4075211291646927, - "learning_rate": 9.446102243102698e-05, - "loss": 0.7512, - "step": 3641 - }, - { - "epoch": 1.7895060589284615, - "grad_norm": 0.4110434646998583, - "learning_rate": 9.439954230690387e-05, - "loss": 0.7843, - "step": 3642 - }, - { - "epoch": 1.7899981546410777, - "grad_norm": 0.3821810793023431, - "learning_rate": 9.43380643062322e-05, - "loss": 0.7083, - "step": 3643 - }, - { - "epoch": 1.790490250353694, - "grad_norm": 0.43800897842754943, - "learning_rate": 9.427658845232183e-05, - "loss": 0.7994, - "step": 3644 - }, - { - "epoch": 1.79098234606631, - "grad_norm": 0.4060559657117266, - "learning_rate": 9.421511476848176e-05, - "loss": 0.7054, - "step": 3645 - }, - { - "epoch": 1.791474441778926, - "grad_norm": 0.4251741558666256, - "learning_rate": 9.41536432780201e-05, - "loss": 0.6839, - "step": 3646 - }, - { - "epoch": 1.791966537491542, - "grad_norm": 0.3967139786790515, - "learning_rate": 9.409217400424425e-05, - "loss": 0.6599, - "step": 3647 - }, - { - "epoch": 1.7924586332041583, - "grad_norm": 0.3844356170739326, - "learning_rate": 9.403070697046064e-05, - "loss": 0.6614, - "step": 3648 - }, - { - "epoch": 1.7929507289167743, - "grad_norm": 0.405097692915681, - "learning_rate": 9.396924219997495e-05, - "loss": 0.6524, - "step": 3649 - }, - { - "epoch": 1.7934428246293903, - "grad_norm": 0.4015999545623788, - "learning_rate": 9.390777971609192e-05, - "loss": 0.7098, - "step": 3650 - }, - { - "epoch": 1.7939349203420065, - "grad_norm": 0.39157952962502157, - "learning_rate": 9.384631954211556e-05, - "loss": 0.7294, - "step": 3651 - }, - { - "epoch": 1.7944270160546227, - "grad_norm": 0.4053712472251571, - "learning_rate": 9.37848617013488e-05, - "loss": 0.7188, - "step": 3652 - }, - { - "epoch": 1.7949191117672387, - "grad_norm": 0.42118321856598334, - "learning_rate": 9.372340621709384e-05, - "loss": 0.7789, - "step": 3653 - }, - { - "epoch": 1.7954112074798547, - "grad_norm": 0.3736988990314272, - "learning_rate": 9.366195311265199e-05, - "loss": 0.7104, - "step": 3654 - }, - { - "epoch": 1.795903303192471, - "grad_norm": 0.3800296898417442, - "learning_rate": 9.360050241132353e-05, - "loss": 0.6743, - "step": 3655 - }, - { - "epoch": 1.7963953989050871, - "grad_norm": 0.4087234152549692, - "learning_rate": 9.353905413640795e-05, - "loss": 0.716, - "step": 3656 - }, - { - "epoch": 1.7968874946177031, - "grad_norm": 0.38169405337252094, - "learning_rate": 9.347760831120384e-05, - "loss": 0.7012, - "step": 3657 - }, - { - "epoch": 1.7973795903303191, - "grad_norm": 0.39749285717983823, - "learning_rate": 9.34161649590087e-05, - "loss": 0.6815, - "step": 3658 - }, - { - "epoch": 1.7978716860429353, - "grad_norm": 0.42167945023720504, - "learning_rate": 9.335472410311924e-05, - "loss": 0.7164, - "step": 3659 - }, - { - "epoch": 1.7983637817555516, - "grad_norm": 0.3938649420809012, - "learning_rate": 9.329328576683117e-05, - "loss": 0.6846, - "step": 3660 - }, - { - "epoch": 1.7988558774681676, - "grad_norm": 0.41802174604661035, - "learning_rate": 9.323184997343926e-05, - "loss": 0.7196, - "step": 3661 - }, - { - "epoch": 1.7993479731807835, - "grad_norm": 0.39438780140295476, - "learning_rate": 9.317041674623731e-05, - "loss": 0.7902, - "step": 3662 - }, - { - "epoch": 1.7998400688933998, - "grad_norm": 0.421907736746234, - "learning_rate": 9.310898610851814e-05, - "loss": 0.6881, - "step": 3663 - }, - { - "epoch": 1.800332164606016, - "grad_norm": 0.39614091012821945, - "learning_rate": 9.304755808357355e-05, - "loss": 0.6844, - "step": 3664 - }, - { - "epoch": 1.800824260318632, - "grad_norm": 0.3733740097174482, - "learning_rate": 9.298613269469449e-05, - "loss": 0.6513, - "step": 3665 - }, - { - "epoch": 1.801316356031248, - "grad_norm": 0.3712862644921941, - "learning_rate": 9.292470996517069e-05, - "loss": 0.6777, - "step": 3666 - }, - { - "epoch": 1.8018084517438642, - "grad_norm": 0.41059073322424766, - "learning_rate": 9.286328991829107e-05, - "loss": 0.7188, - "step": 3667 - }, - { - "epoch": 1.8023005474564804, - "grad_norm": 0.3898080484982208, - "learning_rate": 9.280187257734349e-05, - "loss": 0.6804, - "step": 3668 - }, - { - "epoch": 1.8027926431690964, - "grad_norm": 0.38677746790954753, - "learning_rate": 9.274045796561466e-05, - "loss": 0.6653, - "step": 3669 - }, - { - "epoch": 1.8032847388817124, - "grad_norm": 0.4165005809660249, - "learning_rate": 9.26790461063904e-05, - "loss": 0.7119, - "step": 3670 - }, - { - "epoch": 1.8037768345943286, - "grad_norm": 0.3824033248879199, - "learning_rate": 9.261763702295543e-05, - "loss": 0.7022, - "step": 3671 - }, - { - "epoch": 1.8042689303069448, - "grad_norm": 0.4023884022227293, - "learning_rate": 9.255623073859343e-05, - "loss": 0.6696, - "step": 3672 - }, - { - "epoch": 1.8047610260195608, - "grad_norm": 0.40136840583425243, - "learning_rate": 9.249482727658696e-05, - "loss": 0.6904, - "step": 3673 - }, - { - "epoch": 1.8052531217321768, - "grad_norm": 0.3798428338020864, - "learning_rate": 9.243342666021764e-05, - "loss": 0.6783, - "step": 3674 - }, - { - "epoch": 1.805745217444793, - "grad_norm": 0.38874449867040906, - "learning_rate": 9.237202891276583e-05, - "loss": 0.6736, - "step": 3675 - }, - { - "epoch": 1.8062373131574092, - "grad_norm": 0.4108747113060928, - "learning_rate": 9.231063405751095e-05, - "loss": 0.6866, - "step": 3676 - }, - { - "epoch": 1.8067294088700252, - "grad_norm": 0.4502609393034804, - "learning_rate": 9.224924211773134e-05, - "loss": 0.7519, - "step": 3677 - }, - { - "epoch": 1.8072215045826412, - "grad_norm": 0.3796156629232663, - "learning_rate": 9.218785311670406e-05, - "loss": 0.6691, - "step": 3678 - }, - { - "epoch": 1.8077136002952574, - "grad_norm": 0.40886274368335873, - "learning_rate": 9.21264670777052e-05, - "loss": 0.7112, - "step": 3679 - }, - { - "epoch": 1.8082056960078736, - "grad_norm": 0.42355144311951637, - "learning_rate": 9.206508402400978e-05, - "loss": 0.6871, - "step": 3680 - }, - { - "epoch": 1.8086977917204896, - "grad_norm": 0.3853752762561231, - "learning_rate": 9.200370397889145e-05, - "loss": 0.7104, - "step": 3681 - }, - { - "epoch": 1.8091898874331056, - "grad_norm": 0.39264909739973025, - "learning_rate": 9.194232696562299e-05, - "loss": 0.7307, - "step": 3682 - }, - { - "epoch": 1.8096819831457218, - "grad_norm": 0.4102083225264434, - "learning_rate": 9.188095300747588e-05, - "loss": 0.7301, - "step": 3683 - }, - { - "epoch": 1.810174078858338, - "grad_norm": 0.39726478647459396, - "learning_rate": 9.181958212772045e-05, - "loss": 0.6958, - "step": 3684 - }, - { - "epoch": 1.810666174570954, - "grad_norm": 0.4021112806581545, - "learning_rate": 9.175821434962587e-05, - "loss": 0.6997, - "step": 3685 - }, - { - "epoch": 1.81115827028357, - "grad_norm": 0.41951606160623856, - "learning_rate": 9.169684969646022e-05, - "loss": 0.6895, - "step": 3686 - }, - { - "epoch": 1.8116503659961862, - "grad_norm": 0.3845688009685872, - "learning_rate": 9.163548819149024e-05, - "loss": 0.6505, - "step": 3687 - }, - { - "epoch": 1.8121424617088024, - "grad_norm": 0.37000457667998365, - "learning_rate": 9.157412985798164e-05, - "loss": 0.6849, - "step": 3688 - }, - { - "epoch": 1.8126345574214184, - "grad_norm": 0.38963423725786017, - "learning_rate": 9.151277471919877e-05, - "loss": 0.729, - "step": 3689 - }, - { - "epoch": 1.8131266531340344, - "grad_norm": 0.39149433659239297, - "learning_rate": 9.145142279840489e-05, - "loss": 0.6916, - "step": 3690 - }, - { - "epoch": 1.8136187488466506, - "grad_norm": 0.4241517404854112, - "learning_rate": 9.139007411886203e-05, - "loss": 0.7773, - "step": 3691 - }, - { - "epoch": 1.8141108445592669, - "grad_norm": 0.38811482772973754, - "learning_rate": 9.132872870383086e-05, - "loss": 0.6882, - "step": 3692 - }, - { - "epoch": 1.8146029402718828, - "grad_norm": 0.38972974305922936, - "learning_rate": 9.1267386576571e-05, - "loss": 0.7575, - "step": 3693 - }, - { - "epoch": 1.8150950359844988, - "grad_norm": 0.3707112981210964, - "learning_rate": 9.12060477603407e-05, - "loss": 0.7315, - "step": 3694 - }, - { - "epoch": 1.815587131697115, - "grad_norm": 0.37287843944915056, - "learning_rate": 9.114471227839701e-05, - "loss": 0.7211, - "step": 3695 - }, - { - "epoch": 1.8160792274097313, - "grad_norm": 0.39947783415704224, - "learning_rate": 9.108338015399563e-05, - "loss": 0.6665, - "step": 3696 - }, - { - "epoch": 1.8165713231223473, - "grad_norm": 0.36954024004603747, - "learning_rate": 9.102205141039115e-05, - "loss": 0.6781, - "step": 3697 - }, - { - "epoch": 1.8170634188349633, - "grad_norm": 0.40546968582768905, - "learning_rate": 9.096072607083667e-05, - "loss": 0.7207, - "step": 3698 - }, - { - "epoch": 1.8175555145475795, - "grad_norm": 0.42605154676674606, - "learning_rate": 9.089940415858416e-05, - "loss": 0.7944, - "step": 3699 - }, - { - "epoch": 1.8180476102601957, - "grad_norm": 0.4107813840978002, - "learning_rate": 9.083808569688428e-05, - "loss": 0.7439, - "step": 3700 - }, - { - "epoch": 1.8185397059728117, - "grad_norm": 0.4112317216098252, - "learning_rate": 9.077677070898627e-05, - "loss": 0.7473, - "step": 3701 - }, - { - "epoch": 1.8190318016854277, - "grad_norm": 0.39892042348833323, - "learning_rate": 9.071545921813814e-05, - "loss": 0.7187, - "step": 3702 - }, - { - "epoch": 1.8195238973980439, - "grad_norm": 0.39043805453135716, - "learning_rate": 9.065415124758663e-05, - "loss": 0.7001, - "step": 3703 - }, - { - "epoch": 1.82001599311066, - "grad_norm": 0.4079832837617063, - "learning_rate": 9.059284682057695e-05, - "loss": 0.6899, - "step": 3704 - }, - { - "epoch": 1.820508088823276, - "grad_norm": 0.40226960278619833, - "learning_rate": 9.053154596035318e-05, - "loss": 0.7256, - "step": 3705 - }, - { - "epoch": 1.821000184535892, - "grad_norm": 0.39845675735152963, - "learning_rate": 9.047024869015794e-05, - "loss": 0.7302, - "step": 3706 - }, - { - "epoch": 1.8214922802485083, - "grad_norm": 0.3754735430182928, - "learning_rate": 9.040895503323248e-05, - "loss": 0.7277, - "step": 3707 - }, - { - "epoch": 1.8219843759611245, - "grad_norm": 0.39474050952105155, - "learning_rate": 9.034766501281671e-05, - "loss": 0.6996, - "step": 3708 - }, - { - "epoch": 1.8224764716737405, - "grad_norm": 0.3896056496771847, - "learning_rate": 9.02863786521492e-05, - "loss": 0.7312, - "step": 3709 - }, - { - "epoch": 1.8229685673863565, - "grad_norm": 0.3696055463080941, - "learning_rate": 9.0225095974467e-05, - "loss": 0.7103, - "step": 3710 - }, - { - "epoch": 1.8234606630989727, - "grad_norm": 0.3869019189330015, - "learning_rate": 9.016381700300598e-05, - "loss": 0.793, - "step": 3711 - }, - { - "epoch": 1.823952758811589, - "grad_norm": 0.4083584908249029, - "learning_rate": 9.010254176100034e-05, - "loss": 0.7622, - "step": 3712 - }, - { - "epoch": 1.824444854524205, - "grad_norm": 0.3877063911433168, - "learning_rate": 9.004127027168307e-05, - "loss": 0.6907, - "step": 3713 - }, - { - "epoch": 1.824936950236821, - "grad_norm": 0.4029664066401883, - "learning_rate": 8.998000255828573e-05, - "loss": 0.6933, - "step": 3714 - }, - { - "epoch": 1.8254290459494371, - "grad_norm": 0.3770977332100248, - "learning_rate": 8.991873864403827e-05, - "loss": 0.6523, - "step": 3715 - }, - { - "epoch": 1.8259211416620533, - "grad_norm": 0.43050634881317557, - "learning_rate": 8.98574785521694e-05, - "loss": 0.7379, - "step": 3716 - }, - { - "epoch": 1.8264132373746693, - "grad_norm": 0.381995881375049, - "learning_rate": 8.979622230590627e-05, - "loss": 0.699, - "step": 3717 - }, - { - "epoch": 1.8269053330872855, - "grad_norm": 0.39348691077067494, - "learning_rate": 8.97349699284746e-05, - "loss": 0.7172, - "step": 3718 - }, - { - "epoch": 1.8273974287999017, - "grad_norm": 0.4081967569670485, - "learning_rate": 8.967372144309864e-05, - "loss": 0.7371, - "step": 3719 - }, - { - "epoch": 1.8278895245125177, - "grad_norm": 0.3833825353998454, - "learning_rate": 8.961247687300122e-05, - "loss": 0.7411, - "step": 3720 - }, - { - "epoch": 1.8283816202251337, - "grad_norm": 0.39388029814116815, - "learning_rate": 8.955123624140355e-05, - "loss": 0.7949, - "step": 3721 - }, - { - "epoch": 1.82887371593775, - "grad_norm": 0.4108304786282939, - "learning_rate": 8.948999957152547e-05, - "loss": 0.7347, - "step": 3722 - }, - { - "epoch": 1.8293658116503662, - "grad_norm": 0.6030504547135287, - "learning_rate": 8.942876688658531e-05, - "loss": 0.6942, - "step": 3723 - }, - { - "epoch": 1.8298579073629822, - "grad_norm": 0.4350371316300042, - "learning_rate": 8.936753820979981e-05, - "loss": 0.7535, - "step": 3724 - }, - { - "epoch": 1.8303500030755981, - "grad_norm": 0.41955100355530994, - "learning_rate": 8.930631356438423e-05, - "loss": 0.7797, - "step": 3725 - }, - { - "epoch": 1.8308420987882144, - "grad_norm": 0.4347420898352567, - "learning_rate": 8.924509297355239e-05, - "loss": 0.7939, - "step": 3726 - }, - { - "epoch": 1.8313341945008306, - "grad_norm": 0.381354374019232, - "learning_rate": 8.91838764605164e-05, - "loss": 0.6642, - "step": 3727 - }, - { - "epoch": 1.8318262902134466, - "grad_norm": 0.3718501686471073, - "learning_rate": 8.912266404848697e-05, - "loss": 0.666, - "step": 3728 - }, - { - "epoch": 1.8323183859260626, - "grad_norm": 0.3714097611358155, - "learning_rate": 8.906145576067319e-05, - "loss": 0.6676, - "step": 3729 - }, - { - "epoch": 1.8328104816386788, - "grad_norm": 0.41357898781941393, - "learning_rate": 8.900025162028257e-05, - "loss": 0.747, - "step": 3730 - }, - { - "epoch": 1.833302577351295, - "grad_norm": 0.4031385619511016, - "learning_rate": 8.893905165052108e-05, - "loss": 0.7013, - "step": 3731 - }, - { - "epoch": 1.833794673063911, - "grad_norm": 0.36808070463648157, - "learning_rate": 8.887785587459319e-05, - "loss": 0.6565, - "step": 3732 - }, - { - "epoch": 1.834286768776527, - "grad_norm": 0.40993267655499366, - "learning_rate": 8.881666431570155e-05, - "loss": 0.657, - "step": 3733 - }, - { - "epoch": 1.8347788644891432, - "grad_norm": 0.38779860392045135, - "learning_rate": 8.875547699704742e-05, - "loss": 0.7153, - "step": 3734 - }, - { - "epoch": 1.8352709602017594, - "grad_norm": 0.39033315281503617, - "learning_rate": 8.869429394183046e-05, - "loss": 0.7381, - "step": 3735 - }, - { - "epoch": 1.8357630559143754, - "grad_norm": 0.38994489830429646, - "learning_rate": 8.863311517324852e-05, - "loss": 0.7355, - "step": 3736 - }, - { - "epoch": 1.8362551516269914, - "grad_norm": 0.455438199517176, - "learning_rate": 8.857194071449804e-05, - "loss": 0.706, - "step": 3737 - }, - { - "epoch": 1.8367472473396076, - "grad_norm": 0.3922967608564001, - "learning_rate": 8.851077058877364e-05, - "loss": 0.732, - "step": 3738 - }, - { - "epoch": 1.8372393430522238, - "grad_norm": 0.38887486284354605, - "learning_rate": 8.844960481926847e-05, - "loss": 0.7134, - "step": 3739 - }, - { - "epoch": 1.8377314387648398, - "grad_norm": 0.43176106329419306, - "learning_rate": 8.83884434291739e-05, - "loss": 0.6663, - "step": 3740 - }, - { - "epoch": 1.8382235344774558, - "grad_norm": 0.3941384918487974, - "learning_rate": 8.83272864416797e-05, - "loss": 0.7481, - "step": 3741 - }, - { - "epoch": 1.838715630190072, - "grad_norm": 0.40064854172022374, - "learning_rate": 8.826613387997393e-05, - "loss": 0.6401, - "step": 3742 - }, - { - "epoch": 1.8392077259026882, - "grad_norm": 0.38157431859627394, - "learning_rate": 8.820498576724307e-05, - "loss": 0.6739, - "step": 3743 - }, - { - "epoch": 1.8396998216153042, - "grad_norm": 0.3710958900036419, - "learning_rate": 8.814384212667175e-05, - "loss": 0.6977, - "step": 3744 - }, - { - "epoch": 1.8401919173279202, - "grad_norm": 0.4127225948441869, - "learning_rate": 8.808270298144304e-05, - "loss": 0.6942, - "step": 3745 - }, - { - "epoch": 1.8406840130405364, - "grad_norm": 0.3945045820758828, - "learning_rate": 8.80215683547383e-05, - "loss": 0.7707, - "step": 3746 - }, - { - "epoch": 1.8411761087531526, - "grad_norm": 0.37529300969754065, - "learning_rate": 8.796043826973705e-05, - "loss": 0.6929, - "step": 3747 - }, - { - "epoch": 1.8416682044657686, - "grad_norm": 0.3803235173782925, - "learning_rate": 8.789931274961724e-05, - "loss": 0.6737, - "step": 3748 - }, - { - "epoch": 1.8421603001783846, - "grad_norm": 0.40410222428430514, - "learning_rate": 8.783819181755504e-05, - "loss": 0.6884, - "step": 3749 - }, - { - "epoch": 1.8426523958910008, - "grad_norm": 0.38811489242499914, - "learning_rate": 8.77770754967248e-05, - "loss": 0.7584, - "step": 3750 - }, - { - "epoch": 1.843144491603617, - "grad_norm": 0.6251943676212821, - "learning_rate": 8.771596381029923e-05, - "loss": 0.7277, - "step": 3751 - }, - { - "epoch": 1.843636587316233, - "grad_norm": 0.4208576032096547, - "learning_rate": 8.765485678144925e-05, - "loss": 0.7618, - "step": 3752 - }, - { - "epoch": 1.844128683028849, - "grad_norm": 0.4008461840171299, - "learning_rate": 8.759375443334396e-05, - "loss": 0.7259, - "step": 3753 - }, - { - "epoch": 1.8446207787414652, - "grad_norm": 0.4146321017072665, - "learning_rate": 8.753265678915076e-05, - "loss": 0.6809, - "step": 3754 - }, - { - "epoch": 1.8451128744540815, - "grad_norm": 0.42344388837986163, - "learning_rate": 8.747156387203528e-05, - "loss": 0.6932, - "step": 3755 - }, - { - "epoch": 1.8456049701666974, - "grad_norm": 0.42379754679967974, - "learning_rate": 8.74104757051612e-05, - "loss": 0.771, - "step": 3756 - }, - { - "epoch": 1.8460970658793134, - "grad_norm": 0.39649121838114987, - "learning_rate": 8.734939231169059e-05, - "loss": 0.6894, - "step": 3757 - }, - { - "epoch": 1.8465891615919297, - "grad_norm": 0.40998816993359005, - "learning_rate": 8.728831371478365e-05, - "loss": 0.6918, - "step": 3758 - }, - { - "epoch": 1.8470812573045459, - "grad_norm": 0.37188395049459383, - "learning_rate": 8.722723993759869e-05, - "loss": 0.6585, - "step": 3759 - }, - { - "epoch": 1.8475733530171619, - "grad_norm": 0.39116038860538377, - "learning_rate": 8.716617100329231e-05, - "loss": 0.7328, - "step": 3760 - }, - { - "epoch": 1.8480654487297778, - "grad_norm": 0.3932310254133637, - "learning_rate": 8.710510693501912e-05, - "loss": 0.6873, - "step": 3761 - }, - { - "epoch": 1.848557544442394, - "grad_norm": 0.3913915052836477, - "learning_rate": 8.704404775593205e-05, - "loss": 0.6605, - "step": 3762 - }, - { - "epoch": 1.8490496401550103, - "grad_norm": 0.4031210169561732, - "learning_rate": 8.698299348918209e-05, - "loss": 0.707, - "step": 3763 - }, - { - "epoch": 1.8495417358676263, - "grad_norm": 0.37107837027361323, - "learning_rate": 8.692194415791834e-05, - "loss": 0.6348, - "step": 3764 - }, - { - "epoch": 1.8500338315802423, - "grad_norm": 0.42508606082857237, - "learning_rate": 8.68608997852881e-05, - "loss": 0.8115, - "step": 3765 - }, - { - "epoch": 1.8505259272928585, - "grad_norm": 0.42053473150393383, - "learning_rate": 8.679986039443679e-05, - "loss": 0.6626, - "step": 3766 - }, - { - "epoch": 1.8510180230054747, - "grad_norm": 0.4355712014350185, - "learning_rate": 8.673882600850782e-05, - "loss": 0.6569, - "step": 3767 - }, - { - "epoch": 1.8515101187180907, - "grad_norm": 0.41190927012735623, - "learning_rate": 8.667779665064284e-05, - "loss": 0.8091, - "step": 3768 - }, - { - "epoch": 1.8520022144307067, - "grad_norm": 0.3898814652389425, - "learning_rate": 8.66167723439816e-05, - "loss": 0.7222, - "step": 3769 - }, - { - "epoch": 1.8524943101433229, - "grad_norm": 0.3614475994894846, - "learning_rate": 8.655575311166178e-05, - "loss": 0.7334, - "step": 3770 - }, - { - "epoch": 1.852986405855939, - "grad_norm": 0.4777743120470902, - "learning_rate": 8.649473897681927e-05, - "loss": 0.7197, - "step": 3771 - }, - { - "epoch": 1.853478501568555, - "grad_norm": 0.35562573345109555, - "learning_rate": 8.643372996258807e-05, - "loss": 0.6453, - "step": 3772 - }, - { - "epoch": 1.853970597281171, - "grad_norm": 0.4039928739211572, - "learning_rate": 8.637272609210004e-05, - "loss": 0.6872, - "step": 3773 - }, - { - "epoch": 1.8544626929937873, - "grad_norm": 0.38841040229625956, - "learning_rate": 8.63117273884853e-05, - "loss": 0.6808, - "step": 3774 - }, - { - "epoch": 1.8549547887064035, - "grad_norm": 0.4096237991473063, - "learning_rate": 8.625073387487187e-05, - "loss": 0.7119, - "step": 3775 - }, - { - "epoch": 1.8554468844190195, - "grad_norm": 0.38596576523341325, - "learning_rate": 8.618974557438588e-05, - "loss": 0.6666, - "step": 3776 - }, - { - "epoch": 1.8559389801316355, - "grad_norm": 0.39409470141726155, - "learning_rate": 8.612876251015143e-05, - "loss": 0.7382, - "step": 3777 - }, - { - "epoch": 1.8564310758442517, - "grad_norm": 0.38532395588137713, - "learning_rate": 8.606778470529072e-05, - "loss": 0.7215, - "step": 3778 - }, - { - "epoch": 1.856923171556868, - "grad_norm": 0.3928612260370393, - "learning_rate": 8.600681218292382e-05, - "loss": 0.7244, - "step": 3779 - }, - { - "epoch": 1.857415267269484, - "grad_norm": 0.41470702589972863, - "learning_rate": 8.594584496616892e-05, - "loss": 0.6866, - "step": 3780 - }, - { - "epoch": 1.8579073629821, - "grad_norm": 0.40288101987634306, - "learning_rate": 8.588488307814219e-05, - "loss": 0.6973, - "step": 3781 - }, - { - "epoch": 1.8583994586947161, - "grad_norm": 0.40182217552033306, - "learning_rate": 8.582392654195765e-05, - "loss": 0.7019, - "step": 3782 - }, - { - "epoch": 1.8588915544073323, - "grad_norm": 0.38297590120797004, - "learning_rate": 8.576297538072745e-05, - "loss": 0.6397, - "step": 3783 - }, - { - "epoch": 1.8593836501199483, - "grad_norm": 0.4485398175862643, - "learning_rate": 8.570202961756166e-05, - "loss": 0.7402, - "step": 3784 - }, - { - "epoch": 1.8598757458325643, - "grad_norm": 0.4066828438215999, - "learning_rate": 8.564108927556821e-05, - "loss": 0.7403, - "step": 3785 - }, - { - "epoch": 1.8603678415451805, - "grad_norm": 0.44161212245553355, - "learning_rate": 8.558015437785307e-05, - "loss": 0.7408, - "step": 3786 - }, - { - "epoch": 1.8608599372577967, - "grad_norm": 0.40158467712899054, - "learning_rate": 8.551922494752014e-05, - "loss": 0.7313, - "step": 3787 - }, - { - "epoch": 1.8613520329704127, - "grad_norm": 0.39374026714890287, - "learning_rate": 8.545830100767119e-05, - "loss": 0.8011, - "step": 3788 - }, - { - "epoch": 1.8618441286830287, - "grad_norm": 0.40035521905149524, - "learning_rate": 8.539738258140599e-05, - "loss": 0.7178, - "step": 3789 - }, - { - "epoch": 1.862336224395645, - "grad_norm": 0.385540295090378, - "learning_rate": 8.533646969182212e-05, - "loss": 0.7086, - "step": 3790 - }, - { - "epoch": 1.8628283201082612, - "grad_norm": 0.3939001451499724, - "learning_rate": 8.527556236201512e-05, - "loss": 0.7987, - "step": 3791 - }, - { - "epoch": 1.8633204158208772, - "grad_norm": 0.38453740296294187, - "learning_rate": 8.521466061507851e-05, - "loss": 0.7068, - "step": 3792 - }, - { - "epoch": 1.8638125115334931, - "grad_norm": 0.35669861262940133, - "learning_rate": 8.515376447410348e-05, - "loss": 0.652, - "step": 3793 - }, - { - "epoch": 1.8643046072461094, - "grad_norm": 0.4445744310608242, - "learning_rate": 8.509287396217927e-05, - "loss": 0.6823, - "step": 3794 - }, - { - "epoch": 1.8647967029587256, - "grad_norm": 0.4148131315069323, - "learning_rate": 8.503198910239296e-05, - "loss": 0.7272, - "step": 3795 - }, - { - "epoch": 1.8652887986713416, - "grad_norm": 0.37908432290668054, - "learning_rate": 8.497110991782938e-05, - "loss": 0.6588, - "step": 3796 - }, - { - "epoch": 1.8657808943839576, - "grad_norm": 0.36803127144774506, - "learning_rate": 8.491023643157135e-05, - "loss": 0.6546, - "step": 3797 - }, - { - "epoch": 1.8662729900965738, - "grad_norm": 0.38550432925741684, - "learning_rate": 8.484936866669945e-05, - "loss": 0.6906, - "step": 3798 - }, - { - "epoch": 1.86676508580919, - "grad_norm": 0.3773157261334979, - "learning_rate": 8.47885066462921e-05, - "loss": 0.7224, - "step": 3799 - }, - { - "epoch": 1.867257181521806, - "grad_norm": 0.3961376302003408, - "learning_rate": 8.472765039342551e-05, - "loss": 0.7273, - "step": 3800 - }, - { - "epoch": 1.867749277234422, - "grad_norm": 0.3946762702992294, - "learning_rate": 8.466679993117384e-05, - "loss": 0.7254, - "step": 3801 - }, - { - "epoch": 1.8682413729470382, - "grad_norm": 0.37808628750504814, - "learning_rate": 8.460595528260883e-05, - "loss": 0.7114, - "step": 3802 - }, - { - "epoch": 1.8687334686596544, - "grad_norm": 0.38643394808877063, - "learning_rate": 8.45451164708002e-05, - "loss": 0.7016, - "step": 3803 - }, - { - "epoch": 1.8692255643722704, - "grad_norm": 0.429300538303249, - "learning_rate": 8.448428351881545e-05, - "loss": 0.7641, - "step": 3804 - }, - { - "epoch": 1.8697176600848864, - "grad_norm": 0.43625152503785475, - "learning_rate": 8.442345644971972e-05, - "loss": 0.7163, - "step": 3805 - }, - { - "epoch": 1.8702097557975026, - "grad_norm": 0.36460530301815747, - "learning_rate": 8.436263528657605e-05, - "loss": 0.6496, - "step": 3806 - }, - { - "epoch": 1.8707018515101188, - "grad_norm": 0.38492339749305676, - "learning_rate": 8.430182005244521e-05, - "loss": 0.6765, - "step": 3807 - }, - { - "epoch": 1.8711939472227348, - "grad_norm": 0.3807975636968924, - "learning_rate": 8.424101077038568e-05, - "loss": 0.681, - "step": 3808 - }, - { - "epoch": 1.8716860429353508, - "grad_norm": 0.3732510035840216, - "learning_rate": 8.418020746345371e-05, - "loss": 0.6569, - "step": 3809 - }, - { - "epoch": 1.872178138647967, - "grad_norm": 0.40355547433002115, - "learning_rate": 8.411941015470329e-05, - "loss": 0.6905, - "step": 3810 - }, - { - "epoch": 1.8726702343605832, - "grad_norm": 0.38860278114060753, - "learning_rate": 8.405861886718614e-05, - "loss": 0.7857, - "step": 3811 - }, - { - "epoch": 1.8731623300731992, - "grad_norm": 0.3625925085463039, - "learning_rate": 8.399783362395169e-05, - "loss": 0.5675, - "step": 3812 - }, - { - "epoch": 1.8736544257858152, - "grad_norm": 0.4011507589155444, - "learning_rate": 8.393705444804704e-05, - "loss": 0.6947, - "step": 3813 - }, - { - "epoch": 1.8741465214984314, - "grad_norm": 0.3914846994760786, - "learning_rate": 8.387628136251707e-05, - "loss": 0.6686, - "step": 3814 - }, - { - "epoch": 1.8746386172110476, - "grad_norm": 0.44190174275672733, - "learning_rate": 8.381551439040433e-05, - "loss": 0.7964, - "step": 3815 - }, - { - "epoch": 1.8751307129236636, - "grad_norm": 0.3691434321689682, - "learning_rate": 8.375475355474895e-05, - "loss": 0.6825, - "step": 3816 - }, - { - "epoch": 1.8756228086362796, - "grad_norm": 0.39352120912167615, - "learning_rate": 8.369399887858886e-05, - "loss": 0.6894, - "step": 3817 - }, - { - "epoch": 1.8761149043488958, - "grad_norm": 0.378502360716238, - "learning_rate": 8.363325038495965e-05, - "loss": 0.6574, - "step": 3818 - }, - { - "epoch": 1.876607000061512, - "grad_norm": 0.40311105925092716, - "learning_rate": 8.357250809689442e-05, - "loss": 0.7525, - "step": 3819 - }, - { - "epoch": 1.877099095774128, - "grad_norm": 0.39149881189310953, - "learning_rate": 8.351177203742412e-05, - "loss": 0.7102, - "step": 3820 - }, - { - "epoch": 1.877591191486744, - "grad_norm": 0.37918226075791184, - "learning_rate": 8.345104222957719e-05, - "loss": 0.6966, - "step": 3821 - }, - { - "epoch": 1.8780832871993602, - "grad_norm": 0.4225687066624157, - "learning_rate": 8.339031869637974e-05, - "loss": 0.8542, - "step": 3822 - }, - { - "epoch": 1.8785753829119765, - "grad_norm": 0.39847575621164844, - "learning_rate": 8.332960146085551e-05, - "loss": 0.7559, - "step": 3823 - }, - { - "epoch": 1.8790674786245924, - "grad_norm": 0.385056347103929, - "learning_rate": 8.32688905460259e-05, - "loss": 0.7425, - "step": 3824 - }, - { - "epoch": 1.8795595743372084, - "grad_norm": 0.38433150729937143, - "learning_rate": 8.32081859749098e-05, - "loss": 0.7067, - "step": 3825 - }, - { - "epoch": 1.8800516700498247, - "grad_norm": 0.4001505978656989, - "learning_rate": 8.314748777052377e-05, - "loss": 0.6428, - "step": 3826 - }, - { - "epoch": 1.8805437657624409, - "grad_norm": 0.3692886739999643, - "learning_rate": 8.308679595588203e-05, - "loss": 0.6267, - "step": 3827 - }, - { - "epoch": 1.8810358614750569, - "grad_norm": 0.41017986026468556, - "learning_rate": 8.302611055399616e-05, - "loss": 0.7414, - "step": 3828 - }, - { - "epoch": 1.8815279571876729, - "grad_norm": 0.39691819101663334, - "learning_rate": 8.296543158787553e-05, - "loss": 0.6985, - "step": 3829 - }, - { - "epoch": 1.882020052900289, - "grad_norm": 0.4163231228785293, - "learning_rate": 8.2904759080527e-05, - "loss": 0.7806, - "step": 3830 - }, - { - "epoch": 1.8825121486129053, - "grad_norm": 0.3916218754190283, - "learning_rate": 8.284409305495489e-05, - "loss": 0.7054, - "step": 3831 - }, - { - "epoch": 1.8830042443255213, - "grad_norm": 0.38331071958594176, - "learning_rate": 8.278343353416114e-05, - "loss": 0.7215, - "step": 3832 - }, - { - "epoch": 1.8834963400381373, - "grad_norm": 0.39088909814304723, - "learning_rate": 8.27227805411453e-05, - "loss": 0.6823, - "step": 3833 - }, - { - "epoch": 1.8839884357507535, - "grad_norm": 0.3974172902358765, - "learning_rate": 8.266213409890427e-05, - "loss": 0.6934, - "step": 3834 - }, - { - "epoch": 1.8844805314633697, - "grad_norm": 0.3782369679192204, - "learning_rate": 8.260149423043263e-05, - "loss": 0.6891, - "step": 3835 - }, - { - "epoch": 1.8849726271759857, - "grad_norm": 0.392966638649635, - "learning_rate": 8.254086095872232e-05, - "loss": 0.7512, - "step": 3836 - }, - { - "epoch": 1.8854647228886017, - "grad_norm": 0.394558787642243, - "learning_rate": 8.24802343067629e-05, - "loss": 0.7267, - "step": 3837 - }, - { - "epoch": 1.8859568186012179, - "grad_norm": 0.3792863680994872, - "learning_rate": 8.241961429754144e-05, - "loss": 0.6789, - "step": 3838 - }, - { - "epoch": 1.886448914313834, - "grad_norm": 0.38164515637351726, - "learning_rate": 8.235900095404231e-05, - "loss": 0.7319, - "step": 3839 - }, - { - "epoch": 1.88694101002645, - "grad_norm": 0.4099016397638299, - "learning_rate": 8.229839429924753e-05, - "loss": 0.7304, - "step": 3840 - }, - { - "epoch": 1.887433105739066, - "grad_norm": 0.38415307587009967, - "learning_rate": 8.223779435613654e-05, - "loss": 0.7236, - "step": 3841 - }, - { - "epoch": 1.8879252014516823, - "grad_norm": 0.4408150407896303, - "learning_rate": 8.217720114768618e-05, - "loss": 0.7462, - "step": 3842 - }, - { - "epoch": 1.8884172971642985, - "grad_norm": 0.43569175301612517, - "learning_rate": 8.21166146968708e-05, - "loss": 0.7277, - "step": 3843 - }, - { - "epoch": 1.8889093928769145, - "grad_norm": 0.4092180401833967, - "learning_rate": 8.205603502666216e-05, - "loss": 0.7862, - "step": 3844 - }, - { - "epoch": 1.8894014885895305, - "grad_norm": 0.39599779906032656, - "learning_rate": 8.199546216002945e-05, - "loss": 0.628, - "step": 3845 - }, - { - "epoch": 1.889893584302147, - "grad_norm": 0.3979126754120914, - "learning_rate": 8.193489611993926e-05, - "loss": 0.7038, - "step": 3846 - }, - { - "epoch": 1.890385680014763, - "grad_norm": 0.4117737153450986, - "learning_rate": 8.18743369293557e-05, - "loss": 0.6989, - "step": 3847 - }, - { - "epoch": 1.890877775727379, - "grad_norm": 0.4046921419899561, - "learning_rate": 8.181378461124006e-05, - "loss": 0.6912, - "step": 3848 - }, - { - "epoch": 1.8913698714399951, - "grad_norm": 0.3711360123633831, - "learning_rate": 8.175323918855125e-05, - "loss": 0.6544, - "step": 3849 - }, - { - "epoch": 1.8918619671526113, - "grad_norm": 0.40231408623806025, - "learning_rate": 8.169270068424549e-05, - "loss": 0.6882, - "step": 3850 - }, - { - "epoch": 1.8923540628652273, - "grad_norm": 0.40330329724477676, - "learning_rate": 8.163216912127632e-05, - "loss": 0.7932, - "step": 3851 - }, - { - "epoch": 1.8928461585778433, - "grad_norm": 0.37383738566453517, - "learning_rate": 8.157164452259469e-05, - "loss": 0.718, - "step": 3852 - }, - { - "epoch": 1.8933382542904595, - "grad_norm": 0.3831247990844017, - "learning_rate": 8.151112691114899e-05, - "loss": 0.7086, - "step": 3853 - }, - { - "epoch": 1.8938303500030758, - "grad_norm": 0.35959035294624553, - "learning_rate": 8.145061630988479e-05, - "loss": 0.7009, - "step": 3854 - }, - { - "epoch": 1.8943224457156917, - "grad_norm": 0.3918304765662465, - "learning_rate": 8.13901127417451e-05, - "loss": 0.7179, - "step": 3855 - }, - { - "epoch": 1.8948145414283077, - "grad_norm": 1.0671166540530197, - "learning_rate": 8.132961622967035e-05, - "loss": 0.7583, - "step": 3856 - }, - { - "epoch": 1.895306637140924, - "grad_norm": 0.41056659421614605, - "learning_rate": 8.126912679659808e-05, - "loss": 0.7938, - "step": 3857 - }, - { - "epoch": 1.8957987328535402, - "grad_norm": 0.3940836886653735, - "learning_rate": 8.120864446546338e-05, - "loss": 0.7456, - "step": 3858 - }, - { - "epoch": 1.8962908285661562, - "grad_norm": 0.3958673026190194, - "learning_rate": 8.114816925919844e-05, - "loss": 0.6397, - "step": 3859 - }, - { - "epoch": 1.8967829242787722, - "grad_norm": 0.4154842629060033, - "learning_rate": 8.108770120073289e-05, - "loss": 0.7005, - "step": 3860 - }, - { - "epoch": 1.8972750199913884, - "grad_norm": 0.41679603606848203, - "learning_rate": 8.102724031299362e-05, - "loss": 0.6887, - "step": 3861 - }, - { - "epoch": 1.8977671157040046, - "grad_norm": 0.37960669953571574, - "learning_rate": 8.096678661890475e-05, - "loss": 0.6616, - "step": 3862 - }, - { - "epoch": 1.8982592114166206, - "grad_norm": 0.41898795118992954, - "learning_rate": 8.090634014138771e-05, - "loss": 0.7603, - "step": 3863 - }, - { - "epoch": 1.8987513071292366, - "grad_norm": 0.3901178471684261, - "learning_rate": 8.084590090336127e-05, - "loss": 0.7369, - "step": 3864 - }, - { - "epoch": 1.8992434028418528, - "grad_norm": 0.4001074664480027, - "learning_rate": 8.078546892774126e-05, - "loss": 0.7181, - "step": 3865 - }, - { - "epoch": 1.899735498554469, - "grad_norm": 0.412856381844074, - "learning_rate": 8.072504423744094e-05, - "loss": 0.7099, - "step": 3866 - }, - { - "epoch": 1.900227594267085, - "grad_norm": 0.3960452658443214, - "learning_rate": 8.066462685537074e-05, - "loss": 0.7229, - "step": 3867 - }, - { - "epoch": 1.900719689979701, - "grad_norm": 0.6320535315638518, - "learning_rate": 8.060421680443831e-05, - "loss": 0.7684, - "step": 3868 - }, - { - "epoch": 1.9012117856923172, - "grad_norm": 0.4146782766110037, - "learning_rate": 8.054381410754848e-05, - "loss": 0.7029, - "step": 3869 - }, - { - "epoch": 1.9017038814049334, - "grad_norm": 0.40409425232709, - "learning_rate": 8.048341878760345e-05, - "loss": 0.7093, - "step": 3870 - }, - { - "epoch": 1.9021959771175494, - "grad_norm": 0.3892420486298809, - "learning_rate": 8.042303086750241e-05, - "loss": 0.6532, - "step": 3871 - }, - { - "epoch": 1.9026880728301654, - "grad_norm": 0.389899415150913, - "learning_rate": 8.036265037014188e-05, - "loss": 0.7346, - "step": 3872 - }, - { - "epoch": 1.9031801685427816, - "grad_norm": 0.4252139440574835, - "learning_rate": 8.030227731841562e-05, - "loss": 0.7112, - "step": 3873 - }, - { - "epoch": 1.9036722642553978, - "grad_norm": 0.3870187866545771, - "learning_rate": 8.024191173521435e-05, - "loss": 0.7001, - "step": 3874 - }, - { - "epoch": 1.9041643599680138, - "grad_norm": 0.3896495111815748, - "learning_rate": 8.018155364342615e-05, - "loss": 0.688, - "step": 3875 - }, - { - "epoch": 1.9046564556806298, - "grad_norm": 0.37281523882922024, - "learning_rate": 8.012120306593623e-05, - "loss": 0.6826, - "step": 3876 - }, - { - "epoch": 1.905148551393246, - "grad_norm": 0.3976998963384889, - "learning_rate": 8.006086002562689e-05, - "loss": 0.749, - "step": 3877 - }, - { - "epoch": 1.9056406471058622, - "grad_norm": 0.37228780960957175, - "learning_rate": 8.000052454537756e-05, - "loss": 0.6921, - "step": 3878 - }, - { - "epoch": 1.9061327428184782, - "grad_norm": 0.4232935279839646, - "learning_rate": 7.994019664806495e-05, - "loss": 0.7563, - "step": 3879 - }, - { - "epoch": 1.9066248385310942, - "grad_norm": 0.3939288567329543, - "learning_rate": 7.987987635656267e-05, - "loss": 0.6897, - "step": 3880 - }, - { - "epoch": 1.9071169342437104, - "grad_norm": 0.39392641292806546, - "learning_rate": 7.981956369374164e-05, - "loss": 0.7314, - "step": 3881 - }, - { - "epoch": 1.9076090299563266, - "grad_norm": 0.3731836872853506, - "learning_rate": 7.975925868246985e-05, - "loss": 0.7107, - "step": 3882 - }, - { - "epoch": 1.9081011256689426, - "grad_norm": 0.39577720252997817, - "learning_rate": 7.969896134561226e-05, - "loss": 0.7301, - "step": 3883 - }, - { - "epoch": 1.9085932213815586, - "grad_norm": 0.407424447362973, - "learning_rate": 7.963867170603109e-05, - "loss": 0.7504, - "step": 3884 - }, - { - "epoch": 1.9090853170941748, - "grad_norm": 0.4036401411701683, - "learning_rate": 7.95783897865855e-05, - "loss": 0.7475, - "step": 3885 - }, - { - "epoch": 1.909577412806791, - "grad_norm": 0.4103005449659771, - "learning_rate": 7.951811561013181e-05, - "loss": 0.77, - "step": 3886 - }, - { - "epoch": 1.910069508519407, - "grad_norm": 0.38510200906900166, - "learning_rate": 7.945784919952345e-05, - "loss": 0.7327, - "step": 3887 - }, - { - "epoch": 1.910561604232023, - "grad_norm": 0.41653972275927953, - "learning_rate": 7.939759057761075e-05, - "loss": 0.7525, - "step": 3888 - }, - { - "epoch": 1.9110536999446393, - "grad_norm": 0.3832676976187471, - "learning_rate": 7.933733976724121e-05, - "loss": 0.6463, - "step": 3889 - }, - { - "epoch": 1.9115457956572555, - "grad_norm": 0.3727547330165016, - "learning_rate": 7.927709679125935e-05, - "loss": 0.6828, - "step": 3890 - }, - { - "epoch": 1.9120378913698715, - "grad_norm": 0.37988777052469785, - "learning_rate": 7.921686167250668e-05, - "loss": 0.743, - "step": 3891 - }, - { - "epoch": 1.9125299870824874, - "grad_norm": 0.39145387833034767, - "learning_rate": 7.915663443382173e-05, - "loss": 0.6608, - "step": 3892 - }, - { - "epoch": 1.9130220827951037, - "grad_norm": 0.4043528265935617, - "learning_rate": 7.909641509804015e-05, - "loss": 0.6648, - "step": 3893 - }, - { - "epoch": 1.9135141785077199, - "grad_norm": 0.4082358905888069, - "learning_rate": 7.903620368799439e-05, - "loss": 0.7488, - "step": 3894 - }, - { - "epoch": 1.9140062742203359, - "grad_norm": 0.4009678832066102, - "learning_rate": 7.897600022651409e-05, - "loss": 0.7247, - "step": 3895 - }, - { - "epoch": 1.9144983699329519, - "grad_norm": 0.38337578975659437, - "learning_rate": 7.891580473642582e-05, - "loss": 0.7509, - "step": 3896 - }, - { - "epoch": 1.914990465645568, - "grad_norm": 0.40866859945165235, - "learning_rate": 7.885561724055305e-05, - "loss": 0.69, - "step": 3897 - }, - { - "epoch": 1.9154825613581843, - "grad_norm": 0.4183788383491655, - "learning_rate": 7.87954377617163e-05, - "loss": 0.7324, - "step": 3898 - }, - { - "epoch": 1.9159746570708003, - "grad_norm": 0.37721860259448187, - "learning_rate": 7.873526632273304e-05, - "loss": 0.7288, - "step": 3899 - }, - { - "epoch": 1.9164667527834163, - "grad_norm": 0.36384014459340297, - "learning_rate": 7.86751029464177e-05, - "loss": 0.6763, - "step": 3900 - }, - { - "epoch": 1.9169588484960325, - "grad_norm": 0.3981900478021887, - "learning_rate": 7.861494765558153e-05, - "loss": 0.7455, - "step": 3901 - }, - { - "epoch": 1.9174509442086487, - "grad_norm": 0.37327464484386186, - "learning_rate": 7.855480047303296e-05, - "loss": 0.6973, - "step": 3902 - }, - { - "epoch": 1.9179430399212647, - "grad_norm": 0.3981322933258857, - "learning_rate": 7.849466142157708e-05, - "loss": 0.7177, - "step": 3903 - }, - { - "epoch": 1.9184351356338807, - "grad_norm": 0.4064397438066983, - "learning_rate": 7.843453052401606e-05, - "loss": 0.7306, - "step": 3904 - }, - { - "epoch": 1.918927231346497, - "grad_norm": 0.3880128323398334, - "learning_rate": 7.8374407803149e-05, - "loss": 0.7034, - "step": 3905 - }, - { - "epoch": 1.9194193270591131, - "grad_norm": 0.3880484616530206, - "learning_rate": 7.831429328177172e-05, - "loss": 0.6771, - "step": 3906 - }, - { - "epoch": 1.919911422771729, - "grad_norm": 0.41919304250603023, - "learning_rate": 7.825418698267717e-05, - "loss": 0.6921, - "step": 3907 - }, - { - "epoch": 1.920403518484345, - "grad_norm": 0.4092797013272745, - "learning_rate": 7.819408892865496e-05, - "loss": 0.7241, - "step": 3908 - }, - { - "epoch": 1.9208956141969613, - "grad_norm": 0.37058491846323266, - "learning_rate": 7.81339991424917e-05, - "loss": 0.606, - "step": 3909 - }, - { - "epoch": 1.9213877099095775, - "grad_norm": 0.39136055759246124, - "learning_rate": 7.80739176469709e-05, - "loss": 0.7127, - "step": 3910 - }, - { - "epoch": 1.9218798056221935, - "grad_norm": 0.37185122565731515, - "learning_rate": 7.801384446487278e-05, - "loss": 0.6649, - "step": 3911 - }, - { - "epoch": 1.9223719013348095, - "grad_norm": 0.36242614478350127, - "learning_rate": 7.795377961897457e-05, - "loss": 0.6752, - "step": 3912 - }, - { - "epoch": 1.9228639970474257, - "grad_norm": 0.38954244258229026, - "learning_rate": 7.78937231320502e-05, - "loss": 0.7113, - "step": 3913 - }, - { - "epoch": 1.923356092760042, - "grad_norm": 0.6042183187900263, - "learning_rate": 7.783367502687055e-05, - "loss": 0.6591, - "step": 3914 - }, - { - "epoch": 1.923848188472658, - "grad_norm": 0.37240936186016527, - "learning_rate": 7.777363532620322e-05, - "loss": 0.7206, - "step": 3915 - }, - { - "epoch": 1.924340284185274, - "grad_norm": 0.3908463141441568, - "learning_rate": 7.77136040528127e-05, - "loss": 0.6938, - "step": 3916 - }, - { - "epoch": 1.9248323798978901, - "grad_norm": 0.36547252043384504, - "learning_rate": 7.765358122946023e-05, - "loss": 0.681, - "step": 3917 - }, - { - "epoch": 1.9253244756105063, - "grad_norm": 0.36236724630938455, - "learning_rate": 7.759356687890387e-05, - "loss": 0.7331, - "step": 3918 - }, - { - "epoch": 1.9258165713231223, - "grad_norm": 0.3858341600511175, - "learning_rate": 7.753356102389851e-05, - "loss": 0.6595, - "step": 3919 - }, - { - "epoch": 1.9263086670357383, - "grad_norm": 0.3648928111427161, - "learning_rate": 7.747356368719572e-05, - "loss": 0.6407, - "step": 3920 - }, - { - "epoch": 1.9268007627483545, - "grad_norm": 0.35891837542860777, - "learning_rate": 7.741357489154393e-05, - "loss": 0.6502, - "step": 3921 - }, - { - "epoch": 1.9272928584609708, - "grad_norm": 0.3511748862513752, - "learning_rate": 7.735359465968833e-05, - "loss": 0.6547, - "step": 3922 - }, - { - "epoch": 1.9277849541735868, - "grad_norm": 0.38882962060119547, - "learning_rate": 7.729362301437076e-05, - "loss": 0.7529, - "step": 3923 - }, - { - "epoch": 1.9282770498862027, - "grad_norm": 0.41045132673897666, - "learning_rate": 7.723365997832989e-05, - "loss": 0.7183, - "step": 3924 - }, - { - "epoch": 1.928769145598819, - "grad_norm": 0.36737314744463456, - "learning_rate": 7.717370557430119e-05, - "loss": 0.6562, - "step": 3925 - }, - { - "epoch": 1.9292612413114352, - "grad_norm": 0.35477206335805433, - "learning_rate": 7.711375982501666e-05, - "loss": 0.6714, - "step": 3926 - }, - { - "epoch": 1.9297533370240512, - "grad_norm": 0.3644726863724075, - "learning_rate": 7.70538227532052e-05, - "loss": 0.5821, - "step": 3927 - }, - { - "epoch": 1.9302454327366672, - "grad_norm": 0.37967583596356574, - "learning_rate": 7.69938943815924e-05, - "loss": 0.6873, - "step": 3928 - }, - { - "epoch": 1.9307375284492834, - "grad_norm": 0.40669346566849485, - "learning_rate": 7.693397473290042e-05, - "loss": 0.7506, - "step": 3929 - }, - { - "epoch": 1.9312296241618996, - "grad_norm": 0.35485559877989564, - "learning_rate": 7.687406382984824e-05, - "loss": 0.7025, - "step": 3930 - }, - { - "epoch": 1.9317217198745156, - "grad_norm": 0.3874316845253912, - "learning_rate": 7.681416169515153e-05, - "loss": 0.7445, - "step": 3931 - }, - { - "epoch": 1.9322138155871316, - "grad_norm": 0.36698998710942177, - "learning_rate": 7.675426835152251e-05, - "loss": 0.6624, - "step": 3932 - }, - { - "epoch": 1.9327059112997478, - "grad_norm": 0.5317326558138038, - "learning_rate": 7.669438382167023e-05, - "loss": 0.724, - "step": 3933 - }, - { - "epoch": 1.933198007012364, - "grad_norm": 0.3952290965426452, - "learning_rate": 7.663450812830022e-05, - "loss": 0.6268, - "step": 3934 - }, - { - "epoch": 1.93369010272498, - "grad_norm": 0.3691924253171679, - "learning_rate": 7.657464129411482e-05, - "loss": 0.6754, - "step": 3935 - }, - { - "epoch": 1.934182198437596, - "grad_norm": 0.38390491352597483, - "learning_rate": 7.651478334181294e-05, - "loss": 0.6864, - "step": 3936 - }, - { - "epoch": 1.9346742941502122, - "grad_norm": 0.38709566127974504, - "learning_rate": 7.645493429409012e-05, - "loss": 0.7239, - "step": 3937 - }, - { - "epoch": 1.9351663898628284, - "grad_norm": 0.3930552781239274, - "learning_rate": 7.639509417363851e-05, - "loss": 0.6669, - "step": 3938 - }, - { - "epoch": 1.9356584855754444, - "grad_norm": 0.419724994671056, - "learning_rate": 7.633526300314695e-05, - "loss": 0.7072, - "step": 3939 - }, - { - "epoch": 1.9361505812880604, - "grad_norm": 0.3919255622772456, - "learning_rate": 7.627544080530077e-05, - "loss": 0.6986, - "step": 3940 - }, - { - "epoch": 1.9366426770006766, - "grad_norm": 0.3618611807972929, - "learning_rate": 7.6215627602782e-05, - "loss": 0.6982, - "step": 3941 - }, - { - "epoch": 1.9371347727132928, - "grad_norm": 0.384737341889195, - "learning_rate": 7.615582341826924e-05, - "loss": 0.6407, - "step": 3942 - }, - { - "epoch": 1.9376268684259088, - "grad_norm": 0.39658939521182923, - "learning_rate": 7.60960282744376e-05, - "loss": 0.6992, - "step": 3943 - }, - { - "epoch": 1.9381189641385248, - "grad_norm": 0.3862929418739502, - "learning_rate": 7.603624219395886e-05, - "loss": 0.6689, - "step": 3944 - }, - { - "epoch": 1.938611059851141, - "grad_norm": 0.358821061476227, - "learning_rate": 7.59764651995013e-05, - "loss": 0.652, - "step": 3945 - }, - { - "epoch": 1.9391031555637572, - "grad_norm": 0.38679309562512887, - "learning_rate": 7.591669731372977e-05, - "loss": 0.7125, - "step": 3946 - }, - { - "epoch": 1.9395952512763732, - "grad_norm": 0.40047442982785675, - "learning_rate": 7.585693855930565e-05, - "loss": 0.6988, - "step": 3947 - }, - { - "epoch": 1.9400873469889892, - "grad_norm": 0.40516081465275683, - "learning_rate": 7.579718895888693e-05, - "loss": 0.7243, - "step": 3948 - }, - { - "epoch": 1.9405794427016054, - "grad_norm": 0.38918640243870417, - "learning_rate": 7.573744853512801e-05, - "loss": 0.7002, - "step": 3949 - }, - { - "epoch": 1.9410715384142216, - "grad_norm": 0.4081989388987209, - "learning_rate": 7.56777173106799e-05, - "loss": 0.7325, - "step": 3950 - }, - { - "epoch": 1.9415636341268376, - "grad_norm": 0.38965796642259787, - "learning_rate": 7.561799530819016e-05, - "loss": 0.6553, - "step": 3951 - }, - { - "epoch": 1.9420557298394536, - "grad_norm": 0.4329902070473435, - "learning_rate": 7.555828255030269e-05, - "loss": 0.7718, - "step": 3952 - }, - { - "epoch": 1.9425478255520698, - "grad_norm": 0.3851929182372548, - "learning_rate": 7.549857905965805e-05, - "loss": 0.6929, - "step": 3953 - }, - { - "epoch": 1.943039921264686, - "grad_norm": 0.41348069716240204, - "learning_rate": 7.543888485889325e-05, - "loss": 0.7604, - "step": 3954 - }, - { - "epoch": 1.943532016977302, - "grad_norm": 0.3659746623034429, - "learning_rate": 7.537919997064166e-05, - "loss": 0.6784, - "step": 3955 - }, - { - "epoch": 1.944024112689918, - "grad_norm": 0.38655369915998505, - "learning_rate": 7.53195244175333e-05, - "loss": 0.7017, - "step": 3956 - }, - { - "epoch": 1.9445162084025343, - "grad_norm": 0.4381754236216023, - "learning_rate": 7.525985822219449e-05, - "loss": 0.7443, - "step": 3957 - }, - { - "epoch": 1.9450083041151505, - "grad_norm": 0.3703580602730962, - "learning_rate": 7.520020140724812e-05, - "loss": 0.6762, - "step": 3958 - }, - { - "epoch": 1.9455003998277665, - "grad_norm": 0.384051493397988, - "learning_rate": 7.514055399531345e-05, - "loss": 0.6779, - "step": 3959 - }, - { - "epoch": 1.9459924955403824, - "grad_norm": 0.42371486958296867, - "learning_rate": 7.508091600900622e-05, - "loss": 0.7155, - "step": 3960 - }, - { - "epoch": 1.9464845912529987, - "grad_norm": 0.4242852107890427, - "learning_rate": 7.502128747093855e-05, - "loss": 0.7045, - "step": 3961 - }, - { - "epoch": 1.9469766869656149, - "grad_norm": 0.3833034508840907, - "learning_rate": 7.496166840371905e-05, - "loss": 0.716, - "step": 3962 - }, - { - "epoch": 1.9474687826782309, - "grad_norm": 0.3931053734634782, - "learning_rate": 7.490205882995262e-05, - "loss": 0.693, - "step": 3963 - }, - { - "epoch": 1.9479608783908469, - "grad_norm": 0.37142856120811557, - "learning_rate": 7.48424587722407e-05, - "loss": 0.709, - "step": 3964 - }, - { - "epoch": 1.948452974103463, - "grad_norm": 0.3836266535930692, - "learning_rate": 7.478286825318107e-05, - "loss": 0.6681, - "step": 3965 - }, - { - "epoch": 1.9489450698160793, - "grad_norm": 0.38418787540701044, - "learning_rate": 7.472328729536778e-05, - "loss": 0.6837, - "step": 3966 - }, - { - "epoch": 1.9494371655286953, - "grad_norm": 0.3958444618673081, - "learning_rate": 7.466371592139144e-05, - "loss": 0.6669, - "step": 3967 - }, - { - "epoch": 1.9499292612413113, - "grad_norm": 0.4139091323368251, - "learning_rate": 7.460415415383892e-05, - "loss": 0.7679, - "step": 3968 - }, - { - "epoch": 1.9504213569539275, - "grad_norm": 0.3916373108439504, - "learning_rate": 7.454460201529347e-05, - "loss": 0.6784, - "step": 3969 - }, - { - "epoch": 1.9509134526665437, - "grad_norm": 0.3888059304432373, - "learning_rate": 7.448505952833467e-05, - "loss": 0.7274, - "step": 3970 - }, - { - "epoch": 1.9514055483791597, - "grad_norm": 0.3962591604311054, - "learning_rate": 7.44255267155385e-05, - "loss": 0.7075, - "step": 3971 - }, - { - "epoch": 1.9518976440917757, - "grad_norm": 0.3986261270565527, - "learning_rate": 7.436600359947716e-05, - "loss": 0.6868, - "step": 3972 - }, - { - "epoch": 1.952389739804392, - "grad_norm": 0.3753923584160657, - "learning_rate": 7.430649020271928e-05, - "loss": 0.6913, - "step": 3973 - }, - { - "epoch": 1.9528818355170081, - "grad_norm": 0.5451048466962539, - "learning_rate": 7.424698654782982e-05, - "loss": 0.7778, - "step": 3974 - }, - { - "epoch": 1.953373931229624, - "grad_norm": 0.38129838315942727, - "learning_rate": 7.418749265736988e-05, - "loss": 0.6884, - "step": 3975 - }, - { - "epoch": 1.9538660269422403, - "grad_norm": 0.3717298725102167, - "learning_rate": 7.412800855389706e-05, - "loss": 0.634, - "step": 3976 - }, - { - "epoch": 1.9543581226548565, - "grad_norm": 0.3753359427054715, - "learning_rate": 7.406853425996516e-05, - "loss": 0.6536, - "step": 3977 - }, - { - "epoch": 1.9548502183674725, - "grad_norm": 0.3792311434661509, - "learning_rate": 7.400906979812419e-05, - "loss": 0.6589, - "step": 3978 - }, - { - "epoch": 1.9553423140800885, - "grad_norm": 0.40045467173202365, - "learning_rate": 7.394961519092059e-05, - "loss": 0.7486, - "step": 3979 - }, - { - "epoch": 1.9558344097927047, - "grad_norm": 0.40348379740070833, - "learning_rate": 7.389017046089693e-05, - "loss": 0.7228, - "step": 3980 - }, - { - "epoch": 1.956326505505321, - "grad_norm": 0.36871799311639814, - "learning_rate": 7.38307356305921e-05, - "loss": 0.639, - "step": 3981 - }, - { - "epoch": 1.956818601217937, - "grad_norm": 0.3712312310517344, - "learning_rate": 7.37713107225412e-05, - "loss": 0.6222, - "step": 3982 - }, - { - "epoch": 1.957310696930553, - "grad_norm": 0.39149582583187675, - "learning_rate": 7.371189575927559e-05, - "loss": 0.7135, - "step": 3983 - }, - { - "epoch": 1.9578027926431691, - "grad_norm": 0.378984951027961, - "learning_rate": 7.365249076332286e-05, - "loss": 0.6857, - "step": 3984 - }, - { - "epoch": 1.9582948883557854, - "grad_norm": 0.3939353646696863, - "learning_rate": 7.359309575720684e-05, - "loss": 0.7188, - "step": 3985 - }, - { - "epoch": 1.9587869840684013, - "grad_norm": 0.37732842848980086, - "learning_rate": 7.35337107634475e-05, - "loss": 0.6829, - "step": 3986 - }, - { - "epoch": 1.9592790797810173, - "grad_norm": 0.4133237097160675, - "learning_rate": 7.347433580456109e-05, - "loss": 0.7565, - "step": 3987 - }, - { - "epoch": 1.9597711754936336, - "grad_norm": 0.41302993387558545, - "learning_rate": 7.341497090306007e-05, - "loss": 0.7545, - "step": 3988 - }, - { - "epoch": 1.9602632712062498, - "grad_norm": 0.37239730229404544, - "learning_rate": 7.335561608145295e-05, - "loss": 0.6516, - "step": 3989 - }, - { - "epoch": 1.9607553669188658, - "grad_norm": 0.3832871384393314, - "learning_rate": 7.329627136224459e-05, - "loss": 0.729, - "step": 3990 - }, - { - "epoch": 1.9612474626314818, - "grad_norm": 0.3851560377513374, - "learning_rate": 7.323693676793593e-05, - "loss": 0.7525, - "step": 3991 - }, - { - "epoch": 1.961739558344098, - "grad_norm": 0.3827623689533237, - "learning_rate": 7.317761232102407e-05, - "loss": 0.6951, - "step": 3992 - }, - { - "epoch": 1.9622316540567142, - "grad_norm": 0.37162400806550305, - "learning_rate": 7.311829804400225e-05, - "loss": 0.7051, - "step": 3993 - }, - { - "epoch": 1.9627237497693302, - "grad_norm": 0.3600410323263964, - "learning_rate": 7.305899395935996e-05, - "loss": 0.6315, - "step": 3994 - }, - { - "epoch": 1.9632158454819462, - "grad_norm": 0.38586231066607346, - "learning_rate": 7.299970008958263e-05, - "loss": 0.7077, - "step": 3995 - }, - { - "epoch": 1.9637079411945624, - "grad_norm": 0.3790292346339804, - "learning_rate": 7.294041645715202e-05, - "loss": 0.6864, - "step": 3996 - }, - { - "epoch": 1.9642000369071786, - "grad_norm": 0.39286744520815214, - "learning_rate": 7.288114308454594e-05, - "loss": 0.7155, - "step": 3997 - }, - { - "epoch": 1.9646921326197946, - "grad_norm": 0.3888345515951847, - "learning_rate": 7.282187999423819e-05, - "loss": 0.7233, - "step": 3998 - }, - { - "epoch": 1.9651842283324106, - "grad_norm": 0.3587376427302989, - "learning_rate": 7.276262720869886e-05, - "loss": 0.6629, - "step": 3999 - }, - { - "epoch": 1.9656763240450268, - "grad_norm": 0.3660232540426246, - "learning_rate": 7.270338475039403e-05, - "loss": 0.6734, - "step": 4000 - }, - { - "epoch": 1.966168419757643, - "grad_norm": 0.3699736422164832, - "learning_rate": 7.264415264178584e-05, - "loss": 0.7058, - "step": 4001 - }, - { - "epoch": 1.966660515470259, - "grad_norm": 0.3883997786079121, - "learning_rate": 7.258493090533258e-05, - "loss": 0.7026, - "step": 4002 - }, - { - "epoch": 1.967152611182875, - "grad_norm": 0.36329654656760146, - "learning_rate": 7.252571956348857e-05, - "loss": 0.6876, - "step": 4003 - }, - { - "epoch": 1.9676447068954912, - "grad_norm": 0.3934790306062663, - "learning_rate": 7.246651863870419e-05, - "loss": 0.7143, - "step": 4004 - }, - { - "epoch": 1.9681368026081074, - "grad_norm": 0.3667473853117635, - "learning_rate": 7.240732815342586e-05, - "loss": 0.698, - "step": 4005 - }, - { - "epoch": 1.9686288983207234, - "grad_norm": 0.4013537302536723, - "learning_rate": 7.234814813009607e-05, - "loss": 0.6995, - "step": 4006 - }, - { - "epoch": 1.9691209940333394, - "grad_norm": 0.41091932113589824, - "learning_rate": 7.228897859115328e-05, - "loss": 0.6696, - "step": 4007 - }, - { - "epoch": 1.9696130897459556, - "grad_norm": 0.3665492709345682, - "learning_rate": 7.222981955903212e-05, - "loss": 0.6631, - "step": 4008 - }, - { - "epoch": 1.9701051854585718, - "grad_norm": 0.39965118014858425, - "learning_rate": 7.217067105616303e-05, - "loss": 0.6935, - "step": 4009 - }, - { - "epoch": 1.9705972811711878, - "grad_norm": 0.40863817469155167, - "learning_rate": 7.21115331049726e-05, - "loss": 0.7634, - "step": 4010 - }, - { - "epoch": 1.9710893768838038, - "grad_norm": 0.3957593416304825, - "learning_rate": 7.205240572788347e-05, - "loss": 0.6962, - "step": 4011 - }, - { - "epoch": 1.97158147259642, - "grad_norm": 0.4081654308335122, - "learning_rate": 7.199328894731405e-05, - "loss": 0.6983, - "step": 4012 - }, - { - "epoch": 1.9720735683090362, - "grad_norm": 0.39040802513376455, - "learning_rate": 7.193418278567896e-05, - "loss": 0.7076, - "step": 4013 - }, - { - "epoch": 1.9725656640216522, - "grad_norm": 0.4008045733607616, - "learning_rate": 7.187508726538868e-05, - "loss": 0.7273, - "step": 4014 - }, - { - "epoch": 1.9730577597342682, - "grad_norm": 0.38090762300162073, - "learning_rate": 7.181600240884964e-05, - "loss": 0.6886, - "step": 4015 - }, - { - "epoch": 1.9735498554468844, - "grad_norm": 0.37013891486174255, - "learning_rate": 7.17569282384643e-05, - "loss": 0.6878, - "step": 4016 - }, - { - "epoch": 1.9740419511595007, - "grad_norm": 0.38334532397209775, - "learning_rate": 7.169786477663107e-05, - "loss": 0.6934, - "step": 4017 - }, - { - "epoch": 1.9745340468721166, - "grad_norm": 0.3789189230561965, - "learning_rate": 7.163881204574416e-05, - "loss": 0.6783, - "step": 4018 - }, - { - "epoch": 1.9750261425847326, - "grad_norm": 0.36729184015092375, - "learning_rate": 7.157977006819389e-05, - "loss": 0.7182, - "step": 4019 - }, - { - "epoch": 1.9755182382973488, - "grad_norm": 0.3919399646984472, - "learning_rate": 7.152073886636644e-05, - "loss": 0.6721, - "step": 4020 - }, - { - "epoch": 1.976010334009965, - "grad_norm": 0.3762141305174364, - "learning_rate": 7.146171846264383e-05, - "loss": 0.706, - "step": 4021 - }, - { - "epoch": 1.976502429722581, - "grad_norm": 0.3764993270301451, - "learning_rate": 7.140270887940406e-05, - "loss": 0.703, - "step": 4022 - }, - { - "epoch": 1.976994525435197, - "grad_norm": 0.367886036134554, - "learning_rate": 7.134371013902106e-05, - "loss": 0.6753, - "step": 4023 - }, - { - "epoch": 1.9774866211478133, - "grad_norm": 0.3848459021046618, - "learning_rate": 7.128472226386455e-05, - "loss": 0.6684, - "step": 4024 - }, - { - "epoch": 1.9779787168604295, - "grad_norm": 0.39550051877055614, - "learning_rate": 7.122574527630021e-05, - "loss": 0.6941, - "step": 4025 - }, - { - "epoch": 1.9784708125730455, - "grad_norm": 0.38016213285326045, - "learning_rate": 7.116677919868954e-05, - "loss": 0.6346, - "step": 4026 - }, - { - "epoch": 1.9789629082856615, - "grad_norm": 0.3894246527271591, - "learning_rate": 7.110782405338999e-05, - "loss": 0.6878, - "step": 4027 - }, - { - "epoch": 1.9794550039982777, - "grad_norm": 0.4133635692174051, - "learning_rate": 7.10488798627547e-05, - "loss": 0.7232, - "step": 4028 - }, - { - "epoch": 1.9799470997108939, - "grad_norm": 0.40156391540366365, - "learning_rate": 7.098994664913287e-05, - "loss": 0.696, - "step": 4029 - }, - { - "epoch": 1.9804391954235099, - "grad_norm": 0.4015682966013915, - "learning_rate": 7.093102443486932e-05, - "loss": 0.6298, - "step": 4030 - }, - { - "epoch": 1.9809312911361259, - "grad_norm": 0.42187283731355024, - "learning_rate": 7.087211324230492e-05, - "loss": 0.7534, - "step": 4031 - }, - { - "epoch": 1.981423386848742, - "grad_norm": 0.40001052591898884, - "learning_rate": 7.081321309377615e-05, - "loss": 0.7422, - "step": 4032 - }, - { - "epoch": 1.9819154825613583, - "grad_norm": 0.4018306730114361, - "learning_rate": 7.075432401161541e-05, - "loss": 0.6845, - "step": 4033 - }, - { - "epoch": 1.9824075782739743, - "grad_norm": 0.4006821583956409, - "learning_rate": 7.069544601815099e-05, - "loss": 0.6996, - "step": 4034 - }, - { - "epoch": 1.9828996739865903, - "grad_norm": 0.4330809786874021, - "learning_rate": 7.063657913570678e-05, - "loss": 0.7038, - "step": 4035 - }, - { - "epoch": 1.9833917696992065, - "grad_norm": 0.3801958588467038, - "learning_rate": 7.05777233866026e-05, - "loss": 0.6982, - "step": 4036 - }, - { - "epoch": 1.9838838654118227, - "grad_norm": 0.3947886852134686, - "learning_rate": 7.0518878793154e-05, - "loss": 0.679, - "step": 4037 - }, - { - "epoch": 1.9843759611244387, - "grad_norm": 0.38529311731196236, - "learning_rate": 7.04600453776723e-05, - "loss": 0.6694, - "step": 4038 - }, - { - "epoch": 1.9848680568370547, - "grad_norm": 0.3706573317319643, - "learning_rate": 7.040122316246457e-05, - "loss": 0.6912, - "step": 4039 - }, - { - "epoch": 1.985360152549671, - "grad_norm": 0.3844736105471381, - "learning_rate": 7.034241216983373e-05, - "loss": 0.6732, - "step": 4040 - }, - { - "epoch": 1.9858522482622871, - "grad_norm": 0.3882127364280242, - "learning_rate": 7.028361242207826e-05, - "loss": 0.749, - "step": 4041 - }, - { - "epoch": 1.9863443439749031, - "grad_norm": 0.3815591229369493, - "learning_rate": 7.022482394149252e-05, - "loss": 0.6357, - "step": 4042 - }, - { - "epoch": 1.986836439687519, - "grad_norm": 0.36098206448794196, - "learning_rate": 7.016604675036664e-05, - "loss": 0.5993, - "step": 4043 - }, - { - "epoch": 1.9873285354001353, - "grad_norm": 0.37414818025183944, - "learning_rate": 7.010728087098627e-05, - "loss": 0.6918, - "step": 4044 - }, - { - "epoch": 1.9878206311127515, - "grad_norm": 0.3925281535031267, - "learning_rate": 7.004852632563294e-05, - "loss": 0.6839, - "step": 4045 - }, - { - "epoch": 1.9883127268253675, - "grad_norm": 0.42591815350432277, - "learning_rate": 6.998978313658391e-05, - "loss": 0.7108, - "step": 4046 - }, - { - "epoch": 1.9888048225379835, - "grad_norm": 0.3888517155200917, - "learning_rate": 6.993105132611192e-05, - "loss": 0.6691, - "step": 4047 - }, - { - "epoch": 1.9892969182505997, - "grad_norm": 0.42597976309535457, - "learning_rate": 6.987233091648563e-05, - "loss": 0.6791, - "step": 4048 - }, - { - "epoch": 1.989789013963216, - "grad_norm": 0.3580610621691536, - "learning_rate": 6.981362192996925e-05, - "loss": 0.6681, - "step": 4049 - }, - { - "epoch": 1.990281109675832, - "grad_norm": 0.3862945615520119, - "learning_rate": 6.97549243888227e-05, - "loss": 0.6996, - "step": 4050 - }, - { - "epoch": 1.990773205388448, - "grad_norm": 0.3946234123343332, - "learning_rate": 6.969623831530153e-05, - "loss": 0.6948, - "step": 4051 - }, - { - "epoch": 1.9912653011010641, - "grad_norm": 0.37424551565435027, - "learning_rate": 6.9637563731657e-05, - "loss": 0.6788, - "step": 4052 - }, - { - "epoch": 1.9917573968136804, - "grad_norm": 0.40721138742140944, - "learning_rate": 6.957890066013591e-05, - "loss": 0.6949, - "step": 4053 - }, - { - "epoch": 1.9922494925262964, - "grad_norm": 0.4124145540393745, - "learning_rate": 6.952024912298087e-05, - "loss": 0.7289, - "step": 4054 - }, - { - "epoch": 1.9927415882389123, - "grad_norm": 0.37934479535897286, - "learning_rate": 6.946160914242987e-05, - "loss": 0.6843, - "step": 4055 - }, - { - "epoch": 1.9932336839515286, - "grad_norm": 0.3968682923956738, - "learning_rate": 6.940298074071674e-05, - "loss": 0.6528, - "step": 4056 - }, - { - "epoch": 1.9937257796641448, - "grad_norm": 0.39753013317271285, - "learning_rate": 6.934436394007088e-05, - "loss": 0.6873, - "step": 4057 - }, - { - "epoch": 1.9942178753767608, - "grad_norm": 0.4000299686554215, - "learning_rate": 6.928575876271714e-05, - "loss": 0.6579, - "step": 4058 - }, - { - "epoch": 1.9947099710893768, - "grad_norm": 0.4166155384433662, - "learning_rate": 6.922716523087613e-05, - "loss": 0.713, - "step": 4059 - }, - { - "epoch": 1.995202066801993, - "grad_norm": 0.3893089265316482, - "learning_rate": 6.916858336676399e-05, - "loss": 0.7017, - "step": 4060 - }, - { - "epoch": 1.9956941625146092, - "grad_norm": 0.3715116087865237, - "learning_rate": 6.91100131925924e-05, - "loss": 0.6517, - "step": 4061 - }, - { - "epoch": 1.9961862582272252, - "grad_norm": 0.40996160393797904, - "learning_rate": 6.905145473056866e-05, - "loss": 0.6874, - "step": 4062 - }, - { - "epoch": 1.9966783539398412, - "grad_norm": 0.4342963666090466, - "learning_rate": 6.899290800289562e-05, - "loss": 0.6562, - "step": 4063 - }, - { - "epoch": 1.9971704496524574, - "grad_norm": 0.3842098815912704, - "learning_rate": 6.893437303177162e-05, - "loss": 0.6818, - "step": 4064 - }, - { - "epoch": 1.9976625453650736, - "grad_norm": 0.38851779653997165, - "learning_rate": 6.887584983939063e-05, - "loss": 0.6541, - "step": 4065 - }, - { - "epoch": 1.9981546410776896, - "grad_norm": 0.4006724039006498, - "learning_rate": 6.881733844794213e-05, - "loss": 0.7806, - "step": 4066 - }, - { - "epoch": 1.9986467367903056, - "grad_norm": 0.36687676440230677, - "learning_rate": 6.875883887961105e-05, - "loss": 0.6418, - "step": 4067 - }, - { - "epoch": 1.9991388325029218, - "grad_norm": 0.3740451711557724, - "learning_rate": 6.870035115657795e-05, - "loss": 0.697, - "step": 4068 - }, - { - "epoch": 1.9991388325029218, - "eval_loss": 0.7580295205116272, - "eval_runtime": 6652.6164, - "eval_samples_per_second": 4.284, - "eval_steps_per_second": 2.142, - "step": 4068 - }, - { - "epoch": 1.999630928215538, - "grad_norm": 0.3894925405323842, - "learning_rate": 6.864187530101886e-05, - "loss": 0.6709, - "step": 4069 - }, - { - "epoch": 2.000123023928154, - "grad_norm": 0.3755117315889235, - "learning_rate": 6.858341133510524e-05, - "loss": 0.6201, - "step": 4070 - }, - { - "epoch": 2.00061511964077, - "grad_norm": 0.4014505915180703, - "learning_rate": 6.852495928100416e-05, - "loss": 0.7471, - "step": 4071 - }, - { - "epoch": 2.001107215353386, - "grad_norm": 0.5370893198510371, - "learning_rate": 6.846651916087806e-05, - "loss": 0.7051, - "step": 4072 - }, - { - "epoch": 2.0015993110660024, - "grad_norm": 0.4014359983059753, - "learning_rate": 6.840809099688493e-05, - "loss": 0.7049, - "step": 4073 - }, - { - "epoch": 2.0020914067786184, - "grad_norm": 0.3743263437640882, - "learning_rate": 6.834967481117817e-05, - "loss": 0.7083, - "step": 4074 - }, - { - "epoch": 2.000122887864823, - "grad_norm": 0.3815542448086518, - "learning_rate": 6.829127062590676e-05, - "loss": 0.6448, - "step": 4075 - }, - { - "epoch": 2.0006144393241168, - "grad_norm": 0.6708925998664091, - "learning_rate": 6.823287846321495e-05, - "loss": 0.4752, - "step": 4076 - }, - { - "epoch": 2.0011059907834103, - "grad_norm": 0.5173586392729002, - "learning_rate": 6.817449834524256e-05, - "loss": 0.429, - "step": 4077 - }, - { - "epoch": 2.0015975422427035, - "grad_norm": 0.4707257381318483, - "learning_rate": 6.811613029412485e-05, - "loss": 0.4924, - "step": 4078 - }, - { - "epoch": 2.002089093701997, - "grad_norm": 0.5106438786967059, - "learning_rate": 6.805777433199237e-05, - "loss": 0.4694, - "step": 4079 - }, - { - "epoch": 2.00258064516129, - "grad_norm": 0.6033965393914016, - "learning_rate": 6.799943048097129e-05, - "loss": 0.4622, - "step": 4080 - }, - { - "epoch": 2.003072196620584, - "grad_norm": 0.6244085622434824, - "learning_rate": 6.794109876318297e-05, - "loss": 0.4839, - "step": 4081 - }, - { - "epoch": 2.003563748079877, - "grad_norm": 0.4313257222073283, - "learning_rate": 6.788277920074433e-05, - "loss": 0.4298, - "step": 4082 - }, - { - "epoch": 2.0040552995391705, - "grad_norm": 0.42524213292992213, - "learning_rate": 6.782447181576763e-05, - "loss": 0.4372, - "step": 4083 - }, - { - "epoch": 2.004546850998464, - "grad_norm": 0.474606247022978, - "learning_rate": 6.77661766303605e-05, - "loss": 0.4498, - "step": 4084 - }, - { - "epoch": 2.005038402457757, - "grad_norm": 0.42730997305705487, - "learning_rate": 6.770789366662594e-05, - "loss": 0.4186, - "step": 4085 - }, - { - "epoch": 2.005529953917051, - "grad_norm": 0.4790604818251201, - "learning_rate": 6.764962294666237e-05, - "loss": 0.4768, - "step": 4086 - }, - { - "epoch": 2.006021505376344, - "grad_norm": 0.4358449485807311, - "learning_rate": 6.759136449256348e-05, - "loss": 0.4195, - "step": 4087 - }, - { - "epoch": 2.0065130568356375, - "grad_norm": 0.4194938080241657, - "learning_rate": 6.753311832641837e-05, - "loss": 0.4529, - "step": 4088 - }, - { - "epoch": 2.0070046082949307, - "grad_norm": 0.4863607881419129, - "learning_rate": 6.747488447031154e-05, - "loss": 0.4839, - "step": 4089 - }, - { - "epoch": 2.0074961597542242, - "grad_norm": 0.4160476895074089, - "learning_rate": 6.741666294632263e-05, - "loss": 0.4004, - "step": 4090 - }, - { - "epoch": 2.007987711213518, - "grad_norm": 0.4391113923613935, - "learning_rate": 6.735845377652679e-05, - "loss": 0.4228, - "step": 4091 - }, - { - "epoch": 2.008479262672811, - "grad_norm": 0.4693568766600367, - "learning_rate": 6.730025698299446e-05, - "loss": 0.4587, - "step": 4092 - }, - { - "epoch": 2.0089708141321045, - "grad_norm": 0.4813398186533998, - "learning_rate": 6.724207258779128e-05, - "loss": 0.4913, - "step": 4093 - }, - { - "epoch": 2.0094623655913977, - "grad_norm": 0.42630600115753076, - "learning_rate": 6.718390061297829e-05, - "loss": 0.4622, - "step": 4094 - }, - { - "epoch": 2.0099539170506913, - "grad_norm": 0.39257238097213143, - "learning_rate": 6.712574108061179e-05, - "loss": 0.4041, - "step": 4095 - }, - { - "epoch": 2.010445468509985, - "grad_norm": 0.4074983610993362, - "learning_rate": 6.706759401274334e-05, - "loss": 0.4451, - "step": 4096 - }, - { - "epoch": 2.010937019969278, - "grad_norm": 0.43739407714550654, - "learning_rate": 6.700945943141981e-05, - "loss": 0.4827, - "step": 4097 - }, - { - "epoch": 2.0114285714285716, - "grad_norm": 0.4085917973021729, - "learning_rate": 6.695133735868335e-05, - "loss": 0.4069, - "step": 4098 - }, - { - "epoch": 2.0119201228878647, - "grad_norm": 0.4208115463076437, - "learning_rate": 6.689322781657126e-05, - "loss": 0.4232, - "step": 4099 - }, - { - "epoch": 2.0124116743471583, - "grad_norm": 0.4510441341424042, - "learning_rate": 6.683513082711622e-05, - "loss": 0.5122, - "step": 4100 - }, - { - "epoch": 2.0129032258064514, - "grad_norm": 0.40602687554070094, - "learning_rate": 6.677704641234614e-05, - "loss": 0.4196, - "step": 4101 - }, - { - "epoch": 2.013394777265745, - "grad_norm": 0.4322923711717843, - "learning_rate": 6.671897459428403e-05, - "loss": 0.4353, - "step": 4102 - }, - { - "epoch": 2.0138863287250386, - "grad_norm": 0.4294115463406626, - "learning_rate": 6.666091539494828e-05, - "loss": 0.4143, - "step": 4103 - }, - { - "epoch": 2.0143778801843317, - "grad_norm": 0.4101529983890213, - "learning_rate": 6.660286883635236e-05, - "loss": 0.4321, - "step": 4104 - }, - { - "epoch": 2.0148694316436253, - "grad_norm": 0.4249185882895841, - "learning_rate": 6.65448349405051e-05, - "loss": 0.4331, - "step": 4105 - }, - { - "epoch": 2.0153609831029184, - "grad_norm": 0.4258412576860847, - "learning_rate": 6.648681372941038e-05, - "loss": 0.4293, - "step": 4106 - }, - { - "epoch": 2.015852534562212, - "grad_norm": 0.3912177759782342, - "learning_rate": 6.642880522506736e-05, - "loss": 0.4505, - "step": 4107 - }, - { - "epoch": 2.016344086021505, - "grad_norm": 0.41628918720304675, - "learning_rate": 6.637080944947036e-05, - "loss": 0.4119, - "step": 4108 - }, - { - "epoch": 2.0168356374807987, - "grad_norm": 0.3965045620637507, - "learning_rate": 6.631282642460889e-05, - "loss": 0.4146, - "step": 4109 - }, - { - "epoch": 2.0173271889400923, - "grad_norm": 0.42006382185977537, - "learning_rate": 6.625485617246756e-05, - "loss": 0.4523, - "step": 4110 - }, - { - "epoch": 2.0178187403993855, - "grad_norm": 0.4247212776906086, - "learning_rate": 6.619689871502619e-05, - "loss": 0.4342, - "step": 4111 - }, - { - "epoch": 2.018310291858679, - "grad_norm": 0.4275327236859406, - "learning_rate": 6.613895407425982e-05, - "loss": 0.4481, - "step": 4112 - }, - { - "epoch": 2.018801843317972, - "grad_norm": 0.3919230359262144, - "learning_rate": 6.608102227213843e-05, - "loss": 0.4319, - "step": 4113 - }, - { - "epoch": 2.0192933947772658, - "grad_norm": 0.4050965827800886, - "learning_rate": 6.602310333062735e-05, - "loss": 0.4438, - "step": 4114 - }, - { - "epoch": 2.0197849462365594, - "grad_norm": 0.4114104262486101, - "learning_rate": 6.596519727168692e-05, - "loss": 0.4382, - "step": 4115 - }, - { - "epoch": 2.0202764976958525, - "grad_norm": 0.3786829710539903, - "learning_rate": 6.590730411727259e-05, - "loss": 0.4309, - "step": 4116 - }, - { - "epoch": 2.020768049155146, - "grad_norm": 0.4706133872636162, - "learning_rate": 6.584942388933497e-05, - "loss": 0.4708, - "step": 4117 - }, - { - "epoch": 2.021259600614439, - "grad_norm": 0.3900931651516358, - "learning_rate": 6.579155660981973e-05, - "loss": 0.3856, - "step": 4118 - }, - { - "epoch": 2.021751152073733, - "grad_norm": 0.4139423552177058, - "learning_rate": 6.573370230066763e-05, - "loss": 0.4211, - "step": 4119 - }, - { - "epoch": 2.022242703533026, - "grad_norm": 0.41903910589701315, - "learning_rate": 6.567586098381451e-05, - "loss": 0.4387, - "step": 4120 - }, - { - "epoch": 2.0227342549923195, - "grad_norm": 0.427809401375755, - "learning_rate": 6.561803268119137e-05, - "loss": 0.4286, - "step": 4121 - }, - { - "epoch": 2.023225806451613, - "grad_norm": 0.3994268707226468, - "learning_rate": 6.55602174147241e-05, - "loss": 0.4321, - "step": 4122 - }, - { - "epoch": 2.0237173579109062, - "grad_norm": 0.39016984389351117, - "learning_rate": 6.55024152063338e-05, - "loss": 0.4261, - "step": 4123 - }, - { - "epoch": 2.0242089093702, - "grad_norm": 0.4411645189937886, - "learning_rate": 6.544462607793662e-05, - "loss": 0.4278, - "step": 4124 - }, - { - "epoch": 2.024700460829493, - "grad_norm": 0.4017328097731629, - "learning_rate": 6.538685005144361e-05, - "loss": 0.4039, - "step": 4125 - }, - { - "epoch": 2.0251920122887865, - "grad_norm": 0.42563631870851926, - "learning_rate": 6.532908714876098e-05, - "loss": 0.428, - "step": 4126 - }, - { - "epoch": 2.0256835637480797, - "grad_norm": 0.4075830650325921, - "learning_rate": 6.527133739178997e-05, - "loss": 0.4409, - "step": 4127 - }, - { - "epoch": 2.0261751152073733, - "grad_norm": 0.43540133618542143, - "learning_rate": 6.521360080242672e-05, - "loss": 0.456, - "step": 4128 - }, - { - "epoch": 2.026666666666667, - "grad_norm": 0.439879167469121, - "learning_rate": 6.515587740256249e-05, - "loss": 0.4744, - "step": 4129 - }, - { - "epoch": 2.02715821812596, - "grad_norm": 0.411680677061892, - "learning_rate": 6.509816721408349e-05, - "loss": 0.4405, - "step": 4130 - }, - { - "epoch": 2.0276497695852536, - "grad_norm": 0.394497213937786, - "learning_rate": 6.504047025887091e-05, - "loss": 0.4281, - "step": 4131 - }, - { - "epoch": 2.0281413210445467, - "grad_norm": 0.4120193361553592, - "learning_rate": 6.498278655880098e-05, - "loss": 0.4166, - "step": 4132 - }, - { - "epoch": 2.0286328725038403, - "grad_norm": 0.4346158339098155, - "learning_rate": 6.492511613574481e-05, - "loss": 0.4342, - "step": 4133 - }, - { - "epoch": 2.029124423963134, - "grad_norm": 0.4439111452882697, - "learning_rate": 6.486745901156857e-05, - "loss": 0.4336, - "step": 4134 - }, - { - "epoch": 2.029615975422427, - "grad_norm": 0.4208865777916461, - "learning_rate": 6.480981520813339e-05, - "loss": 0.4369, - "step": 4135 - }, - { - "epoch": 2.0301075268817206, - "grad_norm": 0.4384899856213821, - "learning_rate": 6.475218474729521e-05, - "loss": 0.4465, - "step": 4136 - }, - { - "epoch": 2.0305990783410137, - "grad_norm": 0.4295474685785935, - "learning_rate": 6.469456765090507e-05, - "loss": 0.4473, - "step": 4137 - }, - { - "epoch": 2.0310906298003073, - "grad_norm": 0.4261412429004471, - "learning_rate": 6.463696394080892e-05, - "loss": 0.4405, - "step": 4138 - }, - { - "epoch": 2.0315821812596004, - "grad_norm": 0.4498202985404661, - "learning_rate": 6.457937363884752e-05, - "loss": 0.4515, - "step": 4139 - }, - { - "epoch": 2.032073732718894, - "grad_norm": 0.43715641660298266, - "learning_rate": 6.452179676685666e-05, - "loss": 0.4435, - "step": 4140 - }, - { - "epoch": 2.0325652841781876, - "grad_norm": 0.4286604499497547, - "learning_rate": 6.446423334666702e-05, - "loss": 0.4887, - "step": 4141 - }, - { - "epoch": 2.0330568356374807, - "grad_norm": 0.420967234444815, - "learning_rate": 6.440668340010412e-05, - "loss": 0.457, - "step": 4142 - }, - { - "epoch": 2.0335483870967743, - "grad_norm": 0.415186780716354, - "learning_rate": 6.434914694898841e-05, - "loss": 0.4173, - "step": 4143 - }, - { - "epoch": 2.0340399385560675, - "grad_norm": 0.4137964320774288, - "learning_rate": 6.42916240151353e-05, - "loss": 0.4587, - "step": 4144 - }, - { - "epoch": 2.034531490015361, - "grad_norm": 0.41415684066741026, - "learning_rate": 6.42341146203549e-05, - "loss": 0.4821, - "step": 4145 - }, - { - "epoch": 2.035023041474654, - "grad_norm": 0.39270470579534494, - "learning_rate": 6.417661878645234e-05, - "loss": 0.4483, - "step": 4146 - }, - { - "epoch": 2.0355145929339478, - "grad_norm": 0.40117532015060886, - "learning_rate": 6.411913653522757e-05, - "loss": 0.4819, - "step": 4147 - }, - { - "epoch": 2.0360061443932413, - "grad_norm": 0.42198643835245975, - "learning_rate": 6.406166788847531e-05, - "loss": 0.4686, - "step": 4148 - }, - { - "epoch": 2.0364976958525345, - "grad_norm": 0.42417085484863365, - "learning_rate": 6.400421286798526e-05, - "loss": 0.4558, - "step": 4149 - }, - { - "epoch": 2.036989247311828, - "grad_norm": 0.40938232295129534, - "learning_rate": 6.394677149554188e-05, - "loss": 0.3957, - "step": 4150 - }, - { - "epoch": 2.037480798771121, - "grad_norm": 0.4081222904677301, - "learning_rate": 6.38893437929244e-05, - "loss": 0.4292, - "step": 4151 - }, - { - "epoch": 2.037972350230415, - "grad_norm": 0.4285141261745488, - "learning_rate": 6.383192978190695e-05, - "loss": 0.4418, - "step": 4152 - }, - { - "epoch": 2.038463901689708, - "grad_norm": 0.4222072895102476, - "learning_rate": 6.377452948425844e-05, - "loss": 0.4238, - "step": 4153 - }, - { - "epoch": 2.0389554531490015, - "grad_norm": 0.4457859828251274, - "learning_rate": 6.371714292174257e-05, - "loss": 0.4727, - "step": 4154 - }, - { - "epoch": 2.039447004608295, - "grad_norm": 0.4138136085922726, - "learning_rate": 6.36597701161179e-05, - "loss": 0.438, - "step": 4155 - }, - { - "epoch": 2.0399385560675882, - "grad_norm": 0.4219842851486012, - "learning_rate": 6.360241108913763e-05, - "loss": 0.4429, - "step": 4156 - }, - { - "epoch": 2.040430107526882, - "grad_norm": 0.3948076248943635, - "learning_rate": 6.354506586254985e-05, - "loss": 0.429, - "step": 4157 - }, - { - "epoch": 2.040921658986175, - "grad_norm": 0.40035883095024877, - "learning_rate": 6.348773445809747e-05, - "loss": 0.4521, - "step": 4158 - }, - { - "epoch": 2.0414132104454685, - "grad_norm": 0.4017427790558212, - "learning_rate": 6.343041689751798e-05, - "loss": 0.4331, - "step": 4159 - }, - { - "epoch": 2.041904761904762, - "grad_norm": 0.3954016603152219, - "learning_rate": 6.337311320254375e-05, - "loss": 0.3992, - "step": 4160 - }, - { - "epoch": 2.0423963133640552, - "grad_norm": 0.45375102420223135, - "learning_rate": 6.331582339490192e-05, - "loss": 0.4469, - "step": 4161 - }, - { - "epoch": 2.042887864823349, - "grad_norm": 0.4123656788480944, - "learning_rate": 6.325854749631423e-05, - "loss": 0.4169, - "step": 4162 - }, - { - "epoch": 2.043379416282642, - "grad_norm": 0.3974255647849046, - "learning_rate": 6.320128552849728e-05, - "loss": 0.4142, - "step": 4163 - }, - { - "epoch": 2.0438709677419356, - "grad_norm": 0.42228387487327623, - "learning_rate": 6.314403751316231e-05, - "loss": 0.4263, - "step": 4164 - }, - { - "epoch": 2.0443625192012287, - "grad_norm": 0.43566106954297074, - "learning_rate": 6.30868034720153e-05, - "loss": 0.4418, - "step": 4165 - }, - { - "epoch": 2.0448540706605223, - "grad_norm": 0.4506847981089266, - "learning_rate": 6.30295834267569e-05, - "loss": 0.4457, - "step": 4166 - }, - { - "epoch": 2.045345622119816, - "grad_norm": 0.368505391234191, - "learning_rate": 6.297237739908253e-05, - "loss": 0.3901, - "step": 4167 - }, - { - "epoch": 2.045837173579109, - "grad_norm": 0.39309283243786297, - "learning_rate": 6.291518541068217e-05, - "loss": 0.4152, - "step": 4168 - }, - { - "epoch": 2.0463287250384026, - "grad_norm": 0.412713494958705, - "learning_rate": 6.285800748324062e-05, - "loss": 0.4032, - "step": 4169 - }, - { - "epoch": 2.0468202764976957, - "grad_norm": 0.433805358942553, - "learning_rate": 6.280084363843726e-05, - "loss": 0.4546, - "step": 4170 - }, - { - "epoch": 2.0473118279569893, - "grad_norm": 0.4054641455222668, - "learning_rate": 6.274369389794612e-05, - "loss": 0.4543, - "step": 4171 - }, - { - "epoch": 2.0478033794162824, - "grad_norm": 0.4479040896677571, - "learning_rate": 6.268655828343591e-05, - "loss": 0.4247, - "step": 4172 - }, - { - "epoch": 2.048294930875576, - "grad_norm": 0.43764567536186005, - "learning_rate": 6.262943681657007e-05, - "loss": 0.4978, - "step": 4173 - }, - { - "epoch": 2.0487864823348696, - "grad_norm": 0.4114390085492157, - "learning_rate": 6.257232951900649e-05, - "loss": 0.4618, - "step": 4174 - }, - { - "epoch": 2.0492780337941627, - "grad_norm": 0.44117863226371595, - "learning_rate": 6.251523641239782e-05, - "loss": 0.4459, - "step": 4175 - }, - { - "epoch": 2.0497695852534563, - "grad_norm": 0.4221198902147319, - "learning_rate": 6.245815751839133e-05, - "loss": 0.4464, - "step": 4176 - }, - { - "epoch": 2.0502611367127495, - "grad_norm": 0.4102347342983765, - "learning_rate": 6.240109285862881e-05, - "loss": 0.4379, - "step": 4177 - }, - { - "epoch": 2.050752688172043, - "grad_norm": 0.4564328383403766, - "learning_rate": 6.23440424547468e-05, - "loss": 0.465, - "step": 4178 - }, - { - "epoch": 2.0512442396313366, - "grad_norm": 0.4385686966186336, - "learning_rate": 6.228700632837624e-05, - "loss": 0.4769, - "step": 4179 - }, - { - "epoch": 2.0517357910906298, - "grad_norm": 0.415179595540576, - "learning_rate": 6.222998450114283e-05, - "loss": 0.442, - "step": 4180 - }, - { - "epoch": 2.0522273425499233, - "grad_norm": 0.44994559714934196, - "learning_rate": 6.21729769946668e-05, - "loss": 0.4903, - "step": 4181 - }, - { - "epoch": 2.0527188940092165, - "grad_norm": 0.4293575390240816, - "learning_rate": 6.211598383056287e-05, - "loss": 0.4171, - "step": 4182 - }, - { - "epoch": 2.05321044546851, - "grad_norm": 0.42337786819357853, - "learning_rate": 6.20590050304404e-05, - "loss": 0.4465, - "step": 4183 - }, - { - "epoch": 2.053701996927803, - "grad_norm": 0.4287816136370031, - "learning_rate": 6.200204061590336e-05, - "loss": 0.4001, - "step": 4184 - }, - { - "epoch": 2.054193548387097, - "grad_norm": 0.44936952646255507, - "learning_rate": 6.19450906085501e-05, - "loss": 0.5181, - "step": 4185 - }, - { - "epoch": 2.0546850998463904, - "grad_norm": 0.4716491483591041, - "learning_rate": 6.188815502997367e-05, - "loss": 0.4952, - "step": 4186 - }, - { - "epoch": 2.0551766513056835, - "grad_norm": 0.41813981468267314, - "learning_rate": 6.183123390176154e-05, - "loss": 0.4456, - "step": 4187 - }, - { - "epoch": 2.055668202764977, - "grad_norm": 0.40200169789525003, - "learning_rate": 6.177432724549574e-05, - "loss": 0.4152, - "step": 4188 - }, - { - "epoch": 2.05615975422427, - "grad_norm": 0.4335209784535882, - "learning_rate": 6.171743508275283e-05, - "loss": 0.4635, - "step": 4189 - }, - { - "epoch": 2.056651305683564, - "grad_norm": 0.4479894470101658, - "learning_rate": 6.166055743510388e-05, - "loss": 0.4534, - "step": 4190 - }, - { - "epoch": 2.057142857142857, - "grad_norm": 0.39543833767960374, - "learning_rate": 6.160369432411438e-05, - "loss": 0.4368, - "step": 4191 - }, - { - "epoch": 2.0576344086021505, - "grad_norm": 0.42126283772504136, - "learning_rate": 6.15468457713444e-05, - "loss": 0.4553, - "step": 4192 - }, - { - "epoch": 2.058125960061444, - "grad_norm": 0.4179272616502525, - "learning_rate": 6.149001179834848e-05, - "loss": 0.4902, - "step": 4193 - }, - { - "epoch": 2.0586175115207372, - "grad_norm": 0.44317015319838116, - "learning_rate": 6.143319242667554e-05, - "loss": 0.4415, - "step": 4194 - }, - { - "epoch": 2.059109062980031, - "grad_norm": 0.4238443346146709, - "learning_rate": 6.137638767786906e-05, - "loss": 0.4234, - "step": 4195 - }, - { - "epoch": 2.059600614439324, - "grad_norm": 0.43151834583654336, - "learning_rate": 6.131959757346699e-05, - "loss": 0.4482, - "step": 4196 - }, - { - "epoch": 2.0600921658986175, - "grad_norm": 0.39480103202227185, - "learning_rate": 6.126282213500163e-05, - "loss": 0.4484, - "step": 4197 - }, - { - "epoch": 2.060583717357911, - "grad_norm": 0.42970638592110433, - "learning_rate": 6.120606138399977e-05, - "loss": 0.4377, - "step": 4198 - }, - { - "epoch": 2.0610752688172043, - "grad_norm": 0.47463432127306476, - "learning_rate": 6.114931534198268e-05, - "loss": 0.47, - "step": 4199 - }, - { - "epoch": 2.061566820276498, - "grad_norm": 0.4068639285001661, - "learning_rate": 6.109258403046593e-05, - "loss": 0.4311, - "step": 4200 - }, - { - "epoch": 2.062058371735791, - "grad_norm": 0.42315206102396913, - "learning_rate": 6.103586747095965e-05, - "loss": 0.505, - "step": 4201 - }, - { - "epoch": 2.0625499231950846, - "grad_norm": 0.41286570120147303, - "learning_rate": 6.097916568496831e-05, - "loss": 0.4386, - "step": 4202 - }, - { - "epoch": 2.0630414746543777, - "grad_norm": 0.4362737550070793, - "learning_rate": 6.092247869399073e-05, - "loss": 0.3975, - "step": 4203 - }, - { - "epoch": 2.0635330261136713, - "grad_norm": 0.39515381693685236, - "learning_rate": 6.086580651952021e-05, - "loss": 0.4324, - "step": 4204 - }, - { - "epoch": 2.064024577572965, - "grad_norm": 0.4517275023619289, - "learning_rate": 6.080914918304432e-05, - "loss": 0.4544, - "step": 4205 - }, - { - "epoch": 2.064516129032258, - "grad_norm": 0.42463570176493404, - "learning_rate": 6.0752506706045134e-05, - "loss": 0.449, - "step": 4206 - }, - { - "epoch": 2.0650076804915516, - "grad_norm": 0.4231749522154473, - "learning_rate": 6.069587910999905e-05, - "loss": 0.4624, - "step": 4207 - }, - { - "epoch": 2.0654992319508447, - "grad_norm": 0.40450809361098755, - "learning_rate": 6.063926641637674e-05, - "loss": 0.4774, - "step": 4208 - }, - { - "epoch": 2.0659907834101383, - "grad_norm": 0.42138351602512353, - "learning_rate": 6.058266864664335e-05, - "loss": 0.4626, - "step": 4209 - }, - { - "epoch": 2.0664823348694314, - "grad_norm": 0.4399847833526247, - "learning_rate": 6.052608582225827e-05, - "loss": 0.454, - "step": 4210 - }, - { - "epoch": 2.066973886328725, - "grad_norm": 0.3866471785643516, - "learning_rate": 6.0469517964675274e-05, - "loss": 0.3906, - "step": 4211 - }, - { - "epoch": 2.0674654377880186, - "grad_norm": 0.4091364352386822, - "learning_rate": 6.0412965095342425e-05, - "loss": 0.4211, - "step": 4212 - }, - { - "epoch": 2.0679569892473117, - "grad_norm": 0.4439567956775442, - "learning_rate": 6.035642723570218e-05, - "loss": 0.4562, - "step": 4213 - }, - { - "epoch": 2.0684485407066053, - "grad_norm": 0.42440742150518035, - "learning_rate": 6.02999044071912e-05, - "loss": 0.4332, - "step": 4214 - }, - { - "epoch": 2.0689400921658985, - "grad_norm": 0.44296274032268856, - "learning_rate": 6.02433966312405e-05, - "loss": 0.4364, - "step": 4215 - }, - { - "epoch": 2.069431643625192, - "grad_norm": 0.41404027269175936, - "learning_rate": 6.018690392927546e-05, - "loss": 0.4512, - "step": 4216 - }, - { - "epoch": 2.0699231950844856, - "grad_norm": 0.427461997274158, - "learning_rate": 6.013042632271556e-05, - "loss": 0.4112, - "step": 4217 - }, - { - "epoch": 2.0704147465437788, - "grad_norm": 0.4020576859023756, - "learning_rate": 6.0073963832974735e-05, - "loss": 0.422, - "step": 4218 - }, - { - "epoch": 2.0709062980030724, - "grad_norm": 0.4324412504665786, - "learning_rate": 6.001751648146115e-05, - "loss": 0.4869, - "step": 4219 - }, - { - "epoch": 2.0713978494623655, - "grad_norm": 0.45320776064796725, - "learning_rate": 5.996108428957713e-05, - "loss": 0.4673, - "step": 4220 - }, - { - "epoch": 2.071889400921659, - "grad_norm": 0.4075869212928584, - "learning_rate": 5.990466727871933e-05, - "loss": 0.3943, - "step": 4221 - }, - { - "epoch": 2.072380952380952, - "grad_norm": 0.43830291088572687, - "learning_rate": 5.984826547027871e-05, - "loss": 0.438, - "step": 4222 - }, - { - "epoch": 2.072872503840246, - "grad_norm": 0.4335520478543081, - "learning_rate": 5.9791878885640315e-05, - "loss": 0.4411, - "step": 4223 - }, - { - "epoch": 2.0733640552995394, - "grad_norm": 0.4044037516383188, - "learning_rate": 5.973550754618353e-05, - "loss": 0.4135, - "step": 4224 - }, - { - "epoch": 2.0738556067588325, - "grad_norm": 0.4175805564684879, - "learning_rate": 5.9679151473282e-05, - "loss": 0.4343, - "step": 4225 - }, - { - "epoch": 2.074347158218126, - "grad_norm": 0.4622345146053767, - "learning_rate": 5.96228106883034e-05, - "loss": 0.4488, - "step": 4226 - }, - { - "epoch": 2.0748387096774192, - "grad_norm": 0.4336592304310755, - "learning_rate": 5.956648521260979e-05, - "loss": 0.4369, - "step": 4227 - }, - { - "epoch": 2.075330261136713, - "grad_norm": 0.4375432816128757, - "learning_rate": 5.951017506755732e-05, - "loss": 0.4305, - "step": 4228 - }, - { - "epoch": 2.075821812596006, - "grad_norm": 0.4100065792649927, - "learning_rate": 5.945388027449636e-05, - "loss": 0.4491, - "step": 4229 - }, - { - "epoch": 2.0763133640552995, - "grad_norm": 0.4225142824113446, - "learning_rate": 5.939760085477155e-05, - "loss": 0.4647, - "step": 4230 - }, - { - "epoch": 2.076804915514593, - "grad_norm": 0.4615051490534944, - "learning_rate": 5.93413368297215e-05, - "loss": 0.4471, - "step": 4231 - }, - { - "epoch": 2.0772964669738863, - "grad_norm": 0.4386461472128062, - "learning_rate": 5.928508822067914e-05, - "loss": 0.4507, - "step": 4232 - }, - { - "epoch": 2.07778801843318, - "grad_norm": 0.43290493152027015, - "learning_rate": 5.922885504897153e-05, - "loss": 0.4214, - "step": 4233 - }, - { - "epoch": 2.078279569892473, - "grad_norm": 0.45573929418569736, - "learning_rate": 5.9172637335919834e-05, - "loss": 0.4898, - "step": 4234 - }, - { - "epoch": 2.0787711213517666, - "grad_norm": 0.40594113040743607, - "learning_rate": 5.911643510283937e-05, - "loss": 0.4013, - "step": 4235 - }, - { - "epoch": 2.0792626728110597, - "grad_norm": 0.41906807843682553, - "learning_rate": 5.906024837103965e-05, - "loss": 0.3846, - "step": 4236 - }, - { - "epoch": 2.0797542242703533, - "grad_norm": 0.40930435408629845, - "learning_rate": 5.900407716182418e-05, - "loss": 0.4135, - "step": 4237 - }, - { - "epoch": 2.080245775729647, - "grad_norm": 0.4326377182972461, - "learning_rate": 5.894792149649069e-05, - "loss": 0.4548, - "step": 4238 - }, - { - "epoch": 2.08073732718894, - "grad_norm": 0.4391786320177373, - "learning_rate": 5.889178139633101e-05, - "loss": 0.4576, - "step": 4239 - }, - { - "epoch": 2.0812288786482336, - "grad_norm": 0.44145357856165895, - "learning_rate": 5.883565688263099e-05, - "loss": 0.46, - "step": 4240 - }, - { - "epoch": 2.0817204301075267, - "grad_norm": 0.4263744073324838, - "learning_rate": 5.8779547976670625e-05, - "loss": 0.462, - "step": 4241 - }, - { - "epoch": 2.0822119815668203, - "grad_norm": 0.43403947901842116, - "learning_rate": 5.872345469972405e-05, - "loss": 0.4728, - "step": 4242 - }, - { - "epoch": 2.082703533026114, - "grad_norm": 0.40056615005581947, - "learning_rate": 5.866737707305935e-05, - "loss": 0.4019, - "step": 4243 - }, - { - "epoch": 2.083195084485407, - "grad_norm": 0.41796871662512924, - "learning_rate": 5.861131511793871e-05, - "loss": 0.4351, - "step": 4244 - }, - { - "epoch": 2.0836866359447006, - "grad_norm": 0.4335772808148304, - "learning_rate": 5.8555268855618504e-05, - "loss": 0.4506, - "step": 4245 - }, - { - "epoch": 2.0841781874039937, - "grad_norm": 0.41647793325665344, - "learning_rate": 5.849923830734895e-05, - "loss": 0.4385, - "step": 4246 - }, - { - "epoch": 2.0846697388632873, - "grad_norm": 0.44367157856839123, - "learning_rate": 5.844322349437443e-05, - "loss": 0.4189, - "step": 4247 - }, - { - "epoch": 2.0851612903225805, - "grad_norm": 0.4168511883626321, - "learning_rate": 5.8387224437933416e-05, - "loss": 0.4384, - "step": 4248 - }, - { - "epoch": 2.085652841781874, - "grad_norm": 0.4206723111717998, - "learning_rate": 5.8331241159258254e-05, - "loss": 0.4734, - "step": 4249 - }, - { - "epoch": 2.0861443932411676, - "grad_norm": 0.4906305343165687, - "learning_rate": 5.827527367957536e-05, - "loss": 0.4691, - "step": 4250 - }, - { - "epoch": 2.0866359447004608, - "grad_norm": 0.4187927938525607, - "learning_rate": 5.82193220201053e-05, - "loss": 0.4539, - "step": 4251 - }, - { - "epoch": 2.0871274961597543, - "grad_norm": 0.42085613310255154, - "learning_rate": 5.816338620206239e-05, - "loss": 0.3937, - "step": 4252 - }, - { - "epoch": 2.0876190476190475, - "grad_norm": 0.40741924993899514, - "learning_rate": 5.8107466246655194e-05, - "loss": 0.4158, - "step": 4253 - }, - { - "epoch": 2.088110599078341, - "grad_norm": 0.42475115806634434, - "learning_rate": 5.805156217508601e-05, - "loss": 0.4309, - "step": 4254 - }, - { - "epoch": 2.088602150537634, - "grad_norm": 0.44669438770800324, - "learning_rate": 5.799567400855136e-05, - "loss": 0.4448, - "step": 4255 - }, - { - "epoch": 2.089093701996928, - "grad_norm": 0.42259855388429474, - "learning_rate": 5.793980176824158e-05, - "loss": 0.437, - "step": 4256 - }, - { - "epoch": 2.0895852534562214, - "grad_norm": 0.4128000188437674, - "learning_rate": 5.7883945475341005e-05, - "loss": 0.3984, - "step": 4257 - }, - { - "epoch": 2.0900768049155145, - "grad_norm": 0.45715433167600705, - "learning_rate": 5.78281051510279e-05, - "loss": 0.4523, - "step": 4258 - }, - { - "epoch": 2.090568356374808, - "grad_norm": 0.45573679362133407, - "learning_rate": 5.777228081647461e-05, - "loss": 0.4736, - "step": 4259 - }, - { - "epoch": 2.0910599078341012, - "grad_norm": 0.4365464346634226, - "learning_rate": 5.771647249284715e-05, - "loss": 0.4547, - "step": 4260 - }, - { - "epoch": 2.091551459293395, - "grad_norm": 0.41296022755027106, - "learning_rate": 5.766068020130575e-05, - "loss": 0.4309, - "step": 4261 - }, - { - "epoch": 2.092043010752688, - "grad_norm": 0.44935687469527485, - "learning_rate": 5.760490396300443e-05, - "loss": 0.4329, - "step": 4262 - }, - { - "epoch": 2.0925345622119815, - "grad_norm": 0.3894919375689829, - "learning_rate": 5.754914379909102e-05, - "loss": 0.4411, - "step": 4263 - }, - { - "epoch": 2.093026113671275, - "grad_norm": 0.41443354525864307, - "learning_rate": 5.7493399730707464e-05, - "loss": 0.4492, - "step": 4264 - }, - { - "epoch": 2.0935176651305683, - "grad_norm": 0.41810900486078467, - "learning_rate": 5.743767177898948e-05, - "loss": 0.4372, - "step": 4265 - }, - { - "epoch": 2.094009216589862, - "grad_norm": 0.4189025385705837, - "learning_rate": 5.73819599650667e-05, - "loss": 0.4746, - "step": 4266 - }, - { - "epoch": 2.094500768049155, - "grad_norm": 0.43034018217263537, - "learning_rate": 5.7326264310062585e-05, - "loss": 0.413, - "step": 4267 - }, - { - "epoch": 2.0949923195084486, - "grad_norm": 0.39978423275874964, - "learning_rate": 5.727058483509463e-05, - "loss": 0.4372, - "step": 4268 - }, - { - "epoch": 2.095483870967742, - "grad_norm": 0.41270926386333584, - "learning_rate": 5.7214921561273945e-05, - "loss": 0.4441, - "step": 4269 - }, - { - "epoch": 2.0959754224270353, - "grad_norm": 0.4279187141373669, - "learning_rate": 5.715927450970577e-05, - "loss": 0.434, - "step": 4270 - }, - { - "epoch": 2.096466973886329, - "grad_norm": 0.4362070618606098, - "learning_rate": 5.7103643701488995e-05, - "loss": 0.4513, - "step": 4271 - }, - { - "epoch": 2.096958525345622, - "grad_norm": 0.46100877295110176, - "learning_rate": 5.704802915771642e-05, - "loss": 0.447, - "step": 4272 - }, - { - "epoch": 2.0974500768049156, - "grad_norm": 0.4176842880306042, - "learning_rate": 5.6992430899474684e-05, - "loss": 0.4289, - "step": 4273 - }, - { - "epoch": 2.0979416282642087, - "grad_norm": 0.41357265167043994, - "learning_rate": 5.6936848947844245e-05, - "loss": 0.4098, - "step": 4274 - }, - { - "epoch": 2.0984331797235023, - "grad_norm": 0.403733583839532, - "learning_rate": 5.688128332389937e-05, - "loss": 0.4539, - "step": 4275 - }, - { - "epoch": 2.098924731182796, - "grad_norm": 0.4051883313467419, - "learning_rate": 5.6825734048708155e-05, - "loss": 0.435, - "step": 4276 - }, - { - "epoch": 2.099416282642089, - "grad_norm": 0.4173185495282762, - "learning_rate": 5.6770201143332466e-05, - "loss": 0.4432, - "step": 4277 - }, - { - "epoch": 2.0999078341013826, - "grad_norm": 0.38956088264333477, - "learning_rate": 5.671468462882796e-05, - "loss": 0.4007, - "step": 4278 - }, - { - "epoch": 2.1003993855606757, - "grad_norm": 0.40783052142404524, - "learning_rate": 5.66591845262442e-05, - "loss": 0.4433, - "step": 4279 - }, - { - "epoch": 2.1008909370199693, - "grad_norm": 0.39415449939366926, - "learning_rate": 5.6603700856624276e-05, - "loss": 0.3917, - "step": 4280 - }, - { - "epoch": 2.1013824884792625, - "grad_norm": 0.4366003112326132, - "learning_rate": 5.654823364100532e-05, - "loss": 0.4607, - "step": 4281 - }, - { - "epoch": 2.101874039938556, - "grad_norm": 0.44813445324718815, - "learning_rate": 5.649278290041806e-05, - "loss": 0.4582, - "step": 4282 - }, - { - "epoch": 2.1023655913978496, - "grad_norm": 0.5816753446345644, - "learning_rate": 5.6437348655887014e-05, - "loss": 0.4849, - "step": 4283 - }, - { - "epoch": 2.1028571428571428, - "grad_norm": 0.404515910416168, - "learning_rate": 5.6381930928430474e-05, - "loss": 0.432, - "step": 4284 - }, - { - "epoch": 2.1033486943164363, - "grad_norm": 0.4720776180458684, - "learning_rate": 5.632652973906041e-05, - "loss": 0.4476, - "step": 4285 - }, - { - "epoch": 2.1038402457757295, - "grad_norm": 0.4085662152502483, - "learning_rate": 5.627114510878257e-05, - "loss": 0.4481, - "step": 4286 - }, - { - "epoch": 2.104331797235023, - "grad_norm": 0.41486082305016303, - "learning_rate": 5.6215777058596406e-05, - "loss": 0.4339, - "step": 4287 - }, - { - "epoch": 2.1048233486943166, - "grad_norm": 0.43919763438895254, - "learning_rate": 5.616042560949517e-05, - "loss": 0.4524, - "step": 4288 - }, - { - "epoch": 2.10531490015361, - "grad_norm": 0.4467773296703719, - "learning_rate": 5.6105090782465596e-05, - "loss": 0.4481, - "step": 4289 - }, - { - "epoch": 2.1058064516129034, - "grad_norm": 0.4035063773503261, - "learning_rate": 5.6049772598488385e-05, - "loss": 0.3952, - "step": 4290 - }, - { - "epoch": 2.1062980030721965, - "grad_norm": 0.40984972985964835, - "learning_rate": 5.5994471078537736e-05, - "loss": 0.3756, - "step": 4291 - }, - { - "epoch": 2.10678955453149, - "grad_norm": 0.4430332216262048, - "learning_rate": 5.5939186243581607e-05, - "loss": 0.4398, - "step": 4292 - }, - { - "epoch": 2.107281105990783, - "grad_norm": 0.7517599704205895, - "learning_rate": 5.5883918114581634e-05, - "loss": 0.4649, - "step": 4293 - }, - { - "epoch": 2.107772657450077, - "grad_norm": 0.42219048420753424, - "learning_rate": 5.58286667124931e-05, - "loss": 0.4528, - "step": 4294 - }, - { - "epoch": 2.1082642089093704, - "grad_norm": 0.41393203240996357, - "learning_rate": 5.577343205826492e-05, - "loss": 0.4224, - "step": 4295 - }, - { - "epoch": 2.1087557603686635, - "grad_norm": 0.43523521388805325, - "learning_rate": 5.5718214172839664e-05, - "loss": 0.449, - "step": 4296 - }, - { - "epoch": 2.109247311827957, - "grad_norm": 0.4588148374271314, - "learning_rate": 5.5663013077153705e-05, - "loss": 0.4492, - "step": 4297 - }, - { - "epoch": 2.1097388632872502, - "grad_norm": 0.4390774467423553, - "learning_rate": 5.560782879213673e-05, - "loss": 0.4627, - "step": 4298 - }, - { - "epoch": 2.110230414746544, - "grad_norm": 0.42403359582377603, - "learning_rate": 5.5552661338712356e-05, - "loss": 0.4335, - "step": 4299 - }, - { - "epoch": 2.110721966205837, - "grad_norm": 0.3893738533159988, - "learning_rate": 5.549751073779768e-05, - "loss": 0.3949, - "step": 4300 - }, - { - "epoch": 2.1112135176651305, - "grad_norm": 0.40235654616974886, - "learning_rate": 5.544237701030339e-05, - "loss": 0.4208, - "step": 4301 - }, - { - "epoch": 2.111705069124424, - "grad_norm": 0.40588728212682634, - "learning_rate": 5.538726017713385e-05, - "loss": 0.414, - "step": 4302 - }, - { - "epoch": 2.1121966205837173, - "grad_norm": 0.4347137345119711, - "learning_rate": 5.533216025918695e-05, - "loss": 0.4534, - "step": 4303 - }, - { - "epoch": 2.112688172043011, - "grad_norm": 0.4064793647100823, - "learning_rate": 5.527707727735416e-05, - "loss": 0.4545, - "step": 4304 - }, - { - "epoch": 2.113179723502304, - "grad_norm": 0.407632446198486, - "learning_rate": 5.522201125252071e-05, - "loss": 0.4317, - "step": 4305 - }, - { - "epoch": 2.1136712749615976, - "grad_norm": 0.37832562786643037, - "learning_rate": 5.5166962205565076e-05, - "loss": 0.4069, - "step": 4306 - }, - { - "epoch": 2.114162826420891, - "grad_norm": 0.4134656782296033, - "learning_rate": 5.511193015735962e-05, - "loss": 0.4334, - "step": 4307 - }, - { - "epoch": 2.1146543778801843, - "grad_norm": 0.4373465449969304, - "learning_rate": 5.505691512877007e-05, - "loss": 0.4182, - "step": 4308 - }, - { - "epoch": 2.115145929339478, - "grad_norm": 0.44125981660801755, - "learning_rate": 5.500191714065568e-05, - "loss": 0.4019, - "step": 4309 - }, - { - "epoch": 2.115637480798771, - "grad_norm": 0.39669666789656294, - "learning_rate": 5.4946936213869394e-05, - "loss": 0.4004, - "step": 4310 - }, - { - "epoch": 2.1161290322580646, - "grad_norm": 0.40699107590458056, - "learning_rate": 5.489197236925758e-05, - "loss": 0.4375, - "step": 4311 - }, - { - "epoch": 2.1166205837173577, - "grad_norm": 0.4426464646697577, - "learning_rate": 5.4837025627660154e-05, - "loss": 0.4613, - "step": 4312 - }, - { - "epoch": 2.1171121351766513, - "grad_norm": 0.4309200739702426, - "learning_rate": 5.478209600991049e-05, - "loss": 0.4504, - "step": 4313 - }, - { - "epoch": 2.117603686635945, - "grad_norm": 0.40561472063691545, - "learning_rate": 5.4727183536835645e-05, - "loss": 0.4159, - "step": 4314 - }, - { - "epoch": 2.118095238095238, - "grad_norm": 0.4070801765550614, - "learning_rate": 5.467228822925592e-05, - "loss": 0.4077, - "step": 4315 - }, - { - "epoch": 2.1185867895545316, - "grad_norm": 0.5851478499291302, - "learning_rate": 5.4617410107985334e-05, - "loss": 0.4621, - "step": 4316 - }, - { - "epoch": 2.1190783410138248, - "grad_norm": 0.4037222857784695, - "learning_rate": 5.456254919383128e-05, - "loss": 0.4741, - "step": 4317 - }, - { - "epoch": 2.1195698924731183, - "grad_norm": 0.454975814241343, - "learning_rate": 5.450770550759463e-05, - "loss": 0.4135, - "step": 4318 - }, - { - "epoch": 2.1200614439324115, - "grad_norm": 0.40158758582202636, - "learning_rate": 5.445287907006975e-05, - "loss": 0.446, - "step": 4319 - }, - { - "epoch": 2.120552995391705, - "grad_norm": 0.45012536558190436, - "learning_rate": 5.4398069902044456e-05, - "loss": 0.4608, - "step": 4320 - }, - { - "epoch": 2.1210445468509986, - "grad_norm": 0.43217395612265946, - "learning_rate": 5.434327802430002e-05, - "loss": 0.4506, - "step": 4321 - }, - { - "epoch": 2.1215360983102918, - "grad_norm": 0.45837058375236767, - "learning_rate": 5.428850345761107e-05, - "loss": 0.4575, - "step": 4322 - }, - { - "epoch": 2.1220276497695854, - "grad_norm": 0.43880641287499256, - "learning_rate": 5.423374622274594e-05, - "loss": 0.4221, - "step": 4323 - }, - { - "epoch": 2.1225192012288785, - "grad_norm": 0.42240569393185295, - "learning_rate": 5.4179006340466e-05, - "loss": 0.4392, - "step": 4324 - }, - { - "epoch": 2.123010752688172, - "grad_norm": 0.44762750433104964, - "learning_rate": 5.412428383152644e-05, - "loss": 0.4265, - "step": 4325 - }, - { - "epoch": 2.1235023041474657, - "grad_norm": 0.4234949194967042, - "learning_rate": 5.4069578716675486e-05, - "loss": 0.4377, - "step": 4326 - }, - { - "epoch": 2.123993855606759, - "grad_norm": 0.4049735863161532, - "learning_rate": 5.4014891016655065e-05, - "loss": 0.4474, - "step": 4327 - }, - { - "epoch": 2.1244854070660524, - "grad_norm": 0.41485118889205314, - "learning_rate": 5.396022075220037e-05, - "loss": 0.4549, - "step": 4328 - }, - { - "epoch": 2.1249769585253455, - "grad_norm": 0.42655675316538716, - "learning_rate": 5.390556794404e-05, - "loss": 0.4247, - "step": 4329 - }, - { - "epoch": 2.125468509984639, - "grad_norm": 0.4227362368948209, - "learning_rate": 5.385093261289594e-05, - "loss": 0.4455, - "step": 4330 - }, - { - "epoch": 2.1259600614439322, - "grad_norm": 0.39724971257023695, - "learning_rate": 5.379631477948355e-05, - "loss": 0.3841, - "step": 4331 - }, - { - "epoch": 2.126451612903226, - "grad_norm": 0.41390658051806384, - "learning_rate": 5.3741714464511526e-05, - "loss": 0.4298, - "step": 4332 - }, - { - "epoch": 2.1269431643625194, - "grad_norm": 0.38780469194051687, - "learning_rate": 5.3687131688681914e-05, - "loss": 0.412, - "step": 4333 - }, - { - "epoch": 2.1274347158218125, - "grad_norm": 0.435484590620052, - "learning_rate": 5.363256647269028e-05, - "loss": 0.4431, - "step": 4334 - }, - { - "epoch": 2.127926267281106, - "grad_norm": 0.42119058864037817, - "learning_rate": 5.3578018837225244e-05, - "loss": 0.4498, - "step": 4335 - }, - { - "epoch": 2.1284178187403993, - "grad_norm": 0.39102037612829293, - "learning_rate": 5.3523488802969e-05, - "loss": 0.3978, - "step": 4336 - }, - { - "epoch": 2.128909370199693, - "grad_norm": 0.4001188913866158, - "learning_rate": 5.346897639059696e-05, - "loss": 0.4239, - "step": 4337 - }, - { - "epoch": 2.129400921658986, - "grad_norm": 0.4727651469042729, - "learning_rate": 5.3414481620777867e-05, - "loss": 0.4532, - "step": 4338 - }, - { - "epoch": 2.1298924731182796, - "grad_norm": 0.41595630304601267, - "learning_rate": 5.33600045141738e-05, - "loss": 0.4467, - "step": 4339 - }, - { - "epoch": 2.130384024577573, - "grad_norm": 0.4291265838853872, - "learning_rate": 5.33055450914401e-05, - "loss": 0.4658, - "step": 4340 - }, - { - "epoch": 2.1308755760368663, - "grad_norm": 0.40854086583064586, - "learning_rate": 5.325110337322543e-05, - "loss": 0.3894, - "step": 4341 - }, - { - "epoch": 2.13136712749616, - "grad_norm": 0.4438761789467226, - "learning_rate": 5.31966793801717e-05, - "loss": 0.4918, - "step": 4342 - }, - { - "epoch": 2.131858678955453, - "grad_norm": 0.41391216359375127, - "learning_rate": 5.314227313291427e-05, - "loss": 0.4445, - "step": 4343 - }, - { - "epoch": 2.1323502304147466, - "grad_norm": 0.45122684773947314, - "learning_rate": 5.308788465208146e-05, - "loss": 0.4901, - "step": 4344 - }, - { - "epoch": 2.13284178187404, - "grad_norm": 0.416500040730317, - "learning_rate": 5.3033513958295145e-05, - "loss": 0.457, - "step": 4345 - }, - { - "epoch": 2.1333333333333333, - "grad_norm": 0.4349749020563684, - "learning_rate": 5.297916107217033e-05, - "loss": 0.4102, - "step": 4346 - }, - { - "epoch": 2.133824884792627, - "grad_norm": 0.4016279125462083, - "learning_rate": 5.2924826014315246e-05, - "loss": 0.4174, - "step": 4347 - }, - { - "epoch": 2.13431643625192, - "grad_norm": 0.4410167232425387, - "learning_rate": 5.287050880533138e-05, - "loss": 0.467, - "step": 4348 - }, - { - "epoch": 2.1348079877112136, - "grad_norm": 0.4795203199587206, - "learning_rate": 5.2816209465813595e-05, - "loss": 0.395, - "step": 4349 - }, - { - "epoch": 2.1352995391705067, - "grad_norm": 0.45030586232055475, - "learning_rate": 5.276192801634967e-05, - "loss": 0.4265, - "step": 4350 - }, - { - "epoch": 2.1357910906298003, - "grad_norm": 0.43040429273516784, - "learning_rate": 5.270766447752097e-05, - "loss": 0.3789, - "step": 4351 - }, - { - "epoch": 2.136282642089094, - "grad_norm": 0.4405157905881135, - "learning_rate": 5.2653418869901714e-05, - "loss": 0.4402, - "step": 4352 - }, - { - "epoch": 2.136774193548387, - "grad_norm": 0.41373487724637437, - "learning_rate": 5.259919121405961e-05, - "loss": 0.4064, - "step": 4353 - }, - { - "epoch": 2.1372657450076806, - "grad_norm": 0.4649562006226589, - "learning_rate": 5.2544981530555425e-05, - "loss": 0.4808, - "step": 4354 - }, - { - "epoch": 2.1377572964669738, - "grad_norm": 0.43818021032391435, - "learning_rate": 5.2490789839943036e-05, - "loss": 0.4029, - "step": 4355 - }, - { - "epoch": 2.1382488479262673, - "grad_norm": 0.4176186544370395, - "learning_rate": 5.243661616276969e-05, - "loss": 0.4538, - "step": 4356 - }, - { - "epoch": 2.1387403993855605, - "grad_norm": 0.4250411574079382, - "learning_rate": 5.2382460519575674e-05, - "loss": 0.452, - "step": 4357 - }, - { - "epoch": 2.139231950844854, - "grad_norm": 0.4255661163081559, - "learning_rate": 5.2328322930894466e-05, - "loss": 0.4673, - "step": 4358 - }, - { - "epoch": 2.1397235023041477, - "grad_norm": 0.4712447049756326, - "learning_rate": 5.227420341725267e-05, - "loss": 0.4585, - "step": 4359 - }, - { - "epoch": 2.140215053763441, - "grad_norm": 0.41687159849900807, - "learning_rate": 5.2220101999170156e-05, - "loss": 0.42, - "step": 4360 - }, - { - "epoch": 2.1407066052227344, - "grad_norm": 0.3876350752448147, - "learning_rate": 5.2166018697159714e-05, - "loss": 0.4025, - "step": 4361 - }, - { - "epoch": 2.1411981566820275, - "grad_norm": 0.4398360394424511, - "learning_rate": 5.2111953531727484e-05, - "loss": 0.5072, - "step": 4362 - }, - { - "epoch": 2.141689708141321, - "grad_norm": 0.4475877916395095, - "learning_rate": 5.205790652337264e-05, - "loss": 0.4411, - "step": 4363 - }, - { - "epoch": 2.1421812596006147, - "grad_norm": 0.42813264116827077, - "learning_rate": 5.2003877692587435e-05, - "loss": 0.4383, - "step": 4364 - }, - { - "epoch": 2.142672811059908, - "grad_norm": 0.4210238187824671, - "learning_rate": 5.1949867059857296e-05, - "loss": 0.4158, - "step": 4365 - }, - { - "epoch": 2.1431643625192014, - "grad_norm": 0.42162936077012486, - "learning_rate": 5.189587464566069e-05, - "loss": 0.4331, - "step": 4366 - }, - { - "epoch": 2.1436559139784945, - "grad_norm": 0.43042589148166177, - "learning_rate": 5.184190047046923e-05, - "loss": 0.4211, - "step": 4367 - }, - { - "epoch": 2.144147465437788, - "grad_norm": 0.45073364278941475, - "learning_rate": 5.178794455474754e-05, - "loss": 0.4512, - "step": 4368 - }, - { - "epoch": 2.1446390168970813, - "grad_norm": 0.4315029400695387, - "learning_rate": 5.173400691895349e-05, - "loss": 0.457, - "step": 4369 - }, - { - "epoch": 2.145130568356375, - "grad_norm": 0.411411302822659, - "learning_rate": 5.168008758353775e-05, - "loss": 0.416, - "step": 4370 - }, - { - "epoch": 2.145622119815668, - "grad_norm": 0.73511778409704, - "learning_rate": 5.162618656894431e-05, - "loss": 0.4794, - "step": 4371 - }, - { - "epoch": 2.1461136712749616, - "grad_norm": 0.4162527447396153, - "learning_rate": 5.1572303895610086e-05, - "loss": 0.4059, - "step": 4372 - }, - { - "epoch": 2.146605222734255, - "grad_norm": 0.44990882816501676, - "learning_rate": 5.151843958396503e-05, - "loss": 0.4622, - "step": 4373 - }, - { - "epoch": 2.1470967741935483, - "grad_norm": 0.4031846943438368, - "learning_rate": 5.146459365443217e-05, - "loss": 0.3995, - "step": 4374 - }, - { - "epoch": 2.147588325652842, - "grad_norm": 0.43375475174688627, - "learning_rate": 5.141076612742757e-05, - "loss": 0.4593, - "step": 4375 - }, - { - "epoch": 2.148079877112135, - "grad_norm": 0.4396761341089915, - "learning_rate": 5.1356957023360287e-05, - "loss": 0.4808, - "step": 4376 - }, - { - "epoch": 2.1485714285714286, - "grad_norm": 0.4409281915702271, - "learning_rate": 5.13031663626324e-05, - "loss": 0.4978, - "step": 4377 - }, - { - "epoch": 2.149062980030722, - "grad_norm": 0.44717845168424386, - "learning_rate": 5.124939416563903e-05, - "loss": 0.4443, - "step": 4378 - }, - { - "epoch": 2.1495545314900153, - "grad_norm": 0.3813846296539454, - "learning_rate": 5.119564045276821e-05, - "loss": 0.4195, - "step": 4379 - }, - { - "epoch": 2.150046082949309, - "grad_norm": 0.3996798873020046, - "learning_rate": 5.1141905244401144e-05, - "loss": 0.4108, - "step": 4380 - }, - { - "epoch": 2.150537634408602, - "grad_norm": 0.4299087809995918, - "learning_rate": 5.1088188560911733e-05, - "loss": 0.4342, - "step": 4381 - }, - { - "epoch": 2.1510291858678956, - "grad_norm": 0.4217456028047463, - "learning_rate": 5.103449042266717e-05, - "loss": 0.4045, - "step": 4382 - }, - { - "epoch": 2.1515207373271887, - "grad_norm": 0.41749867581150746, - "learning_rate": 5.0980810850027404e-05, - "loss": 0.4201, - "step": 4383 - }, - { - "epoch": 2.1520122887864823, - "grad_norm": 0.4286667184185848, - "learning_rate": 5.0927149863345416e-05, - "loss": 0.4422, - "step": 4384 - }, - { - "epoch": 2.152503840245776, - "grad_norm": 0.4219826069627986, - "learning_rate": 5.087350748296714e-05, - "loss": 0.4471, - "step": 4385 - }, - { - "epoch": 2.152995391705069, - "grad_norm": 0.41988649075766477, - "learning_rate": 5.081988372923143e-05, - "loss": 0.4347, - "step": 4386 - }, - { - "epoch": 2.1534869431643626, - "grad_norm": 0.41939660893342745, - "learning_rate": 5.076627862247012e-05, - "loss": 0.4185, - "step": 4387 - }, - { - "epoch": 2.1539784946236558, - "grad_norm": 0.41779965967946603, - "learning_rate": 5.071269218300789e-05, - "loss": 0.448, - "step": 4388 - }, - { - "epoch": 2.1544700460829493, - "grad_norm": 0.44787804981382706, - "learning_rate": 5.065912443116252e-05, - "loss": 0.4544, - "step": 4389 - }, - { - "epoch": 2.1549615975422425, - "grad_norm": 0.42412804495993484, - "learning_rate": 5.060557538724444e-05, - "loss": 0.4275, - "step": 4390 - }, - { - "epoch": 2.155453149001536, - "grad_norm": 0.4738682238040403, - "learning_rate": 5.0552045071557245e-05, - "loss": 0.4901, - "step": 4391 - }, - { - "epoch": 2.1559447004608296, - "grad_norm": 0.39120760867964455, - "learning_rate": 5.0498533504397286e-05, - "loss": 0.3948, - "step": 4392 - }, - { - "epoch": 2.156436251920123, - "grad_norm": 0.42834411621168483, - "learning_rate": 5.044504070605381e-05, - "loss": 0.4241, - "step": 4393 - }, - { - "epoch": 2.1569278033794164, - "grad_norm": 0.4292925707764463, - "learning_rate": 5.039156669680898e-05, - "loss": 0.4626, - "step": 4394 - }, - { - "epoch": 2.1574193548387095, - "grad_norm": 0.42117150287635424, - "learning_rate": 5.03381114969379e-05, - "loss": 0.4164, - "step": 4395 - }, - { - "epoch": 2.157910906298003, - "grad_norm": 0.4556973258772338, - "learning_rate": 5.028467512670834e-05, - "loss": 0.4401, - "step": 4396 - }, - { - "epoch": 2.1584024577572967, - "grad_norm": 0.4005896738237106, - "learning_rate": 5.0231257606381174e-05, - "loss": 0.4168, - "step": 4397 - }, - { - "epoch": 2.15889400921659, - "grad_norm": 0.41955820433982366, - "learning_rate": 5.017785895620999e-05, - "loss": 0.4531, - "step": 4398 - }, - { - "epoch": 2.1593855606758834, - "grad_norm": 0.44005067322569746, - "learning_rate": 5.012447919644122e-05, - "loss": 0.405, - "step": 4399 - }, - { - "epoch": 2.1598771121351765, - "grad_norm": 0.43738368249977705, - "learning_rate": 5.007111834731422e-05, - "loss": 0.4579, - "step": 4400 - }, - { - "epoch": 2.16036866359447, - "grad_norm": 0.4460818048531665, - "learning_rate": 5.0017776429061004e-05, - "loss": 0.4408, - "step": 4401 - }, - { - "epoch": 2.1608602150537632, - "grad_norm": 0.4147449048415123, - "learning_rate": 4.9964453461906626e-05, - "loss": 0.3757, - "step": 4402 - }, - { - "epoch": 2.161351766513057, - "grad_norm": 0.45692420521761984, - "learning_rate": 4.991114946606882e-05, - "loss": 0.4608, - "step": 4403 - }, - { - "epoch": 2.1618433179723504, - "grad_norm": 0.39778311288667356, - "learning_rate": 4.985786446175815e-05, - "loss": 0.3995, - "step": 4404 - }, - { - "epoch": 2.1623348694316435, - "grad_norm": 0.41081249623918414, - "learning_rate": 4.980459846917797e-05, - "loss": 0.4023, - "step": 4405 - }, - { - "epoch": 2.162826420890937, - "grad_norm": 0.4440057390082081, - "learning_rate": 4.975135150852452e-05, - "loss": 0.446, - "step": 4406 - }, - { - "epoch": 2.1633179723502303, - "grad_norm": 0.4238231167375056, - "learning_rate": 4.969812359998662e-05, - "loss": 0.4423, - "step": 4407 - }, - { - "epoch": 2.163809523809524, - "grad_norm": 0.4068182614428961, - "learning_rate": 4.964491476374611e-05, - "loss": 0.4271, - "step": 4408 - }, - { - "epoch": 2.164301075268817, - "grad_norm": 0.4110332195034115, - "learning_rate": 4.959172501997742e-05, - "loss": 0.444, - "step": 4409 - }, - { - "epoch": 2.1647926267281106, - "grad_norm": 0.4261451735273132, - "learning_rate": 4.953855438884782e-05, - "loss": 0.4235, - "step": 4410 - }, - { - "epoch": 2.165284178187404, - "grad_norm": 0.43062714070941965, - "learning_rate": 4.9485402890517306e-05, - "loss": 0.4402, - "step": 4411 - }, - { - "epoch": 2.1657757296466973, - "grad_norm": 0.4043275262104169, - "learning_rate": 4.943227054513864e-05, - "loss": 0.4418, - "step": 4412 - }, - { - "epoch": 2.166267281105991, - "grad_norm": 0.40076859148893473, - "learning_rate": 4.9379157372857295e-05, - "loss": 0.4246, - "step": 4413 - }, - { - "epoch": 2.166758832565284, - "grad_norm": 0.3996514965672806, - "learning_rate": 4.932606339381146e-05, - "loss": 0.3997, - "step": 4414 - }, - { - "epoch": 2.1672503840245776, - "grad_norm": 0.41932968565298395, - "learning_rate": 4.92729886281322e-05, - "loss": 0.4488, - "step": 4415 - }, - { - "epoch": 2.167741935483871, - "grad_norm": 0.41813494970217113, - "learning_rate": 4.9219933095943005e-05, - "loss": 0.4262, - "step": 4416 - }, - { - "epoch": 2.1682334869431643, - "grad_norm": 0.4515181929220674, - "learning_rate": 4.916689681736036e-05, - "loss": 0.4611, - "step": 4417 - }, - { - "epoch": 2.168725038402458, - "grad_norm": 0.43052490479356864, - "learning_rate": 4.911387981249329e-05, - "loss": 0.4889, - "step": 4418 - }, - { - "epoch": 2.169216589861751, - "grad_norm": 0.4144350250575182, - "learning_rate": 4.906088210144356e-05, - "loss": 0.4445, - "step": 4419 - }, - { - "epoch": 2.1697081413210446, - "grad_norm": 0.43388096264822634, - "learning_rate": 4.90079037043056e-05, - "loss": 0.4693, - "step": 4420 - }, - { - "epoch": 2.1701996927803378, - "grad_norm": 0.45783415204779754, - "learning_rate": 4.895494464116653e-05, - "loss": 0.4563, - "step": 4421 - }, - { - "epoch": 2.1706912442396313, - "grad_norm": 0.43757618312677327, - "learning_rate": 4.890200493210615e-05, - "loss": 0.4557, - "step": 4422 - }, - { - "epoch": 2.171182795698925, - "grad_norm": 0.4086225003044922, - "learning_rate": 4.8849084597196896e-05, - "loss": 0.4077, - "step": 4423 - }, - { - "epoch": 2.171674347158218, - "grad_norm": 0.42413792999109734, - "learning_rate": 4.879618365650387e-05, - "loss": 0.4107, - "step": 4424 - }, - { - "epoch": 2.1721658986175116, - "grad_norm": 0.3957085764936461, - "learning_rate": 4.87433021300848e-05, - "loss": 0.3994, - "step": 4425 - }, - { - "epoch": 2.1726574500768048, - "grad_norm": 0.4146731465061143, - "learning_rate": 4.869044003799017e-05, - "loss": 0.4397, - "step": 4426 - }, - { - "epoch": 2.1731490015360984, - "grad_norm": 0.629670447790838, - "learning_rate": 4.863759740026286e-05, - "loss": 0.5132, - "step": 4427 - }, - { - "epoch": 2.1736405529953915, - "grad_norm": 0.41846108849849256, - "learning_rate": 4.858477423693862e-05, - "loss": 0.4465, - "step": 4428 - }, - { - "epoch": 2.174132104454685, - "grad_norm": 0.4119282564612937, - "learning_rate": 4.853197056804569e-05, - "loss": 0.4068, - "step": 4429 - }, - { - "epoch": 2.1746236559139787, - "grad_norm": 0.4153153738135856, - "learning_rate": 4.8479186413604924e-05, - "loss": 0.4316, - "step": 4430 - }, - { - "epoch": 2.175115207373272, - "grad_norm": 0.4541094117283424, - "learning_rate": 4.8426421793629795e-05, - "loss": 0.3986, - "step": 4431 - }, - { - "epoch": 2.1756067588325654, - "grad_norm": 0.4350741526371811, - "learning_rate": 4.837367672812636e-05, - "loss": 0.44, - "step": 4432 - }, - { - "epoch": 2.1760983102918585, - "grad_norm": 0.4367682504678159, - "learning_rate": 4.832095123709328e-05, - "loss": 0.421, - "step": 4433 - }, - { - "epoch": 2.176589861751152, - "grad_norm": 0.42496427203676856, - "learning_rate": 4.826824534052174e-05, - "loss": 0.4454, - "step": 4434 - }, - { - "epoch": 2.1770814132104457, - "grad_norm": 0.4724552533650487, - "learning_rate": 4.8215559058395645e-05, - "loss": 0.4492, - "step": 4435 - }, - { - "epoch": 2.177572964669739, - "grad_norm": 0.4225075059301563, - "learning_rate": 4.816289241069122e-05, - "loss": 0.4374, - "step": 4436 - }, - { - "epoch": 2.1780645161290324, - "grad_norm": 0.46626577619958326, - "learning_rate": 4.811024541737747e-05, - "loss": 0.4531, - "step": 4437 - }, - { - "epoch": 2.1785560675883255, - "grad_norm": 0.4388792550230304, - "learning_rate": 4.8057618098415845e-05, - "loss": 0.4799, - "step": 4438 - }, - { - "epoch": 2.179047619047619, - "grad_norm": 0.46133312440643043, - "learning_rate": 4.800501047376034e-05, - "loss": 0.439, - "step": 4439 - }, - { - "epoch": 2.1795391705069123, - "grad_norm": 0.4578713126408955, - "learning_rate": 4.7952422563357444e-05, - "loss": 0.4662, - "step": 4440 - }, - { - "epoch": 2.180030721966206, - "grad_norm": 0.44059885691155315, - "learning_rate": 4.789985438714636e-05, - "loss": 0.4439, - "step": 4441 - }, - { - "epoch": 2.1805222734254994, - "grad_norm": 0.40775031092238395, - "learning_rate": 4.7847305965058497e-05, - "loss": 0.4289, - "step": 4442 - }, - { - "epoch": 2.1810138248847926, - "grad_norm": 0.41936920870705424, - "learning_rate": 4.779477731701806e-05, - "loss": 0.4472, - "step": 4443 - }, - { - "epoch": 2.181505376344086, - "grad_norm": 0.44219260372325364, - "learning_rate": 4.7742268462941606e-05, - "loss": 0.476, - "step": 4444 - }, - { - "epoch": 2.1819969278033793, - "grad_norm": 0.40176090187932306, - "learning_rate": 4.768977942273822e-05, - "loss": 0.4392, - "step": 4445 - }, - { - "epoch": 2.182488479262673, - "grad_norm": 0.43136033668182955, - "learning_rate": 4.763731021630949e-05, - "loss": 0.4627, - "step": 4446 - }, - { - "epoch": 2.182980030721966, - "grad_norm": 0.45410435241183505, - "learning_rate": 4.758486086354946e-05, - "loss": 0.4492, - "step": 4447 - }, - { - "epoch": 2.1834715821812596, - "grad_norm": 0.3803836135972755, - "learning_rate": 4.7532431384344666e-05, - "loss": 0.4006, - "step": 4448 - }, - { - "epoch": 2.183963133640553, - "grad_norm": 0.4361763004935449, - "learning_rate": 4.7480021798574094e-05, - "loss": 0.4412, - "step": 4449 - }, - { - "epoch": 2.1844546850998463, - "grad_norm": 0.3917833693828462, - "learning_rate": 4.7427632126109186e-05, - "loss": 0.4373, - "step": 4450 - }, - { - "epoch": 2.18494623655914, - "grad_norm": 0.40969285412603657, - "learning_rate": 4.7375262386813814e-05, - "loss": 0.4344, - "step": 4451 - }, - { - "epoch": 2.185437788018433, - "grad_norm": 0.4453853483380478, - "learning_rate": 4.7322912600544435e-05, - "loss": 0.492, - "step": 4452 - }, - { - "epoch": 2.1859293394777266, - "grad_norm": 0.4104727458845761, - "learning_rate": 4.727058278714966e-05, - "loss": 0.4271, - "step": 4453 - }, - { - "epoch": 2.18642089093702, - "grad_norm": 0.4277137440758201, - "learning_rate": 4.721827296647083e-05, - "loss": 0.4349, - "step": 4454 - }, - { - "epoch": 2.1869124423963133, - "grad_norm": 0.44356378878174674, - "learning_rate": 4.716598315834151e-05, - "loss": 0.4357, - "step": 4455 - }, - { - "epoch": 2.187403993855607, - "grad_norm": 0.4308645590911913, - "learning_rate": 4.7113713382587745e-05, - "loss": 0.4405, - "step": 4456 - }, - { - "epoch": 2.1878955453149, - "grad_norm": 0.4307110305176749, - "learning_rate": 4.706146365902796e-05, - "loss": 0.4205, - "step": 4457 - }, - { - "epoch": 2.1883870967741936, - "grad_norm": 0.44090083753213666, - "learning_rate": 4.7009234007473016e-05, - "loss": 0.4509, - "step": 4458 - }, - { - "epoch": 2.1888786482334868, - "grad_norm": 0.42396812896536706, - "learning_rate": 4.695702444772611e-05, - "loss": 0.4522, - "step": 4459 - }, - { - "epoch": 2.1893701996927803, - "grad_norm": 0.4390162544767842, - "learning_rate": 4.6904834999582834e-05, - "loss": 0.4613, - "step": 4460 - }, - { - "epoch": 2.189861751152074, - "grad_norm": 0.40877584824911817, - "learning_rate": 4.6852665682831284e-05, - "loss": 0.4188, - "step": 4461 - }, - { - "epoch": 2.190353302611367, - "grad_norm": 0.4261914855380424, - "learning_rate": 4.6800516517251644e-05, - "loss": 0.4718, - "step": 4462 - }, - { - "epoch": 2.1908448540706607, - "grad_norm": 0.43261030424633407, - "learning_rate": 4.674838752261675e-05, - "loss": 0.4257, - "step": 4463 - }, - { - "epoch": 2.191336405529954, - "grad_norm": 0.4082558536404945, - "learning_rate": 4.6696278718691635e-05, - "loss": 0.4445, - "step": 4464 - }, - { - "epoch": 2.1918279569892474, - "grad_norm": 0.41713268918343116, - "learning_rate": 4.6644190125233675e-05, - "loss": 0.4367, - "step": 4465 - }, - { - "epoch": 2.1923195084485405, - "grad_norm": 0.4205254545193984, - "learning_rate": 4.659212176199264e-05, - "loss": 0.4579, - "step": 4466 - }, - { - "epoch": 2.192811059907834, - "grad_norm": 0.4294159493533712, - "learning_rate": 4.65400736487106e-05, - "loss": 0.4867, - "step": 4467 - }, - { - "epoch": 2.1933026113671277, - "grad_norm": 0.4096270514696592, - "learning_rate": 4.6488045805121936e-05, - "loss": 0.4472, - "step": 4468 - }, - { - "epoch": 2.193794162826421, - "grad_norm": 0.4222234775429786, - "learning_rate": 4.643603825095333e-05, - "loss": 0.4012, - "step": 4469 - }, - { - "epoch": 2.1942857142857144, - "grad_norm": 0.43404196111820115, - "learning_rate": 4.63840510059239e-05, - "loss": 0.4539, - "step": 4470 - }, - { - "epoch": 2.1947772657450075, - "grad_norm": 0.4225395948168961, - "learning_rate": 4.6332084089744834e-05, - "loss": 0.451, - "step": 4471 - }, - { - "epoch": 2.195268817204301, - "grad_norm": 0.47872341738559837, - "learning_rate": 4.628013752211987e-05, - "loss": 0.4689, - "step": 4472 - }, - { - "epoch": 2.1957603686635947, - "grad_norm": 0.40990643697997525, - "learning_rate": 4.622821132274475e-05, - "loss": 0.4241, - "step": 4473 - }, - { - "epoch": 2.196251920122888, - "grad_norm": 0.4166483040482699, - "learning_rate": 4.617630551130778e-05, - "loss": 0.4133, - "step": 4474 - }, - { - "epoch": 2.1967434715821814, - "grad_norm": 0.430535564954758, - "learning_rate": 4.612442010748934e-05, - "loss": 0.4434, - "step": 4475 - }, - { - "epoch": 2.1972350230414746, - "grad_norm": 0.3999902275332258, - "learning_rate": 4.607255513096215e-05, - "loss": 0.4503, - "step": 4476 - }, - { - "epoch": 2.197726574500768, - "grad_norm": 0.41627649583279697, - "learning_rate": 4.602071060139115e-05, - "loss": 0.4418, - "step": 4477 - }, - { - "epoch": 2.1982181259600613, - "grad_norm": 0.39641632122986575, - "learning_rate": 4.596888653843354e-05, - "loss": 0.403, - "step": 4478 - }, - { - "epoch": 2.198709677419355, - "grad_norm": 0.4161578906013969, - "learning_rate": 4.59170829617388e-05, - "loss": 0.4831, - "step": 4479 - }, - { - "epoch": 2.1992012288786484, - "grad_norm": 0.45074209732039683, - "learning_rate": 4.586529989094853e-05, - "loss": 0.4994, - "step": 4480 - }, - { - "epoch": 2.1996927803379416, - "grad_norm": 0.4195719471519759, - "learning_rate": 4.581353734569678e-05, - "loss": 0.4556, - "step": 4481 - }, - { - "epoch": 2.200184331797235, - "grad_norm": 0.4364300571102807, - "learning_rate": 4.576179534560948e-05, - "loss": 0.408, - "step": 4482 - }, - { - "epoch": 2.2006758832565283, - "grad_norm": 0.4102148959058698, - "learning_rate": 4.571007391030511e-05, - "loss": 0.4293, - "step": 4483 - }, - { - "epoch": 2.201167434715822, - "grad_norm": 0.4379611186924836, - "learning_rate": 4.565837305939414e-05, - "loss": 0.4479, - "step": 4484 - }, - { - "epoch": 2.201658986175115, - "grad_norm": 0.44502080054836296, - "learning_rate": 4.560669281247931e-05, - "loss": 0.4442, - "step": 4485 - }, - { - "epoch": 2.2021505376344086, - "grad_norm": 0.44082396196571616, - "learning_rate": 4.5555033189155505e-05, - "loss": 0.4042, - "step": 4486 - }, - { - "epoch": 2.202642089093702, - "grad_norm": 0.5436717499586232, - "learning_rate": 4.550339420900992e-05, - "loss": 0.4569, - "step": 4487 - }, - { - "epoch": 2.2031336405529953, - "grad_norm": 0.43565266629182936, - "learning_rate": 4.545177589162167e-05, - "loss": 0.4196, - "step": 4488 - }, - { - "epoch": 2.203625192012289, - "grad_norm": 0.45680818945809476, - "learning_rate": 4.540017825656232e-05, - "loss": 0.4289, - "step": 4489 - }, - { - "epoch": 2.204116743471582, - "grad_norm": 0.4216188706854949, - "learning_rate": 4.5348601323395415e-05, - "loss": 0.4583, - "step": 4490 - }, - { - "epoch": 2.2046082949308756, - "grad_norm": 0.47074766549661506, - "learning_rate": 4.529704511167669e-05, - "loss": 0.4386, - "step": 4491 - }, - { - "epoch": 2.205099846390169, - "grad_norm": 0.4044953118483657, - "learning_rate": 4.5245509640954057e-05, - "loss": 0.3919, - "step": 4492 - }, - { - "epoch": 2.2055913978494623, - "grad_norm": 0.41423338895930417, - "learning_rate": 4.5193994930767526e-05, - "loss": 0.4137, - "step": 4493 - }, - { - "epoch": 2.206082949308756, - "grad_norm": 0.4021240623311567, - "learning_rate": 4.514250100064924e-05, - "loss": 0.4459, - "step": 4494 - }, - { - "epoch": 2.206574500768049, - "grad_norm": 0.4487918225338265, - "learning_rate": 4.509102787012344e-05, - "loss": 0.4128, - "step": 4495 - }, - { - "epoch": 2.2070660522273426, - "grad_norm": 0.4104860899519224, - "learning_rate": 4.5039575558706625e-05, - "loss": 0.4045, - "step": 4496 - }, - { - "epoch": 2.207557603686636, - "grad_norm": 0.40373145049891, - "learning_rate": 4.4988144085907136e-05, - "loss": 0.4309, - "step": 4497 - }, - { - "epoch": 2.2080491551459294, - "grad_norm": 0.4135194906388253, - "learning_rate": 4.493673347122572e-05, - "loss": 0.4417, - "step": 4498 - }, - { - "epoch": 2.2085407066052225, - "grad_norm": 0.4165902757659008, - "learning_rate": 4.488534373415492e-05, - "loss": 0.4521, - "step": 4499 - }, - { - "epoch": 2.209032258064516, - "grad_norm": 0.38993335048743943, - "learning_rate": 4.483397489417959e-05, - "loss": 0.4061, - "step": 4500 - }, - { - "epoch": 2.2095238095238097, - "grad_norm": 0.4524942637057196, - "learning_rate": 4.4782626970776544e-05, - "loss": 0.4763, - "step": 4501 - }, - { - "epoch": 2.210015360983103, - "grad_norm": 0.4190968947250465, - "learning_rate": 4.47312999834147e-05, - "loss": 0.4632, - "step": 4502 - }, - { - "epoch": 2.2105069124423964, - "grad_norm": 0.4170480670203764, - "learning_rate": 4.4679993951555044e-05, - "loss": 0.4108, - "step": 4503 - }, - { - "epoch": 2.2109984639016895, - "grad_norm": 0.41696081364198906, - "learning_rate": 4.462870889465058e-05, - "loss": 0.4285, - "step": 4504 - }, - { - "epoch": 2.211490015360983, - "grad_norm": 0.4686791565256886, - "learning_rate": 4.457744483214641e-05, - "loss": 0.4826, - "step": 4505 - }, - { - "epoch": 2.2119815668202767, - "grad_norm": 0.42478544683028374, - "learning_rate": 4.45262017834796e-05, - "loss": 0.4707, - "step": 4506 - }, - { - "epoch": 2.21247311827957, - "grad_norm": 0.40480018628254527, - "learning_rate": 4.447497976807942e-05, - "loss": 0.3889, - "step": 4507 - }, - { - "epoch": 2.2129646697388634, - "grad_norm": 0.441810988481185, - "learning_rate": 4.442377880536689e-05, - "loss": 0.4446, - "step": 4508 - }, - { - "epoch": 2.2134562211981565, - "grad_norm": 0.4255315141753949, - "learning_rate": 4.43725989147553e-05, - "loss": 0.4394, - "step": 4509 - }, - { - "epoch": 2.21394777265745, - "grad_norm": 0.3998457670599458, - "learning_rate": 4.4321440115649835e-05, - "loss": 0.4022, - "step": 4510 - }, - { - "epoch": 2.2144393241167433, - "grad_norm": 0.4440745234989787, - "learning_rate": 4.42703024274477e-05, - "loss": 0.4275, - "step": 4511 - }, - { - "epoch": 2.214930875576037, - "grad_norm": 0.44696329121917866, - "learning_rate": 4.421918586953808e-05, - "loss": 0.4257, - "step": 4512 - }, - { - "epoch": 2.2154224270353304, - "grad_norm": 0.8390629282171552, - "learning_rate": 4.416809046130218e-05, - "loss": 0.4722, - "step": 4513 - }, - { - "epoch": 2.2159139784946236, - "grad_norm": 0.4333858632285727, - "learning_rate": 4.411701622211316e-05, - "loss": 0.4548, - "step": 4514 - }, - { - "epoch": 2.216405529953917, - "grad_norm": 0.4540794328456241, - "learning_rate": 4.4065963171336144e-05, - "loss": 0.4199, - "step": 4515 - }, - { - "epoch": 2.2168970814132103, - "grad_norm": 0.4151006477780833, - "learning_rate": 4.401493132832832e-05, - "loss": 0.41, - "step": 4516 - }, - { - "epoch": 2.217388632872504, - "grad_norm": 0.4539538777684501, - "learning_rate": 4.396392071243864e-05, - "loss": 0.4461, - "step": 4517 - }, - { - "epoch": 2.217880184331797, - "grad_norm": 0.4232795307120172, - "learning_rate": 4.391293134300824e-05, - "loss": 0.4165, - "step": 4518 - }, - { - "epoch": 2.2183717357910906, - "grad_norm": 0.39441453701000495, - "learning_rate": 4.3861963239370007e-05, - "loss": 0.3831, - "step": 4519 - }, - { - "epoch": 2.218863287250384, - "grad_norm": 0.4153403752586208, - "learning_rate": 4.3811016420848884e-05, - "loss": 0.382, - "step": 4520 - }, - { - "epoch": 2.2193548387096773, - "grad_norm": 0.4330494209968192, - "learning_rate": 4.3760090906761686e-05, - "loss": 0.4707, - "step": 4521 - }, - { - "epoch": 2.219846390168971, - "grad_norm": 0.39644422590373857, - "learning_rate": 4.370918671641716e-05, - "loss": 0.4211, - "step": 4522 - }, - { - "epoch": 2.220337941628264, - "grad_norm": 0.4515300199036469, - "learning_rate": 4.3658303869115994e-05, - "loss": 0.474, - "step": 4523 - }, - { - "epoch": 2.2208294930875576, - "grad_norm": 0.4153569127675756, - "learning_rate": 4.360744238415075e-05, - "loss": 0.446, - "step": 4524 - }, - { - "epoch": 2.221321044546851, - "grad_norm": 0.40705322607325733, - "learning_rate": 4.3556602280805904e-05, - "loss": 0.3911, - "step": 4525 - }, - { - "epoch": 2.2218125960061443, - "grad_norm": 0.47516826610389534, - "learning_rate": 4.350578357835781e-05, - "loss": 0.4324, - "step": 4526 - }, - { - "epoch": 2.222304147465438, - "grad_norm": 0.4192287978383726, - "learning_rate": 4.3454986296074795e-05, - "loss": 0.411, - "step": 4527 - }, - { - "epoch": 2.222795698924731, - "grad_norm": 0.419728918031921, - "learning_rate": 4.340421045321688e-05, - "loss": 0.4338, - "step": 4528 - }, - { - "epoch": 2.2232872503840246, - "grad_norm": 0.5017966085307272, - "learning_rate": 4.335345606903616e-05, - "loss": 0.4972, - "step": 4529 - }, - { - "epoch": 2.2237788018433178, - "grad_norm": 0.3932223866674703, - "learning_rate": 4.330272316277648e-05, - "loss": 0.3854, - "step": 4530 - }, - { - "epoch": 2.2242703533026114, - "grad_norm": 0.4223532152873605, - "learning_rate": 4.325201175367356e-05, - "loss": 0.4434, - "step": 4531 - }, - { - "epoch": 2.224761904761905, - "grad_norm": 0.45051744969778434, - "learning_rate": 4.3201321860954943e-05, - "loss": 0.455, - "step": 4532 - }, - { - "epoch": 2.225253456221198, - "grad_norm": 0.43922627478970927, - "learning_rate": 4.3150653503840145e-05, - "loss": 0.4594, - "step": 4533 - }, - { - "epoch": 2.2257450076804917, - "grad_norm": 0.40800420038643515, - "learning_rate": 4.3100006701540274e-05, - "loss": 0.449, - "step": 4534 - }, - { - "epoch": 2.226236559139785, - "grad_norm": 0.4156611109687932, - "learning_rate": 4.3049381473258534e-05, - "loss": 0.4262, - "step": 4535 - }, - { - "epoch": 2.2267281105990784, - "grad_norm": 0.44412597623468436, - "learning_rate": 4.299877783818975e-05, - "loss": 0.4556, - "step": 4536 - }, - { - "epoch": 2.2272196620583715, - "grad_norm": 0.43052028167325, - "learning_rate": 4.2948195815520675e-05, - "loss": 0.4365, - "step": 4537 - }, - { - "epoch": 2.227711213517665, - "grad_norm": 0.44273384583205344, - "learning_rate": 4.2897635424429795e-05, - "loss": 0.4237, - "step": 4538 - }, - { - "epoch": 2.2282027649769587, - "grad_norm": 0.42175756925343066, - "learning_rate": 4.284709668408744e-05, - "loss": 0.4175, - "step": 4539 - }, - { - "epoch": 2.228694316436252, - "grad_norm": 0.4291815039613073, - "learning_rate": 4.279657961365572e-05, - "loss": 0.4359, - "step": 4540 - }, - { - "epoch": 2.2291858678955454, - "grad_norm": 0.43043977435781366, - "learning_rate": 4.2746084232288476e-05, - "loss": 0.4639, - "step": 4541 - }, - { - "epoch": 2.2296774193548385, - "grad_norm": 0.4153525016413108, - "learning_rate": 4.269561055913148e-05, - "loss": 0.4399, - "step": 4542 - }, - { - "epoch": 2.230168970814132, - "grad_norm": 0.41924135802962675, - "learning_rate": 4.2645158613322044e-05, - "loss": 0.4431, - "step": 4543 - }, - { - "epoch": 2.2306605222734257, - "grad_norm": 0.4356805792830745, - "learning_rate": 4.259472841398945e-05, - "loss": 0.4298, - "step": 4544 - }, - { - "epoch": 2.231152073732719, - "grad_norm": 0.4191414001168072, - "learning_rate": 4.254431998025462e-05, - "loss": 0.4408, - "step": 4545 - }, - { - "epoch": 2.2316436251920124, - "grad_norm": 0.425152767662572, - "learning_rate": 4.249393333123026e-05, - "loss": 0.4503, - "step": 4546 - }, - { - "epoch": 2.2321351766513056, - "grad_norm": 0.4140228911065242, - "learning_rate": 4.244356848602081e-05, - "loss": 0.4404, - "step": 4547 - }, - { - "epoch": 2.232626728110599, - "grad_norm": 0.42846747020990583, - "learning_rate": 4.239322546372244e-05, - "loss": 0.4842, - "step": 4548 - }, - { - "epoch": 2.2331182795698923, - "grad_norm": 0.41412832765079294, - "learning_rate": 4.234290428342305e-05, - "loss": 0.4338, - "step": 4549 - }, - { - "epoch": 2.233609831029186, - "grad_norm": 0.42718140381970093, - "learning_rate": 4.229260496420224e-05, - "loss": 0.4394, - "step": 4550 - }, - { - "epoch": 2.2341013824884794, - "grad_norm": 0.4269230817888805, - "learning_rate": 4.2242327525131365e-05, - "loss": 0.4593, - "step": 4551 - }, - { - "epoch": 2.2345929339477726, - "grad_norm": 0.42697239288245986, - "learning_rate": 4.219207198527339e-05, - "loss": 0.4386, - "step": 4552 - }, - { - "epoch": 2.235084485407066, - "grad_norm": 0.4297789439398567, - "learning_rate": 4.2141838363683185e-05, - "loss": 0.4379, - "step": 4553 - }, - { - "epoch": 2.2355760368663593, - "grad_norm": 0.3892118154155358, - "learning_rate": 4.2091626679407004e-05, - "loss": 0.3629, - "step": 4554 - }, - { - "epoch": 2.236067588325653, - "grad_norm": 0.4343031562397274, - "learning_rate": 4.2041436951483045e-05, - "loss": 0.4398, - "step": 4555 - }, - { - "epoch": 2.236559139784946, - "grad_norm": 0.43960752875867515, - "learning_rate": 4.1991269198941084e-05, - "loss": 0.4193, - "step": 4556 - }, - { - "epoch": 2.2370506912442396, - "grad_norm": 0.42335764116690994, - "learning_rate": 4.194112344080252e-05, - "loss": 0.4044, - "step": 4557 - }, - { - "epoch": 2.237542242703533, - "grad_norm": 0.4237042651183192, - "learning_rate": 4.189099969608049e-05, - "loss": 0.4187, - "step": 4558 - }, - { - "epoch": 2.2380337941628263, - "grad_norm": 0.45150880876275945, - "learning_rate": 4.184089798377975e-05, - "loss": 0.4465, - "step": 4559 - }, - { - "epoch": 2.23852534562212, - "grad_norm": 0.41438305293174793, - "learning_rate": 4.179081832289667e-05, - "loss": 0.4002, - "step": 4560 - }, - { - "epoch": 2.239016897081413, - "grad_norm": 0.4437423339084064, - "learning_rate": 4.17407607324193e-05, - "loss": 0.4248, - "step": 4561 - }, - { - "epoch": 2.2395084485407066, - "grad_norm": 0.44924414981628963, - "learning_rate": 4.16907252313274e-05, - "loss": 0.4421, - "step": 4562 - }, - { - "epoch": 2.24, - "grad_norm": 0.42788494613359257, - "learning_rate": 4.1640711838592114e-05, - "loss": 0.3978, - "step": 4563 - }, - { - "epoch": 2.2404915514592934, - "grad_norm": 0.4149591545549934, - "learning_rate": 4.15907205731765e-05, - "loss": 0.4634, - "step": 4564 - }, - { - "epoch": 2.240983102918587, - "grad_norm": 0.41417067877207575, - "learning_rate": 4.154075145403502e-05, - "loss": 0.4568, - "step": 4565 - }, - { - "epoch": 2.24147465437788, - "grad_norm": 0.4418227367043439, - "learning_rate": 4.149080450011382e-05, - "loss": 0.4825, - "step": 4566 - }, - { - "epoch": 2.2419662058371737, - "grad_norm": 0.40096107162784733, - "learning_rate": 4.144087973035062e-05, - "loss": 0.4264, - "step": 4567 - }, - { - "epoch": 2.242457757296467, - "grad_norm": 0.40050554535120325, - "learning_rate": 4.139097716367474e-05, - "loss": 0.452, - "step": 4568 - }, - { - "epoch": 2.2429493087557604, - "grad_norm": 0.4265216565740003, - "learning_rate": 4.1341096819007065e-05, - "loss": 0.4575, - "step": 4569 - }, - { - "epoch": 2.243440860215054, - "grad_norm": 0.4301429842592259, - "learning_rate": 4.129123871526007e-05, - "loss": 0.4515, - "step": 4570 - }, - { - "epoch": 2.243932411674347, - "grad_norm": 0.4098690448745217, - "learning_rate": 4.1241402871337806e-05, - "loss": 0.4304, - "step": 4571 - }, - { - "epoch": 2.2444239631336407, - "grad_norm": 0.4331944219648293, - "learning_rate": 4.1191589306135824e-05, - "loss": 0.42, - "step": 4572 - }, - { - "epoch": 2.244915514592934, - "grad_norm": 0.41091299019535316, - "learning_rate": 4.114179803854138e-05, - "loss": 0.4545, - "step": 4573 - }, - { - "epoch": 2.2454070660522274, - "grad_norm": 0.4053814514597819, - "learning_rate": 4.109202908743303e-05, - "loss": 0.4123, - "step": 4574 - }, - { - "epoch": 2.2458986175115205, - "grad_norm": 0.45923192428403736, - "learning_rate": 4.1042282471681114e-05, - "loss": 0.4697, - "step": 4575 - }, - { - "epoch": 2.246390168970814, - "grad_norm": 0.4430307828698487, - "learning_rate": 4.0992558210147366e-05, - "loss": 0.4332, - "step": 4576 - }, - { - "epoch": 2.2468817204301077, - "grad_norm": 0.4425697406910303, - "learning_rate": 4.094285632168507e-05, - "loss": 0.4686, - "step": 4577 - }, - { - "epoch": 2.247373271889401, - "grad_norm": 0.4217068811747446, - "learning_rate": 4.089317682513902e-05, - "loss": 0.4476, - "step": 4578 - }, - { - "epoch": 2.2478648233486944, - "grad_norm": 0.41911384986879874, - "learning_rate": 4.084351973934561e-05, - "loss": 0.422, - "step": 4579 - }, - { - "epoch": 2.2483563748079876, - "grad_norm": 0.4283120421304942, - "learning_rate": 4.079388508313255e-05, - "loss": 0.4526, - "step": 4580 - }, - { - "epoch": 2.248847926267281, - "grad_norm": 0.4345917705047502, - "learning_rate": 4.0744272875319246e-05, - "loss": 0.4741, - "step": 4581 - }, - { - "epoch": 2.2493394777265747, - "grad_norm": 0.43016562011256765, - "learning_rate": 4.069468313471646e-05, - "loss": 0.4079, - "step": 4582 - }, - { - "epoch": 2.249831029185868, - "grad_norm": 0.4346654484961386, - "learning_rate": 4.0645115880126486e-05, - "loss": 0.4169, - "step": 4583 - }, - { - "epoch": 2.2503225806451614, - "grad_norm": 0.445379207826208, - "learning_rate": 4.059557113034308e-05, - "loss": 0.4491, - "step": 4584 - }, - { - "epoch": 2.2508141321044546, - "grad_norm": 0.41233474037321344, - "learning_rate": 4.054604890415148e-05, - "loss": 0.441, - "step": 4585 - }, - { - "epoch": 2.251305683563748, - "grad_norm": 0.42230985031111795, - "learning_rate": 4.0496549220328376e-05, - "loss": 0.4399, - "step": 4586 - }, - { - "epoch": 2.2517972350230413, - "grad_norm": 0.8896807787548994, - "learning_rate": 4.044707209764185e-05, - "loss": 0.4918, - "step": 4587 - }, - { - "epoch": 2.252288786482335, - "grad_norm": 0.38927116952415103, - "learning_rate": 4.0397617554851616e-05, - "loss": 0.4003, - "step": 4588 - }, - { - "epoch": 2.252780337941628, - "grad_norm": 0.43685660049787034, - "learning_rate": 4.034818561070855e-05, - "loss": 0.4221, - "step": 4589 - }, - { - "epoch": 2.2532718894009216, - "grad_norm": 0.4354103191565989, - "learning_rate": 4.029877628395522e-05, - "loss": 0.4806, - "step": 4590 - }, - { - "epoch": 2.253763440860215, - "grad_norm": 0.41956558625499557, - "learning_rate": 4.024938959332546e-05, - "loss": 0.3949, - "step": 4591 - }, - { - "epoch": 2.2542549923195083, - "grad_norm": 0.39096443425428223, - "learning_rate": 4.020002555754459e-05, - "loss": 0.418, - "step": 4592 - }, - { - "epoch": 2.254746543778802, - "grad_norm": 0.41839045046187673, - "learning_rate": 4.015068419532929e-05, - "loss": 0.397, - "step": 4593 - }, - { - "epoch": 2.255238095238095, - "grad_norm": 0.4754808995685527, - "learning_rate": 4.010136552538769e-05, - "loss": 0.3927, - "step": 4594 - }, - { - "epoch": 2.2557296466973886, - "grad_norm": 0.4460660744439945, - "learning_rate": 4.0052069566419305e-05, - "loss": 0.4739, - "step": 4595 - }, - { - "epoch": 2.256221198156682, - "grad_norm": 0.39726997380970774, - "learning_rate": 4.000279633711501e-05, - "loss": 0.4115, - "step": 4596 - }, - { - "epoch": 2.2567127496159753, - "grad_norm": 0.40592278154823924, - "learning_rate": 3.9953545856157104e-05, - "loss": 0.435, - "step": 4597 - }, - { - "epoch": 2.257204301075269, - "grad_norm": 0.4009355216838383, - "learning_rate": 3.990431814221919e-05, - "loss": 0.454, - "step": 4598 - }, - { - "epoch": 2.257695852534562, - "grad_norm": 0.44290535146492604, - "learning_rate": 3.98551132139664e-05, - "loss": 0.4698, - "step": 4599 - }, - { - "epoch": 2.2581874039938556, - "grad_norm": 0.4460439017698404, - "learning_rate": 3.980593109005498e-05, - "loss": 0.4639, - "step": 4600 - }, - { - "epoch": 2.2586789554531492, - "grad_norm": 0.43143466802333363, - "learning_rate": 3.9756771789132776e-05, - "loss": 0.3946, - "step": 4601 - }, - { - "epoch": 2.2591705069124424, - "grad_norm": 0.41429758273657125, - "learning_rate": 3.970763532983882e-05, - "loss": 0.3789, - "step": 4602 - }, - { - "epoch": 2.259662058371736, - "grad_norm": 0.4508457716819504, - "learning_rate": 3.9658521730803544e-05, - "loss": 0.4037, - "step": 4603 - }, - { - "epoch": 2.260153609831029, - "grad_norm": 0.4139978709261018, - "learning_rate": 3.960943101064869e-05, - "loss": 0.4341, - "step": 4604 - }, - { - "epoch": 2.2606451612903227, - "grad_norm": 0.4027056905353451, - "learning_rate": 3.956036318798736e-05, - "loss": 0.4269, - "step": 4605 - }, - { - "epoch": 2.261136712749616, - "grad_norm": 0.40922513331030813, - "learning_rate": 3.9511318281423923e-05, - "loss": 0.4359, - "step": 4606 - }, - { - "epoch": 2.2616282642089094, - "grad_norm": 0.4020937804835981, - "learning_rate": 3.946229630955407e-05, - "loss": 0.4513, - "step": 4607 - }, - { - "epoch": 2.2621198156682025, - "grad_norm": 0.4611820833640951, - "learning_rate": 3.941329729096492e-05, - "loss": 0.4137, - "step": 4608 - }, - { - "epoch": 2.262611367127496, - "grad_norm": 0.4820967140485992, - "learning_rate": 3.9364321244234635e-05, - "loss": 0.4502, - "step": 4609 - }, - { - "epoch": 2.2631029185867897, - "grad_norm": 0.43654407121472033, - "learning_rate": 3.9315368187932934e-05, - "loss": 0.4428, - "step": 4610 - }, - { - "epoch": 2.263594470046083, - "grad_norm": 0.39993384927913134, - "learning_rate": 3.926643814062064e-05, - "loss": 0.407, - "step": 4611 - }, - { - "epoch": 2.2640860215053764, - "grad_norm": 0.4384874094190952, - "learning_rate": 3.921753112084995e-05, - "loss": 0.4215, - "step": 4612 - }, - { - "epoch": 2.2645775729646695, - "grad_norm": 0.41828219618852935, - "learning_rate": 3.916864714716425e-05, - "loss": 0.4317, - "step": 4613 - }, - { - "epoch": 2.265069124423963, - "grad_norm": 0.4053890493789178, - "learning_rate": 3.911978623809826e-05, - "loss": 0.4272, - "step": 4614 - }, - { - "epoch": 2.2655606758832567, - "grad_norm": 0.41603414503411545, - "learning_rate": 3.90709484121779e-05, - "loss": 0.4342, - "step": 4615 - }, - { - "epoch": 2.26605222734255, - "grad_norm": 0.40679673512646974, - "learning_rate": 3.9022133687920346e-05, - "loss": 0.4235, - "step": 4616 - }, - { - "epoch": 2.2665437788018434, - "grad_norm": 0.4114182523644404, - "learning_rate": 3.897334208383413e-05, - "loss": 0.4611, - "step": 4617 - }, - { - "epoch": 2.2670353302611366, - "grad_norm": 0.3994714572521716, - "learning_rate": 3.892457361841879e-05, - "loss": 0.3932, - "step": 4618 - }, - { - "epoch": 2.26752688172043, - "grad_norm": 0.40916287775468124, - "learning_rate": 3.8875828310165354e-05, - "loss": 0.4548, - "step": 4619 - }, - { - "epoch": 2.2680184331797237, - "grad_norm": 0.4171924973429695, - "learning_rate": 3.882710617755578e-05, - "loss": 0.4244, - "step": 4620 - }, - { - "epoch": 2.268509984639017, - "grad_norm": 0.40797104003939494, - "learning_rate": 3.8778407239063517e-05, - "loss": 0.3729, - "step": 4621 - }, - { - "epoch": 2.2690015360983105, - "grad_norm": 0.42958891391811654, - "learning_rate": 3.8729731513153065e-05, - "loss": 0.4199, - "step": 4622 - }, - { - "epoch": 2.2694930875576036, - "grad_norm": 0.40084594217473807, - "learning_rate": 3.8681079018280144e-05, - "loss": 0.4236, - "step": 4623 - }, - { - "epoch": 2.269984639016897, - "grad_norm": 0.4120488217245618, - "learning_rate": 3.863244977289165e-05, - "loss": 0.4156, - "step": 4624 - }, - { - "epoch": 2.2704761904761903, - "grad_norm": 0.4907505186219989, - "learning_rate": 3.85838437954258e-05, - "loss": 0.4791, - "step": 4625 - }, - { - "epoch": 2.270967741935484, - "grad_norm": 0.42749915402943883, - "learning_rate": 3.8535261104311725e-05, - "loss": 0.4193, - "step": 4626 - }, - { - "epoch": 2.271459293394777, - "grad_norm": 0.42235208614465153, - "learning_rate": 3.8486701717969996e-05, - "loss": 0.4127, - "step": 4627 - }, - { - "epoch": 2.2719508448540706, - "grad_norm": 0.4005869369497063, - "learning_rate": 3.8438165654812194e-05, - "loss": 0.3838, - "step": 4628 - }, - { - "epoch": 2.272442396313364, - "grad_norm": 0.433845300926815, - "learning_rate": 3.8389652933241106e-05, - "loss": 0.4477, - "step": 4629 - }, - { - "epoch": 2.2729339477726573, - "grad_norm": 0.5183877041218403, - "learning_rate": 3.834116357165064e-05, - "loss": 0.4818, - "step": 4630 - }, - { - "epoch": 2.273425499231951, - "grad_norm": 0.4270202814672601, - "learning_rate": 3.82926975884259e-05, - "loss": 0.4319, - "step": 4631 - }, - { - "epoch": 2.273917050691244, - "grad_norm": 0.565245370179597, - "learning_rate": 3.824425500194305e-05, - "loss": 0.4465, - "step": 4632 - }, - { - "epoch": 2.2744086021505376, - "grad_norm": 0.4039578902394242, - "learning_rate": 3.819583583056941e-05, - "loss": 0.4241, - "step": 4633 - }, - { - "epoch": 2.274900153609831, - "grad_norm": 0.39639954565666735, - "learning_rate": 3.814744009266356e-05, - "loss": 0.4233, - "step": 4634 - }, - { - "epoch": 2.2753917050691244, - "grad_norm": 0.3800405835264967, - "learning_rate": 3.8099067806574905e-05, - "loss": 0.3608, - "step": 4635 - }, - { - "epoch": 2.275883256528418, - "grad_norm": 0.41372590167111695, - "learning_rate": 3.805071899064424e-05, - "loss": 0.4371, - "step": 4636 - }, - { - "epoch": 2.276374807987711, - "grad_norm": 0.43731242578098184, - "learning_rate": 3.800239366320332e-05, - "loss": 0.4414, - "step": 4637 - }, - { - "epoch": 2.2768663594470047, - "grad_norm": 0.42546293065024243, - "learning_rate": 3.7954091842575004e-05, - "loss": 0.4605, - "step": 4638 - }, - { - "epoch": 2.2773579109062982, - "grad_norm": 0.41181112298545686, - "learning_rate": 3.790581354707325e-05, - "loss": 0.4259, - "step": 4639 - }, - { - "epoch": 2.2778494623655914, - "grad_norm": 0.4304807424305234, - "learning_rate": 3.785755879500312e-05, - "loss": 0.3906, - "step": 4640 - }, - { - "epoch": 2.278341013824885, - "grad_norm": 0.43232025613389385, - "learning_rate": 3.7809327604660735e-05, - "loss": 0.4393, - "step": 4641 - }, - { - "epoch": 2.278832565284178, - "grad_norm": 0.45054294034429826, - "learning_rate": 3.7761119994333215e-05, - "loss": 0.4152, - "step": 4642 - }, - { - "epoch": 2.2793241167434717, - "grad_norm": 0.4465477853112635, - "learning_rate": 3.771293598229894e-05, - "loss": 0.4478, - "step": 4643 - }, - { - "epoch": 2.279815668202765, - "grad_norm": 0.3948504344265393, - "learning_rate": 3.766477558682704e-05, - "loss": 0.4047, - "step": 4644 - }, - { - "epoch": 2.2803072196620584, - "grad_norm": 0.41226392387825544, - "learning_rate": 3.7616638826178e-05, - "loss": 0.4278, - "step": 4645 - }, - { - "epoch": 2.2807987711213515, - "grad_norm": 0.4280468876965564, - "learning_rate": 3.756852571860307e-05, - "loss": 0.4811, - "step": 4646 - }, - { - "epoch": 2.281290322580645, - "grad_norm": 0.42466977270288475, - "learning_rate": 3.752043628234474e-05, - "loss": 0.4437, - "step": 4647 - }, - { - "epoch": 2.2817818740399387, - "grad_norm": 0.44802496332099906, - "learning_rate": 3.7472370535636445e-05, - "loss": 0.4484, - "step": 4648 - }, - { - "epoch": 2.282273425499232, - "grad_norm": 0.404386504733684, - "learning_rate": 3.742432849670261e-05, - "loss": 0.4295, - "step": 4649 - }, - { - "epoch": 2.2827649769585254, - "grad_norm": 0.4353369101584708, - "learning_rate": 3.737631018375872e-05, - "loss": 0.3975, - "step": 4650 - }, - { - "epoch": 2.2832565284178186, - "grad_norm": 0.43022041895287205, - "learning_rate": 3.732831561501123e-05, - "loss": 0.426, - "step": 4651 - }, - { - "epoch": 2.283748079877112, - "grad_norm": 0.477706570306163, - "learning_rate": 3.728034480865763e-05, - "loss": 0.4867, - "step": 4652 - }, - { - "epoch": 2.2842396313364057, - "grad_norm": 0.4583047865256892, - "learning_rate": 3.7232397782886307e-05, - "loss": 0.4172, - "step": 4653 - }, - { - "epoch": 2.284731182795699, - "grad_norm": 0.404847525461319, - "learning_rate": 3.718447455587682e-05, - "loss": 0.3846, - "step": 4654 - }, - { - "epoch": 2.2852227342549924, - "grad_norm": 0.4226098979981869, - "learning_rate": 3.7136575145799454e-05, - "loss": 0.3868, - "step": 4655 - }, - { - "epoch": 2.2857142857142856, - "grad_norm": 0.4296273361161548, - "learning_rate": 3.7088699570815686e-05, - "loss": 0.46, - "step": 4656 - }, - { - "epoch": 2.286205837173579, - "grad_norm": 0.4179785530279156, - "learning_rate": 3.704084784907784e-05, - "loss": 0.4354, - "step": 4657 - }, - { - "epoch": 2.2866973886328728, - "grad_norm": 0.41141204803552384, - "learning_rate": 3.699301999872922e-05, - "loss": 0.4006, - "step": 4658 - }, - { - "epoch": 2.287188940092166, - "grad_norm": 0.44950649673149085, - "learning_rate": 3.694521603790407e-05, - "loss": 0.3993, - "step": 4659 - }, - { - "epoch": 2.2876804915514595, - "grad_norm": 0.4210103788988504, - "learning_rate": 3.6897435984727605e-05, - "loss": 0.4149, - "step": 4660 - }, - { - "epoch": 2.2881720430107526, - "grad_norm": 0.4186445614979278, - "learning_rate": 3.684967985731593e-05, - "loss": 0.3967, - "step": 4661 - }, - { - "epoch": 2.288663594470046, - "grad_norm": 0.4220170813609645, - "learning_rate": 3.680194767377609e-05, - "loss": 0.4291, - "step": 4662 - }, - { - "epoch": 2.2891551459293393, - "grad_norm": 0.4153687874063938, - "learning_rate": 3.675423945220617e-05, - "loss": 0.3952, - "step": 4663 - }, - { - "epoch": 2.289646697388633, - "grad_norm": 0.40545911262391143, - "learning_rate": 3.6706555210694914e-05, - "loss": 0.4456, - "step": 4664 - }, - { - "epoch": 2.290138248847926, - "grad_norm": 0.40532664376354205, - "learning_rate": 3.665889496732223e-05, - "loss": 0.4161, - "step": 4665 - }, - { - "epoch": 2.2906298003072196, - "grad_norm": 0.40857490132512503, - "learning_rate": 3.661125874015881e-05, - "loss": 0.4277, - "step": 4666 - }, - { - "epoch": 2.291121351766513, - "grad_norm": 0.4510073318625711, - "learning_rate": 3.6563646547266214e-05, - "loss": 0.4523, - "step": 4667 - }, - { - "epoch": 2.2916129032258064, - "grad_norm": 0.4471361763741866, - "learning_rate": 3.651605840669695e-05, - "loss": 0.4391, - "step": 4668 - }, - { - "epoch": 2.2921044546851, - "grad_norm": 0.4363126110022241, - "learning_rate": 3.64684943364944e-05, - "loss": 0.4731, - "step": 4669 - }, - { - "epoch": 2.292596006144393, - "grad_norm": 0.39810227969431883, - "learning_rate": 3.642095435469274e-05, - "loss": 0.4485, - "step": 4670 - }, - { - "epoch": 2.2930875576036867, - "grad_norm": 0.4000949470384779, - "learning_rate": 3.637343847931719e-05, - "loss": 0.4105, - "step": 4671 - }, - { - "epoch": 2.2935791090629802, - "grad_norm": 0.39800246987335286, - "learning_rate": 3.6325946728383584e-05, - "loss": 0.4192, - "step": 4672 - }, - { - "epoch": 2.2940706605222734, - "grad_norm": 0.4602780711104513, - "learning_rate": 3.6278479119898846e-05, - "loss": 0.4248, - "step": 4673 - }, - { - "epoch": 2.294562211981567, - "grad_norm": 0.39877452127112756, - "learning_rate": 3.62310356718606e-05, - "loss": 0.398, - "step": 4674 - }, - { - "epoch": 2.29505376344086, - "grad_norm": 0.40087525919646655, - "learning_rate": 3.618361640225735e-05, - "loss": 0.4321, - "step": 4675 - }, - { - "epoch": 2.2955453149001537, - "grad_norm": 0.436675040585018, - "learning_rate": 3.613622132906843e-05, - "loss": 0.4336, - "step": 4676 - }, - { - "epoch": 2.296036866359447, - "grad_norm": 0.42876675485031573, - "learning_rate": 3.6088850470264015e-05, - "loss": 0.4349, - "step": 4677 - }, - { - "epoch": 2.2965284178187404, - "grad_norm": 0.4111778213451194, - "learning_rate": 3.604150384380508e-05, - "loss": 0.4389, - "step": 4678 - }, - { - "epoch": 2.297019969278034, - "grad_norm": 0.46268139426787824, - "learning_rate": 3.59941814676434e-05, - "loss": 0.4418, - "step": 4679 - }, - { - "epoch": 2.297511520737327, - "grad_norm": 0.4609072912667764, - "learning_rate": 3.594688335972164e-05, - "loss": 0.4382, - "step": 4680 - }, - { - "epoch": 2.2980030721966207, - "grad_norm": 0.42369563482353995, - "learning_rate": 3.58996095379731e-05, - "loss": 0.4335, - "step": 4681 - }, - { - "epoch": 2.298494623655914, - "grad_norm": 0.42286306020186715, - "learning_rate": 3.585236002032205e-05, - "loss": 0.4543, - "step": 4682 - }, - { - "epoch": 2.2989861751152074, - "grad_norm": 0.42080638209148447, - "learning_rate": 3.580513482468344e-05, - "loss": 0.4184, - "step": 4683 - }, - { - "epoch": 2.2994777265745006, - "grad_norm": 0.42167806986936546, - "learning_rate": 3.575793396896303e-05, - "loss": 0.4501, - "step": 4684 - }, - { - "epoch": 2.299969278033794, - "grad_norm": 0.3912828211039465, - "learning_rate": 3.571075747105732e-05, - "loss": 0.4071, - "step": 4685 - }, - { - "epoch": 2.3004608294930877, - "grad_norm": 0.4286467302118104, - "learning_rate": 3.5663605348853625e-05, - "loss": 0.4538, - "step": 4686 - }, - { - "epoch": 2.300952380952381, - "grad_norm": 0.44100720276496774, - "learning_rate": 3.561647762022998e-05, - "loss": 0.4564, - "step": 4687 - }, - { - "epoch": 2.3014439324116744, - "grad_norm": 0.43837868754997794, - "learning_rate": 3.556937430305515e-05, - "loss": 0.4276, - "step": 4688 - }, - { - "epoch": 2.3019354838709676, - "grad_norm": 0.4203693226345875, - "learning_rate": 3.5522295415188774e-05, - "loss": 0.3944, - "step": 4689 - }, - { - "epoch": 2.302427035330261, - "grad_norm": 0.4319344765012458, - "learning_rate": 3.5475240974481006e-05, - "loss": 0.4006, - "step": 4690 - }, - { - "epoch": 2.3029185867895547, - "grad_norm": 0.404823184954575, - "learning_rate": 3.542821099877295e-05, - "loss": 0.384, - "step": 4691 - }, - { - "epoch": 2.303410138248848, - "grad_norm": 0.42483465979867385, - "learning_rate": 3.538120550589631e-05, - "loss": 0.4313, - "step": 4692 - }, - { - "epoch": 2.3039016897081415, - "grad_norm": 0.41230286891851775, - "learning_rate": 3.533422451367353e-05, - "loss": 0.4464, - "step": 4693 - }, - { - "epoch": 2.3043932411674346, - "grad_norm": 0.4325844945211286, - "learning_rate": 3.5287268039917785e-05, - "loss": 0.4464, - "step": 4694 - }, - { - "epoch": 2.304884792626728, - "grad_norm": 0.42635586169823536, - "learning_rate": 3.524033610243293e-05, - "loss": 0.4111, - "step": 4695 - }, - { - "epoch": 2.3053763440860213, - "grad_norm": 0.4301726105167208, - "learning_rate": 3.5193428719013523e-05, - "loss": 0.4086, - "step": 4696 - }, - { - "epoch": 2.305867895545315, - "grad_norm": 0.4198929305556593, - "learning_rate": 3.514654590744483e-05, - "loss": 0.4632, - "step": 4697 - }, - { - "epoch": 2.3063594470046085, - "grad_norm": 0.39637988836601756, - "learning_rate": 3.509968768550278e-05, - "loss": 0.3943, - "step": 4698 - }, - { - "epoch": 2.3068509984639016, - "grad_norm": 0.40120828939709524, - "learning_rate": 3.505285407095394e-05, - "loss": 0.3931, - "step": 4699 - }, - { - "epoch": 2.307342549923195, - "grad_norm": 0.42584379039334697, - "learning_rate": 3.500604508155571e-05, - "loss": 0.4136, - "step": 4700 - }, - { - "epoch": 2.3078341013824883, - "grad_norm": 0.3999601796476035, - "learning_rate": 3.4959260735055896e-05, - "loss": 0.4432, - "step": 4701 - }, - { - "epoch": 2.308325652841782, - "grad_norm": 0.41602318062853544, - "learning_rate": 3.491250104919321e-05, - "loss": 0.445, - "step": 4702 - }, - { - "epoch": 2.308817204301075, - "grad_norm": 0.4030289261290131, - "learning_rate": 3.486576604169686e-05, - "loss": 0.4214, - "step": 4703 - }, - { - "epoch": 2.3093087557603686, - "grad_norm": 0.41813375817643905, - "learning_rate": 3.481905573028673e-05, - "loss": 0.4051, - "step": 4704 - }, - { - "epoch": 2.3098003072196622, - "grad_norm": 0.4376203451293558, - "learning_rate": 3.477237013267337e-05, - "loss": 0.436, - "step": 4705 - }, - { - "epoch": 2.3102918586789554, - "grad_norm": 0.4035687214978179, - "learning_rate": 3.4725709266557924e-05, - "loss": 0.4085, - "step": 4706 - }, - { - "epoch": 2.310783410138249, - "grad_norm": 0.4228343437108084, - "learning_rate": 3.467907314963219e-05, - "loss": 0.428, - "step": 4707 - }, - { - "epoch": 2.311274961597542, - "grad_norm": 0.4458722455069885, - "learning_rate": 3.4632461799578534e-05, - "loss": 0.4808, - "step": 4708 - }, - { - "epoch": 2.3117665130568357, - "grad_norm": 0.42559318325555223, - "learning_rate": 3.4585875234070056e-05, - "loss": 0.4096, - "step": 4709 - }, - { - "epoch": 2.3122580645161293, - "grad_norm": 0.41919658507677926, - "learning_rate": 3.453931347077024e-05, - "loss": 0.4092, - "step": 4710 - }, - { - "epoch": 2.3127496159754224, - "grad_norm": 0.4241294540352367, - "learning_rate": 3.44927765273334e-05, - "loss": 0.4104, - "step": 4711 - }, - { - "epoch": 2.313241167434716, - "grad_norm": 0.42778857671006904, - "learning_rate": 3.444626442140428e-05, - "loss": 0.4387, - "step": 4712 - }, - { - "epoch": 2.313732718894009, - "grad_norm": 0.43551370942469136, - "learning_rate": 3.4399777170618286e-05, - "loss": 0.4587, - "step": 4713 - }, - { - "epoch": 2.3142242703533027, - "grad_norm": 0.4477057519800819, - "learning_rate": 3.435331479260133e-05, - "loss": 0.4485, - "step": 4714 - }, - { - "epoch": 2.314715821812596, - "grad_norm": 0.4277263298977427, - "learning_rate": 3.430687730497003e-05, - "loss": 0.4641, - "step": 4715 - }, - { - "epoch": 2.3152073732718894, - "grad_norm": 0.45950082107122175, - "learning_rate": 3.4260464725331345e-05, - "loss": 0.4639, - "step": 4716 - }, - { - "epoch": 2.3156989247311826, - "grad_norm": 0.4204903418651704, - "learning_rate": 3.421407707128306e-05, - "loss": 0.4543, - "step": 4717 - }, - { - "epoch": 2.316190476190476, - "grad_norm": 0.4227449257009278, - "learning_rate": 3.416771436041323e-05, - "loss": 0.4302, - "step": 4718 - }, - { - "epoch": 2.3166820276497697, - "grad_norm": 0.41669218211322623, - "learning_rate": 3.412137661030068e-05, - "loss": 0.3881, - "step": 4719 - }, - { - "epoch": 2.317173579109063, - "grad_norm": 0.41996009125661177, - "learning_rate": 3.4075063838514676e-05, - "loss": 0.4236, - "step": 4720 - }, - { - "epoch": 2.3176651305683564, - "grad_norm": 0.39659493085845643, - "learning_rate": 3.402877606261499e-05, - "loss": 0.3903, - "step": 4721 - }, - { - "epoch": 2.3181566820276496, - "grad_norm": 0.4036941455892041, - "learning_rate": 3.3982513300151964e-05, - "loss": 0.4006, - "step": 4722 - }, - { - "epoch": 2.318648233486943, - "grad_norm": 0.4123896872395089, - "learning_rate": 3.393627556866643e-05, - "loss": 0.4246, - "step": 4723 - }, - { - "epoch": 2.3191397849462367, - "grad_norm": 0.4096088359337234, - "learning_rate": 3.389006288568974e-05, - "loss": 0.4157, - "step": 4724 - }, - { - "epoch": 2.31963133640553, - "grad_norm": 0.4326586651819651, - "learning_rate": 3.384387526874371e-05, - "loss": 0.4508, - "step": 4725 - }, - { - "epoch": 2.3201228878648235, - "grad_norm": 0.40072572740826556, - "learning_rate": 3.3797712735340794e-05, - "loss": 0.4002, - "step": 4726 - }, - { - "epoch": 2.3206144393241166, - "grad_norm": 0.40810647346308376, - "learning_rate": 3.3751575302983694e-05, - "loss": 0.4288, - "step": 4727 - }, - { - "epoch": 2.32110599078341, - "grad_norm": 0.47708630588981893, - "learning_rate": 3.370546298916583e-05, - "loss": 0.4604, - "step": 4728 - }, - { - "epoch": 2.3215975422427038, - "grad_norm": 0.4502626136190057, - "learning_rate": 3.365937581137095e-05, - "loss": 0.4312, - "step": 4729 - }, - { - "epoch": 2.322089093701997, - "grad_norm": 0.40699201528340645, - "learning_rate": 3.361331378707334e-05, - "loss": 0.4288, - "step": 4730 - }, - { - "epoch": 2.3225806451612905, - "grad_norm": 0.4165295196834001, - "learning_rate": 3.3567276933737734e-05, - "loss": 0.4499, - "step": 4731 - }, - { - "epoch": 2.3230721966205836, - "grad_norm": 0.396397589918511, - "learning_rate": 3.3521265268819294e-05, - "loss": 0.4364, - "step": 4732 - }, - { - "epoch": 2.323563748079877, - "grad_norm": 0.458204883704109, - "learning_rate": 3.347527880976367e-05, - "loss": 0.4586, - "step": 4733 - }, - { - "epoch": 2.3240552995391703, - "grad_norm": 0.42073207194970696, - "learning_rate": 3.342931757400689e-05, - "loss": 0.4087, - "step": 4734 - }, - { - "epoch": 2.324546850998464, - "grad_norm": 0.4647240458644532, - "learning_rate": 3.338338157897559e-05, - "loss": 0.4255, - "step": 4735 - }, - { - "epoch": 2.325038402457757, - "grad_norm": 0.43744954731761865, - "learning_rate": 3.333747084208657e-05, - "loss": 0.4913, - "step": 4736 - }, - { - "epoch": 2.3255299539170506, - "grad_norm": 0.3982797540445961, - "learning_rate": 3.329158538074729e-05, - "loss": 0.4168, - "step": 4737 - }, - { - "epoch": 2.3260215053763442, - "grad_norm": 0.41405732467800316, - "learning_rate": 3.324572521235552e-05, - "loss": 0.4225, - "step": 4738 - }, - { - "epoch": 2.3265130568356374, - "grad_norm": 0.43401156821621784, - "learning_rate": 3.3199890354299435e-05, - "loss": 0.4473, - "step": 4739 - }, - { - "epoch": 2.327004608294931, - "grad_norm": 0.4106466316435039, - "learning_rate": 3.3154080823957635e-05, - "loss": 0.4377, - "step": 4740 - }, - { - "epoch": 2.327496159754224, - "grad_norm": 0.41976505694764593, - "learning_rate": 3.310829663869912e-05, - "loss": 0.4355, - "step": 4741 - }, - { - "epoch": 2.3279877112135177, - "grad_norm": 0.42346622278551865, - "learning_rate": 3.306253781588327e-05, - "loss": 0.428, - "step": 4742 - }, - { - "epoch": 2.3284792626728112, - "grad_norm": 0.4306519527183008, - "learning_rate": 3.3016804372859854e-05, - "loss": 0.4358, - "step": 4743 - }, - { - "epoch": 2.3289708141321044, - "grad_norm": 0.42950444364385965, - "learning_rate": 3.297109632696902e-05, - "loss": 0.4283, - "step": 4744 - }, - { - "epoch": 2.329462365591398, - "grad_norm": 0.43906553733635073, - "learning_rate": 3.2925413695541244e-05, - "loss": 0.4674, - "step": 4745 - }, - { - "epoch": 2.329953917050691, - "grad_norm": 0.4255153153818455, - "learning_rate": 3.287975649589751e-05, - "loss": 0.4244, - "step": 4746 - }, - { - "epoch": 2.3304454685099847, - "grad_norm": 0.43026277125446477, - "learning_rate": 3.283412474534893e-05, - "loss": 0.4237, - "step": 4747 - }, - { - "epoch": 2.3309370199692783, - "grad_norm": 0.4093513150854702, - "learning_rate": 3.2788518461197157e-05, - "loss": 0.433, - "step": 4748 - }, - { - "epoch": 2.3314285714285714, - "grad_norm": 0.43143713432610686, - "learning_rate": 3.274293766073413e-05, - "loss": 0.4422, - "step": 4749 - }, - { - "epoch": 2.331920122887865, - "grad_norm": 0.4196447328123866, - "learning_rate": 3.2697382361242104e-05, - "loss": 0.4036, - "step": 4750 - }, - { - "epoch": 2.332411674347158, - "grad_norm": 0.4204694343348252, - "learning_rate": 3.265185257999367e-05, - "loss": 0.459, - "step": 4751 - }, - { - "epoch": 2.3329032258064517, - "grad_norm": 0.44006084728563805, - "learning_rate": 3.2606348334251755e-05, - "loss": 0.434, - "step": 4752 - }, - { - "epoch": 2.333394777265745, - "grad_norm": 0.39821864349299974, - "learning_rate": 3.256086964126962e-05, - "loss": 0.4071, - "step": 4753 - }, - { - "epoch": 2.3338863287250384, - "grad_norm": 0.3972280648289152, - "learning_rate": 3.2515416518290774e-05, - "loss": 0.4173, - "step": 4754 - }, - { - "epoch": 2.3343778801843316, - "grad_norm": 0.41062672806796124, - "learning_rate": 3.246998898254917e-05, - "loss": 0.4248, - "step": 4755 - }, - { - "epoch": 2.334869431643625, - "grad_norm": 0.46399725882181564, - "learning_rate": 3.2424587051268837e-05, - "loss": 0.4566, - "step": 4756 - }, - { - "epoch": 2.3353609831029187, - "grad_norm": 0.4211860928938646, - "learning_rate": 3.2379210741664336e-05, - "loss": 0.3884, - "step": 4757 - }, - { - "epoch": 2.335852534562212, - "grad_norm": 0.4552154019131506, - "learning_rate": 3.233386007094036e-05, - "loss": 0.4691, - "step": 4758 - }, - { - "epoch": 2.3363440860215055, - "grad_norm": 0.42889915354629227, - "learning_rate": 3.228853505629192e-05, - "loss": 0.4218, - "step": 4759 - }, - { - "epoch": 2.3368356374807986, - "grad_norm": 0.39334507688502596, - "learning_rate": 3.224323571490428e-05, - "loss": 0.3941, - "step": 4760 - }, - { - "epoch": 2.337327188940092, - "grad_norm": 0.38969390775568347, - "learning_rate": 3.219796206395307e-05, - "loss": 0.4331, - "step": 4761 - }, - { - "epoch": 2.3378187403993858, - "grad_norm": 0.4079829619829583, - "learning_rate": 3.2152714120603986e-05, - "loss": 0.4394, - "step": 4762 - }, - { - "epoch": 2.338310291858679, - "grad_norm": 0.40411940197509744, - "learning_rate": 3.21074919020132e-05, - "loss": 0.4302, - "step": 4763 - }, - { - "epoch": 2.3388018433179725, - "grad_norm": 0.43858726812015925, - "learning_rate": 3.206229542532697e-05, - "loss": 0.4084, - "step": 4764 - }, - { - "epoch": 2.3392933947772656, - "grad_norm": 0.4156139932197252, - "learning_rate": 3.201712470768185e-05, - "loss": 0.4037, - "step": 4765 - }, - { - "epoch": 2.339784946236559, - "grad_norm": 0.43863161229391, - "learning_rate": 3.197197976620469e-05, - "loss": 0.4255, - "step": 4766 - }, - { - "epoch": 2.3402764976958528, - "grad_norm": 0.4480292358381235, - "learning_rate": 3.1926860618012346e-05, - "loss": 0.4522, - "step": 4767 - }, - { - "epoch": 2.340768049155146, - "grad_norm": 0.3890287573111709, - "learning_rate": 3.188176728021218e-05, - "loss": 0.4002, - "step": 4768 - }, - { - "epoch": 2.3412596006144395, - "grad_norm": 0.4184492254526786, - "learning_rate": 3.18366997699016e-05, - "loss": 0.4123, - "step": 4769 - }, - { - "epoch": 2.3417511520737326, - "grad_norm": 0.42506116602466937, - "learning_rate": 3.179165810416826e-05, - "loss": 0.455, - "step": 4770 - }, - { - "epoch": 2.342242703533026, - "grad_norm": 0.4122783842189617, - "learning_rate": 3.174664230008998e-05, - "loss": 0.4315, - "step": 4771 - }, - { - "epoch": 2.3427342549923194, - "grad_norm": 0.4148489845101186, - "learning_rate": 3.17016523747349e-05, - "loss": 0.422, - "step": 4772 - }, - { - "epoch": 2.343225806451613, - "grad_norm": 0.42936894470285625, - "learning_rate": 3.165668834516112e-05, - "loss": 0.4284, - "step": 4773 - }, - { - "epoch": 2.343717357910906, - "grad_norm": 0.4159236673561618, - "learning_rate": 3.161175022841717e-05, - "loss": 0.4283, - "step": 4774 - }, - { - "epoch": 2.3442089093701997, - "grad_norm": 0.42347653630964766, - "learning_rate": 3.15668380415416e-05, - "loss": 0.4114, - "step": 4775 - }, - { - "epoch": 2.3447004608294932, - "grad_norm": 0.4651548232142126, - "learning_rate": 3.152195180156317e-05, - "loss": 0.4897, - "step": 4776 - }, - { - "epoch": 2.3451920122887864, - "grad_norm": 0.40571874403564384, - "learning_rate": 3.14770915255008e-05, - "loss": 0.4521, - "step": 4777 - }, - { - "epoch": 2.34568356374808, - "grad_norm": 0.4436730221680044, - "learning_rate": 3.143225723036357e-05, - "loss": 0.4196, - "step": 4778 - }, - { - "epoch": 2.346175115207373, - "grad_norm": 0.42543159822317894, - "learning_rate": 3.13874489331507e-05, - "loss": 0.4238, - "step": 4779 - }, - { - "epoch": 2.3466666666666667, - "grad_norm": 0.4083009504906106, - "learning_rate": 3.134266665085154e-05, - "loss": 0.4262, - "step": 4780 - }, - { - "epoch": 2.3471582181259603, - "grad_norm": 0.4283724148865335, - "learning_rate": 3.1297910400445686e-05, - "loss": 0.4164, - "step": 4781 - }, - { - "epoch": 2.3476497695852534, - "grad_norm": 0.4198485592266913, - "learning_rate": 3.1253180198902655e-05, - "loss": 0.4225, - "step": 4782 - }, - { - "epoch": 2.348141321044547, - "grad_norm": 0.41970964022106316, - "learning_rate": 3.120847606318228e-05, - "loss": 0.461, - "step": 4783 - }, - { - "epoch": 2.34863287250384, - "grad_norm": 0.43024997503838885, - "learning_rate": 3.1163798010234424e-05, - "loss": 0.4452, - "step": 4784 - }, - { - "epoch": 2.3491244239631337, - "grad_norm": 0.4410145878631437, - "learning_rate": 3.111914605699906e-05, - "loss": 0.4551, - "step": 4785 - }, - { - "epoch": 2.3496159754224273, - "grad_norm": 0.4892838955368589, - "learning_rate": 3.10745202204063e-05, - "loss": 0.4335, - "step": 4786 - }, - { - "epoch": 2.3501075268817204, - "grad_norm": 0.38238948191787914, - "learning_rate": 3.102992051737631e-05, - "loss": 0.3736, - "step": 4787 - }, - { - "epoch": 2.350599078341014, - "grad_norm": 0.38302858860671996, - "learning_rate": 3.098534696481937e-05, - "loss": 0.411, - "step": 4788 - }, - { - "epoch": 2.351090629800307, - "grad_norm": 0.4084756345531803, - "learning_rate": 3.094079957963584e-05, - "loss": 0.4268, - "step": 4789 - }, - { - "epoch": 2.3515821812596007, - "grad_norm": 0.4165753600155168, - "learning_rate": 3.089627837871623e-05, - "loss": 0.4602, - "step": 4790 - }, - { - "epoch": 2.352073732718894, - "grad_norm": 0.42206661017082336, - "learning_rate": 3.085178337894093e-05, - "loss": 0.4471, - "step": 4791 - }, - { - "epoch": 2.3525652841781874, - "grad_norm": 0.3800883691971558, - "learning_rate": 3.080731459718067e-05, - "loss": 0.4186, - "step": 4792 - }, - { - "epoch": 2.3530568356374806, - "grad_norm": 0.41444170721371476, - "learning_rate": 3.0762872050295935e-05, - "loss": 0.4204, - "step": 4793 - }, - { - "epoch": 2.353548387096774, - "grad_norm": 0.40085496217796385, - "learning_rate": 3.0718455755137534e-05, - "loss": 0.3856, - "step": 4794 - }, - { - "epoch": 2.3540399385560677, - "grad_norm": 0.4189243886227632, - "learning_rate": 3.0674065728546166e-05, - "loss": 0.4074, - "step": 4795 - }, - { - "epoch": 2.354531490015361, - "grad_norm": 0.41448934031974805, - "learning_rate": 3.06297019873526e-05, - "loss": 0.4704, - "step": 4796 - }, - { - "epoch": 2.3550230414746545, - "grad_norm": 0.42468459699633776, - "learning_rate": 3.058536454837767e-05, - "loss": 0.4089, - "step": 4797 - }, - { - "epoch": 2.3555145929339476, - "grad_norm": 0.40372523180114434, - "learning_rate": 3.054105342843221e-05, - "loss": 0.38, - "step": 4798 - }, - { - "epoch": 2.356006144393241, - "grad_norm": 0.4099057107716398, - "learning_rate": 3.0496768644317077e-05, - "loss": 0.3934, - "step": 4799 - }, - { - "epoch": 2.3564976958525348, - "grad_norm": 0.41255023543990066, - "learning_rate": 3.0452510212823104e-05, - "loss": 0.3916, - "step": 4800 - }, - { - "epoch": 2.356989247311828, - "grad_norm": 0.43795605125926107, - "learning_rate": 3.0408278150731297e-05, - "loss": 0.4368, - "step": 4801 - }, - { - "epoch": 2.3574807987711215, - "grad_norm": 0.4251178192904342, - "learning_rate": 3.03640724748124e-05, - "loss": 0.4965, - "step": 4802 - }, - { - "epoch": 2.3579723502304146, - "grad_norm": 0.42830745049729313, - "learning_rate": 3.0319893201827387e-05, - "loss": 0.4128, - "step": 4803 - }, - { - "epoch": 2.358463901689708, - "grad_norm": 0.42090651381117394, - "learning_rate": 3.0275740348527093e-05, - "loss": 0.4478, - "step": 4804 - }, - { - "epoch": 2.3589554531490013, - "grad_norm": 0.3973716768544322, - "learning_rate": 3.0231613931652392e-05, - "loss": 0.3933, - "step": 4805 - }, - { - "epoch": 2.359447004608295, - "grad_norm": 0.4243238260429334, - "learning_rate": 3.0187513967934067e-05, - "loss": 0.4549, - "step": 4806 - }, - { - "epoch": 2.3599385560675885, - "grad_norm": 0.4343255673653745, - "learning_rate": 3.014344047409301e-05, - "loss": 0.438, - "step": 4807 - }, - { - "epoch": 2.3604301075268816, - "grad_norm": 0.41149175869544385, - "learning_rate": 3.0099393466839864e-05, - "loss": 0.4129, - "step": 4808 - }, - { - "epoch": 2.3609216589861752, - "grad_norm": 0.42358128335906053, - "learning_rate": 3.005537296287546e-05, - "loss": 0.4609, - "step": 4809 - }, - { - "epoch": 2.3614132104454684, - "grad_norm": 0.6738010749080955, - "learning_rate": 3.0011378978890416e-05, - "loss": 0.465, - "step": 4810 - }, - { - "epoch": 2.361904761904762, - "grad_norm": 0.41749798248955805, - "learning_rate": 2.996741153156535e-05, - "loss": 0.4288, - "step": 4811 - }, - { - "epoch": 2.362396313364055, - "grad_norm": 0.42248666624639847, - "learning_rate": 2.9923470637570827e-05, - "loss": 0.4292, - "step": 4812 - }, - { - "epoch": 2.3628878648233487, - "grad_norm": 0.4134649845494821, - "learning_rate": 2.9879556313567335e-05, - "loss": 0.42, - "step": 4813 - }, - { - "epoch": 2.3633794162826423, - "grad_norm": 0.4346405485385153, - "learning_rate": 2.9835668576205288e-05, - "loss": 0.4261, - "step": 4814 - }, - { - "epoch": 2.3638709677419354, - "grad_norm": 0.4355669902937064, - "learning_rate": 2.979180744212502e-05, - "loss": 0.4543, - "step": 4815 - }, - { - "epoch": 2.364362519201229, - "grad_norm": 0.4063349307832069, - "learning_rate": 2.9747972927956768e-05, - "loss": 0.4085, - "step": 4816 - }, - { - "epoch": 2.364854070660522, - "grad_norm": 0.4007487000617257, - "learning_rate": 2.9704165050320652e-05, - "loss": 0.4044, - "step": 4817 - }, - { - "epoch": 2.3653456221198157, - "grad_norm": 0.40464878367350376, - "learning_rate": 2.9660383825826842e-05, - "loss": 0.3812, - "step": 4818 - }, - { - "epoch": 2.3658371735791093, - "grad_norm": 0.39128615180039056, - "learning_rate": 2.9616629271075137e-05, - "loss": 0.4422, - "step": 4819 - }, - { - "epoch": 2.3663287250384024, - "grad_norm": 0.4184770545329243, - "learning_rate": 2.9572901402655484e-05, - "loss": 0.4307, - "step": 4820 - }, - { - "epoch": 2.366820276497696, - "grad_norm": 0.419322774265694, - "learning_rate": 2.9529200237147546e-05, - "loss": 0.4534, - "step": 4821 - }, - { - "epoch": 2.367311827956989, - "grad_norm": 0.48334917449392434, - "learning_rate": 2.948552579112095e-05, - "loss": 0.4712, - "step": 4822 - }, - { - "epoch": 2.3678033794162827, - "grad_norm": 0.43277034543550946, - "learning_rate": 2.944187808113512e-05, - "loss": 0.4331, - "step": 4823 - }, - { - "epoch": 2.368294930875576, - "grad_norm": 0.42527901485206754, - "learning_rate": 2.9398257123739413e-05, - "loss": 0.3937, - "step": 4824 - }, - { - "epoch": 2.3687864823348694, - "grad_norm": 0.4281167188782922, - "learning_rate": 2.9354662935472997e-05, - "loss": 0.4838, - "step": 4825 - }, - { - "epoch": 2.369278033794163, - "grad_norm": 0.4135117960722692, - "learning_rate": 2.9311095532864874e-05, - "loss": 0.4136, - "step": 4826 - }, - { - "epoch": 2.369769585253456, - "grad_norm": 0.39062068942264067, - "learning_rate": 2.9267554932434006e-05, - "loss": 0.3675, - "step": 4827 - }, - { - "epoch": 2.3702611367127497, - "grad_norm": 0.4081901507608112, - "learning_rate": 2.9224041150688997e-05, - "loss": 0.3844, - "step": 4828 - }, - { - "epoch": 2.370752688172043, - "grad_norm": 0.4312197072803686, - "learning_rate": 2.9180554204128473e-05, - "loss": 0.4536, - "step": 4829 - }, - { - "epoch": 2.3712442396313365, - "grad_norm": 0.4116497918018741, - "learning_rate": 2.913709410924078e-05, - "loss": 0.4114, - "step": 4830 - }, - { - "epoch": 2.3717357910906296, - "grad_norm": 0.422336565762676, - "learning_rate": 2.9093660882504105e-05, - "loss": 0.4422, - "step": 4831 - }, - { - "epoch": 2.372227342549923, - "grad_norm": 0.4685740962927458, - "learning_rate": 2.9050254540386457e-05, - "loss": 0.4537, - "step": 4832 - }, - { - "epoch": 2.3727188940092168, - "grad_norm": 0.4411007892285831, - "learning_rate": 2.900687509934563e-05, - "loss": 0.4403, - "step": 4833 - }, - { - "epoch": 2.37321044546851, - "grad_norm": 0.4145793060508992, - "learning_rate": 2.896352257582925e-05, - "loss": 0.438, - "step": 4834 - }, - { - "epoch": 2.3737019969278035, - "grad_norm": 0.39228570624315656, - "learning_rate": 2.892019698627467e-05, - "loss": 0.4041, - "step": 4835 - }, - { - "epoch": 2.3741935483870966, - "grad_norm": 0.43856759654947197, - "learning_rate": 2.8876898347109195e-05, - "loss": 0.4585, - "step": 4836 - }, - { - "epoch": 2.37468509984639, - "grad_norm": 0.39721503017618237, - "learning_rate": 2.883362667474967e-05, - "loss": 0.4156, - "step": 4837 - }, - { - "epoch": 2.375176651305684, - "grad_norm": 0.40519165834452425, - "learning_rate": 2.8790381985602922e-05, - "loss": 0.4371, - "step": 4838 - }, - { - "epoch": 2.375668202764977, - "grad_norm": 0.4155310705391595, - "learning_rate": 2.874716429606543e-05, - "loss": 0.4015, - "step": 4839 - }, - { - "epoch": 2.3761597542242705, - "grad_norm": 0.43033233439082563, - "learning_rate": 2.8703973622523505e-05, - "loss": 0.4733, - "step": 4840 - }, - { - "epoch": 2.3766513056835636, - "grad_norm": 0.4464480701023259, - "learning_rate": 2.8660809981353165e-05, - "loss": 0.4662, - "step": 4841 - }, - { - "epoch": 2.3771428571428572, - "grad_norm": 0.44938378423038566, - "learning_rate": 2.8617673388920197e-05, - "loss": 0.4815, - "step": 4842 - }, - { - "epoch": 2.3776344086021504, - "grad_norm": 0.4463081462483464, - "learning_rate": 2.857456386158014e-05, - "loss": 0.4135, - "step": 4843 - }, - { - "epoch": 2.378125960061444, - "grad_norm": 0.44035002552097285, - "learning_rate": 2.853148141567824e-05, - "loss": 0.4938, - "step": 4844 - }, - { - "epoch": 2.378617511520737, - "grad_norm": 0.41162086220383853, - "learning_rate": 2.8488426067549533e-05, - "loss": 0.4236, - "step": 4845 - }, - { - "epoch": 2.3791090629800307, - "grad_norm": 0.4379675645948824, - "learning_rate": 2.8445397833518695e-05, - "loss": 0.3664, - "step": 4846 - }, - { - "epoch": 2.3796006144393242, - "grad_norm": 0.4214342115439349, - "learning_rate": 2.840239672990026e-05, - "loss": 0.4378, - "step": 4847 - }, - { - "epoch": 2.3800921658986174, - "grad_norm": 0.42574262166261506, - "learning_rate": 2.8359422772998278e-05, - "loss": 0.4603, - "step": 4848 - }, - { - "epoch": 2.380583717357911, - "grad_norm": 0.4633004664942074, - "learning_rate": 2.8316475979106706e-05, - "loss": 0.4625, - "step": 4849 - }, - { - "epoch": 2.381075268817204, - "grad_norm": 0.41098575800817566, - "learning_rate": 2.8273556364509078e-05, - "loss": 0.4274, - "step": 4850 - }, - { - "epoch": 2.3815668202764977, - "grad_norm": 0.40466716665847524, - "learning_rate": 2.8230663945478663e-05, - "loss": 0.3734, - "step": 4851 - }, - { - "epoch": 2.3820583717357913, - "grad_norm": 0.4229969232897735, - "learning_rate": 2.8187798738278358e-05, - "loss": 0.4428, - "step": 4852 - }, - { - "epoch": 2.3825499231950844, - "grad_norm": 0.429880737144034, - "learning_rate": 2.8144960759160922e-05, - "loss": 0.4675, - "step": 4853 - }, - { - "epoch": 2.383041474654378, - "grad_norm": 0.38905584639360397, - "learning_rate": 2.8102150024368525e-05, - "loss": 0.4004, - "step": 4854 - }, - { - "epoch": 2.383533026113671, - "grad_norm": 0.4126523752904647, - "learning_rate": 2.8059366550133237e-05, - "loss": 0.414, - "step": 4855 - }, - { - "epoch": 2.3840245775729647, - "grad_norm": 0.4303438862748314, - "learning_rate": 2.8016610352676675e-05, - "loss": 0.4628, - "step": 4856 - }, - { - "epoch": 2.3845161290322583, - "grad_norm": 0.4148592383125127, - "learning_rate": 2.797388144821015e-05, - "loss": 0.3957, - "step": 4857 - }, - { - "epoch": 2.3850076804915514, - "grad_norm": 0.41798689874148054, - "learning_rate": 2.7931179852934596e-05, - "loss": 0.4635, - "step": 4858 - }, - { - "epoch": 2.385499231950845, - "grad_norm": 0.4044476611099365, - "learning_rate": 2.7888505583040638e-05, - "loss": 0.4393, - "step": 4859 - }, - { - "epoch": 2.385990783410138, - "grad_norm": 0.41043263527171825, - "learning_rate": 2.784585865470849e-05, - "loss": 0.4243, - "step": 4860 - }, - { - "epoch": 2.3864823348694317, - "grad_norm": 0.4677727877314856, - "learning_rate": 2.7803239084108002e-05, - "loss": 0.4973, - "step": 4861 - }, - { - "epoch": 2.386973886328725, - "grad_norm": 0.5059274749460108, - "learning_rate": 2.776064688739878e-05, - "loss": 0.5678, - "step": 4862 - }, - { - "epoch": 2.3874654377880185, - "grad_norm": 0.4197777784887138, - "learning_rate": 2.771808208072979e-05, - "loss": 0.4243, - "step": 4863 - }, - { - "epoch": 2.3879569892473116, - "grad_norm": 0.5906791477620013, - "learning_rate": 2.767554468023992e-05, - "loss": 0.4261, - "step": 4864 - }, - { - "epoch": 2.388448540706605, - "grad_norm": 0.43051146996828155, - "learning_rate": 2.763303470205738e-05, - "loss": 0.4663, - "step": 4865 - }, - { - "epoch": 2.3889400921658988, - "grad_norm": 0.43273203777980085, - "learning_rate": 2.7590552162300198e-05, - "loss": 0.3899, - "step": 4866 - }, - { - "epoch": 2.389431643625192, - "grad_norm": 0.423888044247044, - "learning_rate": 2.754809707707591e-05, - "loss": 0.4366, - "step": 4867 - }, - { - "epoch": 2.3899231950844855, - "grad_norm": 0.41197697162827823, - "learning_rate": 2.750566946248162e-05, - "loss": 0.428, - "step": 4868 - }, - { - "epoch": 2.3904147465437786, - "grad_norm": 0.48095076415614696, - "learning_rate": 2.746326933460406e-05, - "loss": 0.415, - "step": 4869 - }, - { - "epoch": 2.390906298003072, - "grad_norm": 0.4138766262651402, - "learning_rate": 2.742089670951954e-05, - "loss": 0.4291, - "step": 4870 - }, - { - "epoch": 2.3913978494623658, - "grad_norm": 0.44173556551645354, - "learning_rate": 2.73785516032939e-05, - "loss": 0.427, - "step": 4871 - }, - { - "epoch": 2.391889400921659, - "grad_norm": 0.46368609749867074, - "learning_rate": 2.7336234031982565e-05, - "loss": 0.5256, - "step": 4872 - }, - { - "epoch": 2.3923809523809525, - "grad_norm": 0.4290467557078167, - "learning_rate": 2.7293944011630613e-05, - "loss": 0.3894, - "step": 4873 - }, - { - "epoch": 2.3928725038402456, - "grad_norm": 0.5258175337993901, - "learning_rate": 2.7251681558272456e-05, - "loss": 0.4305, - "step": 4874 - }, - { - "epoch": 2.393364055299539, - "grad_norm": 0.43304886320230745, - "learning_rate": 2.7209446687932316e-05, - "loss": 0.452, - "step": 4875 - }, - { - "epoch": 2.393855606758833, - "grad_norm": 0.4961713659640867, - "learning_rate": 2.716723941662377e-05, - "loss": 0.4404, - "step": 4876 - }, - { - "epoch": 2.394347158218126, - "grad_norm": 0.4205551150015031, - "learning_rate": 2.712505976034999e-05, - "loss": 0.4314, - "step": 4877 - }, - { - "epoch": 2.3948387096774195, - "grad_norm": 0.40112376247949666, - "learning_rate": 2.7082907735103703e-05, - "loss": 0.4302, - "step": 4878 - }, - { - "epoch": 2.3953302611367127, - "grad_norm": 0.43438082954604723, - "learning_rate": 2.7040783356867104e-05, - "loss": 0.4065, - "step": 4879 - }, - { - "epoch": 2.3958218125960062, - "grad_norm": 0.44615159590308784, - "learning_rate": 2.6998686641611968e-05, - "loss": 0.4705, - "step": 4880 - }, - { - "epoch": 2.3963133640552994, - "grad_norm": 0.4319043997238457, - "learning_rate": 2.6956617605299507e-05, - "loss": 0.4343, - "step": 4881 - }, - { - "epoch": 2.396804915514593, - "grad_norm": 0.409673154227293, - "learning_rate": 2.6914576263880554e-05, - "loss": 0.4208, - "step": 4882 - }, - { - "epoch": 2.397296466973886, - "grad_norm": 0.42954057203393664, - "learning_rate": 2.6872562633295273e-05, - "loss": 0.4217, - "step": 4883 - }, - { - "epoch": 2.3977880184331797, - "grad_norm": 0.39676202093293017, - "learning_rate": 2.683057672947349e-05, - "loss": 0.4014, - "step": 4884 - }, - { - "epoch": 2.3982795698924733, - "grad_norm": 0.45852711552494085, - "learning_rate": 2.6788618568334434e-05, - "loss": 0.4446, - "step": 4885 - }, - { - "epoch": 2.3987711213517664, - "grad_norm": 0.39368541536487933, - "learning_rate": 2.6746688165786804e-05, - "loss": 0.4047, - "step": 4886 - }, - { - "epoch": 2.39926267281106, - "grad_norm": 0.4046290203797557, - "learning_rate": 2.670478553772878e-05, - "loss": 0.3784, - "step": 4887 - }, - { - "epoch": 2.399754224270353, - "grad_norm": 0.3947818780504029, - "learning_rate": 2.6662910700048115e-05, - "loss": 0.3927, - "step": 4888 - }, - { - "epoch": 2.4002457757296467, - "grad_norm": 0.3860916354336387, - "learning_rate": 2.662106366862184e-05, - "loss": 0.3936, - "step": 4889 - }, - { - "epoch": 2.4007373271889403, - "grad_norm": 0.4098875617715916, - "learning_rate": 2.6579244459316587e-05, - "loss": 0.4115, - "step": 4890 - }, - { - "epoch": 2.4012288786482334, - "grad_norm": 0.43718389732951696, - "learning_rate": 2.6537453087988372e-05, - "loss": 0.4125, - "step": 4891 - }, - { - "epoch": 2.401720430107527, - "grad_norm": 0.4886500766381444, - "learning_rate": 2.649568957048266e-05, - "loss": 0.51, - "step": 4892 - }, - { - "epoch": 2.40221198156682, - "grad_norm": 0.4578957117175013, - "learning_rate": 2.6453953922634466e-05, - "loss": 0.3931, - "step": 4893 - }, - { - "epoch": 2.4027035330261137, - "grad_norm": 0.4163101826419494, - "learning_rate": 2.6412246160268016e-05, - "loss": 0.3926, - "step": 4894 - }, - { - "epoch": 2.4031950844854073, - "grad_norm": 0.42597069878871974, - "learning_rate": 2.6370566299197175e-05, - "loss": 0.4475, - "step": 4895 - }, - { - "epoch": 2.4036866359447004, - "grad_norm": 0.39540056045529925, - "learning_rate": 2.6328914355225133e-05, - "loss": 0.3912, - "step": 4896 - }, - { - "epoch": 2.404178187403994, - "grad_norm": 0.43385914069692255, - "learning_rate": 2.6287290344144476e-05, - "loss": 0.3984, - "step": 4897 - }, - { - "epoch": 2.404669738863287, - "grad_norm": 0.42772163840712074, - "learning_rate": 2.624569428173723e-05, - "loss": 0.4776, - "step": 4898 - }, - { - "epoch": 2.4051612903225807, - "grad_norm": 0.40961723983935033, - "learning_rate": 2.6204126183774914e-05, - "loss": 0.4087, - "step": 4899 - }, - { - "epoch": 2.405652841781874, - "grad_norm": 0.40408596146818493, - "learning_rate": 2.6162586066018213e-05, - "loss": 0.4104, - "step": 4900 - }, - { - "epoch": 2.4061443932411675, - "grad_norm": 0.4038564775408092, - "learning_rate": 2.6121073944217434e-05, - "loss": 0.4047, - "step": 4901 - }, - { - "epoch": 2.4066359447004606, - "grad_norm": 0.43283620280296115, - "learning_rate": 2.607958983411217e-05, - "loss": 0.4211, - "step": 4902 - }, - { - "epoch": 2.407127496159754, - "grad_norm": 0.40441583245398277, - "learning_rate": 2.6038133751431392e-05, - "loss": 0.3765, - "step": 4903 - }, - { - "epoch": 2.4076190476190478, - "grad_norm": 0.40547721938521414, - "learning_rate": 2.5996705711893453e-05, - "loss": 0.4157, - "step": 4904 - }, - { - "epoch": 2.408110599078341, - "grad_norm": 0.41947227165717454, - "learning_rate": 2.595530573120608e-05, - "loss": 0.4461, - "step": 4905 - }, - { - "epoch": 2.4086021505376345, - "grad_norm": 0.443470010362414, - "learning_rate": 2.591393382506635e-05, - "loss": 0.4154, - "step": 4906 - }, - { - "epoch": 2.4090937019969276, - "grad_norm": 0.41808356762443494, - "learning_rate": 2.587259000916068e-05, - "loss": 0.4542, - "step": 4907 - }, - { - "epoch": 2.409585253456221, - "grad_norm": 0.3980796046816956, - "learning_rate": 2.5831274299164932e-05, - "loss": 0.4078, - "step": 4908 - }, - { - "epoch": 2.410076804915515, - "grad_norm": 0.4144579759623887, - "learning_rate": 2.578998671074414e-05, - "loss": 0.4291, - "step": 4909 - }, - { - "epoch": 2.410568356374808, - "grad_norm": 0.4202373811110107, - "learning_rate": 2.574872725955284e-05, - "loss": 0.4373, - "step": 4910 - }, - { - "epoch": 2.4110599078341015, - "grad_norm": 0.5234339238283734, - "learning_rate": 2.570749596123482e-05, - "loss": 0.5084, - "step": 4911 - }, - { - "epoch": 2.4115514592933947, - "grad_norm": 0.4141148789563729, - "learning_rate": 2.56662928314232e-05, - "loss": 0.4108, - "step": 4912 - }, - { - "epoch": 2.4120430107526882, - "grad_norm": 0.40809541573997604, - "learning_rate": 2.5625117885740425e-05, - "loss": 0.401, - "step": 4913 - }, - { - "epoch": 2.412534562211982, - "grad_norm": 0.4202059515704432, - "learning_rate": 2.5583971139798246e-05, - "loss": 0.4238, - "step": 4914 - }, - { - "epoch": 2.413026113671275, - "grad_norm": 0.41970170514797916, - "learning_rate": 2.5542852609197755e-05, - "loss": 0.4534, - "step": 4915 - }, - { - "epoch": 2.4135176651305685, - "grad_norm": 0.404516100355974, - "learning_rate": 2.5501762309529298e-05, - "loss": 0.4114, - "step": 4916 - }, - { - "epoch": 2.4140092165898617, - "grad_norm": 0.3975486909589504, - "learning_rate": 2.5460700256372548e-05, - "loss": 0.4004, - "step": 4917 - }, - { - "epoch": 2.4145007680491553, - "grad_norm": 0.41601915799679556, - "learning_rate": 2.541966646529643e-05, - "loss": 0.4211, - "step": 4918 - }, - { - "epoch": 2.4149923195084484, - "grad_norm": 0.41902679270124227, - "learning_rate": 2.537866095185929e-05, - "loss": 0.4484, - "step": 4919 - }, - { - "epoch": 2.415483870967742, - "grad_norm": 0.43915333885554253, - "learning_rate": 2.5337683731608496e-05, - "loss": 0.4654, - "step": 4920 - }, - { - "epoch": 2.415975422427035, - "grad_norm": 0.4228083435989576, - "learning_rate": 2.529673482008096e-05, - "loss": 0.4482, - "step": 4921 - }, - { - "epoch": 2.4164669738863287, - "grad_norm": 0.4086327788975475, - "learning_rate": 2.5255814232802677e-05, - "loss": 0.4104, - "step": 4922 - }, - { - "epoch": 2.4169585253456223, - "grad_norm": 0.45485819020667084, - "learning_rate": 2.5214921985288998e-05, - "loss": 0.4239, - "step": 4923 - }, - { - "epoch": 2.4174500768049154, - "grad_norm": 0.4019267618333613, - "learning_rate": 2.517405809304446e-05, - "loss": 0.4363, - "step": 4924 - }, - { - "epoch": 2.417941628264209, - "grad_norm": 0.414968583317999, - "learning_rate": 2.513322257156292e-05, - "loss": 0.4244, - "step": 4925 - }, - { - "epoch": 2.418433179723502, - "grad_norm": 0.4116238153943015, - "learning_rate": 2.5092415436327432e-05, - "loss": 0.4268, - "step": 4926 - }, - { - "epoch": 2.4189247311827957, - "grad_norm": 0.4327787732980884, - "learning_rate": 2.5051636702810254e-05, - "loss": 0.3985, - "step": 4927 - }, - { - "epoch": 2.4194162826420893, - "grad_norm": 0.42628721591466123, - "learning_rate": 2.501088638647302e-05, - "loss": 0.4237, - "step": 4928 - }, - { - "epoch": 2.4199078341013824, - "grad_norm": 0.4200807107563955, - "learning_rate": 2.497016450276637e-05, - "loss": 0.4167, - "step": 4929 - }, - { - "epoch": 2.420399385560676, - "grad_norm": 0.42466717652922603, - "learning_rate": 2.4929471067130363e-05, - "loss": 0.4053, - "step": 4930 - }, - { - "epoch": 2.420890937019969, - "grad_norm": 0.43364568944423604, - "learning_rate": 2.4888806094994167e-05, - "loss": 0.4379, - "step": 4931 - }, - { - "epoch": 2.4213824884792627, - "grad_norm": 0.4607203868890819, - "learning_rate": 2.4848169601776183e-05, - "loss": 0.4309, - "step": 4932 - }, - { - "epoch": 2.421874039938556, - "grad_norm": 0.38461586947658116, - "learning_rate": 2.480756160288402e-05, - "loss": 0.3772, - "step": 4933 - }, - { - "epoch": 2.4223655913978495, - "grad_norm": 0.461284179658862, - "learning_rate": 2.4766982113714454e-05, - "loss": 0.4538, - "step": 4934 - }, - { - "epoch": 2.422857142857143, - "grad_norm": 0.4336982695994764, - "learning_rate": 2.4726431149653496e-05, - "loss": 0.3941, - "step": 4935 - }, - { - "epoch": 2.423348694316436, - "grad_norm": 0.4253973098327933, - "learning_rate": 2.468590872607628e-05, - "loss": 0.4254, - "step": 4936 - }, - { - "epoch": 2.4238402457757298, - "grad_norm": 0.46035205327708106, - "learning_rate": 2.464541485834725e-05, - "loss": 0.5255, - "step": 4937 - }, - { - "epoch": 2.424331797235023, - "grad_norm": 0.4448176931120402, - "learning_rate": 2.4604949561819803e-05, - "loss": 0.3926, - "step": 4938 - }, - { - "epoch": 2.4248233486943165, - "grad_norm": 0.43535803154222147, - "learning_rate": 2.456451285183675e-05, - "loss": 0.4444, - "step": 4939 - }, - { - "epoch": 2.4253149001536096, - "grad_norm": 0.39337543079748555, - "learning_rate": 2.4524104743729838e-05, - "loss": 0.373, - "step": 4940 - }, - { - "epoch": 2.425806451612903, - "grad_norm": 0.41262199352475254, - "learning_rate": 2.4483725252820157e-05, - "loss": 0.3906, - "step": 4941 - }, - { - "epoch": 2.426298003072197, - "grad_norm": 0.5067843197376108, - "learning_rate": 2.444337439441783e-05, - "loss": 0.4257, - "step": 4942 - }, - { - "epoch": 2.42678955453149, - "grad_norm": 0.42402781111853366, - "learning_rate": 2.440305218382216e-05, - "loss": 0.4183, - "step": 4943 - }, - { - "epoch": 2.4272811059907835, - "grad_norm": 0.3952064600763445, - "learning_rate": 2.436275863632156e-05, - "loss": 0.3582, - "step": 4944 - }, - { - "epoch": 2.4277726574500766, - "grad_norm": 0.4141091231354883, - "learning_rate": 2.432249376719369e-05, - "loss": 0.4621, - "step": 4945 - }, - { - "epoch": 2.4282642089093702, - "grad_norm": 0.4170927380952518, - "learning_rate": 2.4282257591705127e-05, - "loss": 0.418, - "step": 4946 - }, - { - "epoch": 2.428755760368664, - "grad_norm": 0.43790661875108805, - "learning_rate": 2.424205012511178e-05, - "loss": 0.4095, - "step": 4947 - }, - { - "epoch": 2.429247311827957, - "grad_norm": 0.46353406479874043, - "learning_rate": 2.420187138265856e-05, - "loss": 0.4313, - "step": 4948 - }, - { - "epoch": 2.4297388632872505, - "grad_norm": 0.4070085035504352, - "learning_rate": 2.4161721379579493e-05, - "loss": 0.4408, - "step": 4949 - }, - { - "epoch": 2.4302304147465437, - "grad_norm": 0.4550178445002215, - "learning_rate": 2.412160013109773e-05, - "loss": 0.4178, - "step": 4950 - }, - { - "epoch": 2.4307219662058372, - "grad_norm": 0.3944676701943834, - "learning_rate": 2.4081507652425537e-05, - "loss": 0.4008, - "step": 4951 - }, - { - "epoch": 2.4312135176651304, - "grad_norm": 0.3870608732317602, - "learning_rate": 2.404144395876422e-05, - "loss": 0.3754, - "step": 4952 - }, - { - "epoch": 2.431705069124424, - "grad_norm": 0.4109331386351621, - "learning_rate": 2.4001409065304182e-05, - "loss": 0.4039, - "step": 4953 - }, - { - "epoch": 2.4321966205837176, - "grad_norm": 0.38205813098274866, - "learning_rate": 2.396140298722501e-05, - "loss": 0.424, - "step": 4954 - }, - { - "epoch": 2.4326881720430107, - "grad_norm": 0.46074408777096326, - "learning_rate": 2.392142573969516e-05, - "loss": 0.4483, - "step": 4955 - }, - { - "epoch": 2.4331797235023043, - "grad_norm": 0.45619132072538193, - "learning_rate": 2.388147733787237e-05, - "loss": 0.4124, - "step": 4956 - }, - { - "epoch": 2.4336712749615974, - "grad_norm": 0.43533052904614056, - "learning_rate": 2.3841557796903323e-05, - "loss": 0.4377, - "step": 4957 - }, - { - "epoch": 2.434162826420891, - "grad_norm": 0.45549689420536416, - "learning_rate": 2.3801667131923778e-05, - "loss": 0.4424, - "step": 4958 - }, - { - "epoch": 2.434654377880184, - "grad_norm": 0.41206127919664676, - "learning_rate": 2.376180535805854e-05, - "loss": 0.4631, - "step": 4959 - }, - { - "epoch": 2.4351459293394777, - "grad_norm": 0.4358014767790808, - "learning_rate": 2.3721972490421486e-05, - "loss": 0.4162, - "step": 4960 - }, - { - "epoch": 2.4356374807987713, - "grad_norm": 0.43791299923490723, - "learning_rate": 2.3682168544115514e-05, - "loss": 0.449, - "step": 4961 - }, - { - "epoch": 2.4361290322580644, - "grad_norm": 0.40285694468060407, - "learning_rate": 2.3642393534232543e-05, - "loss": 0.436, - "step": 4962 - }, - { - "epoch": 2.436620583717358, - "grad_norm": 0.41769793235393443, - "learning_rate": 2.3602647475853567e-05, - "loss": 0.442, - "step": 4963 - }, - { - "epoch": 2.437112135176651, - "grad_norm": 0.4412574100483954, - "learning_rate": 2.3562930384048533e-05, - "loss": 0.4162, - "step": 4964 - }, - { - "epoch": 2.4376036866359447, - "grad_norm": 0.40350597371130936, - "learning_rate": 2.352324227387651e-05, - "loss": 0.4368, - "step": 4965 - }, - { - "epoch": 2.4380952380952383, - "grad_norm": 0.43548997015869667, - "learning_rate": 2.3483583160385435e-05, - "loss": 0.4225, - "step": 4966 - }, - { - "epoch": 2.4385867895545315, - "grad_norm": 0.38334190191802703, - "learning_rate": 2.34439530586124e-05, - "loss": 0.3845, - "step": 4967 - }, - { - "epoch": 2.439078341013825, - "grad_norm": 0.41171341418336127, - "learning_rate": 2.3404351983583385e-05, - "loss": 0.4179, - "step": 4968 - }, - { - "epoch": 2.439569892473118, - "grad_norm": 0.4196890334736673, - "learning_rate": 2.336477995031342e-05, - "loss": 0.4383, - "step": 4969 - }, - { - "epoch": 2.4400614439324118, - "grad_norm": 0.45342298310994344, - "learning_rate": 2.332523697380652e-05, - "loss": 0.3981, - "step": 4970 - }, - { - "epoch": 2.440552995391705, - "grad_norm": 0.41924147643358856, - "learning_rate": 2.3285723069055644e-05, - "loss": 0.4414, - "step": 4971 - }, - { - "epoch": 2.4410445468509985, - "grad_norm": 0.4494783755277245, - "learning_rate": 2.3246238251042783e-05, - "loss": 0.4593, - "step": 4972 - }, - { - "epoch": 2.4415360983102916, - "grad_norm": 0.43869194137597284, - "learning_rate": 2.3206782534738825e-05, - "loss": 0.4298, - "step": 4973 - }, - { - "epoch": 2.442027649769585, - "grad_norm": 0.4052401911466543, - "learning_rate": 2.3167355935103775e-05, - "loss": 0.4286, - "step": 4974 - }, - { - "epoch": 2.4425192012288788, - "grad_norm": 0.418322005014125, - "learning_rate": 2.3127958467086376e-05, - "loss": 0.4126, - "step": 4975 - }, - { - "epoch": 2.443010752688172, - "grad_norm": 0.4400683935885187, - "learning_rate": 2.3088590145624524e-05, - "loss": 0.4453, - "step": 4976 - }, - { - "epoch": 2.4435023041474655, - "grad_norm": 0.44435139050927097, - "learning_rate": 2.3049250985644956e-05, - "loss": 0.4142, - "step": 4977 - }, - { - "epoch": 2.4439938556067586, - "grad_norm": 0.40584398439709796, - "learning_rate": 2.3009941002063386e-05, - "loss": 0.3947, - "step": 4978 - }, - { - "epoch": 2.444485407066052, - "grad_norm": 0.41334247645487726, - "learning_rate": 2.2970660209784468e-05, - "loss": 0.4072, - "step": 4979 - }, - { - "epoch": 2.444976958525346, - "grad_norm": 0.41956178081404166, - "learning_rate": 2.2931408623701768e-05, - "loss": 0.4147, - "step": 4980 - }, - { - "epoch": 2.445468509984639, - "grad_norm": 0.4500041655607618, - "learning_rate": 2.289218625869779e-05, - "loss": 0.4524, - "step": 4981 - }, - { - "epoch": 2.4459600614439325, - "grad_norm": 0.42882187483367806, - "learning_rate": 2.2852993129643953e-05, - "loss": 0.4342, - "step": 4982 - }, - { - "epoch": 2.4464516129032257, - "grad_norm": 0.39609811982163823, - "learning_rate": 2.2813829251400654e-05, - "loss": 0.4188, - "step": 4983 - }, - { - "epoch": 2.4469431643625192, - "grad_norm": 0.39380472899425994, - "learning_rate": 2.277469463881704e-05, - "loss": 0.3837, - "step": 4984 - }, - { - "epoch": 2.447434715821813, - "grad_norm": 0.39798016820271653, - "learning_rate": 2.273558930673135e-05, - "loss": 0.4118, - "step": 4985 - }, - { - "epoch": 2.447926267281106, - "grad_norm": 0.41092609830192445, - "learning_rate": 2.269651326997062e-05, - "loss": 0.4136, - "step": 4986 - }, - { - "epoch": 2.4484178187403995, - "grad_norm": 0.4307680149634275, - "learning_rate": 2.265746654335078e-05, - "loss": 0.4662, - "step": 4987 - }, - { - "epoch": 2.4489093701996927, - "grad_norm": 0.42927589692719964, - "learning_rate": 2.2618449141676666e-05, - "loss": 0.4253, - "step": 4988 - }, - { - "epoch": 2.4494009216589863, - "grad_norm": 0.40368383133013636, - "learning_rate": 2.2579461079741982e-05, - "loss": 0.427, - "step": 4989 - }, - { - "epoch": 2.4498924731182794, - "grad_norm": 0.41741848226983125, - "learning_rate": 2.2540502372329298e-05, - "loss": 0.4367, - "step": 4990 - }, - { - "epoch": 2.450384024577573, - "grad_norm": 0.42486087819110996, - "learning_rate": 2.2501573034210154e-05, - "loss": 0.4553, - "step": 4991 - }, - { - "epoch": 2.450875576036866, - "grad_norm": 0.40558882107282096, - "learning_rate": 2.2462673080144747e-05, - "loss": 0.3978, - "step": 4992 - }, - { - "epoch": 2.4513671274961597, - "grad_norm": 0.4294212555649955, - "learning_rate": 2.2423802524882366e-05, - "loss": 0.4097, - "step": 4993 - }, - { - "epoch": 2.4518586789554533, - "grad_norm": 0.42746809790960366, - "learning_rate": 2.238496138316101e-05, - "loss": 0.4217, - "step": 4994 - }, - { - "epoch": 2.4523502304147464, - "grad_norm": 0.4060989520545923, - "learning_rate": 2.234614966970754e-05, - "loss": 0.4027, - "step": 4995 - }, - { - "epoch": 2.45284178187404, - "grad_norm": 0.45248182842262413, - "learning_rate": 2.230736739923771e-05, - "loss": 0.4473, - "step": 4996 - }, - { - "epoch": 2.453333333333333, - "grad_norm": 0.3980380260404687, - "learning_rate": 2.2268614586456062e-05, - "loss": 0.4282, - "step": 4997 - }, - { - "epoch": 2.4538248847926267, - "grad_norm": 0.4112502253597491, - "learning_rate": 2.222989124605599e-05, - "loss": 0.4188, - "step": 4998 - }, - { - "epoch": 2.4543164362519203, - "grad_norm": 0.41725701212807187, - "learning_rate": 2.2191197392719688e-05, - "loss": 0.4205, - "step": 4999 - }, - { - "epoch": 2.4548079877112134, - "grad_norm": 0.40087848677582605, - "learning_rate": 2.2152533041118275e-05, - "loss": 0.3713, - "step": 5000 - }, - { - "epoch": 2.455299539170507, - "grad_norm": 0.4170737879851873, - "learning_rate": 2.2113898205911487e-05, - "loss": 0.4103, - "step": 5001 - }, - { - "epoch": 2.4557910906298, - "grad_norm": 0.3938070235299884, - "learning_rate": 2.207529290174808e-05, - "loss": 0.3661, - "step": 5002 - }, - { - "epoch": 2.4562826420890937, - "grad_norm": 0.39246320315860966, - "learning_rate": 2.2036717143265474e-05, - "loss": 0.3649, - "step": 5003 - }, - { - "epoch": 2.4567741935483873, - "grad_norm": 0.4113108642558232, - "learning_rate": 2.1998170945089923e-05, - "loss": 0.4207, - "step": 5004 - }, - { - "epoch": 2.4572657450076805, - "grad_norm": 0.44280160720408823, - "learning_rate": 2.1959654321836497e-05, - "loss": 0.4618, - "step": 5005 - }, - { - "epoch": 2.457757296466974, - "grad_norm": 0.4350799927654252, - "learning_rate": 2.1921167288109034e-05, - "loss": 0.4383, - "step": 5006 - }, - { - "epoch": 2.458248847926267, - "grad_norm": 0.4096536750530943, - "learning_rate": 2.188270985850015e-05, - "loss": 0.426, - "step": 5007 - }, - { - "epoch": 2.4587403993855608, - "grad_norm": 0.47453302718344237, - "learning_rate": 2.18442820475912e-05, - "loss": 0.4694, - "step": 5008 - }, - { - "epoch": 2.459231950844854, - "grad_norm": 0.4447458622489744, - "learning_rate": 2.1805883869952447e-05, - "loss": 0.4309, - "step": 5009 - }, - { - "epoch": 2.4597235023041475, - "grad_norm": 0.41288963387212224, - "learning_rate": 2.1767515340142708e-05, - "loss": 0.4322, - "step": 5010 - }, - { - "epoch": 2.4602150537634406, - "grad_norm": 0.3970000208134351, - "learning_rate": 2.172917647270977e-05, - "loss": 0.3888, - "step": 5011 - }, - { - "epoch": 2.460706605222734, - "grad_norm": 0.3975775174626306, - "learning_rate": 2.1690867282189974e-05, - "loss": 0.3799, - "step": 5012 - }, - { - "epoch": 2.461198156682028, - "grad_norm": 0.43998952557391197, - "learning_rate": 2.165258778310859e-05, - "loss": 0.4556, - "step": 5013 - }, - { - "epoch": 2.461689708141321, - "grad_norm": 0.4266765045152995, - "learning_rate": 2.1614337989979527e-05, - "loss": 0.482, - "step": 5014 - }, - { - "epoch": 2.4621812596006145, - "grad_norm": 0.42075334461583025, - "learning_rate": 2.1576117917305448e-05, - "loss": 0.4073, - "step": 5015 - }, - { - "epoch": 2.4626728110599077, - "grad_norm": 0.8023675217745048, - "learning_rate": 2.1537927579577754e-05, - "loss": 0.4906, - "step": 5016 - }, - { - "epoch": 2.4631643625192012, - "grad_norm": 0.4306678896059732, - "learning_rate": 2.149976699127657e-05, - "loss": 0.4357, - "step": 5017 - }, - { - "epoch": 2.463655913978495, - "grad_norm": 0.4093322198679003, - "learning_rate": 2.1461636166870735e-05, - "loss": 0.3864, - "step": 5018 - }, - { - "epoch": 2.464147465437788, - "grad_norm": 0.4519684061952061, - "learning_rate": 2.1423535120817796e-05, - "loss": 0.438, - "step": 5019 - }, - { - "epoch": 2.4646390168970815, - "grad_norm": 0.430634076064964, - "learning_rate": 2.1385463867564093e-05, - "loss": 0.4399, - "step": 5020 - }, - { - "epoch": 2.4651305683563747, - "grad_norm": 0.3930196163060272, - "learning_rate": 2.1347422421544495e-05, - "loss": 0.3756, - "step": 5021 - }, - { - "epoch": 2.4656221198156683, - "grad_norm": 0.4506395296409869, - "learning_rate": 2.1309410797182748e-05, - "loss": 0.5219, - "step": 5022 - }, - { - "epoch": 2.466113671274962, - "grad_norm": 0.4139398596955898, - "learning_rate": 2.1271429008891207e-05, - "loss": 0.4328, - "step": 5023 - }, - { - "epoch": 2.466605222734255, - "grad_norm": 0.43207514947020714, - "learning_rate": 2.123347707107092e-05, - "loss": 0.4358, - "step": 5024 - }, - { - "epoch": 2.4670967741935486, - "grad_norm": 0.4162246279419766, - "learning_rate": 2.11955549981116e-05, - "loss": 0.424, - "step": 5025 - }, - { - "epoch": 2.4675883256528417, - "grad_norm": 0.41748950764840775, - "learning_rate": 2.1157662804391688e-05, - "loss": 0.4327, - "step": 5026 - }, - { - "epoch": 2.4680798771121353, - "grad_norm": 0.43412189455031686, - "learning_rate": 2.1119800504278243e-05, - "loss": 0.4454, - "step": 5027 - }, - { - "epoch": 2.4685714285714284, - "grad_norm": 0.4022725907507483, - "learning_rate": 2.1081968112127004e-05, - "loss": 0.4442, - "step": 5028 - }, - { - "epoch": 2.469062980030722, - "grad_norm": 0.4098521047997809, - "learning_rate": 2.1044165642282443e-05, - "loss": 0.4076, - "step": 5029 - }, - { - "epoch": 2.469554531490015, - "grad_norm": 0.4404444318518348, - "learning_rate": 2.1006393109077525e-05, - "loss": 0.4215, - "step": 5030 - }, - { - "epoch": 2.4700460829493087, - "grad_norm": 0.3985616449976521, - "learning_rate": 2.0968650526834044e-05, - "loss": 0.4096, - "step": 5031 - }, - { - "epoch": 2.4705376344086023, - "grad_norm": 0.42915733479069945, - "learning_rate": 2.0930937909862315e-05, - "loss": 0.43, - "step": 5032 - }, - { - "epoch": 2.4710291858678954, - "grad_norm": 0.42843053935322273, - "learning_rate": 2.0893255272461353e-05, - "loss": 0.4468, - "step": 5033 - }, - { - "epoch": 2.471520737327189, - "grad_norm": 0.416855017295221, - "learning_rate": 2.0855602628918747e-05, - "loss": 0.402, - "step": 5034 - }, - { - "epoch": 2.472012288786482, - "grad_norm": 0.4420058552315824, - "learning_rate": 2.0817979993510828e-05, - "loss": 0.4171, - "step": 5035 - }, - { - "epoch": 2.4725038402457757, - "grad_norm": 0.4178801901237165, - "learning_rate": 2.0780387380502375e-05, - "loss": 0.4329, - "step": 5036 - }, - { - "epoch": 2.4729953917050693, - "grad_norm": 0.411539190274043, - "learning_rate": 2.0742824804146978e-05, - "loss": 0.4401, - "step": 5037 - }, - { - "epoch": 2.4734869431643625, - "grad_norm": 0.44661260544536424, - "learning_rate": 2.070529227868665e-05, - "loss": 0.4192, - "step": 5038 - }, - { - "epoch": 2.473978494623656, - "grad_norm": 0.41256879609161523, - "learning_rate": 2.066778981835218e-05, - "loss": 0.4402, - "step": 5039 - }, - { - "epoch": 2.474470046082949, - "grad_norm": 0.42217334590518796, - "learning_rate": 2.0630317437362834e-05, - "loss": 0.4154, - "step": 5040 - }, - { - "epoch": 2.4749615975422428, - "grad_norm": 0.43713094034517735, - "learning_rate": 2.059287514992655e-05, - "loss": 0.4334, - "step": 5041 - }, - { - "epoch": 2.4754531490015363, - "grad_norm": 0.4104594007405608, - "learning_rate": 2.05554629702398e-05, - "loss": 0.4299, - "step": 5042 - }, - { - "epoch": 2.4759447004608295, - "grad_norm": 0.4178253665500614, - "learning_rate": 2.051808091248768e-05, - "loss": 0.4353, - "step": 5043 - }, - { - "epoch": 2.476436251920123, - "grad_norm": 0.3866877583558031, - "learning_rate": 2.0480728990843833e-05, - "loss": 0.4092, - "step": 5044 - }, - { - "epoch": 2.476927803379416, - "grad_norm": 0.42417960596452214, - "learning_rate": 2.044340721947049e-05, - "loss": 0.4131, - "step": 5045 - }, - { - "epoch": 2.47741935483871, - "grad_norm": 0.4258239623041988, - "learning_rate": 2.040611561251853e-05, - "loss": 0.4147, - "step": 5046 - }, - { - "epoch": 2.477910906298003, - "grad_norm": 0.3956867903698109, - "learning_rate": 2.0368854184127183e-05, - "loss": 0.386, - "step": 5047 - }, - { - "epoch": 2.4784024577572965, - "grad_norm": 0.43681721120071376, - "learning_rate": 2.033162294842449e-05, - "loss": 0.4384, - "step": 5048 - }, - { - "epoch": 2.4788940092165896, - "grad_norm": 0.4070485063242609, - "learning_rate": 2.0294421919526873e-05, - "loss": 0.4249, - "step": 5049 - }, - { - "epoch": 2.4793855606758832, - "grad_norm": 0.3981539220893719, - "learning_rate": 2.0257251111539365e-05, - "loss": 0.4029, - "step": 5050 - }, - { - "epoch": 2.479877112135177, - "grad_norm": 0.4231654410800758, - "learning_rate": 2.0220110538555536e-05, - "loss": 0.4184, - "step": 5051 - }, - { - "epoch": 2.48036866359447, - "grad_norm": 0.41648044030596626, - "learning_rate": 2.0183000214657478e-05, - "loss": 0.4409, - "step": 5052 - }, - { - "epoch": 2.4808602150537635, - "grad_norm": 0.45193386894941884, - "learning_rate": 2.0145920153915808e-05, - "loss": 0.438, - "step": 5053 - }, - { - "epoch": 2.4813517665130567, - "grad_norm": 0.4392599785531329, - "learning_rate": 2.0108870370389687e-05, - "loss": 0.4267, - "step": 5054 - }, - { - "epoch": 2.4818433179723502, - "grad_norm": 0.45213477948595066, - "learning_rate": 2.0071850878126842e-05, - "loss": 0.4416, - "step": 5055 - }, - { - "epoch": 2.482334869431644, - "grad_norm": 0.40661863824003847, - "learning_rate": 2.0034861691163374e-05, - "loss": 0.4043, - "step": 5056 - }, - { - "epoch": 2.482826420890937, - "grad_norm": 0.40849375501249074, - "learning_rate": 1.9997902823524072e-05, - "loss": 0.4292, - "step": 5057 - }, - { - "epoch": 2.4833179723502306, - "grad_norm": 0.4119444960928005, - "learning_rate": 1.996097428922209e-05, - "loss": 0.3844, - "step": 5058 - }, - { - "epoch": 2.4838095238095237, - "grad_norm": 0.41662555624285125, - "learning_rate": 1.992407610225915e-05, - "loss": 0.4067, - "step": 5059 - }, - { - "epoch": 2.4843010752688173, - "grad_norm": 0.40819255656770076, - "learning_rate": 1.9887208276625446e-05, - "loss": 0.4199, - "step": 5060 - }, - { - "epoch": 2.4847926267281104, - "grad_norm": 0.41579911013234894, - "learning_rate": 1.9850370826299658e-05, - "loss": 0.4456, - "step": 5061 - }, - { - "epoch": 2.485284178187404, - "grad_norm": 0.38752784168055804, - "learning_rate": 1.9813563765248953e-05, - "loss": 0.3971, - "step": 5062 - }, - { - "epoch": 2.4857757296466976, - "grad_norm": 0.5381697267195698, - "learning_rate": 1.9776787107428997e-05, - "loss": 0.4279, - "step": 5063 - }, - { - "epoch": 2.4862672811059907, - "grad_norm": 0.41004036314598546, - "learning_rate": 1.9740040866783892e-05, - "loss": 0.3947, - "step": 5064 - }, - { - "epoch": 2.4867588325652843, - "grad_norm": 0.4254939276978934, - "learning_rate": 1.9703325057246203e-05, - "loss": 0.4036, - "step": 5065 - }, - { - "epoch": 2.4872503840245774, - "grad_norm": 0.4488112548607266, - "learning_rate": 1.9666639692737055e-05, - "loss": 0.436, - "step": 5066 - }, - { - "epoch": 2.487741935483871, - "grad_norm": 0.3918922658874846, - "learning_rate": 1.9629984787165844e-05, - "loss": 0.4034, - "step": 5067 - }, - { - "epoch": 2.488233486943164, - "grad_norm": 0.4032443888329688, - "learning_rate": 1.959336035443061e-05, - "loss": 0.4216, - "step": 5068 - }, - { - "epoch": 2.4887250384024577, - "grad_norm": 0.42438477756228404, - "learning_rate": 1.9556766408417738e-05, - "loss": 0.3937, - "step": 5069 - }, - { - "epoch": 2.4892165898617513, - "grad_norm": 0.39924274987500186, - "learning_rate": 1.9520202963002043e-05, - "loss": 0.3551, - "step": 5070 - }, - { - "epoch": 2.4897081413210445, - "grad_norm": 0.4039276954336843, - "learning_rate": 1.948367003204683e-05, - "loss": 0.3552, - "step": 5071 - }, - { - "epoch": 2.490199692780338, - "grad_norm": 0.41615304776373657, - "learning_rate": 1.9447167629403805e-05, - "loss": 0.4487, - "step": 5072 - }, - { - "epoch": 2.490691244239631, - "grad_norm": 0.40021005414883415, - "learning_rate": 1.9410695768913077e-05, - "loss": 0.3788, - "step": 5073 - }, - { - "epoch": 2.4911827956989248, - "grad_norm": 0.444686626726618, - "learning_rate": 1.9374254464403207e-05, - "loss": 0.4463, - "step": 5074 - }, - { - "epoch": 2.4916743471582183, - "grad_norm": 0.42130227798357583, - "learning_rate": 1.9337843729691217e-05, - "loss": 0.4238, - "step": 5075 - }, - { - "epoch": 2.4921658986175115, - "grad_norm": 0.3853644670730329, - "learning_rate": 1.930146357858238e-05, - "loss": 0.379, - "step": 5076 - }, - { - "epoch": 2.492657450076805, - "grad_norm": 0.42086831424076065, - "learning_rate": 1.926511402487057e-05, - "loss": 0.4184, - "step": 5077 - }, - { - "epoch": 2.493149001536098, - "grad_norm": 0.4214061439423063, - "learning_rate": 1.9228795082337926e-05, - "loss": 0.433, - "step": 5078 - }, - { - "epoch": 2.493640552995392, - "grad_norm": 0.3994475477371375, - "learning_rate": 1.9192506764755036e-05, - "loss": 0.4115, - "step": 5079 - }, - { - "epoch": 2.494132104454685, - "grad_norm": 0.42562824089908347, - "learning_rate": 1.9156249085880818e-05, - "loss": 0.4017, - "step": 5080 - }, - { - "epoch": 2.4946236559139785, - "grad_norm": 0.39344823805678997, - "learning_rate": 1.9120022059462706e-05, - "loss": 0.4285, - "step": 5081 - }, - { - "epoch": 2.495115207373272, - "grad_norm": 0.39159133762471077, - "learning_rate": 1.9083825699236323e-05, - "loss": 0.3667, - "step": 5082 - }, - { - "epoch": 2.495606758832565, - "grad_norm": 0.42521385079766616, - "learning_rate": 1.9047660018925828e-05, - "loss": 0.4225, - "step": 5083 - }, - { - "epoch": 2.496098310291859, - "grad_norm": 0.4159750219904371, - "learning_rate": 1.901152503224366e-05, - "loss": 0.4106, - "step": 5084 - }, - { - "epoch": 2.496589861751152, - "grad_norm": 0.38528449506553747, - "learning_rate": 1.897542075289066e-05, - "loss": 0.3769, - "step": 5085 - }, - { - "epoch": 2.496589861751152, - "eval_loss": 0.8140223026275635, - "eval_runtime": 6658.8517, - "eval_samples_per_second": 4.28, - "eval_steps_per_second": 2.14, - "step": 5085 - }, - { - "epoch": 2.4970814132104455, - "grad_norm": 0.4426154703832519, - "learning_rate": 1.893934719455599e-05, - "loss": 0.4107, - "step": 5086 - }, - { - "epoch": 2.4975729646697387, - "grad_norm": 0.39328617200473986, - "learning_rate": 1.8903304370917208e-05, - "loss": 0.422, - "step": 5087 - }, - { - "epoch": 2.4980645161290322, - "grad_norm": 0.39941817138053703, - "learning_rate": 1.886729229564016e-05, - "loss": 0.3807, - "step": 5088 - }, - { - "epoch": 2.498556067588326, - "grad_norm": 0.4038335550501196, - "learning_rate": 1.8831310982379103e-05, - "loss": 0.3992, - "step": 5089 - }, - { - "epoch": 2.499047619047619, - "grad_norm": 0.401618039340355, - "learning_rate": 1.8795360444776577e-05, - "loss": 0.4176, - "step": 5090 - }, - { - "epoch": 2.4995391705069125, - "grad_norm": 0.427198751051607, - "learning_rate": 1.875944069646346e-05, - "loss": 0.4183, - "step": 5091 - }, - { - "epoch": 2.5000307219662057, - "grad_norm": 0.40657213362173045, - "learning_rate": 1.872355175105902e-05, - "loss": 0.4765, - "step": 5092 - }, - { - "epoch": 2.5005222734254993, - "grad_norm": 0.40959209087807935, - "learning_rate": 1.8687693622170723e-05, - "loss": 0.4245, - "step": 5093 - }, - { - "epoch": 2.501013824884793, - "grad_norm": 0.4528650314771083, - "learning_rate": 1.8651866323394473e-05, - "loss": 0.4205, - "step": 5094 - }, - { - "epoch": 2.501505376344086, - "grad_norm": 0.45053998340422075, - "learning_rate": 1.861606986831441e-05, - "loss": 0.4499, - "step": 5095 - }, - { - "epoch": 2.5019969278033796, - "grad_norm": 0.3973370539956065, - "learning_rate": 1.8580304270503024e-05, - "loss": 0.3997, - "step": 5096 - }, - { - "epoch": 2.5024884792626727, - "grad_norm": 0.387607528425279, - "learning_rate": 1.8544569543521063e-05, - "loss": 0.3998, - "step": 5097 - }, - { - "epoch": 2.5029800307219663, - "grad_norm": 0.4430245555765779, - "learning_rate": 1.8508865700917598e-05, - "loss": 0.4339, - "step": 5098 - }, - { - "epoch": 2.50347158218126, - "grad_norm": 0.4334586784959611, - "learning_rate": 1.8473192756229985e-05, - "loss": 0.4558, - "step": 5099 - }, - { - "epoch": 2.503963133640553, - "grad_norm": 0.42461552764421445, - "learning_rate": 1.843755072298383e-05, - "loss": 0.3974, - "step": 5100 - }, - { - "epoch": 2.504454685099846, - "grad_norm": 0.42644282145434476, - "learning_rate": 1.8401939614693142e-05, - "loss": 0.4605, - "step": 5101 - }, - { - "epoch": 2.5049462365591397, - "grad_norm": 0.4014263088473555, - "learning_rate": 1.836635944485999e-05, - "loss": 0.3903, - "step": 5102 - }, - { - "epoch": 2.5054377880184333, - "grad_norm": 0.4595881033592045, - "learning_rate": 1.8330810226974928e-05, - "loss": 0.4364, - "step": 5103 - }, - { - "epoch": 2.5059293394777264, - "grad_norm": 0.46305780211307357, - "learning_rate": 1.829529197451666e-05, - "loss": 0.4295, - "step": 5104 - }, - { - "epoch": 2.50642089093702, - "grad_norm": 0.46629544746215856, - "learning_rate": 1.8259804700952166e-05, - "loss": 0.4377, - "step": 5105 - }, - { - "epoch": 2.506912442396313, - "grad_norm": 0.4095067468753335, - "learning_rate": 1.8224348419736693e-05, - "loss": 0.4142, - "step": 5106 - }, - { - "epoch": 2.5074039938556067, - "grad_norm": 0.4115643732643572, - "learning_rate": 1.8188923144313707e-05, - "loss": 0.4298, - "step": 5107 - }, - { - "epoch": 2.5078955453149003, - "grad_norm": 0.41417800237491703, - "learning_rate": 1.815352888811498e-05, - "loss": 0.4053, - "step": 5108 - }, - { - "epoch": 2.5083870967741935, - "grad_norm": 0.4420769583010539, - "learning_rate": 1.811816566456045e-05, - "loss": 0.4688, - "step": 5109 - }, - { - "epoch": 2.508878648233487, - "grad_norm": 0.4145580955031008, - "learning_rate": 1.8082833487058337e-05, - "loss": 0.4236, - "step": 5110 - }, - { - "epoch": 2.50937019969278, - "grad_norm": 0.4149205274435076, - "learning_rate": 1.804753236900505e-05, - "loss": 0.3725, - "step": 5111 - }, - { - "epoch": 2.5098617511520738, - "grad_norm": 0.4424062922484215, - "learning_rate": 1.8012262323785323e-05, - "loss": 0.4401, - "step": 5112 - }, - { - "epoch": 2.5103533026113674, - "grad_norm": 0.4014599813892684, - "learning_rate": 1.7977023364771937e-05, - "loss": 0.39, - "step": 5113 - }, - { - "epoch": 2.5108448540706605, - "grad_norm": 0.4065142469082297, - "learning_rate": 1.7941815505326042e-05, - "loss": 0.3848, - "step": 5114 - }, - { - "epoch": 2.511336405529954, - "grad_norm": 0.42958077209054774, - "learning_rate": 1.790663875879692e-05, - "loss": 0.4489, - "step": 5115 - }, - { - "epoch": 2.511827956989247, - "grad_norm": 0.4434811452856223, - "learning_rate": 1.787149313852209e-05, - "loss": 0.4297, - "step": 5116 - }, - { - "epoch": 2.512319508448541, - "grad_norm": 0.4195995036202588, - "learning_rate": 1.7836378657827224e-05, - "loss": 0.4092, - "step": 5117 - }, - { - "epoch": 2.5128110599078344, - "grad_norm": 0.4243217501066077, - "learning_rate": 1.7801295330026223e-05, - "loss": 0.4727, - "step": 5118 - }, - { - "epoch": 2.5133026113671275, - "grad_norm": 0.4300611395351345, - "learning_rate": 1.7766243168421182e-05, - "loss": 0.4136, - "step": 5119 - }, - { - "epoch": 2.5137941628264207, - "grad_norm": 0.3823968322916278, - "learning_rate": 1.7731222186302323e-05, - "loss": 0.3961, - "step": 5120 - }, - { - "epoch": 2.5142857142857142, - "grad_norm": 0.40991509197708587, - "learning_rate": 1.7696232396948177e-05, - "loss": 0.4273, - "step": 5121 - }, - { - "epoch": 2.514777265745008, - "grad_norm": 0.4091697241015664, - "learning_rate": 1.7661273813625256e-05, - "loss": 0.4159, - "step": 5122 - }, - { - "epoch": 2.515268817204301, - "grad_norm": 0.43010247842834093, - "learning_rate": 1.7626346449588416e-05, - "loss": 0.4428, - "step": 5123 - }, - { - "epoch": 2.5157603686635945, - "grad_norm": 0.3754503027673675, - "learning_rate": 1.7591450318080573e-05, - "loss": 0.3731, - "step": 5124 - }, - { - "epoch": 2.5162519201228877, - "grad_norm": 0.42428740891599254, - "learning_rate": 1.755658543233285e-05, - "loss": 0.4144, - "step": 5125 - }, - { - "epoch": 2.5167434715821813, - "grad_norm": 0.43077476712147833, - "learning_rate": 1.752175180556447e-05, - "loss": 0.4206, - "step": 5126 - }, - { - "epoch": 2.517235023041475, - "grad_norm": 0.38311260300187444, - "learning_rate": 1.7486949450982904e-05, - "loss": 0.4567, - "step": 5127 - }, - { - "epoch": 2.517726574500768, - "grad_norm": 0.3701679906246134, - "learning_rate": 1.7452178381783613e-05, - "loss": 0.3704, - "step": 5128 - }, - { - "epoch": 2.5182181259600616, - "grad_norm": 0.4388550280755766, - "learning_rate": 1.741743861115035e-05, - "loss": 0.4535, - "step": 5129 - }, - { - "epoch": 2.5187096774193547, - "grad_norm": 0.44016639905895155, - "learning_rate": 1.7382730152254922e-05, - "loss": 0.3892, - "step": 5130 - }, - { - "epoch": 2.5192012288786483, - "grad_norm": 0.40077274726571227, - "learning_rate": 1.734805301825726e-05, - "loss": 0.4011, - "step": 5131 - }, - { - "epoch": 2.519692780337942, - "grad_norm": 0.4216679186261871, - "learning_rate": 1.7313407222305456e-05, - "loss": 0.4013, - "step": 5132 - }, - { - "epoch": 2.520184331797235, - "grad_norm": 0.4281045163434053, - "learning_rate": 1.727879277753568e-05, - "loss": 0.4058, - "step": 5133 - }, - { - "epoch": 2.5206758832565286, - "grad_norm": 0.42294901793632517, - "learning_rate": 1.7244209697072233e-05, - "loss": 0.436, - "step": 5134 - }, - { - "epoch": 2.5211674347158217, - "grad_norm": 0.41768516071747835, - "learning_rate": 1.7209657994027527e-05, - "loss": 0.3979, - "step": 5135 - }, - { - "epoch": 2.5216589861751153, - "grad_norm": 0.42351779230794256, - "learning_rate": 1.7175137681502085e-05, - "loss": 0.3784, - "step": 5136 - }, - { - "epoch": 2.5221505376344084, - "grad_norm": 0.41401189736057803, - "learning_rate": 1.714064877258449e-05, - "loss": 0.3834, - "step": 5137 - }, - { - "epoch": 2.522642089093702, - "grad_norm": 0.4239662779891976, - "learning_rate": 1.71061912803515e-05, - "loss": 0.4137, - "step": 5138 - }, - { - "epoch": 2.523133640552995, - "grad_norm": 0.4493118796360464, - "learning_rate": 1.707176521786784e-05, - "loss": 0.4286, - "step": 5139 - }, - { - "epoch": 2.5236251920122887, - "grad_norm": 0.42753180575411415, - "learning_rate": 1.703737059818643e-05, - "loss": 0.4124, - "step": 5140 - }, - { - "epoch": 2.5241167434715823, - "grad_norm": 0.41350002774613603, - "learning_rate": 1.7003007434348218e-05, - "loss": 0.4259, - "step": 5141 - }, - { - "epoch": 2.5246082949308755, - "grad_norm": 0.4728153585992248, - "learning_rate": 1.6968675739382213e-05, - "loss": 0.4309, - "step": 5142 - }, - { - "epoch": 2.525099846390169, - "grad_norm": 0.4620370104264429, - "learning_rate": 1.6934375526305524e-05, - "loss": 0.473, - "step": 5143 - }, - { - "epoch": 2.525591397849462, - "grad_norm": 0.39439966311719304, - "learning_rate": 1.6900106808123307e-05, - "loss": 0.3884, - "step": 5144 - }, - { - "epoch": 2.5260829493087558, - "grad_norm": 0.42179306170473924, - "learning_rate": 1.6865869597828765e-05, - "loss": 0.4374, - "step": 5145 - }, - { - "epoch": 2.5265745007680493, - "grad_norm": 0.4164924671983075, - "learning_rate": 1.6831663908403173e-05, - "loss": 0.4553, - "step": 5146 - }, - { - "epoch": 2.5270660522273425, - "grad_norm": 0.4184462447707974, - "learning_rate": 1.6797489752815886e-05, - "loss": 0.4085, - "step": 5147 - }, - { - "epoch": 2.527557603686636, - "grad_norm": 0.43716333565598975, - "learning_rate": 1.6763347144024198e-05, - "loss": 0.4415, - "step": 5148 - }, - { - "epoch": 2.528049155145929, - "grad_norm": 0.420920991688011, - "learning_rate": 1.6729236094973577e-05, - "loss": 0.3847, - "step": 5149 - }, - { - "epoch": 2.528540706605223, - "grad_norm": 0.37705013432666884, - "learning_rate": 1.669515661859743e-05, - "loss": 0.3617, - "step": 5150 - }, - { - "epoch": 2.5290322580645164, - "grad_norm": 0.3941126515595328, - "learning_rate": 1.666110872781722e-05, - "loss": 0.4069, - "step": 5151 - }, - { - "epoch": 2.5295238095238095, - "grad_norm": 0.42429417080600657, - "learning_rate": 1.6627092435542448e-05, - "loss": 0.4088, - "step": 5152 - }, - { - "epoch": 2.5300153609831026, - "grad_norm": 0.4852191182017364, - "learning_rate": 1.65931077546706e-05, - "loss": 0.4312, - "step": 5153 - }, - { - "epoch": 2.5305069124423962, - "grad_norm": 0.4146656415487168, - "learning_rate": 1.655915469808722e-05, - "loss": 0.4109, - "step": 5154 - }, - { - "epoch": 2.53099846390169, - "grad_norm": 0.42159691766921686, - "learning_rate": 1.6525233278665795e-05, - "loss": 0.3944, - "step": 5155 - }, - { - "epoch": 2.531490015360983, - "grad_norm": 0.4270272139299303, - "learning_rate": 1.6491343509267955e-05, - "loss": 0.3809, - "step": 5156 - }, - { - "epoch": 2.5319815668202765, - "grad_norm": 0.41857493800827444, - "learning_rate": 1.645748540274311e-05, - "loss": 0.4207, - "step": 5157 - }, - { - "epoch": 2.5324731182795697, - "grad_norm": 0.4234857212342732, - "learning_rate": 1.6423658971928923e-05, - "loss": 0.4273, - "step": 5158 - }, - { - "epoch": 2.5329646697388633, - "grad_norm": 0.39772500065506156, - "learning_rate": 1.6389864229650797e-05, - "loss": 0.4083, - "step": 5159 - }, - { - "epoch": 2.533456221198157, - "grad_norm": 0.4009462548155493, - "learning_rate": 1.6356101188722305e-05, - "loss": 0.3938, - "step": 5160 - }, - { - "epoch": 2.53394777265745, - "grad_norm": 0.4381124873214228, - "learning_rate": 1.6322369861944918e-05, - "loss": 0.4384, - "step": 5161 - }, - { - "epoch": 2.5344393241167436, - "grad_norm": 0.4166788073611714, - "learning_rate": 1.6288670262108108e-05, - "loss": 0.416, - "step": 5162 - }, - { - "epoch": 2.5349308755760367, - "grad_norm": 0.3943217111666468, - "learning_rate": 1.6255002401989282e-05, - "loss": 0.4026, - "step": 5163 - }, - { - "epoch": 2.5354224270353303, - "grad_norm": 0.4152791886301715, - "learning_rate": 1.6221366294353857e-05, - "loss": 0.4231, - "step": 5164 - }, - { - "epoch": 2.535913978494624, - "grad_norm": 0.38759262098189234, - "learning_rate": 1.6187761951955184e-05, - "loss": 0.4072, - "step": 5165 - }, - { - "epoch": 2.536405529953917, - "grad_norm": 0.43096175102999695, - "learning_rate": 1.6154189387534546e-05, - "loss": 0.4025, - "step": 5166 - }, - { - "epoch": 2.5368970814132106, - "grad_norm": 0.3877248213476849, - "learning_rate": 1.6120648613821298e-05, - "loss": 0.4014, - "step": 5167 - }, - { - "epoch": 2.5373886328725037, - "grad_norm": 0.4108620546866996, - "learning_rate": 1.6087139643532533e-05, - "loss": 0.4094, - "step": 5168 - }, - { - "epoch": 2.5378801843317973, - "grad_norm": 0.425865320899337, - "learning_rate": 1.60536624893735e-05, - "loss": 0.443, - "step": 5169 - }, - { - "epoch": 2.538371735791091, - "grad_norm": 0.4155784774856912, - "learning_rate": 1.6020217164037244e-05, - "loss": 0.4039, - "step": 5170 - }, - { - "epoch": 2.538863287250384, - "grad_norm": 0.4376177289011271, - "learning_rate": 1.59868036802048e-05, - "loss": 0.4841, - "step": 5171 - }, - { - "epoch": 2.539354838709677, - "grad_norm": 0.45147907378585744, - "learning_rate": 1.5953422050545098e-05, - "loss": 0.4361, - "step": 5172 - }, - { - "epoch": 2.5398463901689707, - "grad_norm": 0.4453501546122892, - "learning_rate": 1.592007228771506e-05, - "loss": 0.4474, - "step": 5173 - }, - { - "epoch": 2.5403379416282643, - "grad_norm": 0.4429132250084318, - "learning_rate": 1.5886754404359393e-05, - "loss": 0.4276, - "step": 5174 - }, - { - "epoch": 2.5408294930875575, - "grad_norm": 0.4201883815292871, - "learning_rate": 1.5853468413110873e-05, - "loss": 0.3932, - "step": 5175 - }, - { - "epoch": 2.541321044546851, - "grad_norm": 0.42536375263496035, - "learning_rate": 1.5820214326590066e-05, - "loss": 0.4122, - "step": 5176 - }, - { - "epoch": 2.541812596006144, - "grad_norm": 0.4376619698641515, - "learning_rate": 1.5786992157405513e-05, - "loss": 0.4108, - "step": 5177 - }, - { - "epoch": 2.5423041474654378, - "grad_norm": 0.4311526090004511, - "learning_rate": 1.575380191815361e-05, - "loss": 0.4069, - "step": 5178 - }, - { - "epoch": 2.5427956989247313, - "grad_norm": 0.46529232878365767, - "learning_rate": 1.572064362141865e-05, - "loss": 0.4275, - "step": 5179 - }, - { - "epoch": 2.5432872503840245, - "grad_norm": 0.4234866701254169, - "learning_rate": 1.5687517279772846e-05, - "loss": 0.4068, - "step": 5180 - }, - { - "epoch": 2.543778801843318, - "grad_norm": 0.43433667062631454, - "learning_rate": 1.5654422905776235e-05, - "loss": 0.4171, - "step": 5181 - }, - { - "epoch": 2.544270353302611, - "grad_norm": 0.40308818593140394, - "learning_rate": 1.5621360511976857e-05, - "loss": 0.4259, - "step": 5182 - }, - { - "epoch": 2.544761904761905, - "grad_norm": 0.42058027825717587, - "learning_rate": 1.5588330110910443e-05, - "loss": 0.4171, - "step": 5183 - }, - { - "epoch": 2.5452534562211984, - "grad_norm": 0.41003745626655774, - "learning_rate": 1.5555331715100784e-05, - "loss": 0.4023, - "step": 5184 - }, - { - "epoch": 2.5457450076804915, - "grad_norm": 0.42199906371832907, - "learning_rate": 1.5522365337059363e-05, - "loss": 0.4218, - "step": 5185 - }, - { - "epoch": 2.546236559139785, - "grad_norm": 0.43297104336064485, - "learning_rate": 1.548943098928566e-05, - "loss": 0.4525, - "step": 5186 - }, - { - "epoch": 2.546728110599078, - "grad_norm": 0.4168197407504609, - "learning_rate": 1.5456528684266937e-05, - "loss": 0.4019, - "step": 5187 - }, - { - "epoch": 2.547219662058372, - "grad_norm": 0.418580175053328, - "learning_rate": 1.5423658434478338e-05, - "loss": 0.4249, - "step": 5188 - }, - { - "epoch": 2.5477112135176654, - "grad_norm": 0.4036913461879293, - "learning_rate": 1.539082025238283e-05, - "loss": 0.4255, - "step": 5189 - }, - { - "epoch": 2.5482027649769585, - "grad_norm": 0.43030777175460977, - "learning_rate": 1.535801415043123e-05, - "loss": 0.4126, - "step": 5190 - }, - { - "epoch": 2.5486943164362517, - "grad_norm": 0.42928372693046496, - "learning_rate": 1.5325240141062204e-05, - "loss": 0.3849, - "step": 5191 - }, - { - "epoch": 2.5491858678955452, - "grad_norm": 0.4127866494129654, - "learning_rate": 1.5292498236702213e-05, - "loss": 0.4488, - "step": 5192 - }, - { - "epoch": 2.549677419354839, - "grad_norm": 0.4273024908598793, - "learning_rate": 1.525978844976563e-05, - "loss": 0.4333, - "step": 5193 - }, - { - "epoch": 2.550168970814132, - "grad_norm": 0.40558111778023015, - "learning_rate": 1.522711079265452e-05, - "loss": 0.4198, - "step": 5194 - }, - { - "epoch": 2.5506605222734255, - "grad_norm": 0.40250482152085676, - "learning_rate": 1.5194465277758884e-05, - "loss": 0.4353, - "step": 5195 - }, - { - "epoch": 2.5511520737327187, - "grad_norm": 0.42983730915198437, - "learning_rate": 1.5161851917456494e-05, - "loss": 0.4107, - "step": 5196 - }, - { - "epoch": 2.5516436251920123, - "grad_norm": 0.4216749696468753, - "learning_rate": 1.5129270724112909e-05, - "loss": 0.4098, - "step": 5197 - }, - { - "epoch": 2.552135176651306, - "grad_norm": 0.4101392008164522, - "learning_rate": 1.5096721710081507e-05, - "loss": 0.4181, - "step": 5198 - }, - { - "epoch": 2.552626728110599, - "grad_norm": 0.4289937808734976, - "learning_rate": 1.5064204887703481e-05, - "loss": 0.4545, - "step": 5199 - }, - { - "epoch": 2.5531182795698926, - "grad_norm": 0.4081670093298884, - "learning_rate": 1.5031720269307792e-05, - "loss": 0.4087, - "step": 5200 - }, - { - "epoch": 2.5536098310291857, - "grad_norm": 0.43613123936449977, - "learning_rate": 1.4999267867211175e-05, - "loss": 0.4203, - "step": 5201 - }, - { - "epoch": 2.5541013824884793, - "grad_norm": 0.4234365538141309, - "learning_rate": 1.4966847693718279e-05, - "loss": 0.3969, - "step": 5202 - }, - { - "epoch": 2.554592933947773, - "grad_norm": 0.41756166466185746, - "learning_rate": 1.49344597611213e-05, - "loss": 0.4127, - "step": 5203 - }, - { - "epoch": 2.555084485407066, - "grad_norm": 0.4208935025456197, - "learning_rate": 1.4902104081700441e-05, - "loss": 0.3804, - "step": 5204 - }, - { - "epoch": 2.5555760368663596, - "grad_norm": 0.44268488906719217, - "learning_rate": 1.4869780667723531e-05, - "loss": 0.3979, - "step": 5205 - }, - { - "epoch": 2.5560675883256527, - "grad_norm": 0.4524267489306665, - "learning_rate": 1.4837489531446237e-05, - "loss": 0.4605, - "step": 5206 - }, - { - "epoch": 2.5565591397849463, - "grad_norm": 0.4329443386600355, - "learning_rate": 1.4805230685111937e-05, - "loss": 0.4415, - "step": 5207 - }, - { - "epoch": 2.55705069124424, - "grad_norm": 0.43016940710358653, - "learning_rate": 1.4773004140951807e-05, - "loss": 0.4188, - "step": 5208 - }, - { - "epoch": 2.557542242703533, - "grad_norm": 0.4352897381845784, - "learning_rate": 1.474080991118476e-05, - "loss": 0.4332, - "step": 5209 - }, - { - "epoch": 2.558033794162826, - "grad_norm": 0.3924524832111949, - "learning_rate": 1.4708648008017444e-05, - "loss": 0.4202, - "step": 5210 - }, - { - "epoch": 2.5585253456221198, - "grad_norm": 0.4198360475188516, - "learning_rate": 1.4676518443644282e-05, - "loss": 0.4123, - "step": 5211 - }, - { - "epoch": 2.5590168970814133, - "grad_norm": 0.4131811141251363, - "learning_rate": 1.4644421230247374e-05, - "loss": 0.4319, - "step": 5212 - }, - { - "epoch": 2.5595084485407065, - "grad_norm": 0.4197501650993057, - "learning_rate": 1.4612356379996672e-05, - "loss": 0.3766, - "step": 5213 - }, - { - "epoch": 2.56, - "grad_norm": 0.41959695002969705, - "learning_rate": 1.4580323905049686e-05, - "loss": 0.4737, - "step": 5214 - }, - { - "epoch": 2.560491551459293, - "grad_norm": 0.3871973737477092, - "learning_rate": 1.4548323817551824e-05, - "loss": 0.3772, - "step": 5215 - }, - { - "epoch": 2.5609831029185868, - "grad_norm": 0.3806914992210328, - "learning_rate": 1.451635612963611e-05, - "loss": 0.3624, - "step": 5216 - }, - { - "epoch": 2.5614746543778804, - "grad_norm": 0.4299269379033961, - "learning_rate": 1.4484420853423297e-05, - "loss": 0.4356, - "step": 5217 - }, - { - "epoch": 2.5619662058371735, - "grad_norm": 0.40142642244715, - "learning_rate": 1.4452518001021864e-05, - "loss": 0.376, - "step": 5218 - }, - { - "epoch": 2.562457757296467, - "grad_norm": 0.42096708275060635, - "learning_rate": 1.4420647584528036e-05, - "loss": 0.4344, - "step": 5219 - }, - { - "epoch": 2.56294930875576, - "grad_norm": 0.4935593284023261, - "learning_rate": 1.4388809616025622e-05, - "loss": 0.441, - "step": 5220 - }, - { - "epoch": 2.563440860215054, - "grad_norm": 0.4501725033871413, - "learning_rate": 1.4357004107586258e-05, - "loss": 0.3772, - "step": 5221 - }, - { - "epoch": 2.5639324116743474, - "grad_norm": 0.4230627057043371, - "learning_rate": 1.4325231071269218e-05, - "loss": 0.442, - "step": 5222 - }, - { - "epoch": 2.5644239631336405, - "grad_norm": 0.41904655321415124, - "learning_rate": 1.4293490519121434e-05, - "loss": 0.4264, - "step": 5223 - }, - { - "epoch": 2.564915514592934, - "grad_norm": 0.4198199995562333, - "learning_rate": 1.4261782463177587e-05, - "loss": 0.4176, - "step": 5224 - }, - { - "epoch": 2.5654070660522272, - "grad_norm": 0.43115357569969237, - "learning_rate": 1.4230106915459963e-05, - "loss": 0.4152, - "step": 5225 - }, - { - "epoch": 2.565898617511521, - "grad_norm": 0.41160266998534767, - "learning_rate": 1.4198463887978596e-05, - "loss": 0.3711, - "step": 5226 - }, - { - "epoch": 2.5663901689708144, - "grad_norm": 0.4158553892016337, - "learning_rate": 1.4166853392731105e-05, - "loss": 0.4141, - "step": 5227 - }, - { - "epoch": 2.5668817204301075, - "grad_norm": 0.3973635607235415, - "learning_rate": 1.41352754417029e-05, - "loss": 0.3913, - "step": 5228 - }, - { - "epoch": 2.5673732718894007, - "grad_norm": 0.41063752774565754, - "learning_rate": 1.4103730046866893e-05, - "loss": 0.4164, - "step": 5229 - }, - { - "epoch": 2.5678648233486943, - "grad_norm": 0.42980867328518335, - "learning_rate": 1.4072217220183793e-05, - "loss": 0.4241, - "step": 5230 - }, - { - "epoch": 2.568356374807988, - "grad_norm": 0.42278273610439054, - "learning_rate": 1.4040736973601887e-05, - "loss": 0.436, - "step": 5231 - }, - { - "epoch": 2.568847926267281, - "grad_norm": 0.4017290939967276, - "learning_rate": 1.4009289319057118e-05, - "loss": 0.4332, - "step": 5232 - }, - { - "epoch": 2.5693394777265746, - "grad_norm": 0.37897490577983095, - "learning_rate": 1.3977874268473069e-05, - "loss": 0.3911, - "step": 5233 - }, - { - "epoch": 2.5698310291858677, - "grad_norm": 0.437632664900981, - "learning_rate": 1.3946491833760988e-05, - "loss": 0.4721, - "step": 5234 - }, - { - "epoch": 2.5703225806451613, - "grad_norm": 0.4281669222895695, - "learning_rate": 1.3915142026819716e-05, - "loss": 0.4556, - "step": 5235 - }, - { - "epoch": 2.570814132104455, - "grad_norm": 0.47070890803857696, - "learning_rate": 1.3883824859535765e-05, - "loss": 0.4295, - "step": 5236 - }, - { - "epoch": 2.571305683563748, - "grad_norm": 0.43931489919317734, - "learning_rate": 1.3852540343783249e-05, - "loss": 0.4012, - "step": 5237 - }, - { - "epoch": 2.5717972350230416, - "grad_norm": 0.4031329712999231, - "learning_rate": 1.3821288491423867e-05, - "loss": 0.4152, - "step": 5238 - }, - { - "epoch": 2.5722887864823347, - "grad_norm": 0.4160293874214144, - "learning_rate": 1.3790069314307053e-05, - "loss": 0.4616, - "step": 5239 - }, - { - "epoch": 2.5727803379416283, - "grad_norm": 0.4516909753487584, - "learning_rate": 1.3758882824269659e-05, - "loss": 0.4716, - "step": 5240 - }, - { - "epoch": 2.573271889400922, - "grad_norm": 0.4286934490832245, - "learning_rate": 1.3727729033136349e-05, - "loss": 0.478, - "step": 5241 - }, - { - "epoch": 2.573763440860215, - "grad_norm": 0.4624019227545422, - "learning_rate": 1.3696607952719253e-05, - "loss": 0.4237, - "step": 5242 - }, - { - "epoch": 2.5742549923195086, - "grad_norm": 0.45014270805912443, - "learning_rate": 1.3665519594818155e-05, - "loss": 0.4216, - "step": 5243 - }, - { - "epoch": 2.5747465437788017, - "grad_norm": 0.4295199509350521, - "learning_rate": 1.3634463971220402e-05, - "loss": 0.3833, - "step": 5244 - }, - { - "epoch": 2.5752380952380953, - "grad_norm": 0.41309087112776827, - "learning_rate": 1.3603441093700941e-05, - "loss": 0.3903, - "step": 5245 - }, - { - "epoch": 2.575729646697389, - "grad_norm": 0.4132812626586564, - "learning_rate": 1.3572450974022321e-05, - "loss": 0.425, - "step": 5246 - }, - { - "epoch": 2.576221198156682, - "grad_norm": 0.37905119216522815, - "learning_rate": 1.354149362393462e-05, - "loss": 0.3938, - "step": 5247 - }, - { - "epoch": 2.576712749615975, - "grad_norm": 0.4044672010327601, - "learning_rate": 1.3510569055175603e-05, - "loss": 0.397, - "step": 5248 - }, - { - "epoch": 2.5772043010752688, - "grad_norm": 0.41093249446353486, - "learning_rate": 1.3479677279470448e-05, - "loss": 0.4425, - "step": 5249 - }, - { - "epoch": 2.5776958525345623, - "grad_norm": 0.400832755999386, - "learning_rate": 1.3448818308532029e-05, - "loss": 0.4071, - "step": 5250 - }, - { - "epoch": 2.5781874039938555, - "grad_norm": 0.4267841036431294, - "learning_rate": 1.3417992154060721e-05, - "loss": 0.4117, - "step": 5251 - }, - { - "epoch": 2.578678955453149, - "grad_norm": 0.41382688696643344, - "learning_rate": 1.3387198827744473e-05, - "loss": 0.3933, - "step": 5252 - }, - { - "epoch": 2.579170506912442, - "grad_norm": 0.45030195704791587, - "learning_rate": 1.335643834125876e-05, - "loss": 0.3901, - "step": 5253 - }, - { - "epoch": 2.579662058371736, - "grad_norm": 0.41456953369304156, - "learning_rate": 1.3325710706266692e-05, - "loss": 0.4414, - "step": 5254 - }, - { - "epoch": 2.5801536098310294, - "grad_norm": 0.4206699474990611, - "learning_rate": 1.3295015934418787e-05, - "loss": 0.412, - "step": 5255 - }, - { - "epoch": 2.5806451612903225, - "grad_norm": 0.4020092415941204, - "learning_rate": 1.326435403735321e-05, - "loss": 0.4212, - "step": 5256 - }, - { - "epoch": 2.581136712749616, - "grad_norm": 0.42552309596455123, - "learning_rate": 1.323372502669562e-05, - "loss": 0.4346, - "step": 5257 - }, - { - "epoch": 2.5816282642089092, - "grad_norm": 0.45233053570079085, - "learning_rate": 1.3203128914059194e-05, - "loss": 0.3929, - "step": 5258 - }, - { - "epoch": 2.582119815668203, - "grad_norm": 0.4038714180820211, - "learning_rate": 1.3172565711044727e-05, - "loss": 0.4142, - "step": 5259 - }, - { - "epoch": 2.5826113671274964, - "grad_norm": 0.4286245868920105, - "learning_rate": 1.3142035429240373e-05, - "loss": 0.4394, - "step": 5260 - }, - { - "epoch": 2.5831029185867895, - "grad_norm": 0.3947956652013941, - "learning_rate": 1.3111538080221952e-05, - "loss": 0.3856, - "step": 5261 - }, - { - "epoch": 2.583594470046083, - "grad_norm": 0.40575206356814003, - "learning_rate": 1.3081073675552735e-05, - "loss": 0.3895, - "step": 5262 - }, - { - "epoch": 2.5840860215053763, - "grad_norm": 0.3783440197370085, - "learning_rate": 1.3050642226783493e-05, - "loss": 0.3846, - "step": 5263 - }, - { - "epoch": 2.58457757296467, - "grad_norm": 0.4230362394713855, - "learning_rate": 1.3020243745452498e-05, - "loss": 0.4619, - "step": 5264 - }, - { - "epoch": 2.585069124423963, - "grad_norm": 0.4519557923289012, - "learning_rate": 1.2989878243085619e-05, - "loss": 0.411, - "step": 5265 - }, - { - "epoch": 2.5855606758832566, - "grad_norm": 0.40034258223343333, - "learning_rate": 1.2959545731196032e-05, - "loss": 0.3981, - "step": 5266 - }, - { - "epoch": 2.5860522273425497, - "grad_norm": 0.4065504050587561, - "learning_rate": 1.2929246221284597e-05, - "loss": 0.3863, - "step": 5267 - }, - { - "epoch": 2.5865437788018433, - "grad_norm": 0.42539197693503783, - "learning_rate": 1.2898979724839545e-05, - "loss": 0.4525, - "step": 5268 - }, - { - "epoch": 2.587035330261137, - "grad_norm": 0.4268125402751333, - "learning_rate": 1.2868746253336638e-05, - "loss": 0.4587, - "step": 5269 - }, - { - "epoch": 2.58752688172043, - "grad_norm": 0.4450892053251256, - "learning_rate": 1.283854581823909e-05, - "loss": 0.4734, - "step": 5270 - }, - { - "epoch": 2.5880184331797236, - "grad_norm": 0.4096266303695234, - "learning_rate": 1.2808378430997591e-05, - "loss": 0.4482, - "step": 5271 - }, - { - "epoch": 2.5885099846390167, - "grad_norm": 0.4073859428706417, - "learning_rate": 1.2778244103050341e-05, - "loss": 0.4046, - "step": 5272 - }, - { - "epoch": 2.5890015360983103, - "grad_norm": 0.3893176076411962, - "learning_rate": 1.2748142845822931e-05, - "loss": 0.4106, - "step": 5273 - }, - { - "epoch": 2.589493087557604, - "grad_norm": 0.40142306934620814, - "learning_rate": 1.271807467072852e-05, - "loss": 0.4285, - "step": 5274 - }, - { - "epoch": 2.589984639016897, - "grad_norm": 0.40061330517437627, - "learning_rate": 1.2688039589167578e-05, - "loss": 0.397, - "step": 5275 - }, - { - "epoch": 2.5904761904761906, - "grad_norm": 0.42711729902195467, - "learning_rate": 1.2658037612528184e-05, - "loss": 0.4175, - "step": 5276 - }, - { - "epoch": 2.5909677419354837, - "grad_norm": 0.4384587559454611, - "learning_rate": 1.2628068752185751e-05, - "loss": 0.4478, - "step": 5277 - }, - { - "epoch": 2.5914592933947773, - "grad_norm": 0.3995933598796077, - "learning_rate": 1.2598133019503189e-05, - "loss": 0.3969, - "step": 5278 - }, - { - "epoch": 2.591950844854071, - "grad_norm": 0.4150669053758655, - "learning_rate": 1.256823042583084e-05, - "loss": 0.4211, - "step": 5279 - }, - { - "epoch": 2.592442396313364, - "grad_norm": 0.3941671749373715, - "learning_rate": 1.2538360982506459e-05, - "loss": 0.3898, - "step": 5280 - }, - { - "epoch": 2.592933947772657, - "grad_norm": 0.4408742353833633, - "learning_rate": 1.2508524700855252e-05, - "loss": 0.4762, - "step": 5281 - }, - { - "epoch": 2.5934254992319508, - "grad_norm": 0.4049296429399314, - "learning_rate": 1.2478721592189846e-05, - "loss": 0.4181, - "step": 5282 - }, - { - "epoch": 2.5939170506912443, - "grad_norm": 0.4419985604059515, - "learning_rate": 1.2448951667810305e-05, - "loss": 0.398, - "step": 5283 - }, - { - "epoch": 2.5944086021505375, - "grad_norm": 0.4304218206479302, - "learning_rate": 1.2419214939004065e-05, - "loss": 0.4549, - "step": 5284 - }, - { - "epoch": 2.594900153609831, - "grad_norm": 0.4683803136024223, - "learning_rate": 1.2389511417046074e-05, - "loss": 0.4366, - "step": 5285 - }, - { - "epoch": 2.595391705069124, - "grad_norm": 0.44115516139270294, - "learning_rate": 1.2359841113198534e-05, - "loss": 0.3851, - "step": 5286 - }, - { - "epoch": 2.595883256528418, - "grad_norm": 0.41651016750665665, - "learning_rate": 1.2330204038711213e-05, - "loss": 0.4077, - "step": 5287 - }, - { - "epoch": 2.5963748079877114, - "grad_norm": 0.4222474066577131, - "learning_rate": 1.2300600204821178e-05, - "loss": 0.4108, - "step": 5288 - }, - { - "epoch": 2.5968663594470045, - "grad_norm": 0.43932138030503176, - "learning_rate": 1.2271029622752939e-05, - "loss": 0.4221, - "step": 5289 - }, - { - "epoch": 2.597357910906298, - "grad_norm": 0.41988399311680097, - "learning_rate": 1.2241492303718372e-05, - "loss": 0.4106, - "step": 5290 - }, - { - "epoch": 2.5978494623655912, - "grad_norm": 0.42322642269398886, - "learning_rate": 1.2211988258916752e-05, - "loss": 0.4235, - "step": 5291 - }, - { - "epoch": 2.598341013824885, - "grad_norm": 0.41203733168819306, - "learning_rate": 1.2182517499534728e-05, - "loss": 0.4133, - "step": 5292 - }, - { - "epoch": 2.5988325652841784, - "grad_norm": 0.3983076817788457, - "learning_rate": 1.2153080036746345e-05, - "loss": 0.4233, - "step": 5293 - }, - { - "epoch": 2.5993241167434715, - "grad_norm": 0.37847333226681684, - "learning_rate": 1.2123675881713048e-05, - "loss": 0.3893, - "step": 5294 - }, - { - "epoch": 2.599815668202765, - "grad_norm": 0.389824916268444, - "learning_rate": 1.2094305045583565e-05, - "loss": 0.4209, - "step": 5295 - }, - { - "epoch": 2.6003072196620582, - "grad_norm": 0.4017719347973111, - "learning_rate": 1.2064967539494087e-05, - "loss": 0.4043, - "step": 5296 - }, - { - "epoch": 2.600798771121352, - "grad_norm": 0.4113509117924611, - "learning_rate": 1.2035663374568118e-05, - "loss": 0.3814, - "step": 5297 - }, - { - "epoch": 2.6012903225806454, - "grad_norm": 0.4165745386087526, - "learning_rate": 1.2006392561916535e-05, - "loss": 0.4531, - "step": 5298 - }, - { - "epoch": 2.6017818740399385, - "grad_norm": 0.43949575300973437, - "learning_rate": 1.197715511263754e-05, - "loss": 0.4329, - "step": 5299 - }, - { - "epoch": 2.6022734254992317, - "grad_norm": 0.4431857617960204, - "learning_rate": 1.1947951037816762e-05, - "loss": 0.4386, - "step": 5300 - }, - { - "epoch": 2.6027649769585253, - "grad_norm": 0.423727275273249, - "learning_rate": 1.191878034852708e-05, - "loss": 0.4069, - "step": 5301 - }, - { - "epoch": 2.603256528417819, - "grad_norm": 0.432004334240044, - "learning_rate": 1.1889643055828758e-05, - "loss": 0.4249, - "step": 5302 - }, - { - "epoch": 2.603748079877112, - "grad_norm": 0.4083443107901837, - "learning_rate": 1.1860539170769436e-05, - "loss": 0.4193, - "step": 5303 - }, - { - "epoch": 2.6042396313364056, - "grad_norm": 0.4138588548351003, - "learning_rate": 1.1831468704383997e-05, - "loss": 0.3929, - "step": 5304 - }, - { - "epoch": 2.6047311827956987, - "grad_norm": 0.4070966211280334, - "learning_rate": 1.1802431667694768e-05, - "loss": 0.4134, - "step": 5305 - }, - { - "epoch": 2.6052227342549923, - "grad_norm": 0.4249990693964303, - "learning_rate": 1.1773428071711256e-05, - "loss": 0.4435, - "step": 5306 - }, - { - "epoch": 2.605714285714286, - "grad_norm": 0.4395434828072506, - "learning_rate": 1.1744457927430441e-05, - "loss": 0.4256, - "step": 5307 - }, - { - "epoch": 2.606205837173579, - "grad_norm": 0.4297304564073295, - "learning_rate": 1.1715521245836524e-05, - "loss": 0.4125, - "step": 5308 - }, - { - "epoch": 2.6066973886328726, - "grad_norm": 0.409718107651262, - "learning_rate": 1.1686618037901053e-05, - "loss": 0.4493, - "step": 5309 - }, - { - "epoch": 2.6071889400921657, - "grad_norm": 0.38519808583489984, - "learning_rate": 1.1657748314582851e-05, - "loss": 0.3788, - "step": 5310 - }, - { - "epoch": 2.6076804915514593, - "grad_norm": 0.419851656874679, - "learning_rate": 1.1628912086828115e-05, - "loss": 0.4297, - "step": 5311 - }, - { - "epoch": 2.608172043010753, - "grad_norm": 0.4565295533193233, - "learning_rate": 1.1600109365570234e-05, - "loss": 0.4016, - "step": 5312 - }, - { - "epoch": 2.608663594470046, - "grad_norm": 0.4031212214363553, - "learning_rate": 1.1571340161730016e-05, - "loss": 0.4145, - "step": 5313 - }, - { - "epoch": 2.6091551459293396, - "grad_norm": 0.39337275037809627, - "learning_rate": 1.1542604486215458e-05, - "loss": 0.4352, - "step": 5314 - }, - { - "epoch": 2.6096466973886328, - "grad_norm": 0.3978657181019612, - "learning_rate": 1.1513902349921913e-05, - "loss": 0.3963, - "step": 5315 - }, - { - "epoch": 2.6101382488479263, - "grad_norm": 0.42965276979044525, - "learning_rate": 1.1485233763731961e-05, - "loss": 0.4448, - "step": 5316 - }, - { - "epoch": 2.61062980030722, - "grad_norm": 0.4461801093207912, - "learning_rate": 1.1456598738515522e-05, - "loss": 0.4925, - "step": 5317 - }, - { - "epoch": 2.611121351766513, - "grad_norm": 0.4091718963844529, - "learning_rate": 1.1427997285129743e-05, - "loss": 0.4008, - "step": 5318 - }, - { - "epoch": 2.611612903225806, - "grad_norm": 0.41745303764794084, - "learning_rate": 1.1399429414419039e-05, - "loss": 0.3986, - "step": 5319 - }, - { - "epoch": 2.6121044546850998, - "grad_norm": 0.45964238945245806, - "learning_rate": 1.1370895137215176e-05, - "loss": 0.4585, - "step": 5320 - }, - { - "epoch": 2.6125960061443934, - "grad_norm": 0.3821186882089041, - "learning_rate": 1.1342394464337036e-05, - "loss": 0.4113, - "step": 5321 - }, - { - "epoch": 2.6130875576036865, - "grad_norm": 0.4301965137833599, - "learning_rate": 1.1313927406590908e-05, - "loss": 0.4159, - "step": 5322 - }, - { - "epoch": 2.61357910906298, - "grad_norm": 0.4150937380830955, - "learning_rate": 1.1285493974770245e-05, - "loss": 0.4138, - "step": 5323 - }, - { - "epoch": 2.614070660522273, - "grad_norm": 0.44047787380907416, - "learning_rate": 1.1257094179655769e-05, - "loss": 0.4505, - "step": 5324 - }, - { - "epoch": 2.614562211981567, - "grad_norm": 0.3941074837180245, - "learning_rate": 1.1228728032015468e-05, - "loss": 0.419, - "step": 5325 - }, - { - "epoch": 2.6150537634408604, - "grad_norm": 0.4267820860409458, - "learning_rate": 1.1200395542604547e-05, - "loss": 0.4607, - "step": 5326 - }, - { - "epoch": 2.6155453149001535, - "grad_norm": 0.445705231325303, - "learning_rate": 1.117209672216546e-05, - "loss": 0.4758, - "step": 5327 - }, - { - "epoch": 2.616036866359447, - "grad_norm": 0.4137045225724416, - "learning_rate": 1.1143831581427889e-05, - "loss": 0.4261, - "step": 5328 - }, - { - "epoch": 2.6165284178187402, - "grad_norm": 0.41750063648094804, - "learning_rate": 1.1115600131108817e-05, - "loss": 0.4103, - "step": 5329 - }, - { - "epoch": 2.617019969278034, - "grad_norm": 0.4165904590596343, - "learning_rate": 1.1087402381912293e-05, - "loss": 0.405, - "step": 5330 - }, - { - "epoch": 2.6175115207373274, - "grad_norm": 0.4058081989695218, - "learning_rate": 1.1059238344529765e-05, - "loss": 0.4402, - "step": 5331 - }, - { - "epoch": 2.6180030721966205, - "grad_norm": 0.4109268389232956, - "learning_rate": 1.1031108029639758e-05, - "loss": 0.4918, - "step": 5332 - }, - { - "epoch": 2.618494623655914, - "grad_norm": 0.40958818773959266, - "learning_rate": 1.1003011447908107e-05, - "loss": 0.4237, - "step": 5333 - }, - { - "epoch": 2.6189861751152073, - "grad_norm": 0.4325584243493363, - "learning_rate": 1.0974948609987823e-05, - "loss": 0.4012, - "step": 5334 - }, - { - "epoch": 2.619477726574501, - "grad_norm": 0.40835986383831824, - "learning_rate": 1.0946919526519118e-05, - "loss": 0.3711, - "step": 5335 - }, - { - "epoch": 2.6199692780337944, - "grad_norm": 0.4017252475489325, - "learning_rate": 1.0918924208129389e-05, - "loss": 0.3932, - "step": 5336 - }, - { - "epoch": 2.6204608294930876, - "grad_norm": 0.4439418722984641, - "learning_rate": 1.0890962665433268e-05, - "loss": 0.4567, - "step": 5337 - }, - { - "epoch": 2.6209523809523807, - "grad_norm": 0.3921230212544778, - "learning_rate": 1.0863034909032566e-05, - "loss": 0.4009, - "step": 5338 - }, - { - "epoch": 2.6214439324116743, - "grad_norm": 0.42325750347872054, - "learning_rate": 1.0835140949516253e-05, - "loss": 0.417, - "step": 5339 - }, - { - "epoch": 2.621935483870968, - "grad_norm": 0.389111937867253, - "learning_rate": 1.0807280797460572e-05, - "loss": 0.3746, - "step": 5340 - }, - { - "epoch": 2.622427035330261, - "grad_norm": 0.3978330318344825, - "learning_rate": 1.0779454463428817e-05, - "loss": 0.3961, - "step": 5341 - }, - { - "epoch": 2.6229185867895546, - "grad_norm": 0.42183186628517433, - "learning_rate": 1.0751661957971582e-05, - "loss": 0.3936, - "step": 5342 - }, - { - "epoch": 2.6234101382488477, - "grad_norm": 0.4302000139158192, - "learning_rate": 1.072390329162657e-05, - "loss": 0.4403, - "step": 5343 - }, - { - "epoch": 2.6239016897081413, - "grad_norm": 0.38113505252068064, - "learning_rate": 1.069617847491866e-05, - "loss": 0.4018, - "step": 5344 - }, - { - "epoch": 2.624393241167435, - "grad_norm": 0.41235578152989044, - "learning_rate": 1.0668487518359894e-05, - "loss": 0.4342, - "step": 5345 - }, - { - "epoch": 2.624884792626728, - "grad_norm": 0.428612445641925, - "learning_rate": 1.0640830432449534e-05, - "loss": 0.4016, - "step": 5346 - }, - { - "epoch": 2.6253763440860216, - "grad_norm": 0.39589738917147255, - "learning_rate": 1.0613207227673904e-05, - "loss": 0.4335, - "step": 5347 - }, - { - "epoch": 2.6258678955453147, - "grad_norm": 0.3918735791818909, - "learning_rate": 1.0585617914506529e-05, - "loss": 0.3989, - "step": 5348 - }, - { - "epoch": 2.6263594470046083, - "grad_norm": 0.4656303513460634, - "learning_rate": 1.0558062503408127e-05, - "loss": 0.436, - "step": 5349 - }, - { - "epoch": 2.626850998463902, - "grad_norm": 0.44756144086005695, - "learning_rate": 1.0530541004826455e-05, - "loss": 0.4687, - "step": 5350 - }, - { - "epoch": 2.627342549923195, - "grad_norm": 0.41186455865336674, - "learning_rate": 1.050305342919653e-05, - "loss": 0.3764, - "step": 5351 - }, - { - "epoch": 2.6278341013824886, - "grad_norm": 0.4305150431206504, - "learning_rate": 1.0475599786940438e-05, - "loss": 0.446, - "step": 5352 - }, - { - "epoch": 2.6283256528417818, - "grad_norm": 0.4141689143924113, - "learning_rate": 1.0448180088467407e-05, - "loss": 0.4131, - "step": 5353 - }, - { - "epoch": 2.6288172043010753, - "grad_norm": 0.43361065141307503, - "learning_rate": 1.0420794344173812e-05, - "loss": 0.4557, - "step": 5354 - }, - { - "epoch": 2.629308755760369, - "grad_norm": 0.4006286832518753, - "learning_rate": 1.0393442564443123e-05, - "loss": 0.4159, - "step": 5355 - }, - { - "epoch": 2.629800307219662, - "grad_norm": 0.40716025365309905, - "learning_rate": 1.0366124759645957e-05, - "loss": 0.4055, - "step": 5356 - }, - { - "epoch": 2.630291858678955, - "grad_norm": 0.4000233448184684, - "learning_rate": 1.0338840940140082e-05, - "loss": 0.4113, - "step": 5357 - }, - { - "epoch": 2.630783410138249, - "grad_norm": 0.42355353723854106, - "learning_rate": 1.031159111627028e-05, - "loss": 0.4623, - "step": 5358 - }, - { - "epoch": 2.6312749615975424, - "grad_norm": 0.46332201152106145, - "learning_rate": 1.0284375298368554e-05, - "loss": 0.4184, - "step": 5359 - }, - { - "epoch": 2.6317665130568355, - "grad_norm": 0.39945161000758245, - "learning_rate": 1.025719349675396e-05, - "loss": 0.3862, - "step": 5360 - }, - { - "epoch": 2.632258064516129, - "grad_norm": 0.4670259379687622, - "learning_rate": 1.0230045721732651e-05, - "loss": 0.3921, - "step": 5361 - }, - { - "epoch": 2.6327496159754222, - "grad_norm": 0.41491852944904284, - "learning_rate": 1.0202931983597896e-05, - "loss": 0.4386, - "step": 5362 - }, - { - "epoch": 2.633241167434716, - "grad_norm": 0.4458484761912659, - "learning_rate": 1.017585229263005e-05, - "loss": 0.3777, - "step": 5363 - }, - { - "epoch": 2.6337327188940094, - "grad_norm": 0.40792236037122076, - "learning_rate": 1.0148806659096555e-05, - "loss": 0.4108, - "step": 5364 - }, - { - "epoch": 2.6342242703533025, - "grad_norm": 0.4065104483513571, - "learning_rate": 1.012179509325194e-05, - "loss": 0.4498, - "step": 5365 - }, - { - "epoch": 2.634715821812596, - "grad_norm": 0.4323095333267805, - "learning_rate": 1.009481760533787e-05, - "loss": 0.4201, - "step": 5366 - }, - { - "epoch": 2.6352073732718893, - "grad_norm": 0.43581406930950106, - "learning_rate": 1.0067874205582971e-05, - "loss": 0.4406, - "step": 5367 - }, - { - "epoch": 2.635698924731183, - "grad_norm": 0.397598261902562, - "learning_rate": 1.0040964904203076e-05, - "loss": 0.4098, - "step": 5368 - }, - { - "epoch": 2.6361904761904764, - "grad_norm": 0.4264061380044999, - "learning_rate": 1.0014089711400998e-05, - "loss": 0.4197, - "step": 5369 - }, - { - "epoch": 2.6366820276497696, - "grad_norm": 0.4100340717425536, - "learning_rate": 9.987248637366664e-06, - "loss": 0.4339, - "step": 5370 - }, - { - "epoch": 2.637173579109063, - "grad_norm": 0.4169090206120225, - "learning_rate": 9.960441692277034e-06, - "loss": 0.4263, - "step": 5371 - }, - { - "epoch": 2.6376651305683563, - "grad_norm": 0.42544068109575156, - "learning_rate": 9.933668886296155e-06, - "loss": 0.4248, - "step": 5372 - }, - { - "epoch": 2.63815668202765, - "grad_norm": 0.4226205801571498, - "learning_rate": 9.906930229575118e-06, - "loss": 0.4481, - "step": 5373 - }, - { - "epoch": 2.6386482334869434, - "grad_norm": 0.41718955703675054, - "learning_rate": 9.880225732252035e-06, - "loss": 0.3969, - "step": 5374 - }, - { - "epoch": 2.6391397849462366, - "grad_norm": 0.39303734634685444, - "learning_rate": 9.853555404452164e-06, - "loss": 0.3754, - "step": 5375 - }, - { - "epoch": 2.6396313364055297, - "grad_norm": 0.44728570212843055, - "learning_rate": 9.82691925628766e-06, - "loss": 0.46, - "step": 5376 - }, - { - "epoch": 2.6401228878648233, - "grad_norm": 0.45493053165477176, - "learning_rate": 9.800317297857863e-06, - "loss": 0.4147, - "step": 5377 - }, - { - "epoch": 2.640614439324117, - "grad_norm": 0.41485918930308463, - "learning_rate": 9.773749539249055e-06, - "loss": 0.3874, - "step": 5378 - }, - { - "epoch": 2.64110599078341, - "grad_norm": 0.41486991053144257, - "learning_rate": 9.747215990534586e-06, - "loss": 0.4066, - "step": 5379 - }, - { - "epoch": 2.6415975422427036, - "grad_norm": 0.3776377283859583, - "learning_rate": 9.720716661774843e-06, - "loss": 0.3979, - "step": 5380 - }, - { - "epoch": 2.6420890937019967, - "grad_norm": 0.3989356508422083, - "learning_rate": 9.694251563017209e-06, - "loss": 0.3963, - "step": 5381 - }, - { - "epoch": 2.6425806451612903, - "grad_norm": 0.4190610328489551, - "learning_rate": 9.667820704296116e-06, - "loss": 0.4151, - "step": 5382 - }, - { - "epoch": 2.643072196620584, - "grad_norm": 0.4239017165612543, - "learning_rate": 9.641424095633e-06, - "loss": 0.4124, - "step": 5383 - }, - { - "epoch": 2.643563748079877, - "grad_norm": 0.3923533858174038, - "learning_rate": 9.615061747036314e-06, - "loss": 0.393, - "step": 5384 - }, - { - "epoch": 2.6440552995391706, - "grad_norm": 0.4289306006867652, - "learning_rate": 9.588733668501504e-06, - "loss": 0.4267, - "step": 5385 - }, - { - "epoch": 2.6445468509984638, - "grad_norm": 0.4260560989284525, - "learning_rate": 9.562439870011098e-06, - "loss": 0.4487, - "step": 5386 - }, - { - "epoch": 2.6450384024577573, - "grad_norm": 0.40920955468712666, - "learning_rate": 9.536180361534474e-06, - "loss": 0.3819, - "step": 5387 - }, - { - "epoch": 2.645529953917051, - "grad_norm": 0.4231466985521743, - "learning_rate": 9.509955153028193e-06, - "loss": 0.4047, - "step": 5388 - }, - { - "epoch": 2.646021505376344, - "grad_norm": 0.42796937170567456, - "learning_rate": 9.483764254435667e-06, - "loss": 0.4287, - "step": 5389 - }, - { - "epoch": 2.6465130568356376, - "grad_norm": 0.41928972491223726, - "learning_rate": 9.457607675687374e-06, - "loss": 0.4411, - "step": 5390 - }, - { - "epoch": 2.647004608294931, - "grad_norm": 0.40376676752035046, - "learning_rate": 9.431485426700736e-06, - "loss": 0.4051, - "step": 5391 - }, - { - "epoch": 2.6474961597542244, - "grad_norm": 0.4150879963716973, - "learning_rate": 9.405397517380233e-06, - "loss": 0.4147, - "step": 5392 - }, - { - "epoch": 2.6479877112135175, - "grad_norm": 0.42185871069050235, - "learning_rate": 9.379343957617226e-06, - "loss": 0.394, - "step": 5393 - }, - { - "epoch": 2.648479262672811, - "grad_norm": 0.4145208597484563, - "learning_rate": 9.353324757290082e-06, - "loss": 0.4214, - "step": 5394 - }, - { - "epoch": 2.6489708141321042, - "grad_norm": 0.4405632271679106, - "learning_rate": 9.327339926264222e-06, - "loss": 0.4362, - "step": 5395 - }, - { - "epoch": 2.649462365591398, - "grad_norm": 0.4092628443925841, - "learning_rate": 9.301389474391897e-06, - "loss": 0.4599, - "step": 5396 - }, - { - "epoch": 2.6499539170506914, - "grad_norm": 0.4864430244853022, - "learning_rate": 9.275473411512447e-06, - "loss": 0.5269, - "step": 5397 - }, - { - "epoch": 2.6504454685099845, - "grad_norm": 0.4299412011518461, - "learning_rate": 9.249591747452124e-06, - "loss": 0.3975, - "step": 5398 - }, - { - "epoch": 2.650937019969278, - "grad_norm": 0.3930617188361656, - "learning_rate": 9.223744492024112e-06, - "loss": 0.4113, - "step": 5399 - }, - { - "epoch": 2.6514285714285712, - "grad_norm": 0.40919081756109565, - "learning_rate": 9.197931655028558e-06, - "loss": 0.4134, - "step": 5400 - }, - { - "epoch": 2.651920122887865, - "grad_norm": 0.4307777252146958, - "learning_rate": 9.172153246252645e-06, - "loss": 0.4225, - "step": 5401 - }, - { - "epoch": 2.6524116743471584, - "grad_norm": 0.42298449468261196, - "learning_rate": 9.146409275470346e-06, - "loss": 0.4257, - "step": 5402 - }, - { - "epoch": 2.6529032258064515, - "grad_norm": 0.4318203258890334, - "learning_rate": 9.120699752442741e-06, - "loss": 0.3941, - "step": 5403 - }, - { - "epoch": 2.653394777265745, - "grad_norm": 0.3826337240679449, - "learning_rate": 9.095024686917687e-06, - "loss": 0.3645, - "step": 5404 - }, - { - "epoch": 2.6538863287250383, - "grad_norm": 0.39433634232551706, - "learning_rate": 9.069384088630117e-06, - "loss": 0.4512, - "step": 5405 - }, - { - "epoch": 2.654377880184332, - "grad_norm": 0.4121891187252731, - "learning_rate": 9.043777967301836e-06, - "loss": 0.45, - "step": 5406 - }, - { - "epoch": 2.6548694316436254, - "grad_norm": 0.4252387662041164, - "learning_rate": 9.018206332641554e-06, - "loss": 0.461, - "step": 5407 - }, - { - "epoch": 2.6553609831029186, - "grad_norm": 0.4111014762637659, - "learning_rate": 8.99266919434495e-06, - "loss": 0.3983, - "step": 5408 - }, - { - "epoch": 2.6558525345622117, - "grad_norm": 0.452111474133612, - "learning_rate": 8.967166562094587e-06, - "loss": 0.4309, - "step": 5409 - }, - { - "epoch": 2.6563440860215053, - "grad_norm": 0.4125007190544734, - "learning_rate": 8.941698445559965e-06, - "loss": 0.4159, - "step": 5410 - }, - { - "epoch": 2.656835637480799, - "grad_norm": 0.515996668294049, - "learning_rate": 8.916264854397483e-06, - "loss": 0.4522, - "step": 5411 - }, - { - "epoch": 2.657327188940092, - "grad_norm": 0.41386177213146924, - "learning_rate": 8.890865798250503e-06, - "loss": 0.4031, - "step": 5412 - }, - { - "epoch": 2.6578187403993856, - "grad_norm": 0.39125648600094315, - "learning_rate": 8.865501286749189e-06, - "loss": 0.396, - "step": 5413 - }, - { - "epoch": 2.6583102918586787, - "grad_norm": 0.4163082831106158, - "learning_rate": 8.840171329510705e-06, - "loss": 0.4085, - "step": 5414 - }, - { - "epoch": 2.6588018433179723, - "grad_norm": 0.42440505791526517, - "learning_rate": 8.814875936139078e-06, - "loss": 0.4416, - "step": 5415 - }, - { - "epoch": 2.659293394777266, - "grad_norm": 0.42560092976027736, - "learning_rate": 8.789615116225213e-06, - "loss": 0.4278, - "step": 5416 - }, - { - "epoch": 2.659784946236559, - "grad_norm": 0.6423512002747916, - "learning_rate": 8.76438887934693e-06, - "loss": 0.4591, - "step": 5417 - }, - { - "epoch": 2.6602764976958526, - "grad_norm": 0.4332157443761783, - "learning_rate": 8.739197235068918e-06, - "loss": 0.4607, - "step": 5418 - }, - { - "epoch": 2.6607680491551458, - "grad_norm": 0.409489123795808, - "learning_rate": 8.714040192942763e-06, - "loss": 0.3945, - "step": 5419 - }, - { - "epoch": 2.6612596006144393, - "grad_norm": 0.4303309799959123, - "learning_rate": 8.68891776250691e-06, - "loss": 0.4256, - "step": 5420 - }, - { - "epoch": 2.661751152073733, - "grad_norm": 0.4267798390985559, - "learning_rate": 8.663829953286762e-06, - "loss": 0.4233, - "step": 5421 - }, - { - "epoch": 2.662242703533026, - "grad_norm": 0.4810385904442036, - "learning_rate": 8.638776774794454e-06, - "loss": 0.4245, - "step": 5422 - }, - { - "epoch": 2.6627342549923196, - "grad_norm": 0.42308100332427523, - "learning_rate": 8.613758236529113e-06, - "loss": 0.4154, - "step": 5423 - }, - { - "epoch": 2.6632258064516128, - "grad_norm": 0.414959213266651, - "learning_rate": 8.58877434797668e-06, - "loss": 0.4528, - "step": 5424 - }, - { - "epoch": 2.6637173579109064, - "grad_norm": 0.37985402740042473, - "learning_rate": 8.563825118609969e-06, - "loss": 0.3577, - "step": 5425 - }, - { - "epoch": 2.6642089093702, - "grad_norm": 0.39331900486038013, - "learning_rate": 8.538910557888635e-06, - "loss": 0.3632, - "step": 5426 - }, - { - "epoch": 2.664700460829493, - "grad_norm": 0.408271435452002, - "learning_rate": 8.514030675259221e-06, - "loss": 0.4388, - "step": 5427 - }, - { - "epoch": 2.665192012288786, - "grad_norm": 0.40828619896231644, - "learning_rate": 8.489185480155082e-06, - "loss": 0.4219, - "step": 5428 - }, - { - "epoch": 2.66568356374808, - "grad_norm": 0.4338039540406892, - "learning_rate": 8.464374981996458e-06, - "loss": 0.4274, - "step": 5429 - }, - { - "epoch": 2.6661751152073734, - "grad_norm": 0.4766909380065768, - "learning_rate": 8.439599190190417e-06, - "loss": 0.4438, - "step": 5430 - }, - { - "epoch": 2.6666666666666665, - "grad_norm": 0.39862044241868255, - "learning_rate": 8.414858114130842e-06, - "loss": 0.4331, - "step": 5431 - }, - { - "epoch": 2.66715821812596, - "grad_norm": 0.3988996663069451, - "learning_rate": 8.390151763198528e-06, - "loss": 0.4306, - "step": 5432 - }, - { - "epoch": 2.6676497695852532, - "grad_norm": 0.4079597306392764, - "learning_rate": 8.365480146761006e-06, - "loss": 0.4177, - "step": 5433 - }, - { - "epoch": 2.668141321044547, - "grad_norm": 0.42331957670189707, - "learning_rate": 8.340843274172728e-06, - "loss": 0.4543, - "step": 5434 - }, - { - "epoch": 2.6686328725038404, - "grad_norm": 0.44553664600185905, - "learning_rate": 8.316241154774906e-06, - "loss": 0.4243, - "step": 5435 - }, - { - "epoch": 2.6691244239631335, - "grad_norm": 0.41428620785650344, - "learning_rate": 8.291673797895616e-06, - "loss": 0.4049, - "step": 5436 - }, - { - "epoch": 2.669615975422427, - "grad_norm": 0.41890171133499243, - "learning_rate": 8.267141212849704e-06, - "loss": 0.4415, - "step": 5437 - }, - { - "epoch": 2.6701075268817203, - "grad_norm": 0.4208486063219193, - "learning_rate": 8.242643408938922e-06, - "loss": 0.4313, - "step": 5438 - }, - { - "epoch": 2.670599078341014, - "grad_norm": 0.42531973876252605, - "learning_rate": 8.218180395451735e-06, - "loss": 0.4477, - "step": 5439 - }, - { - "epoch": 2.6710906298003074, - "grad_norm": 0.3930556158310125, - "learning_rate": 8.193752181663461e-06, - "loss": 0.4162, - "step": 5440 - }, - { - "epoch": 2.6715821812596006, - "grad_norm": 0.41512643601245525, - "learning_rate": 8.169358776836266e-06, - "loss": 0.3853, - "step": 5441 - }, - { - "epoch": 2.672073732718894, - "grad_norm": 0.4365926230250442, - "learning_rate": 8.145000190219e-06, - "loss": 0.4257, - "step": 5442 - }, - { - "epoch": 2.6725652841781873, - "grad_norm": 0.42705422252950365, - "learning_rate": 8.120676431047458e-06, - "loss": 0.4246, - "step": 5443 - }, - { - "epoch": 2.673056835637481, - "grad_norm": 0.41396914387957756, - "learning_rate": 8.096387508544123e-06, - "loss": 0.4348, - "step": 5444 - }, - { - "epoch": 2.6735483870967744, - "grad_norm": 0.410243392264974, - "learning_rate": 8.072133431918316e-06, - "loss": 0.4209, - "step": 5445 - }, - { - "epoch": 2.6740399385560676, - "grad_norm": 0.4049885626060416, - "learning_rate": 8.047914210366104e-06, - "loss": 0.3968, - "step": 5446 - }, - { - "epoch": 2.6745314900153607, - "grad_norm": 0.37332485096170237, - "learning_rate": 8.023729853070439e-06, - "loss": 0.3585, - "step": 5447 - }, - { - "epoch": 2.6750230414746543, - "grad_norm": 0.42614093446772877, - "learning_rate": 7.999580369200898e-06, - "loss": 0.3783, - "step": 5448 - }, - { - "epoch": 2.675514592933948, - "grad_norm": 0.4162593184506544, - "learning_rate": 7.975465767913981e-06, - "loss": 0.4034, - "step": 5449 - }, - { - "epoch": 2.676006144393241, - "grad_norm": 0.42459606567290314, - "learning_rate": 7.951386058352894e-06, - "loss": 0.4178, - "step": 5450 - }, - { - "epoch": 2.6764976958525346, - "grad_norm": 0.3980127332139539, - "learning_rate": 7.927341249647601e-06, - "loss": 0.3451, - "step": 5451 - }, - { - "epoch": 2.6769892473118277, - "grad_norm": 0.4256421613569688, - "learning_rate": 7.903331350914867e-06, - "loss": 0.441, - "step": 5452 - }, - { - "epoch": 2.6774807987711213, - "grad_norm": 0.44487912604201296, - "learning_rate": 7.879356371258218e-06, - "loss": 0.4597, - "step": 5453 - }, - { - "epoch": 2.677972350230415, - "grad_norm": 0.4075066995046214, - "learning_rate": 7.855416319767905e-06, - "loss": 0.3722, - "step": 5454 - }, - { - "epoch": 2.678463901689708, - "grad_norm": 0.4095127828141176, - "learning_rate": 7.83151120552098e-06, - "loss": 0.3888, - "step": 5455 - }, - { - "epoch": 2.6789554531490016, - "grad_norm": 0.3845272925313367, - "learning_rate": 7.807641037581226e-06, - "loss": 0.4136, - "step": 5456 - }, - { - "epoch": 2.6794470046082948, - "grad_norm": 0.4257334648775762, - "learning_rate": 7.783805824999157e-06, - "loss": 0.4207, - "step": 5457 - }, - { - "epoch": 2.6799385560675884, - "grad_norm": 0.40704957813240444, - "learning_rate": 7.760005576812112e-06, - "loss": 0.3943, - "step": 5458 - }, - { - "epoch": 2.680430107526882, - "grad_norm": 0.4303423979860556, - "learning_rate": 7.73624030204404e-06, - "loss": 0.4059, - "step": 5459 - }, - { - "epoch": 2.680921658986175, - "grad_norm": 0.4092923842280974, - "learning_rate": 7.712510009705764e-06, - "loss": 0.4251, - "step": 5460 - }, - { - "epoch": 2.6814132104454687, - "grad_norm": 0.44414524974282177, - "learning_rate": 7.688814708794766e-06, - "loss": 0.4186, - "step": 5461 - }, - { - "epoch": 2.681904761904762, - "grad_norm": 0.4565612987079781, - "learning_rate": 7.665154408295283e-06, - "loss": 0.4025, - "step": 5462 - }, - { - "epoch": 2.6823963133640554, - "grad_norm": 0.4064968763263214, - "learning_rate": 7.64152911717827e-06, - "loss": 0.4588, - "step": 5463 - }, - { - "epoch": 2.682887864823349, - "grad_norm": 0.4230784853496628, - "learning_rate": 7.617938844401429e-06, - "loss": 0.4135, - "step": 5464 - }, - { - "epoch": 2.683379416282642, - "grad_norm": 0.39190676123324375, - "learning_rate": 7.594383598909161e-06, - "loss": 0.3934, - "step": 5465 - }, - { - "epoch": 2.6838709677419352, - "grad_norm": 0.37307168866269835, - "learning_rate": 7.570863389632588e-06, - "loss": 0.3794, - "step": 5466 - }, - { - "epoch": 2.684362519201229, - "grad_norm": 0.41743804704716, - "learning_rate": 7.547378225489599e-06, - "loss": 0.4001, - "step": 5467 - }, - { - "epoch": 2.6848540706605224, - "grad_norm": 0.41777787964864926, - "learning_rate": 7.523928115384682e-06, - "loss": 0.4015, - "step": 5468 - }, - { - "epoch": 2.6853456221198155, - "grad_norm": 0.41587500762223234, - "learning_rate": 7.500513068209181e-06, - "loss": 0.421, - "step": 5469 - }, - { - "epoch": 2.685837173579109, - "grad_norm": 0.3878773263647131, - "learning_rate": 7.477133092841027e-06, - "loss": 0.3643, - "step": 5470 - }, - { - "epoch": 2.6863287250384023, - "grad_norm": 0.39004440967641163, - "learning_rate": 7.4537881981448954e-06, - "loss": 0.3995, - "step": 5471 - }, - { - "epoch": 2.686820276497696, - "grad_norm": 0.43869456975339577, - "learning_rate": 7.430478392972184e-06, - "loss": 0.4649, - "step": 5472 - }, - { - "epoch": 2.6873118279569894, - "grad_norm": 0.4115065209219869, - "learning_rate": 7.4072036861609464e-06, - "loss": 0.3871, - "step": 5473 - }, - { - "epoch": 2.6878033794162826, - "grad_norm": 0.4314534206231697, - "learning_rate": 7.3839640865359435e-06, - "loss": 0.423, - "step": 5474 - }, - { - "epoch": 2.688294930875576, - "grad_norm": 0.47178153428138386, - "learning_rate": 7.360759602908618e-06, - "loss": 0.3804, - "step": 5475 - }, - { - "epoch": 2.6887864823348693, - "grad_norm": 0.4235575725733525, - "learning_rate": 7.3375902440771635e-06, - "loss": 0.4365, - "step": 5476 - }, - { - "epoch": 2.689278033794163, - "grad_norm": 0.45252503817051837, - "learning_rate": 7.314456018826321e-06, - "loss": 0.4495, - "step": 5477 - }, - { - "epoch": 2.6897695852534564, - "grad_norm": 0.39021995944891386, - "learning_rate": 7.291356935927651e-06, - "loss": 0.3991, - "step": 5478 - }, - { - "epoch": 2.6902611367127496, - "grad_norm": 0.425949661897845, - "learning_rate": 7.268293004139271e-06, - "loss": 0.4086, - "step": 5479 - }, - { - "epoch": 2.690752688172043, - "grad_norm": 0.4401504805673719, - "learning_rate": 7.245264232206073e-06, - "loss": 0.4081, - "step": 5480 - }, - { - "epoch": 2.6912442396313363, - "grad_norm": 0.4464911473396545, - "learning_rate": 7.222270628859562e-06, - "loss": 0.4434, - "step": 5481 - }, - { - "epoch": 2.69173579109063, - "grad_norm": 0.4008652142682673, - "learning_rate": 7.199312202817909e-06, - "loss": 0.4039, - "step": 5482 - }, - { - "epoch": 2.6922273425499235, - "grad_norm": 0.46932450787482, - "learning_rate": 7.176388962785952e-06, - "loss": 0.4652, - "step": 5483 - }, - { - "epoch": 2.6927188940092166, - "grad_norm": 0.4209872429872233, - "learning_rate": 7.153500917455225e-06, - "loss": 0.3941, - "step": 5484 - }, - { - "epoch": 2.6932104454685097, - "grad_norm": 0.4638061944951321, - "learning_rate": 7.130648075503843e-06, - "loss": 0.4816, - "step": 5485 - }, - { - "epoch": 2.6937019969278033, - "grad_norm": 0.41302638234078337, - "learning_rate": 7.107830445596631e-06, - "loss": 0.4156, - "step": 5486 - }, - { - "epoch": 2.694193548387097, - "grad_norm": 0.40260901197167054, - "learning_rate": 7.085048036385078e-06, - "loss": 0.4305, - "step": 5487 - }, - { - "epoch": 2.69468509984639, - "grad_norm": 0.43152109742967315, - "learning_rate": 7.062300856507231e-06, - "loss": 0.4385, - "step": 5488 - }, - { - "epoch": 2.6951766513056836, - "grad_norm": 0.41329099828376487, - "learning_rate": 7.039588914587891e-06, - "loss": 0.4449, - "step": 5489 - }, - { - "epoch": 2.6956682027649768, - "grad_norm": 0.4279799231581329, - "learning_rate": 7.0169122192384144e-06, - "loss": 0.434, - "step": 5490 - }, - { - "epoch": 2.6961597542242703, - "grad_norm": 0.3929479731471267, - "learning_rate": 6.994270779056833e-06, - "loss": 0.3593, - "step": 5491 - }, - { - "epoch": 2.696651305683564, - "grad_norm": 0.41398419861491503, - "learning_rate": 6.971664602627792e-06, - "loss": 0.4526, - "step": 5492 - }, - { - "epoch": 2.697142857142857, - "grad_norm": 0.4497409827849861, - "learning_rate": 6.949093698522613e-06, - "loss": 0.5157, - "step": 5493 - }, - { - "epoch": 2.6976344086021506, - "grad_norm": 0.4097101827921944, - "learning_rate": 6.9265580752991495e-06, - "loss": 0.3983, - "step": 5494 - }, - { - "epoch": 2.698125960061444, - "grad_norm": 0.42514745491146183, - "learning_rate": 6.90405774150199e-06, - "loss": 0.4237, - "step": 5495 - }, - { - "epoch": 2.6986175115207374, - "grad_norm": 0.3824039272095052, - "learning_rate": 6.881592705662265e-06, - "loss": 0.3631, - "step": 5496 - }, - { - "epoch": 2.699109062980031, - "grad_norm": 0.42045176301648407, - "learning_rate": 6.8591629762977396e-06, - "loss": 0.4034, - "step": 5497 - }, - { - "epoch": 2.699600614439324, - "grad_norm": 0.4568161670097656, - "learning_rate": 6.836768561912798e-06, - "loss": 0.4174, - "step": 5498 - }, - { - "epoch": 2.7000921658986177, - "grad_norm": 0.4156900179818879, - "learning_rate": 6.8144094709984504e-06, - "loss": 0.4234, - "step": 5499 - }, - { - "epoch": 2.700583717357911, - "grad_norm": 0.39477858041860353, - "learning_rate": 6.792085712032281e-06, - "loss": 0.3605, - "step": 5500 - }, - { - "epoch": 2.7010752688172044, - "grad_norm": 0.46185864360589834, - "learning_rate": 6.769797293478486e-06, - "loss": 0.4075, - "step": 5501 - }, - { - "epoch": 2.701566820276498, - "grad_norm": 0.40614384264872005, - "learning_rate": 6.747544223787916e-06, - "loss": 0.4011, - "step": 5502 - }, - { - "epoch": 2.702058371735791, - "grad_norm": 0.4040622279379188, - "learning_rate": 6.7253265113979225e-06, - "loss": 0.4121, - "step": 5503 - }, - { - "epoch": 2.7025499231950842, - "grad_norm": 0.3932175902213402, - "learning_rate": 6.7031441647325335e-06, - "loss": 0.4, - "step": 5504 - }, - { - "epoch": 2.703041474654378, - "grad_norm": 0.4033607995851353, - "learning_rate": 6.680997192202299e-06, - "loss": 0.4296, - "step": 5505 - }, - { - "epoch": 2.7035330261136714, - "grad_norm": 0.40796652445487475, - "learning_rate": 6.658885602204446e-06, - "loss": 0.4381, - "step": 5506 - }, - { - "epoch": 2.7040245775729645, - "grad_norm": 0.39909841834260235, - "learning_rate": 6.63680940312269e-06, - "loss": 0.4056, - "step": 5507 - }, - { - "epoch": 2.704516129032258, - "grad_norm": 0.4209460608124384, - "learning_rate": 6.614768603327393e-06, - "loss": 0.3929, - "step": 5508 - }, - { - "epoch": 2.7050076804915513, - "grad_norm": 0.43719805515392945, - "learning_rate": 6.592763211175467e-06, - "loss": 0.4152, - "step": 5509 - }, - { - "epoch": 2.705499231950845, - "grad_norm": 0.3920812340842254, - "learning_rate": 6.570793235010408e-06, - "loss": 0.4159, - "step": 5510 - }, - { - "epoch": 2.7059907834101384, - "grad_norm": 0.44013974108592474, - "learning_rate": 6.548858683162284e-06, - "loss": 0.4124, - "step": 5511 - }, - { - "epoch": 2.7064823348694316, - "grad_norm": 0.4252420172153551, - "learning_rate": 6.52695956394771e-06, - "loss": 0.4456, - "step": 5512 - }, - { - "epoch": 2.706973886328725, - "grad_norm": 0.4064544627889784, - "learning_rate": 6.505095885669921e-06, - "loss": 0.4055, - "step": 5513 - }, - { - "epoch": 2.7074654377880183, - "grad_norm": 0.4136392213161951, - "learning_rate": 6.483267656618641e-06, - "loss": 0.3946, - "step": 5514 - }, - { - "epoch": 2.707956989247312, - "grad_norm": 0.42928931669236947, - "learning_rate": 6.4614748850702175e-06, - "loss": 0.404, - "step": 5515 - }, - { - "epoch": 2.7084485407066055, - "grad_norm": 0.42949999014574364, - "learning_rate": 6.439717579287518e-06, - "loss": 0.4155, - "step": 5516 - }, - { - "epoch": 2.7089400921658986, - "grad_norm": 0.4015711691051889, - "learning_rate": 6.417995747519967e-06, - "loss": 0.4021, - "step": 5517 - }, - { - "epoch": 2.709431643625192, - "grad_norm": 0.3892934049595953, - "learning_rate": 6.396309398003564e-06, - "loss": 0.3795, - "step": 5518 - }, - { - "epoch": 2.7099231950844853, - "grad_norm": 0.41761366564574076, - "learning_rate": 6.374658538960809e-06, - "loss": 0.4209, - "step": 5519 - }, - { - "epoch": 2.710414746543779, - "grad_norm": 0.39633714846566165, - "learning_rate": 6.3530431786007926e-06, - "loss": 0.4211, - "step": 5520 - }, - { - "epoch": 2.710906298003072, - "grad_norm": 0.432710141786852, - "learning_rate": 6.331463325119091e-06, - "loss": 0.4421, - "step": 5521 - }, - { - "epoch": 2.7113978494623656, - "grad_norm": 0.3789983734393442, - "learning_rate": 6.309918986697916e-06, - "loss": 0.3539, - "step": 5522 - }, - { - "epoch": 2.7118894009216588, - "grad_norm": 0.40750132326610417, - "learning_rate": 6.288410171505887e-06, - "loss": 0.4179, - "step": 5523 - }, - { - "epoch": 2.7123809523809523, - "grad_norm": 0.3977210793440475, - "learning_rate": 6.26693688769825e-06, - "loss": 0.3676, - "step": 5524 - }, - { - "epoch": 2.712872503840246, - "grad_norm": 0.4172629691498353, - "learning_rate": 6.245499143416733e-06, - "loss": 0.45, - "step": 5525 - }, - { - "epoch": 2.713364055299539, - "grad_norm": 0.45504849644046713, - "learning_rate": 6.224096946789615e-06, - "loss": 0.4375, - "step": 5526 - }, - { - "epoch": 2.7138556067588326, - "grad_norm": 0.43003664665807395, - "learning_rate": 6.20273030593167e-06, - "loss": 0.4195, - "step": 5527 - }, - { - "epoch": 2.7143471582181258, - "grad_norm": 0.4531115642820368, - "learning_rate": 6.181399228944218e-06, - "loss": 0.4194, - "step": 5528 - }, - { - "epoch": 2.7148387096774194, - "grad_norm": 0.4494584646762748, - "learning_rate": 6.160103723915067e-06, - "loss": 0.4388, - "step": 5529 - }, - { - "epoch": 2.715330261136713, - "grad_norm": 0.40059391291025565, - "learning_rate": 6.138843798918558e-06, - "loss": 0.3915, - "step": 5530 - }, - { - "epoch": 2.715821812596006, - "grad_norm": 0.4509052572054086, - "learning_rate": 6.117619462015534e-06, - "loss": 0.3797, - "step": 5531 - }, - { - "epoch": 2.7163133640552997, - "grad_norm": 0.4207411945120334, - "learning_rate": 6.096430721253343e-06, - "loss": 0.4015, - "step": 5532 - }, - { - "epoch": 2.716804915514593, - "grad_norm": 0.4090107518202127, - "learning_rate": 6.075277584665862e-06, - "loss": 0.4146, - "step": 5533 - }, - { - "epoch": 2.7172964669738864, - "grad_norm": 0.42453049869647747, - "learning_rate": 6.054160060273406e-06, - "loss": 0.4052, - "step": 5534 - }, - { - "epoch": 2.71778801843318, - "grad_norm": 0.392115007468133, - "learning_rate": 6.033078156082872e-06, - "loss": 0.3834, - "step": 5535 - }, - { - "epoch": 2.718279569892473, - "grad_norm": 0.4491975234066452, - "learning_rate": 6.012031880087576e-06, - "loss": 0.4261, - "step": 5536 - }, - { - "epoch": 2.7187711213517662, - "grad_norm": 0.46133803637202614, - "learning_rate": 5.991021240267369e-06, - "loss": 0.4404, - "step": 5537 - }, - { - "epoch": 2.71926267281106, - "grad_norm": 0.41266991788978646, - "learning_rate": 5.970046244588557e-06, - "loss": 0.4547, - "step": 5538 - }, - { - "epoch": 2.7197542242703534, - "grad_norm": 0.43379401767716325, - "learning_rate": 5.949106901004009e-06, - "loss": 0.4175, - "step": 5539 - }, - { - "epoch": 2.7202457757296465, - "grad_norm": 0.41715520539584056, - "learning_rate": 5.928203217452944e-06, - "loss": 0.4077, - "step": 5540 - }, - { - "epoch": 2.72073732718894, - "grad_norm": 0.39979302017139584, - "learning_rate": 5.907335201861175e-06, - "loss": 0.4012, - "step": 5541 - }, - { - "epoch": 2.7212288786482333, - "grad_norm": 0.4492443218638585, - "learning_rate": 5.886502862140952e-06, - "loss": 0.3901, - "step": 5542 - }, - { - "epoch": 2.721720430107527, - "grad_norm": 0.41330868165963774, - "learning_rate": 5.865706206191002e-06, - "loss": 0.4062, - "step": 5543 - }, - { - "epoch": 2.7222119815668204, - "grad_norm": 0.4061204852095859, - "learning_rate": 5.844945241896505e-06, - "loss": 0.408, - "step": 5544 - }, - { - "epoch": 2.7227035330261136, - "grad_norm": 0.4303086768341994, - "learning_rate": 5.824219977129119e-06, - "loss": 0.4494, - "step": 5545 - }, - { - "epoch": 2.723195084485407, - "grad_norm": 0.43234124947647756, - "learning_rate": 5.803530419746972e-06, - "loss": 0.4412, - "step": 5546 - }, - { - "epoch": 2.7236866359447003, - "grad_norm": 0.42856716154026187, - "learning_rate": 5.782876577594643e-06, - "loss": 0.4816, - "step": 5547 - }, - { - "epoch": 2.724178187403994, - "grad_norm": 0.43517964835085765, - "learning_rate": 5.762258458503223e-06, - "loss": 0.41, - "step": 5548 - }, - { - "epoch": 2.7246697388632874, - "grad_norm": 0.4087681873162632, - "learning_rate": 5.741676070290136e-06, - "loss": 0.4221, - "step": 5549 - }, - { - "epoch": 2.7251612903225806, - "grad_norm": 0.413728022603252, - "learning_rate": 5.721129420759386e-06, - "loss": 0.3996, - "step": 5550 - }, - { - "epoch": 2.725652841781874, - "grad_norm": 0.42567245625415584, - "learning_rate": 5.700618517701361e-06, - "loss": 0.4377, - "step": 5551 - }, - { - "epoch": 2.7261443932411673, - "grad_norm": 0.4027120128347094, - "learning_rate": 5.68014336889291e-06, - "loss": 0.4071, - "step": 5552 - }, - { - "epoch": 2.726635944700461, - "grad_norm": 0.4173600085987435, - "learning_rate": 5.6597039820973195e-06, - "loss": 0.4278, - "step": 5553 - }, - { - "epoch": 2.7271274961597545, - "grad_norm": 0.42602378606067504, - "learning_rate": 5.639300365064337e-06, - "loss": 0.4129, - "step": 5554 - }, - { - "epoch": 2.7276190476190476, - "grad_norm": 0.4063955012542778, - "learning_rate": 5.618932525530107e-06, - "loss": 0.3785, - "step": 5555 - }, - { - "epoch": 2.7281105990783407, - "grad_norm": 0.3895672747801335, - "learning_rate": 5.598600471217253e-06, - "loss": 0.4021, - "step": 5556 - }, - { - "epoch": 2.7286021505376343, - "grad_norm": 0.3788407920337871, - "learning_rate": 5.578304209834806e-06, - "loss": 0.3791, - "step": 5557 - }, - { - "epoch": 2.729093701996928, - "grad_norm": 0.3885984546093099, - "learning_rate": 5.558043749078213e-06, - "loss": 0.4157, - "step": 5558 - }, - { - "epoch": 2.729585253456221, - "grad_norm": 0.461999180680633, - "learning_rate": 5.537819096629415e-06, - "loss": 0.4222, - "step": 5559 - }, - { - "epoch": 2.7300768049155146, - "grad_norm": 0.4321143441787244, - "learning_rate": 5.517630260156659e-06, - "loss": 0.4631, - "step": 5560 - }, - { - "epoch": 2.7305683563748078, - "grad_norm": 0.42942568715414287, - "learning_rate": 5.497477247314731e-06, - "loss": 0.3754, - "step": 5561 - }, - { - "epoch": 2.7310599078341014, - "grad_norm": 0.41924713137026487, - "learning_rate": 5.477360065744764e-06, - "loss": 0.4149, - "step": 5562 - }, - { - "epoch": 2.731551459293395, - "grad_norm": 0.4211760013667313, - "learning_rate": 5.4572787230743325e-06, - "loss": 0.4064, - "step": 5563 - }, - { - "epoch": 2.732043010752688, - "grad_norm": 0.40925325423567027, - "learning_rate": 5.437233226917393e-06, - "loss": 0.4127, - "step": 5564 - }, - { - "epoch": 2.7325345622119817, - "grad_norm": 0.39017418155738515, - "learning_rate": 5.4172235848743536e-06, - "loss": 0.3871, - "step": 5565 - }, - { - "epoch": 2.733026113671275, - "grad_norm": 0.42387860572388514, - "learning_rate": 5.39724980453199e-06, - "loss": 0.4306, - "step": 5566 - }, - { - "epoch": 2.7335176651305684, - "grad_norm": 0.41859759911190514, - "learning_rate": 5.377311893463499e-06, - "loss": 0.4497, - "step": 5567 - }, - { - "epoch": 2.734009216589862, - "grad_norm": 0.44551197469589415, - "learning_rate": 5.3574098592284906e-06, - "loss": 0.4359, - "step": 5568 - }, - { - "epoch": 2.734500768049155, - "grad_norm": 0.4146158022437149, - "learning_rate": 5.337543709372928e-06, - "loss": 0.4125, - "step": 5569 - }, - { - "epoch": 2.7349923195084487, - "grad_norm": 0.4395996463882175, - "learning_rate": 5.317713451429218e-06, - "loss": 0.4374, - "step": 5570 - }, - { - "epoch": 2.735483870967742, - "grad_norm": 0.4063059656204468, - "learning_rate": 5.297919092916137e-06, - "loss": 0.3952, - "step": 5571 - }, - { - "epoch": 2.7359754224270354, - "grad_norm": 0.42422509347871734, - "learning_rate": 5.2781606413388475e-06, - "loss": 0.4637, - "step": 5572 - }, - { - "epoch": 2.736466973886329, - "grad_norm": 0.41608683387413403, - "learning_rate": 5.258438104188879e-06, - "loss": 0.4597, - "step": 5573 - }, - { - "epoch": 2.736958525345622, - "grad_norm": 0.3923512448829839, - "learning_rate": 5.2387514889442045e-06, - "loss": 0.3537, - "step": 5574 - }, - { - "epoch": 2.7374500768049153, - "grad_norm": 0.4105197544010745, - "learning_rate": 5.2191008030691105e-06, - "loss": 0.4044, - "step": 5575 - }, - { - "epoch": 2.737941628264209, - "grad_norm": 0.5221672938411795, - "learning_rate": 5.199486054014291e-06, - "loss": 0.4793, - "step": 5576 - }, - { - "epoch": 2.7384331797235024, - "grad_norm": 0.43335904506723294, - "learning_rate": 5.17990724921682e-06, - "loss": 0.3987, - "step": 5577 - }, - { - "epoch": 2.7389247311827956, - "grad_norm": 0.4197191888563136, - "learning_rate": 5.160364396100115e-06, - "loss": 0.4309, - "step": 5578 - }, - { - "epoch": 2.739416282642089, - "grad_norm": 0.4221371761334608, - "learning_rate": 5.140857502074015e-06, - "loss": 0.4478, - "step": 5579 - }, - { - "epoch": 2.7399078341013823, - "grad_norm": 0.43891218920831676, - "learning_rate": 5.121386574534648e-06, - "loss": 0.4146, - "step": 5580 - }, - { - "epoch": 2.740399385560676, - "grad_norm": 0.39607461571077496, - "learning_rate": 5.101951620864576e-06, - "loss": 0.3921, - "step": 5581 - }, - { - "epoch": 2.7408909370199694, - "grad_norm": 0.4088936138727943, - "learning_rate": 5.082552648432693e-06, - "loss": 0.3994, - "step": 5582 - }, - { - "epoch": 2.7413824884792626, - "grad_norm": 0.40441391191003495, - "learning_rate": 5.063189664594248e-06, - "loss": 0.4282, - "step": 5583 - }, - { - "epoch": 2.741874039938556, - "grad_norm": 0.4221894321675064, - "learning_rate": 5.043862676690825e-06, - "loss": 0.4678, - "step": 5584 - }, - { - "epoch": 2.7423655913978493, - "grad_norm": 0.40481345629281384, - "learning_rate": 5.0245716920504395e-06, - "loss": 0.3859, - "step": 5585 - }, - { - "epoch": 2.742857142857143, - "grad_norm": 0.4470732877744559, - "learning_rate": 5.005316717987329e-06, - "loss": 0.4092, - "step": 5586 - }, - { - "epoch": 2.7433486943164365, - "grad_norm": 0.3928363936142154, - "learning_rate": 4.986097761802189e-06, - "loss": 0.416, - "step": 5587 - }, - { - "epoch": 2.7438402457757296, - "grad_norm": 0.39688771386393373, - "learning_rate": 4.96691483078201e-06, - "loss": 0.3787, - "step": 5588 - }, - { - "epoch": 2.744331797235023, - "grad_norm": 0.4113568451176203, - "learning_rate": 4.9477679322001425e-06, - "loss": 0.4412, - "step": 5589 - }, - { - "epoch": 2.7448233486943163, - "grad_norm": 0.4056724191898062, - "learning_rate": 4.928657073316234e-06, - "loss": 0.4094, - "step": 5590 - }, - { - "epoch": 2.74531490015361, - "grad_norm": 0.44799921697791584, - "learning_rate": 4.909582261376322e-06, - "loss": 0.4247, - "step": 5591 - }, - { - "epoch": 2.7458064516129035, - "grad_norm": 0.40343199165739774, - "learning_rate": 4.890543503612733e-06, - "loss": 0.3976, - "step": 5592 - }, - { - "epoch": 2.7462980030721966, - "grad_norm": 0.40623013345428693, - "learning_rate": 4.8715408072441346e-06, - "loss": 0.4035, - "step": 5593 - }, - { - "epoch": 2.7467895545314898, - "grad_norm": 0.3702036442100213, - "learning_rate": 4.852574179475566e-06, - "loss": 0.3864, - "step": 5594 - }, - { - "epoch": 2.7472811059907833, - "grad_norm": 0.42621981328885633, - "learning_rate": 4.8336436274983075e-06, - "loss": 0.4485, - "step": 5595 - }, - { - "epoch": 2.747772657450077, - "grad_norm": 0.41055722258421873, - "learning_rate": 4.8147491584900395e-06, - "loss": 0.4082, - "step": 5596 - }, - { - "epoch": 2.74826420890937, - "grad_norm": 0.40601680743235696, - "learning_rate": 4.79589077961472e-06, - "loss": 0.4172, - "step": 5597 - }, - { - "epoch": 2.7487557603686636, - "grad_norm": 0.3956705174303359, - "learning_rate": 4.777068498022619e-06, - "loss": 0.4425, - "step": 5598 - }, - { - "epoch": 2.749247311827957, - "grad_norm": 0.4321025631895987, - "learning_rate": 4.758282320850338e-06, - "loss": 0.4387, - "step": 5599 - }, - { - "epoch": 2.7497388632872504, - "grad_norm": 0.4093227044262583, - "learning_rate": 4.739532255220791e-06, - "loss": 0.4014, - "step": 5600 - }, - { - "epoch": 2.750230414746544, - "grad_norm": 0.40095999040258756, - "learning_rate": 4.720818308243191e-06, - "loss": 0.3794, - "step": 5601 - }, - { - "epoch": 2.750721966205837, - "grad_norm": 0.4192009076581332, - "learning_rate": 4.70214048701304e-06, - "loss": 0.5041, - "step": 5602 - }, - { - "epoch": 2.7512135176651307, - "grad_norm": 0.4140192895860933, - "learning_rate": 4.683498798612185e-06, - "loss": 0.4127, - "step": 5603 - }, - { - "epoch": 2.751705069124424, - "grad_norm": 0.5398164419166952, - "learning_rate": 4.664893250108715e-06, - "loss": 0.4327, - "step": 5604 - }, - { - "epoch": 2.7521966205837174, - "grad_norm": 0.4107205382854118, - "learning_rate": 4.6463238485570995e-06, - "loss": 0.4145, - "step": 5605 - }, - { - "epoch": 2.752688172043011, - "grad_norm": 0.42792597729641585, - "learning_rate": 4.627790600998005e-06, - "loss": 0.437, - "step": 5606 - }, - { - "epoch": 2.753179723502304, - "grad_norm": 0.42765023402346036, - "learning_rate": 4.6092935144584685e-06, - "loss": 0.3702, - "step": 5607 - }, - { - "epoch": 2.7536712749615977, - "grad_norm": 0.4026404175611252, - "learning_rate": 4.590832595951777e-06, - "loss": 0.4171, - "step": 5608 - }, - { - "epoch": 2.754162826420891, - "grad_norm": 0.40278418056646736, - "learning_rate": 4.572407852477511e-06, - "loss": 0.419, - "step": 5609 - }, - { - "epoch": 2.7546543778801844, - "grad_norm": 0.4238053397903939, - "learning_rate": 4.554019291021538e-06, - "loss": 0.4679, - "step": 5610 - }, - { - "epoch": 2.755145929339478, - "grad_norm": 0.40736301230045824, - "learning_rate": 4.535666918556003e-06, - "loss": 0.3774, - "step": 5611 - }, - { - "epoch": 2.755637480798771, - "grad_norm": 0.4264247142085447, - "learning_rate": 4.517350742039339e-06, - "loss": 0.4313, - "step": 5612 - }, - { - "epoch": 2.7561290322580643, - "grad_norm": 0.39867246605189943, - "learning_rate": 4.499070768416225e-06, - "loss": 0.3814, - "step": 5613 - }, - { - "epoch": 2.756620583717358, - "grad_norm": 0.42287412396206325, - "learning_rate": 4.480827004617682e-06, - "loss": 0.4587, - "step": 5614 - }, - { - "epoch": 2.7571121351766514, - "grad_norm": 0.38744592745914636, - "learning_rate": 4.462619457560913e-06, - "loss": 0.4094, - "step": 5615 - }, - { - "epoch": 2.7576036866359446, - "grad_norm": 0.4438337583512388, - "learning_rate": 4.4444481341494595e-06, - "loss": 0.4245, - "step": 5616 - }, - { - "epoch": 2.758095238095238, - "grad_norm": 0.41462000850833064, - "learning_rate": 4.426313041273089e-06, - "loss": 0.4046, - "step": 5617 - }, - { - "epoch": 2.7585867895545313, - "grad_norm": 0.4128230230223947, - "learning_rate": 4.408214185807846e-06, - "loss": 0.4101, - "step": 5618 - }, - { - "epoch": 2.759078341013825, - "grad_norm": 0.43699350736254866, - "learning_rate": 4.390151574616031e-06, - "loss": 0.4778, - "step": 5619 - }, - { - "epoch": 2.7595698924731185, - "grad_norm": 0.4279502071987037, - "learning_rate": 4.37212521454623e-06, - "loss": 0.468, - "step": 5620 - }, - { - "epoch": 2.7600614439324116, - "grad_norm": 0.43204123008984363, - "learning_rate": 4.354135112433233e-06, - "loss": 0.395, - "step": 5621 - }, - { - "epoch": 2.760552995391705, - "grad_norm": 0.3914407994339302, - "learning_rate": 4.336181275098106e-06, - "loss": 0.3932, - "step": 5622 - }, - { - "epoch": 2.7610445468509983, - "grad_norm": 0.4130162880384426, - "learning_rate": 4.318263709348203e-06, - "loss": 0.4206, - "step": 5623 - }, - { - "epoch": 2.761536098310292, - "grad_norm": 0.39264128575213614, - "learning_rate": 4.300382421977034e-06, - "loss": 0.3882, - "step": 5624 - }, - { - "epoch": 2.7620276497695855, - "grad_norm": 0.428753777234044, - "learning_rate": 4.2825374197644764e-06, - "loss": 0.423, - "step": 5625 - }, - { - "epoch": 2.7625192012288786, - "grad_norm": 0.39258652168306096, - "learning_rate": 4.264728709476529e-06, - "loss": 0.3975, - "step": 5626 - }, - { - "epoch": 2.763010752688172, - "grad_norm": 0.4221328929868399, - "learning_rate": 4.246956297865512e-06, - "loss": 0.4395, - "step": 5627 - }, - { - "epoch": 2.7635023041474653, - "grad_norm": 0.38856552140373496, - "learning_rate": 4.229220191669947e-06, - "loss": 0.3835, - "step": 5628 - }, - { - "epoch": 2.763993855606759, - "grad_norm": 0.43274506749309355, - "learning_rate": 4.21152039761461e-06, - "loss": 0.3924, - "step": 5629 - }, - { - "epoch": 2.7644854070660525, - "grad_norm": 0.43860814003791937, - "learning_rate": 4.193856922410466e-06, - "loss": 0.4258, - "step": 5630 - }, - { - "epoch": 2.7649769585253456, - "grad_norm": 0.40890393389705854, - "learning_rate": 4.176229772754803e-06, - "loss": 0.4141, - "step": 5631 - }, - { - "epoch": 2.765468509984639, - "grad_norm": 0.38812692467616483, - "learning_rate": 4.158638955331007e-06, - "loss": 0.4021, - "step": 5632 - }, - { - "epoch": 2.7659600614439324, - "grad_norm": 0.4090721550067412, - "learning_rate": 4.141084476808799e-06, - "loss": 0.4157, - "step": 5633 - }, - { - "epoch": 2.766451612903226, - "grad_norm": 0.4541326062938184, - "learning_rate": 4.123566343844054e-06, - "loss": 0.4578, - "step": 5634 - }, - { - "epoch": 2.766943164362519, - "grad_norm": 0.39586760511797603, - "learning_rate": 4.106084563078916e-06, - "loss": 0.4015, - "step": 5635 - }, - { - "epoch": 2.7674347158218127, - "grad_norm": 0.43934476361263725, - "learning_rate": 4.088639141141692e-06, - "loss": 0.4286, - "step": 5636 - }, - { - "epoch": 2.767926267281106, - "grad_norm": 0.4277219137317346, - "learning_rate": 4.071230084646949e-06, - "loss": 0.4711, - "step": 5637 - }, - { - "epoch": 2.7684178187403994, - "grad_norm": 0.3924805623423362, - "learning_rate": 4.0538574001954485e-06, - "loss": 0.4207, - "step": 5638 - }, - { - "epoch": 2.768909370199693, - "grad_norm": 0.41177963644920046, - "learning_rate": 4.036521094374146e-06, - "loss": 0.4144, - "step": 5639 - }, - { - "epoch": 2.769400921658986, - "grad_norm": 0.4583722326715423, - "learning_rate": 4.01922117375626e-06, - "loss": 0.4573, - "step": 5640 - }, - { - "epoch": 2.7698924731182797, - "grad_norm": 0.4195845603490557, - "learning_rate": 4.001957644901122e-06, - "loss": 0.3999, - "step": 5641 - }, - { - "epoch": 2.770384024577573, - "grad_norm": 0.410935593144402, - "learning_rate": 3.98473051435434e-06, - "loss": 0.4129, - "step": 5642 - }, - { - "epoch": 2.7708755760368664, - "grad_norm": 0.37260710139029496, - "learning_rate": 3.9675397886477e-06, - "loss": 0.3788, - "step": 5643 - }, - { - "epoch": 2.77136712749616, - "grad_norm": 0.42630720479982703, - "learning_rate": 3.95038547429919e-06, - "loss": 0.4037, - "step": 5644 - }, - { - "epoch": 2.771858678955453, - "grad_norm": 0.401648370479641, - "learning_rate": 3.93326757781296e-06, - "loss": 0.3904, - "step": 5645 - }, - { - "epoch": 2.7723502304147467, - "grad_norm": 0.4067435753332641, - "learning_rate": 3.916186105679387e-06, - "loss": 0.4243, - "step": 5646 - }, - { - "epoch": 2.77284178187404, - "grad_norm": 0.4179595838433307, - "learning_rate": 3.8991410643750335e-06, - "loss": 0.4237, - "step": 5647 - }, - { - "epoch": 2.7733333333333334, - "grad_norm": 0.4186244279095402, - "learning_rate": 3.882132460362631e-06, - "loss": 0.3977, - "step": 5648 - }, - { - "epoch": 2.7738248847926266, - "grad_norm": 0.387260809699995, - "learning_rate": 3.865160300091131e-06, - "loss": 0.4247, - "step": 5649 - }, - { - "epoch": 2.77431643625192, - "grad_norm": 0.4401140091413137, - "learning_rate": 3.8482245899956085e-06, - "loss": 0.4292, - "step": 5650 - }, - { - "epoch": 2.7748079877112133, - "grad_norm": 0.3990434151324543, - "learning_rate": 3.8313253364973935e-06, - "loss": 0.4188, - "step": 5651 - }, - { - "epoch": 2.775299539170507, - "grad_norm": 0.39905339579254295, - "learning_rate": 3.814462546003894e-06, - "loss": 0.4141, - "step": 5652 - }, - { - "epoch": 2.7757910906298005, - "grad_norm": 0.41792089931492427, - "learning_rate": 3.797636224908807e-06, - "loss": 0.4011, - "step": 5653 - }, - { - "epoch": 2.7762826420890936, - "grad_norm": 0.4114884127456886, - "learning_rate": 3.780846379591929e-06, - "loss": 0.4159, - "step": 5654 - }, - { - "epoch": 2.776774193548387, - "grad_norm": 0.4385959643072983, - "learning_rate": 3.7640930164192345e-06, - "loss": 0.4119, - "step": 5655 - }, - { - "epoch": 2.7772657450076803, - "grad_norm": 0.45062375467929294, - "learning_rate": 3.7473761417428865e-06, - "loss": 0.4371, - "step": 5656 - }, - { - "epoch": 2.777757296466974, - "grad_norm": 0.4034404366841015, - "learning_rate": 3.730695761901193e-06, - "loss": 0.4112, - "step": 5657 - }, - { - "epoch": 2.7782488479262675, - "grad_norm": 0.3915928507522561, - "learning_rate": 3.714051883218628e-06, - "loss": 0.4269, - "step": 5658 - }, - { - "epoch": 2.7787403993855606, - "grad_norm": 0.41321988653670255, - "learning_rate": 3.697444512005832e-06, - "loss": 0.4226, - "step": 5659 - }, - { - "epoch": 2.779231950844854, - "grad_norm": 0.4199415240561289, - "learning_rate": 3.6808736545596356e-06, - "loss": 0.4381, - "step": 5660 - }, - { - "epoch": 2.7797235023041473, - "grad_norm": 0.4141725330701319, - "learning_rate": 3.664339317162935e-06, - "loss": 0.3977, - "step": 5661 - }, - { - "epoch": 2.780215053763441, - "grad_norm": 0.4235933545288212, - "learning_rate": 3.6478415060848812e-06, - "loss": 0.4248, - "step": 5662 - }, - { - "epoch": 2.7807066052227345, - "grad_norm": 0.42384951392101605, - "learning_rate": 3.631380227580716e-06, - "loss": 0.4235, - "step": 5663 - }, - { - "epoch": 2.7811981566820276, - "grad_norm": 0.40767940713545203, - "learning_rate": 3.6149554878918466e-06, - "loss": 0.4271, - "step": 5664 - }, - { - "epoch": 2.7816897081413208, - "grad_norm": 0.4423360687014389, - "learning_rate": 3.5985672932458136e-06, - "loss": 0.4234, - "step": 5665 - }, - { - "epoch": 2.7821812596006144, - "grad_norm": 0.40300178693165634, - "learning_rate": 3.582215649856335e-06, - "loss": 0.3895, - "step": 5666 - }, - { - "epoch": 2.782672811059908, - "grad_norm": 0.48277576185163745, - "learning_rate": 3.56590056392323e-06, - "loss": 0.405, - "step": 5667 - }, - { - "epoch": 2.783164362519201, - "grad_norm": 0.434678155230467, - "learning_rate": 3.549622041632461e-06, - "loss": 0.5053, - "step": 5668 - }, - { - "epoch": 2.7836559139784947, - "grad_norm": 0.4191663329057678, - "learning_rate": 3.5333800891561798e-06, - "loss": 0.4165, - "step": 5669 - }, - { - "epoch": 2.784147465437788, - "grad_norm": 0.4007109985950894, - "learning_rate": 3.5171747126525823e-06, - "loss": 0.3701, - "step": 5670 - }, - { - "epoch": 2.7846390168970814, - "grad_norm": 0.42808063830432996, - "learning_rate": 3.501005918266087e-06, - "loss": 0.4057, - "step": 5671 - }, - { - "epoch": 2.785130568356375, - "grad_norm": 0.41227216033941905, - "learning_rate": 3.4848737121271903e-06, - "loss": 0.4517, - "step": 5672 - }, - { - "epoch": 2.785622119815668, - "grad_norm": 0.4105400060865333, - "learning_rate": 3.468778100352532e-06, - "loss": 0.3813, - "step": 5673 - }, - { - "epoch": 2.7861136712749617, - "grad_norm": 0.39665087383048686, - "learning_rate": 3.4527190890448535e-06, - "loss": 0.3737, - "step": 5674 - }, - { - "epoch": 2.786605222734255, - "grad_norm": 0.4369044762828287, - "learning_rate": 3.4366966842930614e-06, - "loss": 0.4399, - "step": 5675 - }, - { - "epoch": 2.7870967741935484, - "grad_norm": 0.412136990828404, - "learning_rate": 3.4207108921721296e-06, - "loss": 0.3865, - "step": 5676 - }, - { - "epoch": 2.787588325652842, - "grad_norm": 0.4064981349120306, - "learning_rate": 3.4047617187432213e-06, - "loss": 0.4053, - "step": 5677 - }, - { - "epoch": 2.788079877112135, - "grad_norm": 0.426782122079462, - "learning_rate": 3.388849170053532e-06, - "loss": 0.4463, - "step": 5678 - }, - { - "epoch": 2.7885714285714287, - "grad_norm": 0.42049598883352435, - "learning_rate": 3.3729732521364355e-06, - "loss": 0.4622, - "step": 5679 - }, - { - "epoch": 2.789062980030722, - "grad_norm": 0.41414703388627117, - "learning_rate": 3.357133971011395e-06, - "loss": 0.3732, - "step": 5680 - }, - { - "epoch": 2.7895545314900154, - "grad_norm": 0.41599541935514955, - "learning_rate": 3.341331332683972e-06, - "loss": 0.4099, - "step": 5681 - }, - { - "epoch": 2.790046082949309, - "grad_norm": 0.44442572494419924, - "learning_rate": 3.3255653431458533e-06, - "loss": 0.3876, - "step": 5682 - }, - { - "epoch": 2.790537634408602, - "grad_norm": 0.4288570826018885, - "learning_rate": 3.3098360083748005e-06, - "loss": 0.4342, - "step": 5683 - }, - { - "epoch": 2.7910291858678953, - "grad_norm": 5.013046468247719, - "learning_rate": 3.2941433343347205e-06, - "loss": 0.4303, - "step": 5684 - }, - { - "epoch": 2.791520737327189, - "grad_norm": 0.38606143503997387, - "learning_rate": 3.2784873269755766e-06, - "loss": 0.3961, - "step": 5685 - }, - { - "epoch": 2.7920122887864824, - "grad_norm": 0.4076876178372838, - "learning_rate": 3.2628679922334872e-06, - "loss": 0.3995, - "step": 5686 - }, - { - "epoch": 2.7925038402457756, - "grad_norm": 0.3847657745305065, - "learning_rate": 3.2472853360305813e-06, - "loss": 0.3966, - "step": 5687 - }, - { - "epoch": 2.792995391705069, - "grad_norm": 0.40583672033690993, - "learning_rate": 3.231739364275155e-06, - "loss": 0.4108, - "step": 5688 - }, - { - "epoch": 2.7934869431643623, - "grad_norm": 0.4022656754911858, - "learning_rate": 3.21623008286156e-06, - "loss": 0.4315, - "step": 5689 - }, - { - "epoch": 2.793978494623656, - "grad_norm": 0.4084874921285853, - "learning_rate": 3.200757497670259e-06, - "loss": 0.4291, - "step": 5690 - }, - { - "epoch": 2.7944700460829495, - "grad_norm": 0.42821729610813675, - "learning_rate": 3.1853216145677824e-06, - "loss": 0.4211, - "step": 5691 - }, - { - "epoch": 2.7949615975422426, - "grad_norm": 0.4590091225565753, - "learning_rate": 3.169922439406736e-06, - "loss": 0.3995, - "step": 5692 - }, - { - "epoch": 2.795453149001536, - "grad_norm": 0.4174708604215899, - "learning_rate": 3.1545599780258393e-06, - "loss": 0.3952, - "step": 5693 - }, - { - "epoch": 2.7959447004608293, - "grad_norm": 0.42596384169116536, - "learning_rate": 3.139234236249844e-06, - "loss": 0.4209, - "step": 5694 - }, - { - "epoch": 2.796436251920123, - "grad_norm": 0.3928036464119011, - "learning_rate": 3.123945219889657e-06, - "loss": 0.3819, - "step": 5695 - }, - { - "epoch": 2.7969278033794165, - "grad_norm": 0.3969523013562946, - "learning_rate": 3.1086929347421635e-06, - "loss": 0.4205, - "step": 5696 - }, - { - "epoch": 2.7974193548387096, - "grad_norm": 0.40642537323319916, - "learning_rate": 3.0934773865904155e-06, - "loss": 0.4446, - "step": 5697 - }, - { - "epoch": 2.797910906298003, - "grad_norm": 0.4162400605584403, - "learning_rate": 3.0782985812034536e-06, - "loss": 0.3912, - "step": 5698 - }, - { - "epoch": 2.7984024577572963, - "grad_norm": 0.38699502084153975, - "learning_rate": 3.0631565243364525e-06, - "loss": 0.4146, - "step": 5699 - }, - { - "epoch": 2.79889400921659, - "grad_norm": 0.3850118916386613, - "learning_rate": 3.048051221730597e-06, - "loss": 0.4017, - "step": 5700 - }, - { - "epoch": 2.7993855606758835, - "grad_norm": 0.4031761455330946, - "learning_rate": 3.0329826791131945e-06, - "loss": 0.399, - "step": 5701 - }, - { - "epoch": 2.7998771121351766, - "grad_norm": 0.4332592857247782, - "learning_rate": 3.017950902197575e-06, - "loss": 0.4331, - "step": 5702 - }, - { - "epoch": 2.80036866359447, - "grad_norm": 0.4063561963438648, - "learning_rate": 3.002955896683124e-06, - "loss": 0.3997, - "step": 5703 - }, - { - "epoch": 2.8008602150537634, - "grad_norm": 0.3917812144849345, - "learning_rate": 2.9879976682553157e-06, - "loss": 0.4258, - "step": 5704 - }, - { - "epoch": 2.801351766513057, - "grad_norm": 0.4248388162898716, - "learning_rate": 2.973076222585647e-06, - "loss": 0.3986, - "step": 5705 - }, - { - "epoch": 2.80184331797235, - "grad_norm": 0.4275315357467191, - "learning_rate": 2.958191565331725e-06, - "loss": 0.4156, - "step": 5706 - }, - { - "epoch": 2.8023348694316437, - "grad_norm": 0.42512752944847276, - "learning_rate": 2.9433437021371136e-06, - "loss": 0.3989, - "step": 5707 - }, - { - "epoch": 2.802826420890937, - "grad_norm": 0.3935426785444233, - "learning_rate": 2.9285326386315203e-06, - "loss": 0.3935, - "step": 5708 - }, - { - "epoch": 2.8033179723502304, - "grad_norm": 0.37791572685851643, - "learning_rate": 2.9137583804306423e-06, - "loss": 0.4022, - "step": 5709 - }, - { - "epoch": 2.803809523809524, - "grad_norm": 1.7350169441101304, - "learning_rate": 2.899020933136254e-06, - "loss": 0.4593, - "step": 5710 - }, - { - "epoch": 2.804301075268817, - "grad_norm": 0.38052940572165195, - "learning_rate": 2.88432030233613e-06, - "loss": 0.3923, - "step": 5711 - }, - { - "epoch": 2.8047926267281107, - "grad_norm": 0.4248541707131468, - "learning_rate": 2.869656493604156e-06, - "loss": 0.427, - "step": 5712 - }, - { - "epoch": 2.805284178187404, - "grad_norm": 0.43210340363530475, - "learning_rate": 2.8550295125001845e-06, - "loss": 0.443, - "step": 5713 - }, - { - "epoch": 2.8057757296466974, - "grad_norm": 0.43254811666951304, - "learning_rate": 2.8404393645701243e-06, - "loss": 0.4192, - "step": 5714 - }, - { - "epoch": 2.806267281105991, - "grad_norm": 0.43105457684232945, - "learning_rate": 2.825886055345983e-06, - "loss": 0.4302, - "step": 5715 - }, - { - "epoch": 2.806758832565284, - "grad_norm": 0.4305166931578773, - "learning_rate": 2.8113695903456804e-06, - "loss": 0.4204, - "step": 5716 - }, - { - "epoch": 2.8072503840245777, - "grad_norm": 0.4666338078365619, - "learning_rate": 2.7968899750732692e-06, - "loss": 0.4434, - "step": 5717 - }, - { - "epoch": 2.807741935483871, - "grad_norm": 0.4144834502994846, - "learning_rate": 2.7824472150187907e-06, - "loss": 0.4289, - "step": 5718 - }, - { - "epoch": 2.8082334869431644, - "grad_norm": 0.4190753678398001, - "learning_rate": 2.7680413156583094e-06, - "loss": 0.4299, - "step": 5719 - }, - { - "epoch": 2.808725038402458, - "grad_norm": 0.411122997987756, - "learning_rate": 2.753672282453912e-06, - "loss": 0.3866, - "step": 5720 - }, - { - "epoch": 2.809216589861751, - "grad_norm": 0.40404884890729637, - "learning_rate": 2.7393401208537394e-06, - "loss": 0.3839, - "step": 5721 - }, - { - "epoch": 2.8097081413210443, - "grad_norm": 0.408315575972837, - "learning_rate": 2.7250448362919013e-06, - "loss": 0.3927, - "step": 5722 - }, - { - "epoch": 2.810199692780338, - "grad_norm": 0.3986494417059805, - "learning_rate": 2.7107864341885725e-06, - "loss": 0.3804, - "step": 5723 - }, - { - "epoch": 2.8106912442396315, - "grad_norm": 0.413264261548953, - "learning_rate": 2.6965649199499064e-06, - "loss": 0.4156, - "step": 5724 - }, - { - "epoch": 2.8111827956989246, - "grad_norm": 0.39473403982437766, - "learning_rate": 2.6823802989680903e-06, - "loss": 0.3787, - "step": 5725 - }, - { - "epoch": 2.811674347158218, - "grad_norm": 0.36852636823131574, - "learning_rate": 2.6682325766213323e-06, - "loss": 0.3661, - "step": 5726 - }, - { - "epoch": 2.8121658986175113, - "grad_norm": 0.4069134695206052, - "learning_rate": 2.65412175827382e-06, - "loss": 0.4346, - "step": 5727 - }, - { - "epoch": 2.812657450076805, - "grad_norm": 0.39604692783506845, - "learning_rate": 2.640047849275784e-06, - "loss": 0.4139, - "step": 5728 - }, - { - "epoch": 2.8131490015360985, - "grad_norm": 0.41716189376104984, - "learning_rate": 2.6260108549634234e-06, - "loss": 0.4283, - "step": 5729 - }, - { - "epoch": 2.8136405529953916, - "grad_norm": 0.39628912016162815, - "learning_rate": 2.612010780658969e-06, - "loss": 0.4006, - "step": 5730 - }, - { - "epoch": 2.814132104454685, - "grad_norm": 0.4063347417350446, - "learning_rate": 2.5980476316706303e-06, - "loss": 0.439, - "step": 5731 - }, - { - "epoch": 2.8146236559139783, - "grad_norm": 0.3868052037666175, - "learning_rate": 2.5841214132926728e-06, - "loss": 0.4166, - "step": 5732 - }, - { - "epoch": 2.815115207373272, - "grad_norm": 0.41930073936898876, - "learning_rate": 2.5702321308052504e-06, - "loss": 0.4005, - "step": 5733 - }, - { - "epoch": 2.8156067588325655, - "grad_norm": 0.3729774585601725, - "learning_rate": 2.5563797894746388e-06, - "loss": 0.3771, - "step": 5734 - }, - { - "epoch": 2.8160983102918586, - "grad_norm": 0.40110216338081484, - "learning_rate": 2.5425643945530153e-06, - "loss": 0.4124, - "step": 5735 - }, - { - "epoch": 2.8165898617511522, - "grad_norm": 0.44083808008431297, - "learning_rate": 2.5287859512785894e-06, - "loss": 0.4326, - "step": 5736 - }, - { - "epoch": 2.8170814132104454, - "grad_norm": 0.41176873783279533, - "learning_rate": 2.5150444648755487e-06, - "loss": 0.4083, - "step": 5737 - }, - { - "epoch": 2.817572964669739, - "grad_norm": 0.4323226659220806, - "learning_rate": 2.5013399405540706e-06, - "loss": 0.4449, - "step": 5738 - }, - { - "epoch": 2.8180645161290325, - "grad_norm": 0.4203322896243011, - "learning_rate": 2.4876723835103198e-06, - "loss": 0.4174, - "step": 5739 - }, - { - "epoch": 2.8185560675883257, - "grad_norm": 0.40702076832462786, - "learning_rate": 2.4740417989264408e-06, - "loss": 0.3962, - "step": 5740 - }, - { - "epoch": 2.819047619047619, - "grad_norm": 0.6976067687268606, - "learning_rate": 2.4604481919705767e-06, - "loss": 0.434, - "step": 5741 - }, - { - "epoch": 2.8195391705069124, - "grad_norm": 0.3947677501029888, - "learning_rate": 2.446891567796805e-06, - "loss": 0.4266, - "step": 5742 - }, - { - "epoch": 2.820030721966206, - "grad_norm": 0.4386189304416501, - "learning_rate": 2.433371931545236e-06, - "loss": 0.4462, - "step": 5743 - }, - { - "epoch": 2.820522273425499, - "grad_norm": 0.3683482292160676, - "learning_rate": 2.4198892883419256e-06, - "loss": 0.3626, - "step": 5744 - }, - { - "epoch": 2.8210138248847927, - "grad_norm": 0.4366764917625803, - "learning_rate": 2.4064436432989166e-06, - "loss": 0.4299, - "step": 5745 - }, - { - "epoch": 2.821505376344086, - "grad_norm": 0.4602580056734708, - "learning_rate": 2.39303500151421e-06, - "loss": 0.4079, - "step": 5746 - }, - { - "epoch": 2.8219969278033794, - "grad_norm": 0.41933360010206094, - "learning_rate": 2.3796633680717716e-06, - "loss": 0.4164, - "step": 5747 - }, - { - "epoch": 2.822488479262673, - "grad_norm": 0.40077504255731666, - "learning_rate": 2.3663287480415688e-06, - "loss": 0.4145, - "step": 5748 - }, - { - "epoch": 2.822980030721966, - "grad_norm": 0.43007781093624364, - "learning_rate": 2.3530311464794895e-06, - "loss": 0.4136, - "step": 5749 - }, - { - "epoch": 2.8234715821812597, - "grad_norm": 0.41898990373912737, - "learning_rate": 2.3397705684274353e-06, - "loss": 0.4609, - "step": 5750 - }, - { - "epoch": 2.823963133640553, - "grad_norm": 0.3971932682184444, - "learning_rate": 2.3265470189132165e-06, - "loss": 0.3969, - "step": 5751 - }, - { - "epoch": 2.8244546850998464, - "grad_norm": 0.41822934855331323, - "learning_rate": 2.3133605029506567e-06, - "loss": 0.3937, - "step": 5752 - }, - { - "epoch": 2.82494623655914, - "grad_norm": 0.41080572721083625, - "learning_rate": 2.3002110255394894e-06, - "loss": 0.4258, - "step": 5753 - }, - { - "epoch": 2.825437788018433, - "grad_norm": 0.37415712009570373, - "learning_rate": 2.2870985916654487e-06, - "loss": 0.3777, - "step": 5754 - }, - { - "epoch": 2.8259293394777267, - "grad_norm": 0.41756880594908885, - "learning_rate": 2.274023206300202e-06, - "loss": 0.4352, - "step": 5755 - }, - { - "epoch": 2.82642089093702, - "grad_norm": 0.45221324718717576, - "learning_rate": 2.2609848744013505e-06, - "loss": 0.4087, - "step": 5756 - }, - { - "epoch": 2.8269124423963135, - "grad_norm": 0.42645704169630394, - "learning_rate": 2.247983600912473e-06, - "loss": 0.4395, - "step": 5757 - }, - { - "epoch": 2.827403993855607, - "grad_norm": 0.3915655944202693, - "learning_rate": 2.2350193907631157e-06, - "loss": 0.3729, - "step": 5758 - }, - { - "epoch": 2.8278955453149, - "grad_norm": 0.39172795396693244, - "learning_rate": 2.2220922488687235e-06, - "loss": 0.4122, - "step": 5759 - }, - { - "epoch": 2.8283870967741933, - "grad_norm": 0.42898796785421117, - "learning_rate": 2.2092021801306983e-06, - "loss": 0.4358, - "step": 5760 - }, - { - "epoch": 2.828878648233487, - "grad_norm": 0.4170541030741761, - "learning_rate": 2.196349189436431e-06, - "loss": 0.4144, - "step": 5761 - }, - { - "epoch": 2.8293701996927805, - "grad_norm": 0.3871174685755417, - "learning_rate": 2.183533281659178e-06, - "loss": 0.403, - "step": 5762 - }, - { - "epoch": 2.8298617511520736, - "grad_norm": 0.43973428451434293, - "learning_rate": 2.17075446165822e-06, - "loss": 0.4252, - "step": 5763 - }, - { - "epoch": 2.830353302611367, - "grad_norm": 0.4360226716825237, - "learning_rate": 2.158012734278703e-06, - "loss": 0.419, - "step": 5764 - }, - { - "epoch": 2.8308448540706603, - "grad_norm": 0.40371625764307756, - "learning_rate": 2.145308104351762e-06, - "loss": 0.4155, - "step": 5765 - }, - { - "epoch": 2.831336405529954, - "grad_norm": 0.39080285430860423, - "learning_rate": 2.132640576694411e-06, - "loss": 0.3813, - "step": 5766 - }, - { - "epoch": 2.8318279569892475, - "grad_norm": 0.4175151123713847, - "learning_rate": 2.120010156109675e-06, - "loss": 0.4102, - "step": 5767 - }, - { - "epoch": 2.8323195084485406, - "grad_norm": 0.41894567973153435, - "learning_rate": 2.107416847386423e-06, - "loss": 0.4373, - "step": 5768 - }, - { - "epoch": 2.832811059907834, - "grad_norm": 0.4240410046404566, - "learning_rate": 2.094860655299513e-06, - "loss": 0.402, - "step": 5769 - }, - { - "epoch": 2.8333026113671274, - "grad_norm": 0.3986226573320233, - "learning_rate": 2.0823415846097037e-06, - "loss": 0.3772, - "step": 5770 - }, - { - "epoch": 2.833794162826421, - "grad_norm": 0.3849087000154648, - "learning_rate": 2.069859640063687e-06, - "loss": 0.3979, - "step": 5771 - }, - { - "epoch": 2.8342857142857145, - "grad_norm": 0.45059079762835375, - "learning_rate": 2.057414826394077e-06, - "loss": 0.4152, - "step": 5772 - }, - { - "epoch": 2.8347772657450077, - "grad_norm": 0.42056566150784974, - "learning_rate": 2.045007148319422e-06, - "loss": 0.4135, - "step": 5773 - }, - { - "epoch": 2.8352688172043012, - "grad_norm": 0.412838646332942, - "learning_rate": 2.0326366105441584e-06, - "loss": 0.4212, - "step": 5774 - }, - { - "epoch": 2.8357603686635944, - "grad_norm": 0.43651126081288794, - "learning_rate": 2.0203032177586566e-06, - "loss": 0.41, - "step": 5775 - }, - { - "epoch": 2.836251920122888, - "grad_norm": 0.41279042173374253, - "learning_rate": 2.0080069746392314e-06, - "loss": 0.4085, - "step": 5776 - }, - { - "epoch": 2.836743471582181, - "grad_norm": 0.4188024180669136, - "learning_rate": 1.995747885848054e-06, - "loss": 0.3796, - "step": 5777 - }, - { - "epoch": 2.8372350230414747, - "grad_norm": 0.40816271053360703, - "learning_rate": 1.983525956033272e-06, - "loss": 0.415, - "step": 5778 - }, - { - "epoch": 2.837726574500768, - "grad_norm": 0.4586830317603499, - "learning_rate": 1.9713411898288792e-06, - "loss": 0.4474, - "step": 5779 - }, - { - "epoch": 2.8382181259600614, - "grad_norm": 0.4147635867066052, - "learning_rate": 1.9591935918548464e-06, - "loss": 0.413, - "step": 5780 - }, - { - "epoch": 2.838709677419355, - "grad_norm": 0.4170261163736035, - "learning_rate": 1.9470831667170008e-06, - "loss": 0.4405, - "step": 5781 - }, - { - "epoch": 2.839201228878648, - "grad_norm": 0.40901257467283164, - "learning_rate": 1.9350099190071026e-06, - "loss": 0.433, - "step": 5782 - }, - { - "epoch": 2.8396927803379417, - "grad_norm": 0.4242527956777, - "learning_rate": 1.9229738533027897e-06, - "loss": 0.413, - "step": 5783 - }, - { - "epoch": 2.840184331797235, - "grad_norm": 0.4373874865628205, - "learning_rate": 1.9109749741676232e-06, - "loss": 0.4246, - "step": 5784 - }, - { - "epoch": 2.8406758832565284, - "grad_norm": 0.4231041511488596, - "learning_rate": 1.8990132861510635e-06, - "loss": 0.3885, - "step": 5785 - }, - { - "epoch": 2.841167434715822, - "grad_norm": 0.42181889900433195, - "learning_rate": 1.8870887937884606e-06, - "loss": 0.3774, - "step": 5786 - }, - { - "epoch": 2.841658986175115, - "grad_norm": 0.38920154502716, - "learning_rate": 1.875201501601087e-06, - "loss": 0.3991, - "step": 5787 - }, - { - "epoch": 2.8421505376344087, - "grad_norm": 0.46782773120654203, - "learning_rate": 1.8633514140960484e-06, - "loss": 0.4834, - "step": 5788 - }, - { - "epoch": 2.842642089093702, - "grad_norm": 0.386818692823787, - "learning_rate": 1.8515385357664283e-06, - "loss": 0.4026, - "step": 5789 - }, - { - "epoch": 2.8431336405529954, - "grad_norm": 0.40553074709774, - "learning_rate": 1.839762871091133e-06, - "loss": 0.3832, - "step": 5790 - }, - { - "epoch": 2.843625192012289, - "grad_norm": 0.41270964206202504, - "learning_rate": 1.828024424535002e-06, - "loss": 0.4305, - "step": 5791 - }, - { - "epoch": 2.844116743471582, - "grad_norm": 0.4465637385251171, - "learning_rate": 1.8163232005487418e-06, - "loss": 0.4342, - "step": 5792 - }, - { - "epoch": 2.8446082949308753, - "grad_norm": 0.4256923210296834, - "learning_rate": 1.804659203568937e-06, - "loss": 0.4174, - "step": 5793 - }, - { - "epoch": 2.845099846390169, - "grad_norm": 0.4432405410292091, - "learning_rate": 1.793032438018083e-06, - "loss": 0.4314, - "step": 5794 - }, - { - "epoch": 2.8455913978494625, - "grad_norm": 0.4304274915714804, - "learning_rate": 1.7814429083045424e-06, - "loss": 0.3916, - "step": 5795 - }, - { - "epoch": 2.8460829493087556, - "grad_norm": 0.3946844609662595, - "learning_rate": 1.7698906188225895e-06, - "loss": 0.4111, - "step": 5796 - }, - { - "epoch": 2.846574500768049, - "grad_norm": 0.42696319911981817, - "learning_rate": 1.758375573952309e-06, - "loss": 0.4272, - "step": 5797 - }, - { - "epoch": 2.8470660522273423, - "grad_norm": 0.4201958560260909, - "learning_rate": 1.7468977780597528e-06, - "loss": 0.3863, - "step": 5798 - }, - { - "epoch": 2.847557603686636, - "grad_norm": 0.3986373925787837, - "learning_rate": 1.7354572354967513e-06, - "loss": 0.3932, - "step": 5799 - }, - { - "epoch": 2.8480491551459295, - "grad_norm": 0.4004726172192835, - "learning_rate": 1.7240539506011234e-06, - "loss": 0.4057, - "step": 5800 - }, - { - "epoch": 2.8485407066052226, - "grad_norm": 0.3957133068653019, - "learning_rate": 1.7126879276964547e-06, - "loss": 0.4002, - "step": 5801 - }, - { - "epoch": 2.849032258064516, - "grad_norm": 0.44720652225597013, - "learning_rate": 1.7013591710922761e-06, - "loss": 0.4693, - "step": 5802 - }, - { - "epoch": 2.8495238095238093, - "grad_norm": 0.4045320606754897, - "learning_rate": 1.6900676850839515e-06, - "loss": 0.4166, - "step": 5803 - }, - { - "epoch": 2.850015360983103, - "grad_norm": 0.41359523976204987, - "learning_rate": 1.678813473952745e-06, - "loss": 0.3971, - "step": 5804 - }, - { - "epoch": 2.8505069124423965, - "grad_norm": 0.42316833962175493, - "learning_rate": 1.6675965419657325e-06, - "loss": 0.4342, - "step": 5805 - }, - { - "epoch": 2.8509984639016897, - "grad_norm": 0.4274162811885969, - "learning_rate": 1.6564168933759006e-06, - "loss": 0.4066, - "step": 5806 - }, - { - "epoch": 2.8514900153609832, - "grad_norm": 0.42895370674760874, - "learning_rate": 1.645274532422114e-06, - "loss": 0.4639, - "step": 5807 - }, - { - "epoch": 2.8519815668202764, - "grad_norm": 0.4082614372312397, - "learning_rate": 1.6341694633290495e-06, - "loss": 0.3878, - "step": 5808 - }, - { - "epoch": 2.85247311827957, - "grad_norm": 0.3925800970725663, - "learning_rate": 1.6231016903072715e-06, - "loss": 0.387, - "step": 5809 - }, - { - "epoch": 2.8529646697388635, - "grad_norm": 0.4224411583072324, - "learning_rate": 1.6120712175532128e-06, - "loss": 0.4214, - "step": 5810 - }, - { - "epoch": 2.8534562211981567, - "grad_norm": 0.4162720212427797, - "learning_rate": 1.6010780492491383e-06, - "loss": 0.4497, - "step": 5811 - }, - { - "epoch": 2.85394777265745, - "grad_norm": 0.4134220209944446, - "learning_rate": 1.5901221895631814e-06, - "loss": 0.4415, - "step": 5812 - }, - { - "epoch": 2.8544393241167434, - "grad_norm": 0.4178793963449324, - "learning_rate": 1.5792036426493517e-06, - "loss": 0.4488, - "step": 5813 - }, - { - "epoch": 2.854930875576037, - "grad_norm": 0.3958854792473097, - "learning_rate": 1.5683224126474604e-06, - "loss": 0.3668, - "step": 5814 - }, - { - "epoch": 2.85542242703533, - "grad_norm": 0.42444608032520237, - "learning_rate": 1.5574785036832297e-06, - "loss": 0.4643, - "step": 5815 - }, - { - "epoch": 2.8559139784946237, - "grad_norm": 0.40419884209556983, - "learning_rate": 1.546671919868181e-06, - "loss": 0.4401, - "step": 5816 - }, - { - "epoch": 2.856405529953917, - "grad_norm": 0.42815907611256276, - "learning_rate": 1.5359026652997044e-06, - "loss": 0.432, - "step": 5817 - }, - { - "epoch": 2.8568970814132104, - "grad_norm": 0.4094154752308921, - "learning_rate": 1.5251707440610552e-06, - "loss": 0.3749, - "step": 5818 - }, - { - "epoch": 2.857388632872504, - "grad_norm": 0.4217212400015568, - "learning_rate": 1.5144761602212899e-06, - "loss": 0.4236, - "step": 5819 - }, - { - "epoch": 2.857880184331797, - "grad_norm": 0.402599232840116, - "learning_rate": 1.5038189178353424e-06, - "loss": 0.3897, - "step": 5820 - }, - { - "epoch": 2.8583717357910907, - "grad_norm": 0.4471144273015755, - "learning_rate": 1.4931990209439805e-06, - "loss": 0.401, - "step": 5821 - }, - { - "epoch": 2.858863287250384, - "grad_norm": 0.42827422892425915, - "learning_rate": 1.4826164735738057e-06, - "loss": 0.4186, - "step": 5822 - }, - { - "epoch": 2.8593548387096774, - "grad_norm": 0.41381727659146794, - "learning_rate": 1.4720712797372638e-06, - "loss": 0.3949, - "step": 5823 - }, - { - "epoch": 2.859846390168971, - "grad_norm": 0.4076505586040814, - "learning_rate": 1.4615634434326453e-06, - "loss": 0.4389, - "step": 5824 - }, - { - "epoch": 2.860337941628264, - "grad_norm": 0.42843366544843803, - "learning_rate": 1.4510929686440412e-06, - "loss": 0.4608, - "step": 5825 - }, - { - "epoch": 2.8608294930875577, - "grad_norm": 0.42937302912340913, - "learning_rate": 1.4406598593414311e-06, - "loss": 0.3884, - "step": 5826 - }, - { - "epoch": 2.861321044546851, - "grad_norm": 0.39636500572588623, - "learning_rate": 1.4302641194805955e-06, - "loss": 0.3653, - "step": 5827 - }, - { - "epoch": 2.8618125960061445, - "grad_norm": 0.4058676567250964, - "learning_rate": 1.4199057530031367e-06, - "loss": 0.4351, - "step": 5828 - }, - { - "epoch": 2.862304147465438, - "grad_norm": 0.45160943605787573, - "learning_rate": 1.4095847638365133e-06, - "loss": 0.456, - "step": 5829 - }, - { - "epoch": 2.862795698924731, - "grad_norm": 0.40248423112533765, - "learning_rate": 1.399301155893995e-06, - "loss": 0.3999, - "step": 5830 - }, - { - "epoch": 2.8632872503840243, - "grad_norm": 0.45058274488231065, - "learning_rate": 1.3890549330746626e-06, - "loss": 0.4303, - "step": 5831 - }, - { - "epoch": 2.863778801843318, - "grad_norm": 0.417595846867743, - "learning_rate": 1.3788460992634644e-06, - "loss": 0.4307, - "step": 5832 - }, - { - "epoch": 2.8642703533026115, - "grad_norm": 0.42524426477909055, - "learning_rate": 1.3686746583311483e-06, - "loss": 0.3951, - "step": 5833 - }, - { - "epoch": 2.8647619047619046, - "grad_norm": 0.41566176406668565, - "learning_rate": 1.3585406141342517e-06, - "loss": 0.4337, - "step": 5834 - }, - { - "epoch": 2.865253456221198, - "grad_norm": 0.38436414496786353, - "learning_rate": 1.348443970515212e-06, - "loss": 0.3908, - "step": 5835 - }, - { - "epoch": 2.8657450076804913, - "grad_norm": 0.41940734233633625, - "learning_rate": 1.3383847313022224e-06, - "loss": 0.4346, - "step": 5836 - }, - { - "epoch": 2.866236559139785, - "grad_norm": 0.40964351098155755, - "learning_rate": 1.328362900309299e-06, - "loss": 0.4394, - "step": 5837 - }, - { - "epoch": 2.8667281105990785, - "grad_norm": 0.38558104252725756, - "learning_rate": 1.318378481336302e-06, - "loss": 0.4239, - "step": 5838 - }, - { - "epoch": 2.8672196620583716, - "grad_norm": 0.4397126380793387, - "learning_rate": 1.3084314781688923e-06, - "loss": 0.441, - "step": 5839 - }, - { - "epoch": 2.8677112135176652, - "grad_norm": 0.39452989709684694, - "learning_rate": 1.2985218945785304e-06, - "loss": 0.4113, - "step": 5840 - }, - { - "epoch": 2.8682027649769584, - "grad_norm": 0.4077174202666877, - "learning_rate": 1.2886497343225002e-06, - "loss": 0.3975, - "step": 5841 - }, - { - "epoch": 2.868694316436252, - "grad_norm": 0.40520758288034336, - "learning_rate": 1.2788150011439294e-06, - "loss": 0.3973, - "step": 5842 - }, - { - "epoch": 2.8691858678955455, - "grad_norm": 0.4158537857966892, - "learning_rate": 1.2690176987716907e-06, - "loss": 0.411, - "step": 5843 - }, - { - "epoch": 2.8696774193548387, - "grad_norm": 0.43070013333943247, - "learning_rate": 1.2592578309205017e-06, - "loss": 0.4, - "step": 5844 - }, - { - "epoch": 2.8701689708141322, - "grad_norm": 0.3975217116767363, - "learning_rate": 1.249535401290902e-06, - "loss": 0.4092, - "step": 5845 - }, - { - "epoch": 2.8706605222734254, - "grad_norm": 0.42259862112336966, - "learning_rate": 1.2398504135691991e-06, - "loss": 0.4187, - "step": 5846 - }, - { - "epoch": 2.871152073732719, - "grad_norm": 0.38713490890227364, - "learning_rate": 1.2302028714275215e-06, - "loss": 0.3954, - "step": 5847 - }, - { - "epoch": 2.8716436251920125, - "grad_norm": 0.38207755238597946, - "learning_rate": 1.2205927785238213e-06, - "loss": 0.3769, - "step": 5848 - }, - { - "epoch": 2.8721351766513057, - "grad_norm": 0.40466802391199014, - "learning_rate": 1.211020138501795e-06, - "loss": 0.3993, - "step": 5849 - }, - { - "epoch": 2.872626728110599, - "grad_norm": 0.3888509310388858, - "learning_rate": 1.2014849549910056e-06, - "loss": 0.3767, - "step": 5850 - }, - { - "epoch": 2.8731182795698924, - "grad_norm": 0.39918166555643847, - "learning_rate": 1.1919872316067726e-06, - "loss": 0.4239, - "step": 5851 - }, - { - "epoch": 2.873609831029186, - "grad_norm": 0.42365474077158444, - "learning_rate": 1.1825269719502041e-06, - "loss": 0.3996, - "step": 5852 - }, - { - "epoch": 2.874101382488479, - "grad_norm": 0.40383653043538387, - "learning_rate": 1.1731041796082531e-06, - "loss": 0.3815, - "step": 5853 - }, - { - "epoch": 2.8745929339477727, - "grad_norm": 0.3664272594363523, - "learning_rate": 1.1637188581536175e-06, - "loss": 0.3477, - "step": 5854 - }, - { - "epoch": 2.875084485407066, - "grad_norm": 0.4014211416926223, - "learning_rate": 1.1543710111448059e-06, - "loss": 0.3835, - "step": 5855 - }, - { - "epoch": 2.8755760368663594, - "grad_norm": 0.42697665438228166, - "learning_rate": 1.1450606421261167e-06, - "loss": 0.4205, - "step": 5856 - }, - { - "epoch": 2.876067588325653, - "grad_norm": 0.43225707718851697, - "learning_rate": 1.1357877546276485e-06, - "loss": 0.421, - "step": 5857 - }, - { - "epoch": 2.876559139784946, - "grad_norm": 0.38102777069228094, - "learning_rate": 1.1265523521652666e-06, - "loss": 0.3746, - "step": 5858 - }, - { - "epoch": 2.8770506912442397, - "grad_norm": 0.4062530572452384, - "learning_rate": 1.1173544382406476e-06, - "loss": 0.4073, - "step": 5859 - }, - { - "epoch": 2.877542242703533, - "grad_norm": 0.4367819112894127, - "learning_rate": 1.1081940163412352e-06, - "loss": 0.433, - "step": 5860 - }, - { - "epoch": 2.8780337941628265, - "grad_norm": 0.4337088517858264, - "learning_rate": 1.0990710899402733e-06, - "loss": 0.4007, - "step": 5861 - }, - { - "epoch": 2.87852534562212, - "grad_norm": 0.41737324630561373, - "learning_rate": 1.089985662496773e-06, - "loss": 0.4469, - "step": 5862 - }, - { - "epoch": 2.879016897081413, - "grad_norm": 0.3890062337328297, - "learning_rate": 1.080937737455534e-06, - "loss": 0.3611, - "step": 5863 - }, - { - "epoch": 2.8795084485407068, - "grad_norm": 0.42159932402698724, - "learning_rate": 1.0719273182471569e-06, - "loss": 0.4296, - "step": 5864 - }, - { - "epoch": 2.88, - "grad_norm": 0.40591650799539847, - "learning_rate": 1.0629544082879861e-06, - "loss": 0.391, - "step": 5865 - }, - { - "epoch": 2.8804915514592935, - "grad_norm": 0.4053640976693638, - "learning_rate": 1.0540190109801562e-06, - "loss": 0.4571, - "step": 5866 - }, - { - "epoch": 2.880983102918587, - "grad_norm": 0.38604230174811205, - "learning_rate": 1.0451211297116015e-06, - "loss": 0.3873, - "step": 5867 - }, - { - "epoch": 2.88147465437788, - "grad_norm": 0.4278692361086267, - "learning_rate": 1.036260767856001e-06, - "loss": 0.3741, - "step": 5868 - }, - { - "epoch": 2.8819662058371733, - "grad_norm": 0.4256563762435781, - "learning_rate": 1.0274379287728232e-06, - "loss": 0.3957, - "step": 5869 - }, - { - "epoch": 2.882457757296467, - "grad_norm": 0.46252690525039364, - "learning_rate": 1.0186526158073251e-06, - "loss": 0.4108, - "step": 5870 - }, - { - "epoch": 2.8829493087557605, - "grad_norm": 0.4047418843201301, - "learning_rate": 1.009904832290487e-06, - "loss": 0.3955, - "step": 5871 - }, - { - "epoch": 2.8834408602150536, - "grad_norm": 0.4077592414319878, - "learning_rate": 1.0011945815391e-06, - "loss": 0.3968, - "step": 5872 - }, - { - "epoch": 2.883932411674347, - "grad_norm": 0.3862666192750378, - "learning_rate": 9.925218668557334e-07, - "loss": 0.3736, - "step": 5873 - }, - { - "epoch": 2.8844239631336404, - "grad_norm": 0.4254112125731013, - "learning_rate": 9.838866915286903e-07, - "loss": 0.4322, - "step": 5874 - }, - { - "epoch": 2.884915514592934, - "grad_norm": 0.38933308848071546, - "learning_rate": 9.752890588320518e-07, - "loss": 0.3825, - "step": 5875 - }, - { - "epoch": 2.8854070660522275, - "grad_norm": 0.39411985329179455, - "learning_rate": 9.667289720256766e-07, - "loss": 0.3818, - "step": 5876 - }, - { - "epoch": 2.8858986175115207, - "grad_norm": 0.41074578261619943, - "learning_rate": 9.5820643435518e-07, - "loss": 0.4238, - "step": 5877 - }, - { - "epoch": 2.8863901689708142, - "grad_norm": 0.443401602818785, - "learning_rate": 9.497214490519213e-07, - "loss": 0.4315, - "step": 5878 - }, - { - "epoch": 2.8868817204301074, - "grad_norm": 0.40812302458216504, - "learning_rate": 9.412740193330827e-07, - "loss": 0.4324, - "step": 5879 - }, - { - "epoch": 2.887373271889401, - "grad_norm": 0.42011468714114697, - "learning_rate": 9.328641484015244e-07, - "loss": 0.4032, - "step": 5880 - }, - { - "epoch": 2.8878648233486945, - "grad_norm": 0.3953711918512807, - "learning_rate": 9.244918394459179e-07, - "loss": 0.4026, - "step": 5881 - }, - { - "epoch": 2.8883563748079877, - "grad_norm": 0.45792210525002497, - "learning_rate": 9.161570956406907e-07, - "loss": 0.4647, - "step": 5882 - }, - { - "epoch": 2.8888479262672813, - "grad_norm": 0.41823347848545295, - "learning_rate": 9.078599201460036e-07, - "loss": 0.4495, - "step": 5883 - }, - { - "epoch": 2.8893394777265744, - "grad_norm": 0.4296176804479501, - "learning_rate": 8.99600316107796e-07, - "loss": 0.4081, - "step": 5884 - }, - { - "epoch": 2.889831029185868, - "grad_norm": 0.4209430754008335, - "learning_rate": 8.913782866577403e-07, - "loss": 0.4202, - "step": 5885 - }, - { - "epoch": 2.8903225806451616, - "grad_norm": 0.4162519675064981, - "learning_rate": 8.831938349132984e-07, - "loss": 0.4464, - "step": 5886 - }, - { - "epoch": 2.8908141321044547, - "grad_norm": 0.4113365207521592, - "learning_rate": 8.750469639776326e-07, - "loss": 0.444, - "step": 5887 - }, - { - "epoch": 2.891305683563748, - "grad_norm": 0.41122673214944083, - "learning_rate": 8.669376769397053e-07, - "loss": 0.4466, - "step": 5888 - }, - { - "epoch": 2.8917972350230414, - "grad_norm": 0.3850543479460839, - "learning_rate": 8.588659768741903e-07, - "loss": 0.401, - "step": 5889 - }, - { - "epoch": 2.892288786482335, - "grad_norm": 0.38540758790413565, - "learning_rate": 8.508318668415505e-07, - "loss": 0.3831, - "step": 5890 - }, - { - "epoch": 2.892780337941628, - "grad_norm": 0.4245614822640265, - "learning_rate": 8.428353498879493e-07, - "loss": 0.4397, - "step": 5891 - }, - { - "epoch": 2.8932718894009217, - "grad_norm": 0.4498402247464881, - "learning_rate": 8.348764290453392e-07, - "loss": 0.4868, - "step": 5892 - }, - { - "epoch": 2.893763440860215, - "grad_norm": 0.4442629847525694, - "learning_rate": 8.269551073313841e-07, - "loss": 0.4432, - "step": 5893 - }, - { - "epoch": 2.8942549923195084, - "grad_norm": 0.38416652535860824, - "learning_rate": 8.19071387749526e-07, - "loss": 0.424, - "step": 5894 - }, - { - "epoch": 2.894746543778802, - "grad_norm": 0.417534663952261, - "learning_rate": 8.112252732888959e-07, - "loss": 0.3897, - "step": 5895 - }, - { - "epoch": 2.895238095238095, - "grad_norm": 0.41071717447054606, - "learning_rate": 8.034167669244475e-07, - "loss": 0.4453, - "step": 5896 - }, - { - "epoch": 2.8957296466973887, - "grad_norm": 3.342217382367793, - "learning_rate": 7.956458716167902e-07, - "loss": 0.5777, - "step": 5897 - }, - { - "epoch": 2.896221198156682, - "grad_norm": 0.4178805006042299, - "learning_rate": 7.87912590312323e-07, - "loss": 0.4048, - "step": 5898 - }, - { - "epoch": 2.8967127496159755, - "grad_norm": 0.40527456086358044, - "learning_rate": 7.80216925943189e-07, - "loss": 0.3881, - "step": 5899 - }, - { - "epoch": 2.897204301075269, - "grad_norm": 0.3943595674242165, - "learning_rate": 7.725588814272211e-07, - "loss": 0.4436, - "step": 5900 - }, - { - "epoch": 2.897695852534562, - "grad_norm": 0.4246094853771633, - "learning_rate": 7.64938459668052e-07, - "loss": 0.3692, - "step": 5901 - }, - { - "epoch": 2.8981874039938558, - "grad_norm": 0.37153255095226706, - "learning_rate": 7.573556635549928e-07, - "loss": 0.394, - "step": 5902 - }, - { - "epoch": 2.898678955453149, - "grad_norm": 0.4302436644540405, - "learning_rate": 7.498104959631103e-07, - "loss": 0.4505, - "step": 5903 - }, - { - "epoch": 2.8991705069124425, - "grad_norm": 0.420636643242112, - "learning_rate": 7.423029597532161e-07, - "loss": 0.4129, - "step": 5904 - }, - { - "epoch": 2.8996620583717356, - "grad_norm": 0.4212966683961716, - "learning_rate": 7.348330577718554e-07, - "loss": 0.3766, - "step": 5905 - }, - { - "epoch": 2.900153609831029, - "grad_norm": 0.4150062744422125, - "learning_rate": 7.274007928512627e-07, - "loss": 0.4442, - "step": 5906 - }, - { - "epoch": 2.9006451612903223, - "grad_norm": 0.426745939723344, - "learning_rate": 7.200061678094505e-07, - "loss": 0.4156, - "step": 5907 - }, - { - "epoch": 2.901136712749616, - "grad_norm": 0.404007805671517, - "learning_rate": 7.126491854501427e-07, - "loss": 0.3804, - "step": 5908 - }, - { - "epoch": 2.9016282642089095, - "grad_norm": 0.42348993273163427, - "learning_rate": 7.053298485627857e-07, - "loss": 0.4138, - "step": 5909 - }, - { - "epoch": 2.9021198156682027, - "grad_norm": 0.4360258952759854, - "learning_rate": 6.980481599225486e-07, - "loss": 0.4028, - "step": 5910 - }, - { - "epoch": 2.9026113671274962, - "grad_norm": 0.41588938401658465, - "learning_rate": 6.908041222903449e-07, - "loss": 0.3815, - "step": 5911 - }, - { - "epoch": 2.9031029185867894, - "grad_norm": 0.39590760874778863, - "learning_rate": 6.83597738412789e-07, - "loss": 0.3871, - "step": 5912 - }, - { - "epoch": 2.903594470046083, - "grad_norm": 0.4070223484307068, - "learning_rate": 6.764290110222394e-07, - "loss": 0.4295, - "step": 5913 - }, - { - "epoch": 2.9040860215053765, - "grad_norm": 0.4183721327239439, - "learning_rate": 6.692979428367663e-07, - "loss": 0.4251, - "step": 5914 - }, - { - "epoch": 2.9045775729646697, - "grad_norm": 0.38848263692664525, - "learning_rate": 6.622045365601515e-07, - "loss": 0.4233, - "step": 5915 - }, - { - "epoch": 2.9050691244239633, - "grad_norm": 0.4248028009965904, - "learning_rate": 6.551487948819212e-07, - "loss": 0.454, - "step": 5916 - }, - { - "epoch": 2.9055606758832564, - "grad_norm": 0.4265998555776826, - "learning_rate": 6.481307204773024e-07, - "loss": 0.4264, - "step": 5917 - }, - { - "epoch": 2.90605222734255, - "grad_norm": 0.4414698848708529, - "learning_rate": 6.411503160072441e-07, - "loss": 0.4647, - "step": 5918 - }, - { - "epoch": 2.9065437788018436, - "grad_norm": 0.42171120321986805, - "learning_rate": 6.34207584118418e-07, - "loss": 0.3938, - "step": 5919 - }, - { - "epoch": 2.9070353302611367, - "grad_norm": 0.42297836590279686, - "learning_rate": 6.273025274431965e-07, - "loss": 0.4059, - "step": 5920 - }, - { - "epoch": 2.90752688172043, - "grad_norm": 0.4198860005329027, - "learning_rate": 6.204351485997073e-07, - "loss": 0.4155, - "step": 5921 - }, - { - "epoch": 2.9080184331797234, - "grad_norm": 0.38911050250036117, - "learning_rate": 6.136054501917232e-07, - "loss": 0.3943, - "step": 5922 - }, - { - "epoch": 2.908509984639017, - "grad_norm": 0.3881482600331499, - "learning_rate": 6.068134348088061e-07, - "loss": 0.4115, - "step": 5923 - }, - { - "epoch": 2.90900153609831, - "grad_norm": 0.4156476478433157, - "learning_rate": 6.000591050261739e-07, - "loss": 0.3963, - "step": 5924 - }, - { - "epoch": 2.9094930875576037, - "grad_norm": 0.43041357985433026, - "learning_rate": 5.933424634047891e-07, - "loss": 0.4928, - "step": 5925 - }, - { - "epoch": 2.909984639016897, - "grad_norm": 0.4383580898047529, - "learning_rate": 5.866635124913034e-07, - "loss": 0.4156, - "step": 5926 - }, - { - "epoch": 2.9104761904761904, - "grad_norm": 0.430766224847513, - "learning_rate": 5.8002225481808e-07, - "loss": 0.4973, - "step": 5927 - }, - { - "epoch": 2.910967741935484, - "grad_norm": 0.5198957679999848, - "learning_rate": 5.734186929032159e-07, - "loss": 0.4376, - "step": 5928 - }, - { - "epoch": 2.911459293394777, - "grad_norm": 0.39222659680119215, - "learning_rate": 5.66852829250486e-07, - "loss": 0.378, - "step": 5929 - }, - { - "epoch": 2.9119508448540707, - "grad_norm": 0.3962717075758154, - "learning_rate": 5.603246663493766e-07, - "loss": 0.3825, - "step": 5930 - }, - { - "epoch": 2.912442396313364, - "grad_norm": 0.4420441291907064, - "learning_rate": 5.538342066750968e-07, - "loss": 0.443, - "step": 5931 - }, - { - "epoch": 2.9129339477726575, - "grad_norm": 0.4085987606406521, - "learning_rate": 5.473814526885335e-07, - "loss": 0.4153, - "step": 5932 - }, - { - "epoch": 2.913425499231951, - "grad_norm": 0.40317330597251816, - "learning_rate": 5.409664068362963e-07, - "loss": 0.4333, - "step": 5933 - }, - { - "epoch": 2.913917050691244, - "grad_norm": 0.4327706832914622, - "learning_rate": 5.345890715507173e-07, - "loss": 0.4125, - "step": 5934 - }, - { - "epoch": 2.9144086021505378, - "grad_norm": 0.41311355124205346, - "learning_rate": 5.282494492497736e-07, - "loss": 0.3668, - "step": 5935 - }, - { - "epoch": 2.914900153609831, - "grad_norm": 0.44293607894209375, - "learning_rate": 5.219475423371867e-07, - "loss": 0.4479, - "step": 5936 - }, - { - "epoch": 2.9153917050691245, - "grad_norm": 0.4232264828938185, - "learning_rate": 5.156833532023675e-07, - "loss": 0.3848, - "step": 5937 - }, - { - "epoch": 2.915883256528418, - "grad_norm": 0.431431143628101, - "learning_rate": 5.094568842204383e-07, - "loss": 0.428, - "step": 5938 - }, - { - "epoch": 2.916374807987711, - "grad_norm": 0.4435159455107381, - "learning_rate": 5.032681377521886e-07, - "loss": 0.4346, - "step": 5939 - }, - { - "epoch": 2.9168663594470043, - "grad_norm": 0.43443250079825363, - "learning_rate": 4.971171161441302e-07, - "loss": 0.4418, - "step": 5940 - }, - { - "epoch": 2.917357910906298, - "grad_norm": 0.40421011489480574, - "learning_rate": 4.910038217284752e-07, - "loss": 0.4368, - "step": 5941 - }, - { - "epoch": 2.9178494623655915, - "grad_norm": 0.42642056514858656, - "learning_rate": 4.849282568231028e-07, - "loss": 0.4321, - "step": 5942 - }, - { - "epoch": 2.9183410138248846, - "grad_norm": 0.39507647525376055, - "learning_rate": 4.788904237316149e-07, - "loss": 0.41, - "step": 5943 - }, - { - "epoch": 2.9188325652841782, - "grad_norm": 0.3858639562932033, - "learning_rate": 4.7289032474329143e-07, - "loss": 0.4189, - "step": 5944 - }, - { - "epoch": 2.9193241167434714, - "grad_norm": 0.47691969890438757, - "learning_rate": 4.669279621331235e-07, - "loss": 0.4101, - "step": 5945 - }, - { - "epoch": 2.919815668202765, - "grad_norm": 0.4105611443112569, - "learning_rate": 4.610033381617695e-07, - "loss": 0.4647, - "step": 5946 - }, - { - "epoch": 2.9203072196620585, - "grad_norm": 0.4063027536070045, - "learning_rate": 4.551164550755882e-07, - "loss": 0.3664, - "step": 5947 - }, - { - "epoch": 2.9207987711213517, - "grad_norm": 0.39798519458046383, - "learning_rate": 4.4926731510663843e-07, - "loss": 0.397, - "step": 5948 - }, - { - "epoch": 2.9212903225806452, - "grad_norm": 0.42032651916849023, - "learning_rate": 4.434559204726574e-07, - "loss": 0.4208, - "step": 5949 - }, - { - "epoch": 2.9217818740399384, - "grad_norm": 0.41567196570469567, - "learning_rate": 4.3768227337707135e-07, - "loss": 0.4274, - "step": 5950 - }, - { - "epoch": 2.922273425499232, - "grad_norm": 0.4027071416383512, - "learning_rate": 4.3194637600901813e-07, - "loss": 0.4226, - "step": 5951 - }, - { - "epoch": 2.9227649769585256, - "grad_norm": 0.4083293654665912, - "learning_rate": 4.2624823054328024e-07, - "loss": 0.4383, - "step": 5952 - }, - { - "epoch": 2.9232565284178187, - "grad_norm": 0.4202209247395239, - "learning_rate": 4.205878391403517e-07, - "loss": 0.4068, - "step": 5953 - }, - { - "epoch": 2.9237480798771123, - "grad_norm": 0.4144287902263344, - "learning_rate": 4.149652039464047e-07, - "loss": 0.3911, - "step": 5954 - }, - { - "epoch": 2.9242396313364054, - "grad_norm": 0.41504942741631884, - "learning_rate": 4.0938032709332275e-07, - "loss": 0.4082, - "step": 5955 - }, - { - "epoch": 2.924731182795699, - "grad_norm": 0.3846908548282004, - "learning_rate": 4.038332106986231e-07, - "loss": 0.3927, - "step": 5956 - }, - { - "epoch": 2.9252227342549926, - "grad_norm": 0.41705628166588243, - "learning_rate": 3.983238568655345e-07, - "loss": 0.3615, - "step": 5957 - }, - { - "epoch": 2.9257142857142857, - "grad_norm": 0.4507590009565246, - "learning_rate": 3.92852267682986e-07, - "loss": 0.4831, - "step": 5958 - }, - { - "epoch": 2.926205837173579, - "grad_norm": 0.39985549841289475, - "learning_rate": 3.8741844522555136e-07, - "loss": 0.4306, - "step": 5959 - }, - { - "epoch": 2.9266973886328724, - "grad_norm": 0.4201925462347482, - "learning_rate": 3.820223915535048e-07, - "loss": 0.4048, - "step": 5960 - }, - { - "epoch": 2.927188940092166, - "grad_norm": 0.40623733028686093, - "learning_rate": 3.7666410871279867e-07, - "loss": 0.4137, - "step": 5961 - }, - { - "epoch": 2.927680491551459, - "grad_norm": 0.4298677565113412, - "learning_rate": 3.713435987350522e-07, - "loss": 0.4336, - "step": 5962 - }, - { - "epoch": 2.9281720430107527, - "grad_norm": 0.451860189302632, - "learning_rate": 3.6606086363759615e-07, - "loss": 0.4045, - "step": 5963 - }, - { - "epoch": 2.928663594470046, - "grad_norm": 0.42494963642857425, - "learning_rate": 3.608159054233951e-07, - "loss": 0.4167, - "step": 5964 - }, - { - "epoch": 2.9291551459293395, - "grad_norm": 0.40904406489952366, - "learning_rate": 3.556087260811136e-07, - "loss": 0.4094, - "step": 5965 - }, - { - "epoch": 2.929646697388633, - "grad_norm": 0.42327856786153, - "learning_rate": 3.504393275850948e-07, - "loss": 0.4143, - "step": 5966 - }, - { - "epoch": 2.930138248847926, - "grad_norm": 0.3941127188442293, - "learning_rate": 3.453077118953374e-07, - "loss": 0.3891, - "step": 5967 - }, - { - "epoch": 2.9306298003072198, - "grad_norm": 0.41281696074150137, - "learning_rate": 3.402138809575517e-07, - "loss": 0.3806, - "step": 5968 - }, - { - "epoch": 2.931121351766513, - "grad_norm": 0.4092918330993273, - "learning_rate": 3.3515783670307057e-07, - "loss": 0.3871, - "step": 5969 - }, - { - "epoch": 2.9316129032258065, - "grad_norm": 0.40678432857497393, - "learning_rate": 3.301395810489494e-07, - "loss": 0.3949, - "step": 5970 - }, - { - "epoch": 2.9321044546851, - "grad_norm": 0.433874021223381, - "learning_rate": 3.251591158978884e-07, - "loss": 0.4499, - "step": 5971 - }, - { - "epoch": 2.932596006144393, - "grad_norm": 0.3880236694309334, - "learning_rate": 3.202164431382659e-07, - "loss": 0.4031, - "step": 5972 - }, - { - "epoch": 2.933087557603687, - "grad_norm": 0.4140154485434843, - "learning_rate": 3.1531156464411624e-07, - "loss": 0.4029, - "step": 5973 - }, - { - "epoch": 2.93357910906298, - "grad_norm": 0.4203633666443706, - "learning_rate": 3.10444482275174e-07, - "loss": 0.3991, - "step": 5974 - }, - { - "epoch": 2.9340706605222735, - "grad_norm": 0.423022842267573, - "learning_rate": 3.056151978768185e-07, - "loss": 0.4091, - "step": 5975 - }, - { - "epoch": 2.934562211981567, - "grad_norm": 0.4319111693033177, - "learning_rate": 3.0082371328010727e-07, - "loss": 0.4636, - "step": 5976 - }, - { - "epoch": 2.93505376344086, - "grad_norm": 0.3996701173390591, - "learning_rate": 2.9607003030176494e-07, - "loss": 0.3551, - "step": 5977 - }, - { - "epoch": 2.9355453149001534, - "grad_norm": 0.4021917342472955, - "learning_rate": 2.91354150744183e-07, - "loss": 0.4354, - "step": 5978 - }, - { - "epoch": 2.936036866359447, - "grad_norm": 0.4070519322847705, - "learning_rate": 2.8667607639542016e-07, - "loss": 0.3898, - "step": 5979 - }, - { - "epoch": 2.9365284178187405, - "grad_norm": 0.4052114101319615, - "learning_rate": 2.820358090291908e-07, - "loss": 0.424, - "step": 5980 - }, - { - "epoch": 2.9370199692780337, - "grad_norm": 0.4231356716503333, - "learning_rate": 2.774333504048876e-07, - "loss": 0.4334, - "step": 5981 - }, - { - "epoch": 2.9375115207373272, - "grad_norm": 0.4210842344560463, - "learning_rate": 2.7286870226758135e-07, - "loss": 0.4146, - "step": 5982 - }, - { - "epoch": 2.9380030721966204, - "grad_norm": 0.38704619471286666, - "learning_rate": 2.6834186634796534e-07, - "loss": 0.3902, - "step": 5983 - }, - { - "epoch": 2.938494623655914, - "grad_norm": 0.3988897628911073, - "learning_rate": 2.638528443624333e-07, - "loss": 0.3953, - "step": 5984 - }, - { - "epoch": 2.9389861751152075, - "grad_norm": 0.4200837658240136, - "learning_rate": 2.5940163801301267e-07, - "loss": 0.3957, - "step": 5985 - }, - { - "epoch": 2.9394777265745007, - "grad_norm": 0.4228404926309639, - "learning_rate": 2.5498824898744224e-07, - "loss": 0.4198, - "step": 5986 - }, - { - "epoch": 2.9399692780337943, - "grad_norm": 0.38672086814750295, - "learning_rate": 2.5061267895905016e-07, - "loss": 0.4091, - "step": 5987 - }, - { - "epoch": 2.9404608294930874, - "grad_norm": 0.41196772689289507, - "learning_rate": 2.4627492958688714e-07, - "loss": 0.4327, - "step": 5988 - }, - { - "epoch": 2.940952380952381, - "grad_norm": 0.38198369126642734, - "learning_rate": 2.4197500251563753e-07, - "loss": 0.3585, - "step": 5989 - }, - { - "epoch": 2.9414439324116746, - "grad_norm": 0.416107428976138, - "learning_rate": 2.3771289937563056e-07, - "loss": 0.4041, - "step": 5990 - }, - { - "epoch": 2.9419354838709677, - "grad_norm": 0.4014966002475941, - "learning_rate": 2.3348862178289575e-07, - "loss": 0.4166, - "step": 5991 - }, - { - "epoch": 2.9424270353302613, - "grad_norm": 0.42158686121084765, - "learning_rate": 2.2930217133907418e-07, - "loss": 0.4571, - "step": 5992 - }, - { - "epoch": 2.9429185867895544, - "grad_norm": 0.5447030154486988, - "learning_rate": 2.2515354963150715e-07, - "loss": 0.4161, - "step": 5993 - }, - { - "epoch": 2.943410138248848, - "grad_norm": 0.4003026766203279, - "learning_rate": 2.2104275823315868e-07, - "loss": 0.4411, - "step": 5994 - }, - { - "epoch": 2.9439016897081416, - "grad_norm": 0.3894061583770945, - "learning_rate": 2.1696979870267087e-07, - "loss": 0.3544, - "step": 5995 - }, - { - "epoch": 2.9443932411674347, - "grad_norm": 0.41164779323823, - "learning_rate": 2.1293467258433065e-07, - "loss": 0.4686, - "step": 5996 - }, - { - "epoch": 2.944884792626728, - "grad_norm": 0.4119417860239911, - "learning_rate": 2.0893738140808083e-07, - "loss": 0.4151, - "step": 5997 - }, - { - "epoch": 2.9453763440860214, - "grad_norm": 0.42082802046114, - "learning_rate": 2.0497792668953132e-07, - "loss": 0.4193, - "step": 5998 - }, - { - "epoch": 2.945867895545315, - "grad_norm": 0.4200033745830691, - "learning_rate": 2.0105630992992564e-07, - "loss": 0.4361, - "step": 5999 - }, - { - "epoch": 2.946359447004608, - "grad_norm": 0.43196118605709805, - "learning_rate": 1.9717253261617442e-07, - "loss": 0.4441, - "step": 6000 - }, - { - "epoch": 2.9468509984639017, - "grad_norm": 0.4144578066395236, - "learning_rate": 1.9332659622083304e-07, - "loss": 0.4449, - "step": 6001 - }, - { - "epoch": 2.947342549923195, - "grad_norm": 0.3873889749151009, - "learning_rate": 1.8951850220213508e-07, - "loss": 0.4288, - "step": 6002 - }, - { - "epoch": 2.9478341013824885, - "grad_norm": 0.4010679148855431, - "learning_rate": 1.8574825200391445e-07, - "loss": 0.3936, - "step": 6003 - }, - { - "epoch": 2.948325652841782, - "grad_norm": 0.40243074836209924, - "learning_rate": 1.820158470557165e-07, - "loss": 0.3806, - "step": 6004 - }, - { - "epoch": 2.948817204301075, - "grad_norm": 0.4274781332589534, - "learning_rate": 1.7832128877268705e-07, - "loss": 0.3942, - "step": 6005 - }, - { - "epoch": 2.9493087557603688, - "grad_norm": 0.4069318726121919, - "learning_rate": 1.7466457855565e-07, - "loss": 0.4424, - "step": 6006 - }, - { - "epoch": 2.949800307219662, - "grad_norm": 0.40870341173156804, - "learning_rate": 1.7104571779107402e-07, - "loss": 0.3824, - "step": 6007 - }, - { - "epoch": 2.9502918586789555, - "grad_norm": 0.40053018901382154, - "learning_rate": 1.674647078510727e-07, - "loss": 0.4182, - "step": 6008 - }, - { - "epoch": 2.950783410138249, - "grad_norm": 0.44085495296066624, - "learning_rate": 1.6392155009340437e-07, - "loss": 0.4126, - "step": 6009 - }, - { - "epoch": 2.951274961597542, - "grad_norm": 0.42778884084759744, - "learning_rate": 1.604162458614944e-07, - "loss": 0.4384, - "step": 6010 - }, - { - "epoch": 2.951766513056836, - "grad_norm": 0.3860862006471472, - "learning_rate": 1.5694879648439076e-07, - "loss": 0.435, - "step": 6011 - }, - { - "epoch": 2.952258064516129, - "grad_norm": 0.44385517994740425, - "learning_rate": 1.5351920327680847e-07, - "loss": 0.4027, - "step": 6012 - }, - { - "epoch": 2.9527496159754225, - "grad_norm": 0.39830272358746444, - "learning_rate": 1.5012746753909624e-07, - "loss": 0.4052, - "step": 6013 - }, - { - "epoch": 2.953241167434716, - "grad_norm": 0.4267178954516768, - "learning_rate": 1.467735905572476e-07, - "loss": 0.4187, - "step": 6014 - }, - { - "epoch": 2.9537327188940092, - "grad_norm": 0.4107082385394562, - "learning_rate": 1.434575736029231e-07, - "loss": 0.4084, - "step": 6015 - }, - { - "epoch": 2.9542242703533024, - "grad_norm": 0.4402235094289675, - "learning_rate": 1.4017941793340593e-07, - "loss": 0.4449, - "step": 6016 - }, - { - "epoch": 2.954715821812596, - "grad_norm": 0.3957201828010452, - "learning_rate": 1.3693912479162408e-07, - "loss": 0.3873, - "step": 6017 - }, - { - "epoch": 2.9552073732718895, - "grad_norm": 0.418258384332858, - "learning_rate": 1.337366954061725e-07, - "loss": 0.4369, - "step": 6018 - }, - { - "epoch": 2.9556989247311827, - "grad_norm": 0.4099216051892321, - "learning_rate": 1.3057213099125776e-07, - "loss": 0.4164, - "step": 6019 - }, - { - "epoch": 2.9561904761904763, - "grad_norm": 0.3698579870804078, - "learning_rate": 1.2744543274675334e-07, - "loss": 0.363, - "step": 6020 - }, - { - "epoch": 2.9566820276497694, - "grad_norm": 0.44962052139173175, - "learning_rate": 1.2435660185816655e-07, - "loss": 0.4557, - "step": 6021 - }, - { - "epoch": 2.957173579109063, - "grad_norm": 0.39983886649349315, - "learning_rate": 1.213056394966494e-07, - "loss": 0.4032, - "step": 6022 - }, - { - "epoch": 2.9576651305683566, - "grad_norm": 0.39411613942516277, - "learning_rate": 1.1829254681898772e-07, - "loss": 0.407, - "step": 6023 - }, - { - "epoch": 2.9581566820276497, - "grad_norm": 0.38850085462320016, - "learning_rate": 1.1531732496763425e-07, - "loss": 0.359, - "step": 6024 - }, - { - "epoch": 2.9586482334869433, - "grad_norm": 0.40159884802082935, - "learning_rate": 1.1237997507064224e-07, - "loss": 0.426, - "step": 6025 - }, - { - "epoch": 2.9591397849462364, - "grad_norm": 0.40125688005210397, - "learning_rate": 1.09480498241743e-07, - "loss": 0.4098, - "step": 6026 - }, - { - "epoch": 2.95963133640553, - "grad_norm": 0.3808112577083854, - "learning_rate": 1.0661889558029048e-07, - "loss": 0.3759, - "step": 6027 - }, - { - "epoch": 2.9601228878648236, - "grad_norm": 0.4266575266481944, - "learning_rate": 1.0379516817128343e-07, - "loss": 0.3861, - "step": 6028 - }, - { - "epoch": 2.9606144393241167, - "grad_norm": 0.45668055392825835, - "learning_rate": 1.0100931708534323e-07, - "loss": 0.4616, - "step": 6029 - }, - { - "epoch": 2.9611059907834103, - "grad_norm": 0.4639244945697901, - "learning_rate": 9.826134337875826e-08, - "loss": 0.4125, - "step": 6030 - }, - { - "epoch": 2.9615975422427034, - "grad_norm": 0.4031595099385033, - "learning_rate": 9.555124809343952e-08, - "loss": 0.4026, - "step": 6031 - }, - { - "epoch": 2.962089093701997, - "grad_norm": 0.4209953740906405, - "learning_rate": 9.287903225693173e-08, - "loss": 0.4086, - "step": 6032 - }, - { - "epoch": 2.96258064516129, - "grad_norm": 0.4203806817396384, - "learning_rate": 9.024469688242443e-08, - "loss": 0.4447, - "step": 6033 - }, - { - "epoch": 2.9630721966205837, - "grad_norm": 0.4130465246202411, - "learning_rate": 8.764824296875196e-08, - "loss": 0.445, - "step": 6034 - }, - { - "epoch": 2.963563748079877, - "grad_norm": 0.38443477373932644, - "learning_rate": 8.508967150037128e-08, - "loss": 0.4041, - "step": 6035 - }, - { - "epoch": 2.9640552995391705, - "grad_norm": 0.42372931505908396, - "learning_rate": 8.256898344737307e-08, - "loss": 0.4387, - "step": 6036 - }, - { - "epoch": 2.964546850998464, - "grad_norm": 0.4080582091433443, - "learning_rate": 8.008617976551502e-08, - "loss": 0.3915, - "step": 6037 - }, - { - "epoch": 2.965038402457757, - "grad_norm": 0.39739507320924133, - "learning_rate": 7.764126139615524e-08, - "loss": 0.3999, - "step": 6038 - }, - { - "epoch": 2.9655299539170508, - "grad_norm": 0.41723383285383575, - "learning_rate": 7.523422926629664e-08, - "loss": 0.4235, - "step": 6039 - }, - { - "epoch": 2.966021505376344, - "grad_norm": 0.44034358447187427, - "learning_rate": 7.286508428858696e-08, - "loss": 0.4395, - "step": 6040 - }, - { - "epoch": 2.9665130568356375, - "grad_norm": 0.42902757844756756, - "learning_rate": 7.053382736130764e-08, - "loss": 0.4085, - "step": 6041 - }, - { - "epoch": 2.967004608294931, - "grad_norm": 0.43774027421218564, - "learning_rate": 6.824045936836276e-08, - "loss": 0.3751, - "step": 6042 - }, - { - "epoch": 2.967496159754224, - "grad_norm": 0.40683340468552215, - "learning_rate": 6.598498117931228e-08, - "loss": 0.4142, - "step": 6043 - }, - { - "epoch": 2.967987711213518, - "grad_norm": 0.40725696543062384, - "learning_rate": 6.376739364932772e-08, - "loss": 0.4354, - "step": 6044 - }, - { - "epoch": 2.968479262672811, - "grad_norm": 0.43894079285299115, - "learning_rate": 6.158769761921423e-08, - "loss": 0.4263, - "step": 6045 - }, - { - "epoch": 2.9689708141321045, - "grad_norm": 0.44513675567143146, - "learning_rate": 5.944589391542188e-08, - "loss": 0.4409, - "step": 6046 - }, - { - "epoch": 2.969462365591398, - "grad_norm": 0.42649597478472673, - "learning_rate": 5.734198335004548e-08, - "loss": 0.4355, - "step": 6047 - }, - { - "epoch": 2.9699539170506912, - "grad_norm": 0.4088928304993863, - "learning_rate": 5.527596672078028e-08, - "loss": 0.4255, - "step": 6048 - }, - { - "epoch": 2.9704454685099844, - "grad_norm": 0.4444738183882466, - "learning_rate": 5.324784481096634e-08, - "loss": 0.3721, - "step": 6049 - }, - { - "epoch": 2.970937019969278, - "grad_norm": 0.4126968027163256, - "learning_rate": 5.125761838959964e-08, - "loss": 0.3688, - "step": 6050 - }, - { - "epoch": 2.9714285714285715, - "grad_norm": 0.4003310634673174, - "learning_rate": 4.930528821126545e-08, - "loss": 0.3939, - "step": 6051 - }, - { - "epoch": 2.9719201228878647, - "grad_norm": 0.41670499712303666, - "learning_rate": 4.73908550162272e-08, - "loss": 0.4265, - "step": 6052 - }, - { - "epoch": 2.9724116743471583, - "grad_norm": 0.41468367323871685, - "learning_rate": 4.551431953033758e-08, - "loss": 0.3913, - "step": 6053 - }, - { - "epoch": 2.9729032258064514, - "grad_norm": 0.3954875606446369, - "learning_rate": 4.367568246510523e-08, - "loss": 0.3993, - "step": 6054 - }, - { - "epoch": 2.973394777265745, - "grad_norm": 0.42037941517522137, - "learning_rate": 4.1874944517661385e-08, - "loss": 0.411, - "step": 6055 - }, - { - "epoch": 2.9738863287250386, - "grad_norm": 0.40689557517667146, - "learning_rate": 4.01121063707599e-08, - "loss": 0.4276, - "step": 6056 - }, - { - "epoch": 2.9743778801843317, - "grad_norm": 0.43832708700955153, - "learning_rate": 3.8387168692799456e-08, - "loss": 0.3919, - "step": 6057 - }, - { - "epoch": 2.9748694316436253, - "grad_norm": 0.4170563697685453, - "learning_rate": 3.6700132137812426e-08, - "loss": 0.4068, - "step": 6058 - }, - { - "epoch": 2.9753609831029184, - "grad_norm": 0.440455237868665, - "learning_rate": 3.50509973454316e-08, - "loss": 0.4298, - "step": 6059 - }, - { - "epoch": 2.975852534562212, - "grad_norm": 0.3762155148789076, - "learning_rate": 3.3439764940934594e-08, - "loss": 0.3898, - "step": 6060 - }, - { - "epoch": 2.9763440860215056, - "grad_norm": 0.39882875673227924, - "learning_rate": 3.186643553525492e-08, - "loss": 0.4033, - "step": 6061 - }, - { - "epoch": 2.9768356374807987, - "grad_norm": 0.40712218639627423, - "learning_rate": 3.033100972491543e-08, - "loss": 0.4052, - "step": 6062 - }, - { - "epoch": 2.9773271889400923, - "grad_norm": 0.46334739447452034, - "learning_rate": 2.883348809208375e-08, - "loss": 0.4313, - "step": 6063 - }, - { - "epoch": 2.9778187403993854, - "grad_norm": 0.41473301999742473, - "learning_rate": 2.7373871204561252e-08, - "loss": 0.4343, - "step": 6064 - }, - { - "epoch": 2.978310291858679, - "grad_norm": 0.45174724302909136, - "learning_rate": 2.5952159615760806e-08, - "loss": 0.4046, - "step": 6065 - }, - { - "epoch": 2.9788018433179726, - "grad_norm": 0.4497066917671094, - "learning_rate": 2.4568353864751204e-08, - "loss": 0.4291, - "step": 6066 - }, - { - "epoch": 2.9792933947772657, - "grad_norm": 0.4499684444059429, - "learning_rate": 2.3222454476190535e-08, - "loss": 0.3975, - "step": 6067 - }, - { - "epoch": 2.979784946236559, - "grad_norm": 0.4279091247595014, - "learning_rate": 2.191446196040392e-08, - "loss": 0.3934, - "step": 6068 - }, - { - "epoch": 2.9802764976958525, - "grad_norm": 0.4543201244976313, - "learning_rate": 2.064437681331688e-08, - "loss": 0.3991, - "step": 6069 - }, - { - "epoch": 2.980768049155146, - "grad_norm": 0.40319902320709167, - "learning_rate": 1.941219951648865e-08, - "loss": 0.4245, - "step": 6070 - }, - { - "epoch": 2.981259600614439, - "grad_norm": 0.4040314613900974, - "learning_rate": 1.8217930537112184e-08, - "loss": 0.4243, - "step": 6071 - }, - { - "epoch": 2.9817511520737328, - "grad_norm": 0.41014280331005337, - "learning_rate": 1.706157032800304e-08, - "loss": 0.4117, - "step": 6072 - }, - { - "epoch": 2.982242703533026, - "grad_norm": 0.4397016377924984, - "learning_rate": 1.59431193275994e-08, - "loss": 0.4313, - "step": 6073 - }, - { - "epoch": 2.9827342549923195, - "grad_norm": 0.4048797114510302, - "learning_rate": 1.4862577959973145e-08, - "loss": 0.4263, - "step": 6074 - }, - { - "epoch": 2.983225806451613, - "grad_norm": 0.38905967221178384, - "learning_rate": 1.3819946634818782e-08, - "loss": 0.3735, - "step": 6075 - }, - { - "epoch": 2.983717357910906, - "grad_norm": 0.37764869902734094, - "learning_rate": 1.281522574745342e-08, - "loss": 0.4038, - "step": 6076 - }, - { - "epoch": 2.9842089093702, - "grad_norm": 0.4026355877938672, - "learning_rate": 1.1848415678827885e-08, - "loss": 0.4531, - "step": 6077 - }, - { - "epoch": 2.984700460829493, - "grad_norm": 0.40417802514297885, - "learning_rate": 1.0919516795515617e-08, - "loss": 0.444, - "step": 6078 - }, - { - "epoch": 2.9851920122887865, - "grad_norm": 0.41842658128623017, - "learning_rate": 1.0028529449701563e-08, - "loss": 0.4232, - "step": 6079 - }, - { - "epoch": 2.98568356374808, - "grad_norm": 0.38651846873424983, - "learning_rate": 9.17545397922659e-09, - "loss": 0.3678, - "step": 6080 - }, - { - "epoch": 2.986175115207373, - "grad_norm": 0.37837707655414704, - "learning_rate": 8.360290707543073e-09, - "loss": 0.3956, - "step": 6081 - }, - { - "epoch": 2.986666666666667, - "grad_norm": 0.4215845213690753, - "learning_rate": 7.583039943703795e-09, - "loss": 0.4249, - "step": 6082 - }, - { - "epoch": 2.98715821812596, - "grad_norm": 0.39909141904110923, - "learning_rate": 6.843701982428563e-09, - "loss": 0.4096, - "step": 6083 - }, - { - "epoch": 2.9876497695852535, - "grad_norm": 0.43292174211698903, - "learning_rate": 6.142277104026484e-09, - "loss": 0.4054, - "step": 6084 - }, - { - "epoch": 2.988141321044547, - "grad_norm": 0.40029677964018207, - "learning_rate": 5.478765574462586e-09, - "loss": 0.4008, - "step": 6085 - }, - { - "epoch": 2.9886328725038402, - "grad_norm": 0.4294975110718298, - "learning_rate": 4.853167645302303e-09, - "loss": 0.3856, - "step": 6086 - }, - { - "epoch": 2.9891244239631334, - "grad_norm": 0.4286046977545209, - "learning_rate": 4.265483553755889e-09, - "loss": 0.4107, - "step": 6087 - }, - { - "epoch": 2.989615975422427, - "grad_norm": 0.40104635914997994, - "learning_rate": 3.715713522622899e-09, - "loss": 0.3473, - "step": 6088 - }, - { - "epoch": 2.9901075268817205, - "grad_norm": 0.38453583888031445, - "learning_rate": 3.2038577603810127e-09, - "loss": 0.3976, - "step": 6089 - }, - { - "epoch": 2.9905990783410137, - "grad_norm": 0.45510364577481827, - "learning_rate": 2.729916461097215e-09, - "loss": 0.4103, - "step": 6090 - }, - { - "epoch": 2.9910906298003073, - "grad_norm": 0.4027303305051323, - "learning_rate": 2.2938898044611023e-09, - "loss": 0.4263, - "step": 6091 - }, - { - "epoch": 2.9915821812596004, - "grad_norm": 0.40878866001483, - "learning_rate": 1.8957779557959855e-09, - "loss": 0.3921, - "step": 6092 - }, - { - "epoch": 2.992073732718894, - "grad_norm": 0.42846769694886017, - "learning_rate": 1.5355810660477864e-09, - "loss": 0.4352, - "step": 6093 - }, - { - "epoch": 2.9925652841781876, - "grad_norm": 0.39993957432953003, - "learning_rate": 1.2132992717961422e-09, - "loss": 0.4217, - "step": 6094 - }, - { - "epoch": 2.9930568356374807, - "grad_norm": 0.3936801466298494, - "learning_rate": 9.289326952321986e-10, - "loss": 0.3839, - "step": 6095 - }, - { - "epoch": 2.9935483870967743, - "grad_norm": 0.41596733354476106, - "learning_rate": 6.824814441808159e-10, - "loss": 0.4171, - "step": 6096 - }, - { - "epoch": 2.9940399385560674, - "grad_norm": 0.410800646796449, - "learning_rate": 4.739456120672614e-10, - "loss": 0.4112, - "step": 6097 - }, - { - "epoch": 2.994531490015361, - "grad_norm": 0.4170747581370785, - "learning_rate": 3.033252779838236e-10, - "loss": 0.4316, - "step": 6098 - }, - { - "epoch": 2.9950230414746546, - "grad_norm": 0.4304517925812922, - "learning_rate": 1.706205066009936e-10, - "loss": 0.4562, - "step": 6099 - }, - { - "epoch": 2.9955145929339477, - "grad_norm": 0.40260588675218906, - "learning_rate": 7.583134824518112e-11, - "loss": 0.4133, - "step": 6100 - }, - { - "epoch": 2.9960061443932413, - "grad_norm": 0.38341260419568185, - "learning_rate": 1.8957838854305465e-11, - "loss": 0.4098, - "step": 6101 - }, - { - "epoch": 2.9964976958525344, - "grad_norm": 0.39987369746027823, - "learning_rate": 0.0, - "loss": 0.4278, - "step": 6102 - }, - { - "epoch": 2.9964976958525344, - "eval_loss": 0.8067362904548645, - "eval_runtime": 6662.6718, - "eval_samples_per_second": 4.277, - "eval_steps_per_second": 2.139, - "step": 6102 - } - ], - "logging_steps": 1, - "max_steps": 6102, - "num_input_tokens_seen": 0, - "num_train_epochs": 3, - "save_steps": 2034, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 840273273815040.0, - "train_batch_size": 1, - "trial_name": null, - "trial_params": null -}