| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.971537001897533, |
| "eval_steps": 500, |
| "global_step": 655, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.007590132827324478, |
| "grad_norm": 5.824297945739171, |
| "learning_rate": 1.2121212121212122e-06, |
| "loss": 0.9218, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.015180265654648957, |
| "grad_norm": 5.892263786026164, |
| "learning_rate": 2.4242424242424244e-06, |
| "loss": 0.9293, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.022770398481973434, |
| "grad_norm": 5.8114634272581736, |
| "learning_rate": 3.6363636363636366e-06, |
| "loss": 0.9311, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.030360531309297913, |
| "grad_norm": 5.296649545877873, |
| "learning_rate": 4.848484848484849e-06, |
| "loss": 0.9125, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.03795066413662239, |
| "grad_norm": 3.7809427076070765, |
| "learning_rate": 6.060606060606061e-06, |
| "loss": 0.8611, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.04554079696394687, |
| "grad_norm": 2.1730858437477893, |
| "learning_rate": 7.272727272727273e-06, |
| "loss": 0.8592, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.05313092979127135, |
| "grad_norm": 4.234904575535682, |
| "learning_rate": 8.484848484848486e-06, |
| "loss": 0.8607, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.06072106261859583, |
| "grad_norm": 4.53949743914793, |
| "learning_rate": 9.696969696969698e-06, |
| "loss": 0.8652, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.0683111954459203, |
| "grad_norm": 4.029238499849355, |
| "learning_rate": 1.0909090909090909e-05, |
| "loss": 0.8177, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.07590132827324478, |
| "grad_norm": 3.9784894633891312, |
| "learning_rate": 1.2121212121212122e-05, |
| "loss": 0.8151, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.08349146110056926, |
| "grad_norm": 2.6969063855035493, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 0.7815, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.09108159392789374, |
| "grad_norm": 1.6447992531334745, |
| "learning_rate": 1.4545454545454546e-05, |
| "loss": 0.7618, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.09867172675521822, |
| "grad_norm": 1.5209576084591174, |
| "learning_rate": 1.575757575757576e-05, |
| "loss": 0.7486, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.1062618595825427, |
| "grad_norm": 1.2740153307577036, |
| "learning_rate": 1.6969696969696972e-05, |
| "loss": 0.7247, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.11385199240986717, |
| "grad_norm": 0.9320600982322024, |
| "learning_rate": 1.8181818181818182e-05, |
| "loss": 0.711, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.12144212523719165, |
| "grad_norm": 1.0702991390933831, |
| "learning_rate": 1.9393939393939395e-05, |
| "loss": 0.7028, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.12903225806451613, |
| "grad_norm": 0.9459102023256077, |
| "learning_rate": 2.0606060606060608e-05, |
| "loss": 0.6918, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.1366223908918406, |
| "grad_norm": 0.8949299902760269, |
| "learning_rate": 2.1818181818181818e-05, |
| "loss": 0.6782, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.1442125237191651, |
| "grad_norm": 0.876771721092771, |
| "learning_rate": 2.3030303030303034e-05, |
| "loss": 0.6772, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.15180265654648956, |
| "grad_norm": 1.0086928620416316, |
| "learning_rate": 2.4242424242424244e-05, |
| "loss": 0.6644, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.15939278937381404, |
| "grad_norm": 0.9465491775161774, |
| "learning_rate": 2.5454545454545457e-05, |
| "loss": 0.6628, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.16698292220113853, |
| "grad_norm": 0.8899418440526895, |
| "learning_rate": 2.6666666666666667e-05, |
| "loss": 0.6579, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.174573055028463, |
| "grad_norm": 0.9194240562361043, |
| "learning_rate": 2.7878787878787883e-05, |
| "loss": 0.6488, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.18216318785578747, |
| "grad_norm": 1.140269189956398, |
| "learning_rate": 2.9090909090909093e-05, |
| "loss": 0.6545, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.18975332068311196, |
| "grad_norm": 1.2817416105473125, |
| "learning_rate": 3.0303030303030306e-05, |
| "loss": 0.6505, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.19734345351043645, |
| "grad_norm": 0.6815058078206016, |
| "learning_rate": 3.151515151515152e-05, |
| "loss": 0.6317, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.2049335863377609, |
| "grad_norm": 1.1874255778058744, |
| "learning_rate": 3.272727272727273e-05, |
| "loss": 0.6432, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.2125237191650854, |
| "grad_norm": 0.9363859174853021, |
| "learning_rate": 3.3939393939393945e-05, |
| "loss": 0.6312, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.22011385199240988, |
| "grad_norm": 0.8935811457744806, |
| "learning_rate": 3.515151515151515e-05, |
| "loss": 0.6285, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.22770398481973433, |
| "grad_norm": 1.2762386300886945, |
| "learning_rate": 3.6363636363636364e-05, |
| "loss": 0.6275, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.23529411764705882, |
| "grad_norm": 0.9210641452165423, |
| "learning_rate": 3.7575757575757584e-05, |
| "loss": 0.6264, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.2428842504743833, |
| "grad_norm": 1.2440524082474191, |
| "learning_rate": 3.878787878787879e-05, |
| "loss": 0.6144, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.2504743833017078, |
| "grad_norm": 1.3065985154977695, |
| "learning_rate": 4e-05, |
| "loss": 0.6141, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.25806451612903225, |
| "grad_norm": 0.8172081904989663, |
| "learning_rate": 4.1212121212121216e-05, |
| "loss": 0.6092, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.2656546489563567, |
| "grad_norm": 1.167931775101708, |
| "learning_rate": 4.242424242424242e-05, |
| "loss": 0.6134, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.2732447817836812, |
| "grad_norm": 1.681922162159049, |
| "learning_rate": 4.3636363636363636e-05, |
| "loss": 0.6164, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.2808349146110057, |
| "grad_norm": 1.1257832927645395, |
| "learning_rate": 4.484848484848485e-05, |
| "loss": 0.6011, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.2884250474383302, |
| "grad_norm": 1.790614581178023, |
| "learning_rate": 4.606060606060607e-05, |
| "loss": 0.6094, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.29601518026565465, |
| "grad_norm": 1.127806608018945, |
| "learning_rate": 4.727272727272728e-05, |
| "loss": 0.6011, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.3036053130929791, |
| "grad_norm": 2.093380998039163, |
| "learning_rate": 4.848484848484849e-05, |
| "loss": 0.615, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.3111954459203036, |
| "grad_norm": 0.9384468154974619, |
| "learning_rate": 4.96969696969697e-05, |
| "loss": 0.5974, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.3187855787476281, |
| "grad_norm": 2.4981151616674686, |
| "learning_rate": 5.0909090909090914e-05, |
| "loss": 0.6002, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.32637571157495254, |
| "grad_norm": 1.6534765579286679, |
| "learning_rate": 5.212121212121213e-05, |
| "loss": 0.6062, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.33396584440227706, |
| "grad_norm": 2.4370056170762453, |
| "learning_rate": 5.333333333333333e-05, |
| "loss": 0.6068, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.3415559772296015, |
| "grad_norm": 2.183022857065241, |
| "learning_rate": 5.4545454545454546e-05, |
| "loss": 0.5993, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.349146110056926, |
| "grad_norm": 1.8264041908559345, |
| "learning_rate": 5.5757575757575766e-05, |
| "loss": 0.5967, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.3567362428842505, |
| "grad_norm": 1.9185286131763832, |
| "learning_rate": 5.696969696969698e-05, |
| "loss": 0.6048, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.36432637571157495, |
| "grad_norm": 1.5433175224735158, |
| "learning_rate": 5.8181818181818185e-05, |
| "loss": 0.5991, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.3719165085388994, |
| "grad_norm": 1.6301636930901502, |
| "learning_rate": 5.93939393939394e-05, |
| "loss": 0.5973, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.3795066413662239, |
| "grad_norm": 1.6154604740395921, |
| "learning_rate": 6.060606060606061e-05, |
| "loss": 0.5839, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.3870967741935484, |
| "grad_norm": 1.5375798706049526, |
| "learning_rate": 6.181818181818182e-05, |
| "loss": 0.6014, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.3946869070208729, |
| "grad_norm": 1.8926585193561105, |
| "learning_rate": 6.303030303030304e-05, |
| "loss": 0.5903, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.40227703984819735, |
| "grad_norm": 0.9591201704735197, |
| "learning_rate": 6.424242424242424e-05, |
| "loss": 0.5787, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.4098671726755218, |
| "grad_norm": 2.3504740289658144, |
| "learning_rate": 6.545454545454546e-05, |
| "loss": 0.5836, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.4174573055028463, |
| "grad_norm": 1.9833219660676837, |
| "learning_rate": 6.666666666666667e-05, |
| "loss": 0.6021, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.4250474383301708, |
| "grad_norm": 1.979773818430796, |
| "learning_rate": 6.787878787878789e-05, |
| "loss": 0.5745, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.43263757115749524, |
| "grad_norm": 1.6918535634940701, |
| "learning_rate": 6.90909090909091e-05, |
| "loss": 0.5802, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.44022770398481975, |
| "grad_norm": 1.896304739161675, |
| "learning_rate": 7.03030303030303e-05, |
| "loss": 0.5967, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.4478178368121442, |
| "grad_norm": 1.7127150877307569, |
| "learning_rate": 7.151515151515152e-05, |
| "loss": 0.5873, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.45540796963946867, |
| "grad_norm": 1.7288522680268184, |
| "learning_rate": 7.272727272727273e-05, |
| "loss": 0.5822, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.4629981024667932, |
| "grad_norm": 2.4113594863743777, |
| "learning_rate": 7.393939393939395e-05, |
| "loss": 0.5892, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.47058823529411764, |
| "grad_norm": 1.5610600634117355, |
| "learning_rate": 7.515151515151517e-05, |
| "loss": 0.5888, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.4781783681214421, |
| "grad_norm": 1.554510024355238, |
| "learning_rate": 7.636363636363637e-05, |
| "loss": 0.5748, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.4857685009487666, |
| "grad_norm": 1.4238723235068915, |
| "learning_rate": 7.757575757575758e-05, |
| "loss": 0.5752, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.49335863377609107, |
| "grad_norm": 3.2737964188798, |
| "learning_rate": 7.87878787878788e-05, |
| "loss": 0.5991, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.5009487666034156, |
| "grad_norm": 1.3673718679696243, |
| "learning_rate": 8e-05, |
| "loss": 0.587, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.50853889943074, |
| "grad_norm": 3.10214817390346, |
| "learning_rate": 7.999943101853146e-05, |
| "loss": 0.5968, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.5161290322580645, |
| "grad_norm": 2.4426856945858635, |
| "learning_rate": 7.999772409031277e-05, |
| "loss": 0.6063, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.523719165085389, |
| "grad_norm": 2.384951983454804, |
| "learning_rate": 7.999487926390452e-05, |
| "loss": 0.5968, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.5313092979127134, |
| "grad_norm": 2.470269943289222, |
| "learning_rate": 7.999089662023934e-05, |
| "loss": 0.5976, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.538899430740038, |
| "grad_norm": 2.0615837527679926, |
| "learning_rate": 7.99857762726198e-05, |
| "loss": 0.5892, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.5464895635673624, |
| "grad_norm": 1.4595469442640645, |
| "learning_rate": 7.997951836671498e-05, |
| "loss": 0.5763, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.5540796963946869, |
| "grad_norm": 1.6686147644039993, |
| "learning_rate": 7.997212308055656e-05, |
| "loss": 0.5885, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.5616698292220114, |
| "grad_norm": 1.1588798823385231, |
| "learning_rate": 7.996359062453354e-05, |
| "loss": 0.5816, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.5692599620493358, |
| "grad_norm": 2.139844499195118, |
| "learning_rate": 7.995392124138642e-05, |
| "loss": 0.5815, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.5768500948766604, |
| "grad_norm": 1.6540433397238854, |
| "learning_rate": 7.994311520620017e-05, |
| "loss": 0.5782, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.5844402277039848, |
| "grad_norm": 1.04883299144272, |
| "learning_rate": 7.993117282639648e-05, |
| "loss": 0.5782, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.5920303605313093, |
| "grad_norm": 2.724444333560736, |
| "learning_rate": 7.9918094441725e-05, |
| "loss": 0.5861, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.5996204933586338, |
| "grad_norm": 1.8249890665939426, |
| "learning_rate": 7.990388042425367e-05, |
| "loss": 0.58, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.6072106261859582, |
| "grad_norm": 2.602399399727078, |
| "learning_rate": 7.988853117835806e-05, |
| "loss": 0.5814, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.6148007590132827, |
| "grad_norm": 1.5944678851416663, |
| "learning_rate": 7.987204714071006e-05, |
| "loss": 0.5826, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.6223908918406073, |
| "grad_norm": 2.2610913780974546, |
| "learning_rate": 7.985442878026524e-05, |
| "loss": 0.5754, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.6299810246679317, |
| "grad_norm": 1.7537341638428399, |
| "learning_rate": 7.983567659824962e-05, |
| "loss": 0.5845, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.6375711574952562, |
| "grad_norm": 1.8121108815331453, |
| "learning_rate": 7.981579112814541e-05, |
| "loss": 0.585, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 1.467756636378608, |
| "learning_rate": 7.97947729356758e-05, |
| "loss": 0.5777, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.6527514231499051, |
| "grad_norm": 1.5365204832241453, |
| "learning_rate": 7.977262261878892e-05, |
| "loss": 0.5763, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.6603415559772297, |
| "grad_norm": 1.4259830475580915, |
| "learning_rate": 7.974934080764075e-05, |
| "loss": 0.5662, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.6679316888045541, |
| "grad_norm": 1.290860497369316, |
| "learning_rate": 7.972492816457723e-05, |
| "loss": 0.5627, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.6755218216318786, |
| "grad_norm": 1.1578178204522984, |
| "learning_rate": 7.969938538411543e-05, |
| "loss": 0.5611, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.683111954459203, |
| "grad_norm": 1.8928883460003019, |
| "learning_rate": 7.967271319292382e-05, |
| "loss": 0.5715, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.6907020872865275, |
| "grad_norm": 1.5577040910573858, |
| "learning_rate": 7.96449123498015e-05, |
| "loss": 0.5712, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.698292220113852, |
| "grad_norm": 1.064793253865779, |
| "learning_rate": 7.96159836456567e-05, |
| "loss": 0.5675, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.7058823529411765, |
| "grad_norm": 2.0170128081260406, |
| "learning_rate": 7.958592790348425e-05, |
| "loss": 0.5755, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.713472485768501, |
| "grad_norm": 1.3379111611740009, |
| "learning_rate": 7.955474597834217e-05, |
| "loss": 0.5604, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.7210626185958254, |
| "grad_norm": 1.4656800007307322, |
| "learning_rate": 7.952243875732735e-05, |
| "loss": 0.5655, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.7286527514231499, |
| "grad_norm": 1.2799455534799504, |
| "learning_rate": 7.948900715955025e-05, |
| "loss": 0.5629, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.7362428842504743, |
| "grad_norm": 1.6331551992017197, |
| "learning_rate": 7.94544521361089e-05, |
| "loss": 0.5589, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.7438330170777988, |
| "grad_norm": 1.8686747850955692, |
| "learning_rate": 7.941877467006168e-05, |
| "loss": 0.5644, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.7514231499051234, |
| "grad_norm": 1.1116521915214885, |
| "learning_rate": 7.938197577639942e-05, |
| "loss": 0.5559, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.7590132827324478, |
| "grad_norm": 1.5062245938638401, |
| "learning_rate": 7.934405650201658e-05, |
| "loss": 0.5723, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.7666034155597723, |
| "grad_norm": 1.1108744133424633, |
| "learning_rate": 7.930501792568138e-05, |
| "loss": 0.5545, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.7741935483870968, |
| "grad_norm": 1.5427714103721983, |
| "learning_rate": 7.926486115800511e-05, |
| "loss": 0.556, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.7817836812144212, |
| "grad_norm": 1.764775365031586, |
| "learning_rate": 7.922358734141064e-05, |
| "loss": 0.5596, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.7893738140417458, |
| "grad_norm": 1.2296630078252206, |
| "learning_rate": 7.918119765009979e-05, |
| "loss": 0.5598, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.7969639468690702, |
| "grad_norm": 1.2833682166627998, |
| "learning_rate": 7.913769329002e-05, |
| "loss": 0.5489, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.8045540796963947, |
| "grad_norm": 1.1872477219429831, |
| "learning_rate": 7.909307549883002e-05, |
| "loss": 0.5646, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.8121442125237192, |
| "grad_norm": 1.820761375614486, |
| "learning_rate": 7.904734554586464e-05, |
| "loss": 0.5556, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.8197343453510436, |
| "grad_norm": 1.1423898687342118, |
| "learning_rate": 7.900050473209868e-05, |
| "loss": 0.5483, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.8273244781783681, |
| "grad_norm": 1.476252579811037, |
| "learning_rate": 7.895255439010987e-05, |
| "loss": 0.5479, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.8349146110056926, |
| "grad_norm": 1.3278512325760372, |
| "learning_rate": 7.890349588404102e-05, |
| "loss": 0.5499, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.8425047438330171, |
| "grad_norm": 0.8671841713875902, |
| "learning_rate": 7.885333060956117e-05, |
| "loss": 0.5571, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.8500948766603416, |
| "grad_norm": 1.0738508848999515, |
| "learning_rate": 7.88020599938259e-05, |
| "loss": 0.5449, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.857685009487666, |
| "grad_norm": 1.7715748163298473, |
| "learning_rate": 7.87496854954367e-05, |
| "loss": 0.5491, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.8652751423149905, |
| "grad_norm": 1.0525784243440264, |
| "learning_rate": 7.869620860439956e-05, |
| "loss": 0.543, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.872865275142315, |
| "grad_norm": 2.0621859992760427, |
| "learning_rate": 7.864163084208245e-05, |
| "loss": 0.5622, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.8804554079696395, |
| "grad_norm": 1.363047653212627, |
| "learning_rate": 7.858595376117214e-05, |
| "loss": 0.5515, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.888045540796964, |
| "grad_norm": 1.7242002751506365, |
| "learning_rate": 7.852917894563e-05, |
| "loss": 0.5599, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.8956356736242884, |
| "grad_norm": 1.4061990696892013, |
| "learning_rate": 7.847130801064694e-05, |
| "loss": 0.5605, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.9032258064516129, |
| "grad_norm": 1.7767323380908933, |
| "learning_rate": 7.84123426025974e-05, |
| "loss": 0.5494, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.9108159392789373, |
| "grad_norm": 1.1684328222434068, |
| "learning_rate": 7.835228439899264e-05, |
| "loss": 0.546, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.9184060721062619, |
| "grad_norm": 1.9834381552810127, |
| "learning_rate": 7.829113510843288e-05, |
| "loss": 0.5551, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.9259962049335864, |
| "grad_norm": 1.4942107378630478, |
| "learning_rate": 7.82288964705588e-05, |
| "loss": 0.5454, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.9335863377609108, |
| "grad_norm": 1.631303090634789, |
| "learning_rate": 7.816557025600196e-05, |
| "loss": 0.5403, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.9411764705882353, |
| "grad_norm": 1.2779932620673164, |
| "learning_rate": 7.81011582663345e-05, |
| "loss": 0.5551, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.9487666034155597, |
| "grad_norm": 0.826316123440516, |
| "learning_rate": 7.803566233401784e-05, |
| "loss": 0.5468, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.9563567362428842, |
| "grad_norm": 1.5355038345605292, |
| "learning_rate": 7.796908432235056e-05, |
| "loss": 0.5588, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.9639468690702088, |
| "grad_norm": 1.6053485472330935, |
| "learning_rate": 7.79014261254154e-05, |
| "loss": 0.5457, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.9715370018975332, |
| "grad_norm": 0.8709812572017568, |
| "learning_rate": 7.783268966802539e-05, |
| "loss": 0.5482, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.9791271347248577, |
| "grad_norm": 1.0328203561237506, |
| "learning_rate": 7.776287690566906e-05, |
| "loss": 0.5516, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.9867172675521821, |
| "grad_norm": 1.421726756731164, |
| "learning_rate": 7.769198982445478e-05, |
| "loss": 0.5644, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.9943074003795066, |
| "grad_norm": 0.9699818427155015, |
| "learning_rate": 7.762003044105435e-05, |
| "loss": 0.5333, |
| "step": 131 |
| }, |
| { |
| "epoch": 1.0018975332068312, |
| "grad_norm": 2.203324310322431, |
| "learning_rate": 7.754700080264554e-05, |
| "loss": 0.6801, |
| "step": 132 |
| }, |
| { |
| "epoch": 1.0094876660341556, |
| "grad_norm": 1.2850623970507653, |
| "learning_rate": 7.747290298685392e-05, |
| "loss": 0.5231, |
| "step": 133 |
| }, |
| { |
| "epoch": 1.01707779886148, |
| "grad_norm": 1.0733692629279126, |
| "learning_rate": 7.739773910169366e-05, |
| "loss": 0.526, |
| "step": 134 |
| }, |
| { |
| "epoch": 1.0246679316888045, |
| "grad_norm": 1.3517159638201317, |
| "learning_rate": 7.732151128550767e-05, |
| "loss": 0.5374, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.032258064516129, |
| "grad_norm": 0.9043349347274219, |
| "learning_rate": 7.724422170690668e-05, |
| "loss": 0.5316, |
| "step": 136 |
| }, |
| { |
| "epoch": 1.0398481973434535, |
| "grad_norm": 1.2575116166876772, |
| "learning_rate": 7.716587256470759e-05, |
| "loss": 0.5264, |
| "step": 137 |
| }, |
| { |
| "epoch": 1.047438330170778, |
| "grad_norm": 1.151643956702767, |
| "learning_rate": 7.708646608787091e-05, |
| "loss": 0.5236, |
| "step": 138 |
| }, |
| { |
| "epoch": 1.0550284629981024, |
| "grad_norm": 1.1533411140892482, |
| "learning_rate": 7.700600453543731e-05, |
| "loss": 0.5327, |
| "step": 139 |
| }, |
| { |
| "epoch": 1.0626185958254268, |
| "grad_norm": 1.5703445128955635, |
| "learning_rate": 7.692449019646341e-05, |
| "loss": 0.5189, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.0702087286527515, |
| "grad_norm": 1.503708861643817, |
| "learning_rate": 7.684192538995664e-05, |
| "loss": 0.5208, |
| "step": 141 |
| }, |
| { |
| "epoch": 1.077798861480076, |
| "grad_norm": 0.6891325431467323, |
| "learning_rate": 7.675831246480923e-05, |
| "loss": 0.5176, |
| "step": 142 |
| }, |
| { |
| "epoch": 1.0853889943074004, |
| "grad_norm": 1.862959746082954, |
| "learning_rate": 7.667365379973142e-05, |
| "loss": 0.519, |
| "step": 143 |
| }, |
| { |
| "epoch": 1.092979127134725, |
| "grad_norm": 0.9255777898780981, |
| "learning_rate": 7.658795180318381e-05, |
| "loss": 0.5306, |
| "step": 144 |
| }, |
| { |
| "epoch": 1.1005692599620494, |
| "grad_norm": 1.2860696781263434, |
| "learning_rate": 7.650120891330878e-05, |
| "loss": 0.5231, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.1081593927893738, |
| "grad_norm": 0.9866085500546973, |
| "learning_rate": 7.641342759786116e-05, |
| "loss": 0.5134, |
| "step": 146 |
| }, |
| { |
| "epoch": 1.1157495256166983, |
| "grad_norm": 1.6012070200344108, |
| "learning_rate": 7.632461035413805e-05, |
| "loss": 0.5225, |
| "step": 147 |
| }, |
| { |
| "epoch": 1.1233396584440227, |
| "grad_norm": 1.0880689445644633, |
| "learning_rate": 7.623475970890775e-05, |
| "loss": 0.52, |
| "step": 148 |
| }, |
| { |
| "epoch": 1.1309297912713472, |
| "grad_norm": 1.0388918530802034, |
| "learning_rate": 7.614387821833786e-05, |
| "loss": 0.5234, |
| "step": 149 |
| }, |
| { |
| "epoch": 1.1385199240986716, |
| "grad_norm": 1.3834068969901858, |
| "learning_rate": 7.605196846792256e-05, |
| "loss": 0.52, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.146110056925996, |
| "grad_norm": 1.0808645625405662, |
| "learning_rate": 7.59590330724091e-05, |
| "loss": 0.5199, |
| "step": 151 |
| }, |
| { |
| "epoch": 1.1537001897533208, |
| "grad_norm": 0.8748353485698048, |
| "learning_rate": 7.586507467572339e-05, |
| "loss": 0.5054, |
| "step": 152 |
| }, |
| { |
| "epoch": 1.1612903225806452, |
| "grad_norm": 0.9809721493446659, |
| "learning_rate": 7.577009595089472e-05, |
| "loss": 0.5156, |
| "step": 153 |
| }, |
| { |
| "epoch": 1.1688804554079697, |
| "grad_norm": 1.385065545391808, |
| "learning_rate": 7.567409959997984e-05, |
| "loss": 0.5125, |
| "step": 154 |
| }, |
| { |
| "epoch": 1.1764705882352942, |
| "grad_norm": 1.1835810733031538, |
| "learning_rate": 7.557708835398595e-05, |
| "loss": 0.5089, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.1840607210626186, |
| "grad_norm": 1.0550638017889524, |
| "learning_rate": 7.547906497279315e-05, |
| "loss": 0.5085, |
| "step": 156 |
| }, |
| { |
| "epoch": 1.191650853889943, |
| "grad_norm": 1.0668629873488273, |
| "learning_rate": 7.538003224507579e-05, |
| "loss": 0.5151, |
| "step": 157 |
| }, |
| { |
| "epoch": 1.1992409867172675, |
| "grad_norm": 1.2773079106743754, |
| "learning_rate": 7.52799929882232e-05, |
| "loss": 0.5217, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.206831119544592, |
| "grad_norm": 1.0653233150213854, |
| "learning_rate": 7.517895004825956e-05, |
| "loss": 0.5142, |
| "step": 159 |
| }, |
| { |
| "epoch": 1.2144212523719164, |
| "grad_norm": 1.1811879803660237, |
| "learning_rate": 7.507690629976291e-05, |
| "loss": 0.516, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.222011385199241, |
| "grad_norm": 0.9358140704136899, |
| "learning_rate": 7.497386464578329e-05, |
| "loss": 0.5116, |
| "step": 161 |
| }, |
| { |
| "epoch": 1.2296015180265654, |
| "grad_norm": 1.236267972600389, |
| "learning_rate": 7.486982801776032e-05, |
| "loss": 0.5176, |
| "step": 162 |
| }, |
| { |
| "epoch": 1.23719165085389, |
| "grad_norm": 1.1810121004464773, |
| "learning_rate": 7.476479937543967e-05, |
| "loss": 0.5208, |
| "step": 163 |
| }, |
| { |
| "epoch": 1.2447817836812145, |
| "grad_norm": 1.0715306401128548, |
| "learning_rate": 7.465878170678887e-05, |
| "loss": 0.5149, |
| "step": 164 |
| }, |
| { |
| "epoch": 1.252371916508539, |
| "grad_norm": 1.4554615426026292, |
| "learning_rate": 7.455177802791237e-05, |
| "loss": 0.5176, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.2599620493358634, |
| "grad_norm": 0.8300456250776146, |
| "learning_rate": 7.444379138296572e-05, |
| "loss": 0.5111, |
| "step": 166 |
| }, |
| { |
| "epoch": 1.2675521821631879, |
| "grad_norm": 0.8301260998594161, |
| "learning_rate": 7.433482484406887e-05, |
| "loss": 0.5149, |
| "step": 167 |
| }, |
| { |
| "epoch": 1.2751423149905123, |
| "grad_norm": 1.036861982897111, |
| "learning_rate": 7.42248815112189e-05, |
| "loss": 0.5074, |
| "step": 168 |
| }, |
| { |
| "epoch": 1.2827324478178368, |
| "grad_norm": 1.1061999056879284, |
| "learning_rate": 7.411396451220177e-05, |
| "loss": 0.5014, |
| "step": 169 |
| }, |
| { |
| "epoch": 1.2903225806451613, |
| "grad_norm": 1.3047827647572592, |
| "learning_rate": 7.400207700250333e-05, |
| "loss": 0.5144, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.2979127134724857, |
| "grad_norm": 0.7526970536905354, |
| "learning_rate": 7.388922216521953e-05, |
| "loss": 0.5132, |
| "step": 171 |
| }, |
| { |
| "epoch": 1.3055028462998102, |
| "grad_norm": 0.7452267427677111, |
| "learning_rate": 7.377540321096595e-05, |
| "loss": 0.5022, |
| "step": 172 |
| }, |
| { |
| "epoch": 1.3130929791271346, |
| "grad_norm": 1.0513789114160723, |
| "learning_rate": 7.366062337778637e-05, |
| "loss": 0.5039, |
| "step": 173 |
| }, |
| { |
| "epoch": 1.3206831119544593, |
| "grad_norm": 1.3299701167289224, |
| "learning_rate": 7.354488593106068e-05, |
| "loss": 0.5039, |
| "step": 174 |
| }, |
| { |
| "epoch": 1.3282732447817835, |
| "grad_norm": 0.9881183562854784, |
| "learning_rate": 7.342819416341202e-05, |
| "loss": 0.5161, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.3358633776091082, |
| "grad_norm": 1.3838355156124555, |
| "learning_rate": 7.331055139461305e-05, |
| "loss": 0.5128, |
| "step": 176 |
| }, |
| { |
| "epoch": 1.3434535104364327, |
| "grad_norm": 0.706807050794008, |
| "learning_rate": 7.319196097149153e-05, |
| "loss": 0.4995, |
| "step": 177 |
| }, |
| { |
| "epoch": 1.3510436432637571, |
| "grad_norm": 1.2072275318255294, |
| "learning_rate": 7.307242626783514e-05, |
| "loss": 0.5117, |
| "step": 178 |
| }, |
| { |
| "epoch": 1.3586337760910816, |
| "grad_norm": 0.8736304969731823, |
| "learning_rate": 7.295195068429539e-05, |
| "loss": 0.5093, |
| "step": 179 |
| }, |
| { |
| "epoch": 1.366223908918406, |
| "grad_norm": 1.118370322707032, |
| "learning_rate": 7.283053764829106e-05, |
| "loss": 0.513, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.3738140417457305, |
| "grad_norm": 1.2165754217336513, |
| "learning_rate": 7.270819061391049e-05, |
| "loss": 0.5061, |
| "step": 181 |
| }, |
| { |
| "epoch": 1.381404174573055, |
| "grad_norm": 1.0662810244952639, |
| "learning_rate": 7.258491306181346e-05, |
| "loss": 0.5074, |
| "step": 182 |
| }, |
| { |
| "epoch": 1.3889943074003794, |
| "grad_norm": 1.550093405647991, |
| "learning_rate": 7.24607084991321e-05, |
| "loss": 0.5169, |
| "step": 183 |
| }, |
| { |
| "epoch": 1.396584440227704, |
| "grad_norm": 0.7232302048062569, |
| "learning_rate": 7.233558045937113e-05, |
| "loss": 0.5187, |
| "step": 184 |
| }, |
| { |
| "epoch": 1.4041745730550286, |
| "grad_norm": 1.3301692157689138, |
| "learning_rate": 7.220953250230733e-05, |
| "loss": 0.5101, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.4117647058823528, |
| "grad_norm": 0.9469277615633731, |
| "learning_rate": 7.208256821388831e-05, |
| "loss": 0.5115, |
| "step": 186 |
| }, |
| { |
| "epoch": 1.4193548387096775, |
| "grad_norm": 1.461657389888908, |
| "learning_rate": 7.195469120613041e-05, |
| "loss": 0.518, |
| "step": 187 |
| }, |
| { |
| "epoch": 1.426944971537002, |
| "grad_norm": 0.7145042956694666, |
| "learning_rate": 7.182590511701604e-05, |
| "loss": 0.5002, |
| "step": 188 |
| }, |
| { |
| "epoch": 1.4345351043643264, |
| "grad_norm": 0.9602590784255072, |
| "learning_rate": 7.169621361039009e-05, |
| "loss": 0.4932, |
| "step": 189 |
| }, |
| { |
| "epoch": 1.4421252371916509, |
| "grad_norm": 0.9348247562699835, |
| "learning_rate": 7.156562037585576e-05, |
| "loss": 0.5045, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.4497153700189753, |
| "grad_norm": 1.5691729872812523, |
| "learning_rate": 7.143412912866954e-05, |
| "loss": 0.5146, |
| "step": 191 |
| }, |
| { |
| "epoch": 1.4573055028462998, |
| "grad_norm": 0.7191513604989822, |
| "learning_rate": 7.130174360963562e-05, |
| "loss": 0.5031, |
| "step": 192 |
| }, |
| { |
| "epoch": 1.4648956356736242, |
| "grad_norm": 1.6999162113253339, |
| "learning_rate": 7.116846758499933e-05, |
| "loss": 0.5103, |
| "step": 193 |
| }, |
| { |
| "epoch": 1.4724857685009487, |
| "grad_norm": 1.0965769424195349, |
| "learning_rate": 7.103430484634009e-05, |
| "loss": 0.5101, |
| "step": 194 |
| }, |
| { |
| "epoch": 1.4800759013282732, |
| "grad_norm": 1.042633463565035, |
| "learning_rate": 7.089925921046348e-05, |
| "loss": 0.5133, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.4876660341555978, |
| "grad_norm": 1.5277163845081705, |
| "learning_rate": 7.076333451929275e-05, |
| "loss": 0.5166, |
| "step": 196 |
| }, |
| { |
| "epoch": 1.495256166982922, |
| "grad_norm": 0.7588665368653583, |
| "learning_rate": 7.062653463975938e-05, |
| "loss": 0.5028, |
| "step": 197 |
| }, |
| { |
| "epoch": 1.5028462998102468, |
| "grad_norm": 1.4802097799655463, |
| "learning_rate": 7.048886346369321e-05, |
| "loss": 0.5173, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.510436432637571, |
| "grad_norm": 0.8989137638919413, |
| "learning_rate": 7.035032490771165e-05, |
| "loss": 0.5058, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.5180265654648957, |
| "grad_norm": 1.3727603969798114, |
| "learning_rate": 7.021092291310821e-05, |
| "loss": 0.5196, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.5256166982922201, |
| "grad_norm": 0.95363755185113, |
| "learning_rate": 7.007066144574052e-05, |
| "loss": 0.5205, |
| "step": 201 |
| }, |
| { |
| "epoch": 1.5332068311195446, |
| "grad_norm": 1.1663040985006814, |
| "learning_rate": 6.992954449591731e-05, |
| "loss": 0.5093, |
| "step": 202 |
| }, |
| { |
| "epoch": 1.540796963946869, |
| "grad_norm": 0.7636048619329266, |
| "learning_rate": 6.978757607828509e-05, |
| "loss": 0.506, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.5483870967741935, |
| "grad_norm": 1.1069490833534057, |
| "learning_rate": 6.964476023171378e-05, |
| "loss": 0.516, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.5559772296015182, |
| "grad_norm": 0.6735693040775705, |
| "learning_rate": 6.95011010191819e-05, |
| "loss": 0.507, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.5635673624288424, |
| "grad_norm": 0.7757347897129492, |
| "learning_rate": 6.935660252766092e-05, |
| "loss": 0.5181, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.571157495256167, |
| "grad_norm": 0.7414965427945387, |
| "learning_rate": 6.921126886799903e-05, |
| "loss": 0.5074, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.5787476280834913, |
| "grad_norm": 0.8131364204912126, |
| "learning_rate": 6.906510417480422e-05, |
| "loss": 0.5153, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.586337760910816, |
| "grad_norm": 0.8512550944337758, |
| "learning_rate": 6.891811260632653e-05, |
| "loss": 0.5054, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.5939278937381403, |
| "grad_norm": 0.7855183043381698, |
| "learning_rate": 6.877029834433992e-05, |
| "loss": 0.5047, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.601518026565465, |
| "grad_norm": 0.8992512717445637, |
| "learning_rate": 6.862166559402318e-05, |
| "loss": 0.5025, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.6091081593927894, |
| "grad_norm": 0.9210792646776457, |
| "learning_rate": 6.847221858384032e-05, |
| "loss": 0.4974, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.6166982922201139, |
| "grad_norm": 0.9424266330757026, |
| "learning_rate": 6.832196156542033e-05, |
| "loss": 0.5062, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.6242884250474383, |
| "grad_norm": 1.0966101994750281, |
| "learning_rate": 6.817089881343613e-05, |
| "loss": 0.5054, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.6318785578747628, |
| "grad_norm": 1.009163727768516, |
| "learning_rate": 6.801903462548308e-05, |
| "loss": 0.5034, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.6394686907020875, |
| "grad_norm": 0.9725332248811417, |
| "learning_rate": 6.786637332195659e-05, |
| "loss": 0.5115, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.6470588235294117, |
| "grad_norm": 1.0170207658600694, |
| "learning_rate": 6.771291924592929e-05, |
| "loss": 0.5066, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.6546489563567364, |
| "grad_norm": 0.9422861500618195, |
| "learning_rate": 6.755867676302747e-05, |
| "loss": 0.504, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.6622390891840606, |
| "grad_norm": 0.9158879164554034, |
| "learning_rate": 6.740365026130684e-05, |
| "loss": 0.5032, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.6698292220113853, |
| "grad_norm": 0.7780361297463692, |
| "learning_rate": 6.724784415112774e-05, |
| "loss": 0.4888, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.6774193548387095, |
| "grad_norm": 0.5692137929082299, |
| "learning_rate": 6.709126286502965e-05, |
| "loss": 0.5022, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.6850094876660342, |
| "grad_norm": 0.5004905918093622, |
| "learning_rate": 6.693391085760506e-05, |
| "loss": 0.4995, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.6925996204933587, |
| "grad_norm": 0.5848868016251021, |
| "learning_rate": 6.677579260537277e-05, |
| "loss": 0.5055, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.7001897533206831, |
| "grad_norm": 0.734294502408837, |
| "learning_rate": 6.661691260665057e-05, |
| "loss": 0.5008, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.7077798861480076, |
| "grad_norm": 0.9781085041990851, |
| "learning_rate": 6.64572753814272e-05, |
| "loss": 0.5082, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.715370018975332, |
| "grad_norm": 1.1839289754443743, |
| "learning_rate": 6.629688547123381e-05, |
| "loss": 0.4966, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.7229601518026565, |
| "grad_norm": 0.6203375151514526, |
| "learning_rate": 6.613574743901472e-05, |
| "loss": 0.4976, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.730550284629981, |
| "grad_norm": 0.37377037948651215, |
| "learning_rate": 6.597386586899766e-05, |
| "loss": 0.4907, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.7381404174573056, |
| "grad_norm": 0.5842003288831636, |
| "learning_rate": 6.58112453665633e-05, |
| "loss": 0.5, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.7457305502846299, |
| "grad_norm": 1.1216009200042196, |
| "learning_rate": 6.564789055811422e-05, |
| "loss": 0.5118, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.7533206831119545, |
| "grad_norm": 1.1503531175618553, |
| "learning_rate": 6.54838060909434e-05, |
| "loss": 0.4856, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.7609108159392788, |
| "grad_norm": 0.5953762660752571, |
| "learning_rate": 6.531899663310187e-05, |
| "loss": 0.4933, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.7685009487666035, |
| "grad_norm": 0.4946507234843489, |
| "learning_rate": 6.515346687326602e-05, |
| "loss": 0.488, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.776091081593928, |
| "grad_norm": 0.6911888286239702, |
| "learning_rate": 6.498722152060411e-05, |
| "loss": 0.5024, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.7836812144212524, |
| "grad_norm": 0.9524817833599729, |
| "learning_rate": 6.482026530464244e-05, |
| "loss": 0.497, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.7912713472485768, |
| "grad_norm": 1.056867827538452, |
| "learning_rate": 6.465260297513059e-05, |
| "loss": 0.5001, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.7988614800759013, |
| "grad_norm": 0.9341896474591933, |
| "learning_rate": 6.448423930190653e-05, |
| "loss": 0.5056, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.8064516129032258, |
| "grad_norm": 0.7998775078188581, |
| "learning_rate": 6.431517907476073e-05, |
| "loss": 0.4965, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.8140417457305502, |
| "grad_norm": 0.6024227793682277, |
| "learning_rate": 6.414542710330004e-05, |
| "loss": 0.4918, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.821631878557875, |
| "grad_norm": 0.5054296948703985, |
| "learning_rate": 6.397498821681073e-05, |
| "loss": 0.4987, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.8292220113851991, |
| "grad_norm": 0.4915898095283207, |
| "learning_rate": 6.380386726412122e-05, |
| "loss": 0.489, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.8368121442125238, |
| "grad_norm": 0.5191126165622191, |
| "learning_rate": 6.363206911346405e-05, |
| "loss": 0.5062, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.844402277039848, |
| "grad_norm": 0.591888201694542, |
| "learning_rate": 6.345959865233742e-05, |
| "loss": 0.4928, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.8519924098671727, |
| "grad_norm": 0.6103884601516754, |
| "learning_rate": 6.328646078736614e-05, |
| "loss": 0.4983, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.8595825426944972, |
| "grad_norm": 0.5676870354041681, |
| "learning_rate": 6.311266044416205e-05, |
| "loss": 0.493, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.8671726755218216, |
| "grad_norm": 0.5025577878236349, |
| "learning_rate": 6.293820256718388e-05, |
| "loss": 0.4936, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.874762808349146, |
| "grad_norm": 0.5343665402941907, |
| "learning_rate": 6.276309211959657e-05, |
| "loss": 0.4976, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.8823529411764706, |
| "grad_norm": 0.684168766812062, |
| "learning_rate": 6.25873340831301e-05, |
| "loss": 0.4986, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.889943074003795, |
| "grad_norm": 0.971664414920718, |
| "learning_rate": 6.241093345793777e-05, |
| "loss": 0.4923, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.8975332068311195, |
| "grad_norm": 1.3291099108661037, |
| "learning_rate": 6.22338952624539e-05, |
| "loss": 0.5085, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.9051233396584442, |
| "grad_norm": 0.5887944838607679, |
| "learning_rate": 6.205622453325113e-05, |
| "loss": 0.4901, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.9127134724857684, |
| "grad_norm": 0.5766670451808246, |
| "learning_rate": 6.18779263248971e-05, |
| "loss": 0.4923, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.920303605313093, |
| "grad_norm": 1.1307550162308162, |
| "learning_rate": 6.169900570981057e-05, |
| "loss": 0.4991, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.9278937381404173, |
| "grad_norm": 1.138869550845278, |
| "learning_rate": 6.151946777811729e-05, |
| "loss": 0.4998, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.935483870967742, |
| "grad_norm": 0.6269758422232977, |
| "learning_rate": 6.133931763750509e-05, |
| "loss": 0.4933, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.9430740037950665, |
| "grad_norm": 0.7710149723845751, |
| "learning_rate": 6.11585604130785e-05, |
| "loss": 0.4944, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.950664136622391, |
| "grad_norm": 0.9641556034924468, |
| "learning_rate": 6.097720124721311e-05, |
| "loss": 0.4915, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.9582542694497154, |
| "grad_norm": 0.8101487252514183, |
| "learning_rate": 6.079524529940911e-05, |
| "loss": 0.4788, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.9658444022770398, |
| "grad_norm": 0.6731500817613972, |
| "learning_rate": 6.0612697746144664e-05, |
| "loss": 0.4887, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.9734345351043643, |
| "grad_norm": 0.66266631987093, |
| "learning_rate": 6.0429563780728476e-05, |
| "loss": 0.4888, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.9810246679316887, |
| "grad_norm": 0.5402551506844365, |
| "learning_rate": 6.02458486131522e-05, |
| "loss": 0.4831, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.9886148007590134, |
| "grad_norm": 0.6879216139275022, |
| "learning_rate": 6.006155746994212e-05, |
| "loss": 0.491, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.9962049335863377, |
| "grad_norm": 0.9539606050998473, |
| "learning_rate": 5.98766955940105e-05, |
| "loss": 0.5341, |
| "step": 263 |
| }, |
| { |
| "epoch": 2.0037950664136623, |
| "grad_norm": 1.2929340536370602, |
| "learning_rate": 5.969126824450643e-05, |
| "loss": 0.5524, |
| "step": 264 |
| }, |
| { |
| "epoch": 2.0113851992409866, |
| "grad_norm": 0.6792026166979978, |
| "learning_rate": 5.9505280696666174e-05, |
| "loss": 0.4671, |
| "step": 265 |
| }, |
| { |
| "epoch": 2.0189753320683113, |
| "grad_norm": 0.6570978500488273, |
| "learning_rate": 5.931873824166316e-05, |
| "loss": 0.458, |
| "step": 266 |
| }, |
| { |
| "epoch": 2.0265654648956355, |
| "grad_norm": 0.8625246084442377, |
| "learning_rate": 5.913164618645738e-05, |
| "loss": 0.4646, |
| "step": 267 |
| }, |
| { |
| "epoch": 2.03415559772296, |
| "grad_norm": 0.8463370840972069, |
| "learning_rate": 5.894400985364444e-05, |
| "loss": 0.4503, |
| "step": 268 |
| }, |
| { |
| "epoch": 2.041745730550285, |
| "grad_norm": 0.5846678229118594, |
| "learning_rate": 5.875583458130417e-05, |
| "loss": 0.452, |
| "step": 269 |
| }, |
| { |
| "epoch": 2.049335863377609, |
| "grad_norm": 0.48959366327046705, |
| "learning_rate": 5.856712572284868e-05, |
| "loss": 0.4608, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.0569259962049338, |
| "grad_norm": 0.5808495151777524, |
| "learning_rate": 5.8377888646870154e-05, |
| "loss": 0.4572, |
| "step": 271 |
| }, |
| { |
| "epoch": 2.064516129032258, |
| "grad_norm": 0.5154615059210003, |
| "learning_rate": 5.818812873698809e-05, |
| "loss": 0.4555, |
| "step": 272 |
| }, |
| { |
| "epoch": 2.0721062618595827, |
| "grad_norm": 0.5247505353737575, |
| "learning_rate": 5.799785139169606e-05, |
| "loss": 0.4493, |
| "step": 273 |
| }, |
| { |
| "epoch": 2.079696394686907, |
| "grad_norm": 0.6700114330504865, |
| "learning_rate": 5.7807062024208256e-05, |
| "loss": 0.4593, |
| "step": 274 |
| }, |
| { |
| "epoch": 2.0872865275142316, |
| "grad_norm": 0.6564087028803952, |
| "learning_rate": 5.761576606230538e-05, |
| "loss": 0.4543, |
| "step": 275 |
| }, |
| { |
| "epoch": 2.094876660341556, |
| "grad_norm": 0.6170822532663903, |
| "learning_rate": 5.742396894818031e-05, |
| "loss": 0.4585, |
| "step": 276 |
| }, |
| { |
| "epoch": 2.1024667931688805, |
| "grad_norm": 0.5359408843960233, |
| "learning_rate": 5.723167613828324e-05, |
| "loss": 0.4571, |
| "step": 277 |
| }, |
| { |
| "epoch": 2.1100569259962048, |
| "grad_norm": 0.42551634695058566, |
| "learning_rate": 5.7038893103166425e-05, |
| "loss": 0.4553, |
| "step": 278 |
| }, |
| { |
| "epoch": 2.1176470588235294, |
| "grad_norm": 0.25776313987894806, |
| "learning_rate": 5.684562532732859e-05, |
| "loss": 0.4467, |
| "step": 279 |
| }, |
| { |
| "epoch": 2.1252371916508537, |
| "grad_norm": 0.27351669144074725, |
| "learning_rate": 5.665187830905888e-05, |
| "loss": 0.4415, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.1328273244781784, |
| "grad_norm": 0.41764999814129333, |
| "learning_rate": 5.645765756028045e-05, |
| "loss": 0.459, |
| "step": 281 |
| }, |
| { |
| "epoch": 2.140417457305503, |
| "grad_norm": 0.4715282529881882, |
| "learning_rate": 5.626296860639364e-05, |
| "loss": 0.4535, |
| "step": 282 |
| }, |
| { |
| "epoch": 2.1480075901328273, |
| "grad_norm": 0.45181614089506017, |
| "learning_rate": 5.606781698611879e-05, |
| "loss": 0.4557, |
| "step": 283 |
| }, |
| { |
| "epoch": 2.155597722960152, |
| "grad_norm": 0.3928688694632629, |
| "learning_rate": 5.587220825133867e-05, |
| "loss": 0.4529, |
| "step": 284 |
| }, |
| { |
| "epoch": 2.163187855787476, |
| "grad_norm": 0.3422352007203858, |
| "learning_rate": 5.567614796694056e-05, |
| "loss": 0.4478, |
| "step": 285 |
| }, |
| { |
| "epoch": 2.170777988614801, |
| "grad_norm": 0.3858181479438661, |
| "learning_rate": 5.5479641710657867e-05, |
| "loss": 0.461, |
| "step": 286 |
| }, |
| { |
| "epoch": 2.178368121442125, |
| "grad_norm": 0.4901941376432685, |
| "learning_rate": 5.528269507291152e-05, |
| "loss": 0.4533, |
| "step": 287 |
| }, |
| { |
| "epoch": 2.18595825426945, |
| "grad_norm": 0.6077838701042644, |
| "learning_rate": 5.5085313656650856e-05, |
| "loss": 0.4565, |
| "step": 288 |
| }, |
| { |
| "epoch": 2.193548387096774, |
| "grad_norm": 0.6334250948792183, |
| "learning_rate": 5.48875030771943e-05, |
| "loss": 0.4526, |
| "step": 289 |
| }, |
| { |
| "epoch": 2.2011385199240987, |
| "grad_norm": 0.5394180746780861, |
| "learning_rate": 5.468926896206955e-05, |
| "loss": 0.4474, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.2087286527514234, |
| "grad_norm": 0.3688187782872463, |
| "learning_rate": 5.4490616950853484e-05, |
| "loss": 0.4486, |
| "step": 291 |
| }, |
| { |
| "epoch": 2.2163187855787476, |
| "grad_norm": 0.28612624363569344, |
| "learning_rate": 5.4291552695011786e-05, |
| "loss": 0.4473, |
| "step": 292 |
| }, |
| { |
| "epoch": 2.2239089184060723, |
| "grad_norm": 0.3786323162444375, |
| "learning_rate": 5.409208185773806e-05, |
| "loss": 0.4537, |
| "step": 293 |
| }, |
| { |
| "epoch": 2.2314990512333965, |
| "grad_norm": 0.45998197157742643, |
| "learning_rate": 5.389221011379281e-05, |
| "loss": 0.445, |
| "step": 294 |
| }, |
| { |
| "epoch": 2.239089184060721, |
| "grad_norm": 0.4227537195267863, |
| "learning_rate": 5.3691943149341976e-05, |
| "loss": 0.4524, |
| "step": 295 |
| }, |
| { |
| "epoch": 2.2466793168880455, |
| "grad_norm": 0.3375900744876679, |
| "learning_rate": 5.3491286661795104e-05, |
| "loss": 0.4543, |
| "step": 296 |
| }, |
| { |
| "epoch": 2.25426944971537, |
| "grad_norm": 0.3936283250723083, |
| "learning_rate": 5.3290246359643365e-05, |
| "loss": 0.4549, |
| "step": 297 |
| }, |
| { |
| "epoch": 2.2618595825426944, |
| "grad_norm": 0.4158907529340202, |
| "learning_rate": 5.3088827962297055e-05, |
| "loss": 0.4615, |
| "step": 298 |
| }, |
| { |
| "epoch": 2.269449715370019, |
| "grad_norm": 0.3573969971167834, |
| "learning_rate": 5.288703719992296e-05, |
| "loss": 0.4627, |
| "step": 299 |
| }, |
| { |
| "epoch": 2.2770398481973433, |
| "grad_norm": 0.29339247941077856, |
| "learning_rate": 5.2684879813281324e-05, |
| "loss": 0.4527, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.284629981024668, |
| "grad_norm": 0.3811958753473836, |
| "learning_rate": 5.248236155356244e-05, |
| "loss": 0.4511, |
| "step": 301 |
| }, |
| { |
| "epoch": 2.292220113851992, |
| "grad_norm": 0.3947350372974727, |
| "learning_rate": 5.227948818222317e-05, |
| "loss": 0.4551, |
| "step": 302 |
| }, |
| { |
| "epoch": 2.299810246679317, |
| "grad_norm": 0.2959934006358651, |
| "learning_rate": 5.207626547082294e-05, |
| "loss": 0.451, |
| "step": 303 |
| }, |
| { |
| "epoch": 2.3074003795066416, |
| "grad_norm": 0.3009410854470416, |
| "learning_rate": 5.1872699200859606e-05, |
| "loss": 0.4504, |
| "step": 304 |
| }, |
| { |
| "epoch": 2.314990512333966, |
| "grad_norm": 0.38647651793826754, |
| "learning_rate": 5.1668795163604924e-05, |
| "loss": 0.4575, |
| "step": 305 |
| }, |
| { |
| "epoch": 2.3225806451612905, |
| "grad_norm": 0.34305172614808316, |
| "learning_rate": 5.1464559159939814e-05, |
| "loss": 0.4513, |
| "step": 306 |
| }, |
| { |
| "epoch": 2.3301707779886147, |
| "grad_norm": 0.3120007036591175, |
| "learning_rate": 5.125999700018934e-05, |
| "loss": 0.4601, |
| "step": 307 |
| }, |
| { |
| "epoch": 2.3377609108159394, |
| "grad_norm": 0.31088228173825794, |
| "learning_rate": 5.105511450395742e-05, |
| "loss": 0.4605, |
| "step": 308 |
| }, |
| { |
| "epoch": 2.3453510436432636, |
| "grad_norm": 0.24185319509946887, |
| "learning_rate": 5.084991749996121e-05, |
| "loss": 0.4544, |
| "step": 309 |
| }, |
| { |
| "epoch": 2.3529411764705883, |
| "grad_norm": 0.3141319871949889, |
| "learning_rate": 5.064441182586538e-05, |
| "loss": 0.4477, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.3605313092979125, |
| "grad_norm": 0.3437798737764119, |
| "learning_rate": 5.0438603328115915e-05, |
| "loss": 0.438, |
| "step": 311 |
| }, |
| { |
| "epoch": 2.3681214421252372, |
| "grad_norm": 0.3413170865670166, |
| "learning_rate": 5.023249786177388e-05, |
| "loss": 0.4496, |
| "step": 312 |
| }, |
| { |
| "epoch": 2.375711574952562, |
| "grad_norm": 0.32816099400302223, |
| "learning_rate": 5.002610129034883e-05, |
| "loss": 0.4457, |
| "step": 313 |
| }, |
| { |
| "epoch": 2.383301707779886, |
| "grad_norm": 0.23652738280230934, |
| "learning_rate": 4.981941948563197e-05, |
| "loss": 0.4518, |
| "step": 314 |
| }, |
| { |
| "epoch": 2.3908918406072104, |
| "grad_norm": 0.3332470802079381, |
| "learning_rate": 4.961245832752916e-05, |
| "loss": 0.4553, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.398481973434535, |
| "grad_norm": 0.30703993672772734, |
| "learning_rate": 4.940522370389355e-05, |
| "loss": 0.4511, |
| "step": 316 |
| }, |
| { |
| "epoch": 2.4060721062618597, |
| "grad_norm": 0.3458797214503799, |
| "learning_rate": 4.919772151035819e-05, |
| "loss": 0.4483, |
| "step": 317 |
| }, |
| { |
| "epoch": 2.413662239089184, |
| "grad_norm": 0.33817212823710935, |
| "learning_rate": 4.898995765016822e-05, |
| "loss": 0.4602, |
| "step": 318 |
| }, |
| { |
| "epoch": 2.4212523719165087, |
| "grad_norm": 0.28768592124027254, |
| "learning_rate": 4.878193803401294e-05, |
| "loss": 0.441, |
| "step": 319 |
| }, |
| { |
| "epoch": 2.428842504743833, |
| "grad_norm": 0.24625871004420682, |
| "learning_rate": 4.85736685798577e-05, |
| "loss": 0.4447, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.4364326375711576, |
| "grad_norm": 0.3114815791554252, |
| "learning_rate": 4.836515521277548e-05, |
| "loss": 0.4506, |
| "step": 321 |
| }, |
| { |
| "epoch": 2.444022770398482, |
| "grad_norm": 0.43608825596037326, |
| "learning_rate": 4.8156403864778376e-05, |
| "loss": 0.4559, |
| "step": 322 |
| }, |
| { |
| "epoch": 2.4516129032258065, |
| "grad_norm": 0.3872177355726424, |
| "learning_rate": 4.7947420474648826e-05, |
| "loss": 0.4596, |
| "step": 323 |
| }, |
| { |
| "epoch": 2.4592030360531307, |
| "grad_norm": 0.2265303368613466, |
| "learning_rate": 4.773821098777061e-05, |
| "loss": 0.4529, |
| "step": 324 |
| }, |
| { |
| "epoch": 2.4667931688804554, |
| "grad_norm": 0.26489937931522084, |
| "learning_rate": 4.7528781355959836e-05, |
| "loss": 0.4462, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.47438330170778, |
| "grad_norm": 0.32008600117514796, |
| "learning_rate": 4.731913753729543e-05, |
| "loss": 0.4489, |
| "step": 326 |
| }, |
| { |
| "epoch": 2.4819734345351043, |
| "grad_norm": 0.30655482675440676, |
| "learning_rate": 4.710928549594979e-05, |
| "loss": 0.4542, |
| "step": 327 |
| }, |
| { |
| "epoch": 2.489563567362429, |
| "grad_norm": 0.24961472010620386, |
| "learning_rate": 4.689923120201907e-05, |
| "loss": 0.455, |
| "step": 328 |
| }, |
| { |
| "epoch": 2.4971537001897532, |
| "grad_norm": 0.3196073862864069, |
| "learning_rate": 4.668898063135327e-05, |
| "loss": 0.4401, |
| "step": 329 |
| }, |
| { |
| "epoch": 2.504743833017078, |
| "grad_norm": 0.277810170883558, |
| "learning_rate": 4.647853976538635e-05, |
| "loss": 0.4429, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.512333965844402, |
| "grad_norm": 0.2770203193332356, |
| "learning_rate": 4.626791459096592e-05, |
| "loss": 0.4509, |
| "step": 331 |
| }, |
| { |
| "epoch": 2.519924098671727, |
| "grad_norm": 0.26941306970885837, |
| "learning_rate": 4.605711110018307e-05, |
| "loss": 0.4485, |
| "step": 332 |
| }, |
| { |
| "epoch": 2.527514231499051, |
| "grad_norm": 0.2128205627033176, |
| "learning_rate": 4.584613529020177e-05, |
| "loss": 0.4567, |
| "step": 333 |
| }, |
| { |
| "epoch": 2.5351043643263758, |
| "grad_norm": 0.2612809484941453, |
| "learning_rate": 4.563499316308832e-05, |
| "loss": 0.4454, |
| "step": 334 |
| }, |
| { |
| "epoch": 2.5426944971537004, |
| "grad_norm": 0.2611991188114079, |
| "learning_rate": 4.542369072564062e-05, |
| "loss": 0.4527, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.5502846299810247, |
| "grad_norm": 0.21775843029252434, |
| "learning_rate": 4.5212233989217217e-05, |
| "loss": 0.4533, |
| "step": 336 |
| }, |
| { |
| "epoch": 2.557874762808349, |
| "grad_norm": 0.24689100702507727, |
| "learning_rate": 4.500062896956632e-05, |
| "loss": 0.4564, |
| "step": 337 |
| }, |
| { |
| "epoch": 2.5654648956356736, |
| "grad_norm": 0.26478079829629153, |
| "learning_rate": 4.47888816866547e-05, |
| "loss": 0.4529, |
| "step": 338 |
| }, |
| { |
| "epoch": 2.5730550284629983, |
| "grad_norm": 0.27076572953883926, |
| "learning_rate": 4.457699816449632e-05, |
| "loss": 0.443, |
| "step": 339 |
| }, |
| { |
| "epoch": 2.5806451612903225, |
| "grad_norm": 0.2578704602776011, |
| "learning_rate": 4.436498443098108e-05, |
| "loss": 0.4474, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.588235294117647, |
| "grad_norm": 0.22049010549186773, |
| "learning_rate": 4.4152846517703265e-05, |
| "loss": 0.45, |
| "step": 341 |
| }, |
| { |
| "epoch": 2.5958254269449714, |
| "grad_norm": 0.24125071259305053, |
| "learning_rate": 4.394059045978994e-05, |
| "loss": 0.4481, |
| "step": 342 |
| }, |
| { |
| "epoch": 2.603415559772296, |
| "grad_norm": 0.226901700766956, |
| "learning_rate": 4.372822229572927e-05, |
| "loss": 0.4457, |
| "step": 343 |
| }, |
| { |
| "epoch": 2.6110056925996203, |
| "grad_norm": 0.2538357888941769, |
| "learning_rate": 4.3515748067198734e-05, |
| "loss": 0.4467, |
| "step": 344 |
| }, |
| { |
| "epoch": 2.618595825426945, |
| "grad_norm": 0.24051209192684073, |
| "learning_rate": 4.33031738188933e-05, |
| "loss": 0.4612, |
| "step": 345 |
| }, |
| { |
| "epoch": 2.6261859582542693, |
| "grad_norm": 0.1851624882291598, |
| "learning_rate": 4.309050559835335e-05, |
| "loss": 0.4447, |
| "step": 346 |
| }, |
| { |
| "epoch": 2.633776091081594, |
| "grad_norm": 0.23729717589403226, |
| "learning_rate": 4.287774945579268e-05, |
| "loss": 0.4546, |
| "step": 347 |
| }, |
| { |
| "epoch": 2.6413662239089186, |
| "grad_norm": 0.2414632155732589, |
| "learning_rate": 4.266491144392646e-05, |
| "loss": 0.4547, |
| "step": 348 |
| }, |
| { |
| "epoch": 2.648956356736243, |
| "grad_norm": 0.1961262640553002, |
| "learning_rate": 4.245199761779889e-05, |
| "loss": 0.4528, |
| "step": 349 |
| }, |
| { |
| "epoch": 2.656546489563567, |
| "grad_norm": 0.2519131470563294, |
| "learning_rate": 4.223901403461104e-05, |
| "loss": 0.4468, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.6641366223908918, |
| "grad_norm": 0.2871531404330494, |
| "learning_rate": 4.202596675354851e-05, |
| "loss": 0.4524, |
| "step": 351 |
| }, |
| { |
| "epoch": 2.6717267552182165, |
| "grad_norm": 0.2328323960543026, |
| "learning_rate": 4.1812861835609055e-05, |
| "loss": 0.4477, |
| "step": 352 |
| }, |
| { |
| "epoch": 2.6793168880455407, |
| "grad_norm": 0.34379368220276085, |
| "learning_rate": 4.1599705343430126e-05, |
| "loss": 0.4473, |
| "step": 353 |
| }, |
| { |
| "epoch": 2.6869070208728654, |
| "grad_norm": 0.3077170581200678, |
| "learning_rate": 4.138650334111641e-05, |
| "loss": 0.4482, |
| "step": 354 |
| }, |
| { |
| "epoch": 2.6944971537001896, |
| "grad_norm": 0.27170499447729773, |
| "learning_rate": 4.117326189406733e-05, |
| "loss": 0.4456, |
| "step": 355 |
| }, |
| { |
| "epoch": 2.7020872865275143, |
| "grad_norm": 0.23574025133073204, |
| "learning_rate": 4.095998706880449e-05, |
| "loss": 0.441, |
| "step": 356 |
| }, |
| { |
| "epoch": 2.709677419354839, |
| "grad_norm": 0.24705781444445205, |
| "learning_rate": 4.0746684932799035e-05, |
| "loss": 0.4546, |
| "step": 357 |
| }, |
| { |
| "epoch": 2.717267552182163, |
| "grad_norm": 0.27539198214734206, |
| "learning_rate": 4.05333615542991e-05, |
| "loss": 0.4531, |
| "step": 358 |
| }, |
| { |
| "epoch": 2.7248576850094874, |
| "grad_norm": 0.23526848668490832, |
| "learning_rate": 4.032002300215715e-05, |
| "loss": 0.4453, |
| "step": 359 |
| }, |
| { |
| "epoch": 2.732447817836812, |
| "grad_norm": 0.23006904681882934, |
| "learning_rate": 4.01066753456573e-05, |
| "loss": 0.4498, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.740037950664137, |
| "grad_norm": 0.2370274892416526, |
| "learning_rate": 3.989332465434272e-05, |
| "loss": 0.4453, |
| "step": 361 |
| }, |
| { |
| "epoch": 2.747628083491461, |
| "grad_norm": 0.21309601710918213, |
| "learning_rate": 3.9679976997842875e-05, |
| "loss": 0.4477, |
| "step": 362 |
| }, |
| { |
| "epoch": 2.7552182163187857, |
| "grad_norm": 0.24539024717561866, |
| "learning_rate": 3.946663844570091e-05, |
| "loss": 0.4476, |
| "step": 363 |
| }, |
| { |
| "epoch": 2.76280834914611, |
| "grad_norm": 0.18994827411578624, |
| "learning_rate": 3.925331506720097e-05, |
| "loss": 0.4464, |
| "step": 364 |
| }, |
| { |
| "epoch": 2.7703984819734346, |
| "grad_norm": 0.23585418564376148, |
| "learning_rate": 3.9040012931195515e-05, |
| "loss": 0.4518, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.777988614800759, |
| "grad_norm": 0.2781600210524234, |
| "learning_rate": 3.8826738105932674e-05, |
| "loss": 0.4446, |
| "step": 366 |
| }, |
| { |
| "epoch": 2.7855787476280836, |
| "grad_norm": 0.22272463213128638, |
| "learning_rate": 3.8613496658883593e-05, |
| "loss": 0.4593, |
| "step": 367 |
| }, |
| { |
| "epoch": 2.793168880455408, |
| "grad_norm": 0.21131624600480586, |
| "learning_rate": 3.8400294656569894e-05, |
| "loss": 0.4553, |
| "step": 368 |
| }, |
| { |
| "epoch": 2.8007590132827325, |
| "grad_norm": 0.31839013252012877, |
| "learning_rate": 3.818713816439096e-05, |
| "loss": 0.4548, |
| "step": 369 |
| }, |
| { |
| "epoch": 2.808349146110057, |
| "grad_norm": 0.28626008068744174, |
| "learning_rate": 3.7974033246451496e-05, |
| "loss": 0.4454, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.8159392789373814, |
| "grad_norm": 0.16805045978558228, |
| "learning_rate": 3.7760985965388975e-05, |
| "loss": 0.4533, |
| "step": 371 |
| }, |
| { |
| "epoch": 2.8235294117647056, |
| "grad_norm": 0.27576140446733527, |
| "learning_rate": 3.7548002382201126e-05, |
| "loss": 0.4528, |
| "step": 372 |
| }, |
| { |
| "epoch": 2.8311195445920303, |
| "grad_norm": 0.2587367405759775, |
| "learning_rate": 3.7335088556073554e-05, |
| "loss": 0.4525, |
| "step": 373 |
| }, |
| { |
| "epoch": 2.838709677419355, |
| "grad_norm": 0.2177015252391427, |
| "learning_rate": 3.712225054420732e-05, |
| "loss": 0.4466, |
| "step": 374 |
| }, |
| { |
| "epoch": 2.846299810246679, |
| "grad_norm": 0.22016300401684588, |
| "learning_rate": 3.690949440164667e-05, |
| "loss": 0.4507, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.853889943074004, |
| "grad_norm": 0.2168768585860928, |
| "learning_rate": 3.669682618110671e-05, |
| "loss": 0.4537, |
| "step": 376 |
| }, |
| { |
| "epoch": 2.861480075901328, |
| "grad_norm": 0.23069857787158976, |
| "learning_rate": 3.648425193280128e-05, |
| "loss": 0.4514, |
| "step": 377 |
| }, |
| { |
| "epoch": 2.869070208728653, |
| "grad_norm": 0.2308590735052973, |
| "learning_rate": 3.627177770427075e-05, |
| "loss": 0.4517, |
| "step": 378 |
| }, |
| { |
| "epoch": 2.8766603415559775, |
| "grad_norm": 0.16332486719450007, |
| "learning_rate": 3.6059409540210075e-05, |
| "loss": 0.4437, |
| "step": 379 |
| }, |
| { |
| "epoch": 2.8842504743833017, |
| "grad_norm": 0.2372916479550056, |
| "learning_rate": 3.5847153482296734e-05, |
| "loss": 0.4516, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.891840607210626, |
| "grad_norm": 0.25887011649794794, |
| "learning_rate": 3.563501556901892e-05, |
| "loss": 0.4484, |
| "step": 381 |
| }, |
| { |
| "epoch": 2.8994307400379506, |
| "grad_norm": 0.19423313672626585, |
| "learning_rate": 3.5423001835503696e-05, |
| "loss": 0.4489, |
| "step": 382 |
| }, |
| { |
| "epoch": 2.9070208728652753, |
| "grad_norm": 0.2299380518686083, |
| "learning_rate": 3.521111831334532e-05, |
| "loss": 0.4458, |
| "step": 383 |
| }, |
| { |
| "epoch": 2.9146110056925996, |
| "grad_norm": 0.19304361317692467, |
| "learning_rate": 3.4999371030433694e-05, |
| "loss": 0.4527, |
| "step": 384 |
| }, |
| { |
| "epoch": 2.9222011385199242, |
| "grad_norm": 0.19969286229808306, |
| "learning_rate": 3.47877660107828e-05, |
| "loss": 0.4417, |
| "step": 385 |
| }, |
| { |
| "epoch": 2.9297912713472485, |
| "grad_norm": 0.26755657071262773, |
| "learning_rate": 3.4576309274359394e-05, |
| "loss": 0.4611, |
| "step": 386 |
| }, |
| { |
| "epoch": 2.937381404174573, |
| "grad_norm": 0.20374725518267028, |
| "learning_rate": 3.436500683691168e-05, |
| "loss": 0.4582, |
| "step": 387 |
| }, |
| { |
| "epoch": 2.9449715370018974, |
| "grad_norm": 0.2258822384811313, |
| "learning_rate": 3.4153864709798234e-05, |
| "loss": 0.4475, |
| "step": 388 |
| }, |
| { |
| "epoch": 2.952561669829222, |
| "grad_norm": 0.18638500489261803, |
| "learning_rate": 3.394288889981695e-05, |
| "loss": 0.445, |
| "step": 389 |
| }, |
| { |
| "epoch": 2.9601518026565463, |
| "grad_norm": 0.2080901768578384, |
| "learning_rate": 3.373208540903409e-05, |
| "loss": 0.4515, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.967741935483871, |
| "grad_norm": 0.22482754779566425, |
| "learning_rate": 3.3521460234613664e-05, |
| "loss": 0.4476, |
| "step": 391 |
| }, |
| { |
| "epoch": 2.9753320683111957, |
| "grad_norm": 0.21178017233679877, |
| "learning_rate": 3.331101936864674e-05, |
| "loss": 0.4503, |
| "step": 392 |
| }, |
| { |
| "epoch": 2.98292220113852, |
| "grad_norm": 0.1947806663328843, |
| "learning_rate": 3.310076879798095e-05, |
| "loss": 0.4415, |
| "step": 393 |
| }, |
| { |
| "epoch": 2.990512333965844, |
| "grad_norm": 0.18579209774749325, |
| "learning_rate": 3.2890714504050216e-05, |
| "loss": 0.446, |
| "step": 394 |
| }, |
| { |
| "epoch": 2.998102466793169, |
| "grad_norm": 0.23138108723134526, |
| "learning_rate": 3.268086246270458e-05, |
| "loss": 0.5364, |
| "step": 395 |
| }, |
| { |
| "epoch": 3.0056925996204935, |
| "grad_norm": 0.26045279058003457, |
| "learning_rate": 3.2471218644040184e-05, |
| "loss": 0.4487, |
| "step": 396 |
| }, |
| { |
| "epoch": 3.0132827324478177, |
| "grad_norm": 0.21695656813890685, |
| "learning_rate": 3.2261789012229394e-05, |
| "loss": 0.4084, |
| "step": 397 |
| }, |
| { |
| "epoch": 3.0208728652751424, |
| "grad_norm": 0.23426334273283442, |
| "learning_rate": 3.205257952535119e-05, |
| "loss": 0.4079, |
| "step": 398 |
| }, |
| { |
| "epoch": 3.0284629981024667, |
| "grad_norm": 0.26850763056052446, |
| "learning_rate": 3.184359613522163e-05, |
| "loss": 0.4223, |
| "step": 399 |
| }, |
| { |
| "epoch": 3.0360531309297913, |
| "grad_norm": 0.2683676862064319, |
| "learning_rate": 3.1634844787224525e-05, |
| "loss": 0.4182, |
| "step": 400 |
| }, |
| { |
| "epoch": 3.0436432637571156, |
| "grad_norm": 0.26248839521550266, |
| "learning_rate": 3.1426331420142314e-05, |
| "loss": 0.4171, |
| "step": 401 |
| }, |
| { |
| "epoch": 3.0512333965844403, |
| "grad_norm": 0.24770151611304786, |
| "learning_rate": 3.121806196598706e-05, |
| "loss": 0.4023, |
| "step": 402 |
| }, |
| { |
| "epoch": 3.0588235294117645, |
| "grad_norm": 0.2613619938203551, |
| "learning_rate": 3.10100423498318e-05, |
| "loss": 0.4095, |
| "step": 403 |
| }, |
| { |
| "epoch": 3.066413662239089, |
| "grad_norm": 0.2563470656186417, |
| "learning_rate": 3.0802278489641816e-05, |
| "loss": 0.4101, |
| "step": 404 |
| }, |
| { |
| "epoch": 3.074003795066414, |
| "grad_norm": 0.22055572471052182, |
| "learning_rate": 3.0594776296106464e-05, |
| "loss": 0.4105, |
| "step": 405 |
| }, |
| { |
| "epoch": 3.081593927893738, |
| "grad_norm": 0.22294658539566273, |
| "learning_rate": 3.0387541672470857e-05, |
| "loss": 0.4038, |
| "step": 406 |
| }, |
| { |
| "epoch": 3.0891840607210628, |
| "grad_norm": 0.24713851379128027, |
| "learning_rate": 3.0180580514368037e-05, |
| "loss": 0.406, |
| "step": 407 |
| }, |
| { |
| "epoch": 3.096774193548387, |
| "grad_norm": 0.20233125052294548, |
| "learning_rate": 2.997389870965118e-05, |
| "loss": 0.4067, |
| "step": 408 |
| }, |
| { |
| "epoch": 3.1043643263757117, |
| "grad_norm": 0.2268353462063368, |
| "learning_rate": 2.976750213822613e-05, |
| "loss": 0.4069, |
| "step": 409 |
| }, |
| { |
| "epoch": 3.111954459203036, |
| "grad_norm": 0.22067595331379308, |
| "learning_rate": 2.9561396671884105e-05, |
| "loss": 0.414, |
| "step": 410 |
| }, |
| { |
| "epoch": 3.1195445920303606, |
| "grad_norm": 0.19168123310877352, |
| "learning_rate": 2.9355588174134627e-05, |
| "loss": 0.4052, |
| "step": 411 |
| }, |
| { |
| "epoch": 3.127134724857685, |
| "grad_norm": 0.21765986129210937, |
| "learning_rate": 2.9150082500038794e-05, |
| "loss": 0.4084, |
| "step": 412 |
| }, |
| { |
| "epoch": 3.1347248576850095, |
| "grad_norm": 0.17465524431494953, |
| "learning_rate": 2.8944885496042593e-05, |
| "loss": 0.4039, |
| "step": 413 |
| }, |
| { |
| "epoch": 3.1423149905123338, |
| "grad_norm": 0.16136937176528135, |
| "learning_rate": 2.874000299981067e-05, |
| "loss": 0.4077, |
| "step": 414 |
| }, |
| { |
| "epoch": 3.1499051233396584, |
| "grad_norm": 0.1842039752946862, |
| "learning_rate": 2.8535440840060196e-05, |
| "loss": 0.4114, |
| "step": 415 |
| }, |
| { |
| "epoch": 3.157495256166983, |
| "grad_norm": 0.16275539565392577, |
| "learning_rate": 2.83312048363951e-05, |
| "loss": 0.4122, |
| "step": 416 |
| }, |
| { |
| "epoch": 3.1650853889943074, |
| "grad_norm": 0.17613818747948046, |
| "learning_rate": 2.812730079914041e-05, |
| "loss": 0.4078, |
| "step": 417 |
| }, |
| { |
| "epoch": 3.172675521821632, |
| "grad_norm": 0.1802212697189579, |
| "learning_rate": 2.7923734529177076e-05, |
| "loss": 0.4105, |
| "step": 418 |
| }, |
| { |
| "epoch": 3.1802656546489563, |
| "grad_norm": 0.14800969306703707, |
| "learning_rate": 2.772051181777684e-05, |
| "loss": 0.4153, |
| "step": 419 |
| }, |
| { |
| "epoch": 3.187855787476281, |
| "grad_norm": 0.17753235380567503, |
| "learning_rate": 2.7517638446437574e-05, |
| "loss": 0.4184, |
| "step": 420 |
| }, |
| { |
| "epoch": 3.195445920303605, |
| "grad_norm": 0.16819549000064993, |
| "learning_rate": 2.7315120186718686e-05, |
| "loss": 0.4065, |
| "step": 421 |
| }, |
| { |
| "epoch": 3.20303605313093, |
| "grad_norm": 0.18462973385672243, |
| "learning_rate": 2.7112962800077034e-05, |
| "loss": 0.4076, |
| "step": 422 |
| }, |
| { |
| "epoch": 3.210626185958254, |
| "grad_norm": 0.16361853377388974, |
| "learning_rate": 2.6911172037702962e-05, |
| "loss": 0.4095, |
| "step": 423 |
| }, |
| { |
| "epoch": 3.218216318785579, |
| "grad_norm": 0.18569025030207767, |
| "learning_rate": 2.6709753640356652e-05, |
| "loss": 0.4099, |
| "step": 424 |
| }, |
| { |
| "epoch": 3.225806451612903, |
| "grad_norm": 0.1703754043113873, |
| "learning_rate": 2.650871333820491e-05, |
| "loss": 0.411, |
| "step": 425 |
| }, |
| { |
| "epoch": 3.2333965844402277, |
| "grad_norm": 0.16840742937643677, |
| "learning_rate": 2.6308056850658038e-05, |
| "loss": 0.4114, |
| "step": 426 |
| }, |
| { |
| "epoch": 3.2409867172675524, |
| "grad_norm": 0.15285852823906035, |
| "learning_rate": 2.6107789886207195e-05, |
| "loss": 0.4064, |
| "step": 427 |
| }, |
| { |
| "epoch": 3.2485768500948766, |
| "grad_norm": 0.1713250127448791, |
| "learning_rate": 2.5907918142261944e-05, |
| "loss": 0.4167, |
| "step": 428 |
| }, |
| { |
| "epoch": 3.2561669829222013, |
| "grad_norm": 0.1734237035758403, |
| "learning_rate": 2.5708447304988227e-05, |
| "loss": 0.4053, |
| "step": 429 |
| }, |
| { |
| "epoch": 3.2637571157495255, |
| "grad_norm": 0.17043360973692148, |
| "learning_rate": 2.5509383049146532e-05, |
| "loss": 0.4037, |
| "step": 430 |
| }, |
| { |
| "epoch": 3.27134724857685, |
| "grad_norm": 0.16479095536499094, |
| "learning_rate": 2.5310731037930474e-05, |
| "loss": 0.4071, |
| "step": 431 |
| }, |
| { |
| "epoch": 3.2789373814041745, |
| "grad_norm": 0.17168397871604932, |
| "learning_rate": 2.5112496922805712e-05, |
| "loss": 0.4141, |
| "step": 432 |
| }, |
| { |
| "epoch": 3.286527514231499, |
| "grad_norm": 0.15615068257123285, |
| "learning_rate": 2.4914686343349158e-05, |
| "loss": 0.4051, |
| "step": 433 |
| }, |
| { |
| "epoch": 3.2941176470588234, |
| "grad_norm": 0.16211177369118324, |
| "learning_rate": 2.4717304927088493e-05, |
| "loss": 0.4091, |
| "step": 434 |
| }, |
| { |
| "epoch": 3.301707779886148, |
| "grad_norm": 0.17500505329691834, |
| "learning_rate": 2.4520358289342143e-05, |
| "loss": 0.4157, |
| "step": 435 |
| }, |
| { |
| "epoch": 3.3092979127134727, |
| "grad_norm": 0.16178628883403032, |
| "learning_rate": 2.4323852033059447e-05, |
| "loss": 0.4108, |
| "step": 436 |
| }, |
| { |
| "epoch": 3.316888045540797, |
| "grad_norm": 0.15428648205322795, |
| "learning_rate": 2.412779174866134e-05, |
| "loss": 0.4133, |
| "step": 437 |
| }, |
| { |
| "epoch": 3.324478178368121, |
| "grad_norm": 0.1648054065245008, |
| "learning_rate": 2.393218301388123e-05, |
| "loss": 0.4083, |
| "step": 438 |
| }, |
| { |
| "epoch": 3.332068311195446, |
| "grad_norm": 0.14344944433387424, |
| "learning_rate": 2.3737031393606376e-05, |
| "loss": 0.4115, |
| "step": 439 |
| }, |
| { |
| "epoch": 3.3396584440227706, |
| "grad_norm": 0.16677658490660136, |
| "learning_rate": 2.3542342439719565e-05, |
| "loss": 0.4101, |
| "step": 440 |
| }, |
| { |
| "epoch": 3.347248576850095, |
| "grad_norm": 0.16428323673049788, |
| "learning_rate": 2.3348121690941125e-05, |
| "loss": 0.4033, |
| "step": 441 |
| }, |
| { |
| "epoch": 3.3548387096774195, |
| "grad_norm": 0.14998088085310035, |
| "learning_rate": 2.3154374672671417e-05, |
| "loss": 0.4116, |
| "step": 442 |
| }, |
| { |
| "epoch": 3.3624288425047437, |
| "grad_norm": 0.1646349150252062, |
| "learning_rate": 2.2961106896833588e-05, |
| "loss": 0.4053, |
| "step": 443 |
| }, |
| { |
| "epoch": 3.3700189753320684, |
| "grad_norm": 0.1463235531505828, |
| "learning_rate": 2.2768323861716778e-05, |
| "loss": 0.4045, |
| "step": 444 |
| }, |
| { |
| "epoch": 3.3776091081593926, |
| "grad_norm": 0.18600338855222787, |
| "learning_rate": 2.2576031051819704e-05, |
| "loss": 0.4145, |
| "step": 445 |
| }, |
| { |
| "epoch": 3.3851992409867173, |
| "grad_norm": 0.15182364989779723, |
| "learning_rate": 2.2384233937694626e-05, |
| "loss": 0.412, |
| "step": 446 |
| }, |
| { |
| "epoch": 3.3927893738140416, |
| "grad_norm": 0.1891499854862187, |
| "learning_rate": 2.2192937975791757e-05, |
| "loss": 0.4039, |
| "step": 447 |
| }, |
| { |
| "epoch": 3.4003795066413662, |
| "grad_norm": 0.1524505414622732, |
| "learning_rate": 2.2002148608303947e-05, |
| "loss": 0.4059, |
| "step": 448 |
| }, |
| { |
| "epoch": 3.407969639468691, |
| "grad_norm": 0.1570451938127235, |
| "learning_rate": 2.1811871263011924e-05, |
| "loss": 0.4063, |
| "step": 449 |
| }, |
| { |
| "epoch": 3.415559772296015, |
| "grad_norm": 0.14780715290004184, |
| "learning_rate": 2.1622111353129832e-05, |
| "loss": 0.4137, |
| "step": 450 |
| }, |
| { |
| "epoch": 3.42314990512334, |
| "grad_norm": 0.154720916330876, |
| "learning_rate": 2.1432874277151337e-05, |
| "loss": 0.4072, |
| "step": 451 |
| }, |
| { |
| "epoch": 3.430740037950664, |
| "grad_norm": 0.14741003990298276, |
| "learning_rate": 2.124416541869586e-05, |
| "loss": 0.4106, |
| "step": 452 |
| }, |
| { |
| "epoch": 3.4383301707779887, |
| "grad_norm": 0.13756307467876858, |
| "learning_rate": 2.1055990146355566e-05, |
| "loss": 0.4176, |
| "step": 453 |
| }, |
| { |
| "epoch": 3.445920303605313, |
| "grad_norm": 0.1478071810974749, |
| "learning_rate": 2.0868353813542633e-05, |
| "loss": 0.4068, |
| "step": 454 |
| }, |
| { |
| "epoch": 3.4535104364326377, |
| "grad_norm": 0.14471344451828674, |
| "learning_rate": 2.068126175833685e-05, |
| "loss": 0.4118, |
| "step": 455 |
| }, |
| { |
| "epoch": 3.461100569259962, |
| "grad_norm": 0.1488219736052461, |
| "learning_rate": 2.0494719303333836e-05, |
| "loss": 0.412, |
| "step": 456 |
| }, |
| { |
| "epoch": 3.4686907020872866, |
| "grad_norm": 0.7113380874758816, |
| "learning_rate": 2.0308731755493577e-05, |
| "loss": 0.4155, |
| "step": 457 |
| }, |
| { |
| "epoch": 3.476280834914611, |
| "grad_norm": 0.13854452311181958, |
| "learning_rate": 2.012330440598952e-05, |
| "loss": 0.4058, |
| "step": 458 |
| }, |
| { |
| "epoch": 3.4838709677419355, |
| "grad_norm": 0.17245669157249005, |
| "learning_rate": 1.9938442530057904e-05, |
| "loss": 0.4158, |
| "step": 459 |
| }, |
| { |
| "epoch": 3.4914611005692597, |
| "grad_norm": 0.13334621900310906, |
| "learning_rate": 1.975415138684781e-05, |
| "loss": 0.4064, |
| "step": 460 |
| }, |
| { |
| "epoch": 3.4990512333965844, |
| "grad_norm": 0.1704714868106143, |
| "learning_rate": 1.9570436219271534e-05, |
| "loss": 0.4053, |
| "step": 461 |
| }, |
| { |
| "epoch": 3.506641366223909, |
| "grad_norm": 0.13841108978778513, |
| "learning_rate": 1.9387302253855353e-05, |
| "loss": 0.4084, |
| "step": 462 |
| }, |
| { |
| "epoch": 3.5142314990512333, |
| "grad_norm": 0.14973879725428832, |
| "learning_rate": 1.9204754700590878e-05, |
| "loss": 0.412, |
| "step": 463 |
| }, |
| { |
| "epoch": 3.521821631878558, |
| "grad_norm": 0.14960927710961097, |
| "learning_rate": 1.9022798752786896e-05, |
| "loss": 0.4118, |
| "step": 464 |
| }, |
| { |
| "epoch": 3.5294117647058822, |
| "grad_norm": 0.14735649088289546, |
| "learning_rate": 1.8841439586921515e-05, |
| "loss": 0.4066, |
| "step": 465 |
| }, |
| { |
| "epoch": 3.537001897533207, |
| "grad_norm": 0.15821300680395564, |
| "learning_rate": 1.8660682362494926e-05, |
| "loss": 0.416, |
| "step": 466 |
| }, |
| { |
| "epoch": 3.544592030360531, |
| "grad_norm": 0.14559017597813168, |
| "learning_rate": 1.848053222188271e-05, |
| "loss": 0.4095, |
| "step": 467 |
| }, |
| { |
| "epoch": 3.552182163187856, |
| "grad_norm": 0.15314476683793304, |
| "learning_rate": 1.8300994290189452e-05, |
| "loss": 0.4094, |
| "step": 468 |
| }, |
| { |
| "epoch": 3.55977229601518, |
| "grad_norm": 0.169795874861623, |
| "learning_rate": 1.8122073675102935e-05, |
| "loss": 0.418, |
| "step": 469 |
| }, |
| { |
| "epoch": 3.5673624288425048, |
| "grad_norm": 0.1443798018236273, |
| "learning_rate": 1.7943775466748867e-05, |
| "loss": 0.4086, |
| "step": 470 |
| }, |
| { |
| "epoch": 3.5749525616698294, |
| "grad_norm": 0.16667220421287227, |
| "learning_rate": 1.7766104737546102e-05, |
| "loss": 0.4079, |
| "step": 471 |
| }, |
| { |
| "epoch": 3.5825426944971537, |
| "grad_norm": 0.1346641081419542, |
| "learning_rate": 1.7589066542062253e-05, |
| "loss": 0.4076, |
| "step": 472 |
| }, |
| { |
| "epoch": 3.590132827324478, |
| "grad_norm": 0.15846447495361166, |
| "learning_rate": 1.741266591686991e-05, |
| "loss": 0.4059, |
| "step": 473 |
| }, |
| { |
| "epoch": 3.5977229601518026, |
| "grad_norm": 0.1366720287265696, |
| "learning_rate": 1.7236907880403447e-05, |
| "loss": 0.4078, |
| "step": 474 |
| }, |
| { |
| "epoch": 3.6053130929791273, |
| "grad_norm": 0.16252820939929033, |
| "learning_rate": 1.7061797432816138e-05, |
| "loss": 0.4073, |
| "step": 475 |
| }, |
| { |
| "epoch": 3.6129032258064515, |
| "grad_norm": 0.1494339978487333, |
| "learning_rate": 1.6887339555837948e-05, |
| "loss": 0.4081, |
| "step": 476 |
| }, |
| { |
| "epoch": 3.620493358633776, |
| "grad_norm": 0.14531376606247468, |
| "learning_rate": 1.671353921263386e-05, |
| "loss": 0.4072, |
| "step": 477 |
| }, |
| { |
| "epoch": 3.6280834914611004, |
| "grad_norm": 0.1388375063144433, |
| "learning_rate": 1.654040134766259e-05, |
| "loss": 0.4075, |
| "step": 478 |
| }, |
| { |
| "epoch": 3.635673624288425, |
| "grad_norm": 0.13505123860416815, |
| "learning_rate": 1.6367930886535957e-05, |
| "loss": 0.4145, |
| "step": 479 |
| }, |
| { |
| "epoch": 3.64326375711575, |
| "grad_norm": 0.13194605801302411, |
| "learning_rate": 1.619613273587879e-05, |
| "loss": 0.4177, |
| "step": 480 |
| }, |
| { |
| "epoch": 3.650853889943074, |
| "grad_norm": 0.14166452387033596, |
| "learning_rate": 1.602501178318928e-05, |
| "loss": 0.4161, |
| "step": 481 |
| }, |
| { |
| "epoch": 3.6584440227703983, |
| "grad_norm": 0.13583541432935878, |
| "learning_rate": 1.5854572896699977e-05, |
| "loss": 0.4105, |
| "step": 482 |
| }, |
| { |
| "epoch": 3.666034155597723, |
| "grad_norm": 0.15005698486131297, |
| "learning_rate": 1.5684820925239273e-05, |
| "loss": 0.398, |
| "step": 483 |
| }, |
| { |
| "epoch": 3.6736242884250476, |
| "grad_norm": 0.13590326204495468, |
| "learning_rate": 1.5515760698093485e-05, |
| "loss": 0.408, |
| "step": 484 |
| }, |
| { |
| "epoch": 3.681214421252372, |
| "grad_norm": 0.14280292743110926, |
| "learning_rate": 1.5347397024869423e-05, |
| "loss": 0.4102, |
| "step": 485 |
| }, |
| { |
| "epoch": 3.6888045540796965, |
| "grad_norm": 0.14960004062226398, |
| "learning_rate": 1.5179734695357584e-05, |
| "loss": 0.4048, |
| "step": 486 |
| }, |
| { |
| "epoch": 3.6963946869070208, |
| "grad_norm": 0.14308871822409527, |
| "learning_rate": 1.5012778479395892e-05, |
| "loss": 0.41, |
| "step": 487 |
| }, |
| { |
| "epoch": 3.7039848197343455, |
| "grad_norm": 0.17743355555341997, |
| "learning_rate": 1.4846533126733999e-05, |
| "loss": 0.4066, |
| "step": 488 |
| }, |
| { |
| "epoch": 3.7115749525616697, |
| "grad_norm": 0.13804122945298328, |
| "learning_rate": 1.4681003366898132e-05, |
| "loss": 0.4108, |
| "step": 489 |
| }, |
| { |
| "epoch": 3.7191650853889944, |
| "grad_norm": 0.13970618164447296, |
| "learning_rate": 1.4516193909056609e-05, |
| "loss": 0.4029, |
| "step": 490 |
| }, |
| { |
| "epoch": 3.7267552182163186, |
| "grad_norm": 0.16177493370388654, |
| "learning_rate": 1.4352109441885786e-05, |
| "loss": 0.4083, |
| "step": 491 |
| }, |
| { |
| "epoch": 3.7343453510436433, |
| "grad_norm": 0.12082905813373115, |
| "learning_rate": 1.4188754633436718e-05, |
| "loss": 0.4013, |
| "step": 492 |
| }, |
| { |
| "epoch": 3.741935483870968, |
| "grad_norm": 0.14795325276605165, |
| "learning_rate": 1.4026134131002347e-05, |
| "loss": 0.4101, |
| "step": 493 |
| }, |
| { |
| "epoch": 3.749525616698292, |
| "grad_norm": 0.14127927316169223, |
| "learning_rate": 1.3864252560985283e-05, |
| "loss": 0.414, |
| "step": 494 |
| }, |
| { |
| "epoch": 3.7571157495256164, |
| "grad_norm": 0.13865730984205682, |
| "learning_rate": 1.3703114528766203e-05, |
| "loss": 0.4029, |
| "step": 495 |
| }, |
| { |
| "epoch": 3.764705882352941, |
| "grad_norm": 0.14552488808913097, |
| "learning_rate": 1.35427246185728e-05, |
| "loss": 0.4073, |
| "step": 496 |
| }, |
| { |
| "epoch": 3.772296015180266, |
| "grad_norm": 0.1357309007526371, |
| "learning_rate": 1.3383087393349436e-05, |
| "loss": 0.4091, |
| "step": 497 |
| }, |
| { |
| "epoch": 3.77988614800759, |
| "grad_norm": 0.1324930748382244, |
| "learning_rate": 1.3224207394627241e-05, |
| "loss": 0.4122, |
| "step": 498 |
| }, |
| { |
| "epoch": 3.7874762808349147, |
| "grad_norm": 0.1362271179146437, |
| "learning_rate": 1.306608914239496e-05, |
| "loss": 0.4041, |
| "step": 499 |
| }, |
| { |
| "epoch": 3.795066413662239, |
| "grad_norm": 0.12409927061704383, |
| "learning_rate": 1.2908737134970367e-05, |
| "loss": 0.4056, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.8026565464895636, |
| "grad_norm": 0.13343164571477334, |
| "learning_rate": 1.2752155848872266e-05, |
| "loss": 0.4096, |
| "step": 501 |
| }, |
| { |
| "epoch": 3.8102466793168883, |
| "grad_norm": 0.11787497363701406, |
| "learning_rate": 1.2596349738693162e-05, |
| "loss": 0.3975, |
| "step": 502 |
| }, |
| { |
| "epoch": 3.8178368121442126, |
| "grad_norm": 0.12436712059884664, |
| "learning_rate": 1.2441323236972536e-05, |
| "loss": 0.4103, |
| "step": 503 |
| }, |
| { |
| "epoch": 3.825426944971537, |
| "grad_norm": 0.1217813385845422, |
| "learning_rate": 1.2287080754070719e-05, |
| "loss": 0.407, |
| "step": 504 |
| }, |
| { |
| "epoch": 3.8330170777988615, |
| "grad_norm": 0.11341001168506011, |
| "learning_rate": 1.2133626678043426e-05, |
| "loss": 0.4113, |
| "step": 505 |
| }, |
| { |
| "epoch": 3.840607210626186, |
| "grad_norm": 0.10943902346608907, |
| "learning_rate": 1.1980965374516922e-05, |
| "loss": 0.4042, |
| "step": 506 |
| }, |
| { |
| "epoch": 3.8481973434535104, |
| "grad_norm": 0.13586692787728996, |
| "learning_rate": 1.1829101186563876e-05, |
| "loss": 0.4149, |
| "step": 507 |
| }, |
| { |
| "epoch": 3.855787476280835, |
| "grad_norm": 0.11690469403831676, |
| "learning_rate": 1.167803843457969e-05, |
| "loss": 0.4174, |
| "step": 508 |
| }, |
| { |
| "epoch": 3.8633776091081593, |
| "grad_norm": 0.11141063470580559, |
| "learning_rate": 1.1527781416159684e-05, |
| "loss": 0.4064, |
| "step": 509 |
| }, |
| { |
| "epoch": 3.870967741935484, |
| "grad_norm": 0.14761043009465893, |
| "learning_rate": 1.1378334405976829e-05, |
| "loss": 0.4095, |
| "step": 510 |
| }, |
| { |
| "epoch": 3.878557874762808, |
| "grad_norm": 0.12802936815914415, |
| "learning_rate": 1.122970165566009e-05, |
| "loss": 0.4126, |
| "step": 511 |
| }, |
| { |
| "epoch": 3.886148007590133, |
| "grad_norm": 0.11715478782287307, |
| "learning_rate": 1.1081887393673481e-05, |
| "loss": 0.4039, |
| "step": 512 |
| }, |
| { |
| "epoch": 3.893738140417457, |
| "grad_norm": 0.12113439474595457, |
| "learning_rate": 1.0934895825195807e-05, |
| "loss": 0.4039, |
| "step": 513 |
| }, |
| { |
| "epoch": 3.901328273244782, |
| "grad_norm": 0.12045212518695263, |
| "learning_rate": 1.0788731132000985e-05, |
| "loss": 0.4157, |
| "step": 514 |
| }, |
| { |
| "epoch": 3.9089184060721065, |
| "grad_norm": 0.11325751581490795, |
| "learning_rate": 1.0643397472339103e-05, |
| "loss": 0.4058, |
| "step": 515 |
| }, |
| { |
| "epoch": 3.9165085388994307, |
| "grad_norm": 0.11669592480574363, |
| "learning_rate": 1.0498898980818115e-05, |
| "loss": 0.4082, |
| "step": 516 |
| }, |
| { |
| "epoch": 3.924098671726755, |
| "grad_norm": 0.11461527608695707, |
| "learning_rate": 1.035523976828623e-05, |
| "loss": 0.419, |
| "step": 517 |
| }, |
| { |
| "epoch": 3.9316888045540797, |
| "grad_norm": 0.11809514587490393, |
| "learning_rate": 1.0212423921714923e-05, |
| "loss": 0.4158, |
| "step": 518 |
| }, |
| { |
| "epoch": 3.9392789373814043, |
| "grad_norm": 0.11115980306830088, |
| "learning_rate": 1.0070455504082695e-05, |
| "loss": 0.4095, |
| "step": 519 |
| }, |
| { |
| "epoch": 3.9468690702087286, |
| "grad_norm": 0.114832121716292, |
| "learning_rate": 9.92933855425951e-06, |
| "loss": 0.4154, |
| "step": 520 |
| }, |
| { |
| "epoch": 3.9544592030360532, |
| "grad_norm": 0.11130484634692327, |
| "learning_rate": 9.789077086891802e-06, |
| "loss": 0.4137, |
| "step": 521 |
| }, |
| { |
| "epoch": 3.9620493358633775, |
| "grad_norm": 0.10695561553946319, |
| "learning_rate": 9.649675092288366e-06, |
| "loss": 0.4006, |
| "step": 522 |
| }, |
| { |
| "epoch": 3.969639468690702, |
| "grad_norm": 0.12052592151934423, |
| "learning_rate": 9.511136536306793e-06, |
| "loss": 0.4082, |
| "step": 523 |
| }, |
| { |
| "epoch": 3.9772296015180264, |
| "grad_norm": 0.10913274770762135, |
| "learning_rate": 9.373465360240627e-06, |
| "loss": 0.4134, |
| "step": 524 |
| }, |
| { |
| "epoch": 3.984819734345351, |
| "grad_norm": 0.11482531714881333, |
| "learning_rate": 9.236665480707266e-06, |
| "loss": 0.405, |
| "step": 525 |
| }, |
| { |
| "epoch": 3.9924098671726753, |
| "grad_norm": 0.11550707751512918, |
| "learning_rate": 9.100740789536515e-06, |
| "loss": 0.4061, |
| "step": 526 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.14100318690739433, |
| "learning_rate": 8.96569515365993e-06, |
| "loss": 0.5074, |
| "step": 527 |
| }, |
| { |
| "epoch": 4.007590132827325, |
| "grad_norm": 0.19514222390213506, |
| "learning_rate": 8.831532415000685e-06, |
| "loss": 0.3785, |
| "step": 528 |
| }, |
| { |
| "epoch": 4.015180265654649, |
| "grad_norm": 0.14576099759098526, |
| "learning_rate": 8.698256390364386e-06, |
| "loss": 0.373, |
| "step": 529 |
| }, |
| { |
| "epoch": 4.022770398481973, |
| "grad_norm": 0.13042501540336135, |
| "learning_rate": 8.565870871330463e-06, |
| "loss": 0.3799, |
| "step": 530 |
| }, |
| { |
| "epoch": 4.030360531309298, |
| "grad_norm": 0.1565052782903353, |
| "learning_rate": 8.434379624144261e-06, |
| "loss": 0.3881, |
| "step": 531 |
| }, |
| { |
| "epoch": 4.0379506641366225, |
| "grad_norm": 0.16603797889226304, |
| "learning_rate": 8.303786389609914e-06, |
| "loss": 0.386, |
| "step": 532 |
| }, |
| { |
| "epoch": 4.045540796963947, |
| "grad_norm": 0.16546797622934456, |
| "learning_rate": 8.17409488298396e-06, |
| "loss": 0.3847, |
| "step": 533 |
| }, |
| { |
| "epoch": 4.053130929791271, |
| "grad_norm": 0.14996763626729143, |
| "learning_rate": 8.0453087938696e-06, |
| "loss": 0.3816, |
| "step": 534 |
| }, |
| { |
| "epoch": 4.060721062618596, |
| "grad_norm": 0.14469013701568176, |
| "learning_rate": 7.917431786111698e-06, |
| "loss": 0.3814, |
| "step": 535 |
| }, |
| { |
| "epoch": 4.06831119544592, |
| "grad_norm": 0.14926356991103681, |
| "learning_rate": 7.790467497692678e-06, |
| "loss": 0.3779, |
| "step": 536 |
| }, |
| { |
| "epoch": 4.075901328273245, |
| "grad_norm": 0.1655187733066776, |
| "learning_rate": 7.664419540628886e-06, |
| "loss": 0.3884, |
| "step": 537 |
| }, |
| { |
| "epoch": 4.08349146110057, |
| "grad_norm": 0.16471106758966367, |
| "learning_rate": 7.539291500867918e-06, |
| "loss": 0.3823, |
| "step": 538 |
| }, |
| { |
| "epoch": 4.0910815939278935, |
| "grad_norm": 0.146447604920726, |
| "learning_rate": 7.415086938186542e-06, |
| "loss": 0.392, |
| "step": 539 |
| }, |
| { |
| "epoch": 4.098671726755218, |
| "grad_norm": 0.14674866169218687, |
| "learning_rate": 7.291809386089515e-06, |
| "loss": 0.3807, |
| "step": 540 |
| }, |
| { |
| "epoch": 4.106261859582543, |
| "grad_norm": 0.1654058535020922, |
| "learning_rate": 7.169462351708958e-06, |
| "loss": 0.3852, |
| "step": 541 |
| }, |
| { |
| "epoch": 4.1138519924098675, |
| "grad_norm": 0.1447542059683869, |
| "learning_rate": 7.048049315704611e-06, |
| "loss": 0.3831, |
| "step": 542 |
| }, |
| { |
| "epoch": 4.121442125237191, |
| "grad_norm": 0.1261716394650113, |
| "learning_rate": 6.927573732164879e-06, |
| "loss": 0.3831, |
| "step": 543 |
| }, |
| { |
| "epoch": 4.129032258064516, |
| "grad_norm": 0.15942710883242464, |
| "learning_rate": 6.808039028508475e-06, |
| "loss": 0.3835, |
| "step": 544 |
| }, |
| { |
| "epoch": 4.136622390891841, |
| "grad_norm": 0.13949828863354824, |
| "learning_rate": 6.6894486053869525e-06, |
| "loss": 0.3811, |
| "step": 545 |
| }, |
| { |
| "epoch": 4.144212523719165, |
| "grad_norm": 0.1298354723268211, |
| "learning_rate": 6.571805836587981e-06, |
| "loss": 0.3771, |
| "step": 546 |
| }, |
| { |
| "epoch": 4.151802656546489, |
| "grad_norm": 0.11985046208878425, |
| "learning_rate": 6.455114068939323e-06, |
| "loss": 0.3865, |
| "step": 547 |
| }, |
| { |
| "epoch": 4.159392789373814, |
| "grad_norm": 0.1444577266660429, |
| "learning_rate": 6.3393766222136445e-06, |
| "loss": 0.3826, |
| "step": 548 |
| }, |
| { |
| "epoch": 4.1669829222011385, |
| "grad_norm": 0.13012757211149162, |
| "learning_rate": 6.224596789034061e-06, |
| "loss": 0.3809, |
| "step": 549 |
| }, |
| { |
| "epoch": 4.174573055028463, |
| "grad_norm": 0.11192946871125323, |
| "learning_rate": 6.1107778347804814e-06, |
| "loss": 0.3826, |
| "step": 550 |
| }, |
| { |
| "epoch": 4.182163187855788, |
| "grad_norm": 0.11903117545520656, |
| "learning_rate": 5.99792299749669e-06, |
| "loss": 0.3768, |
| "step": 551 |
| }, |
| { |
| "epoch": 4.189753320683112, |
| "grad_norm": 0.12263935198172611, |
| "learning_rate": 5.886035487798229e-06, |
| "loss": 0.3807, |
| "step": 552 |
| }, |
| { |
| "epoch": 4.197343453510436, |
| "grad_norm": 0.11730507620931735, |
| "learning_rate": 5.775118488781099e-06, |
| "loss": 0.3822, |
| "step": 553 |
| }, |
| { |
| "epoch": 4.204933586337761, |
| "grad_norm": 0.11587381154598554, |
| "learning_rate": 5.665175155931133e-06, |
| "loss": 0.3827, |
| "step": 554 |
| }, |
| { |
| "epoch": 4.212523719165086, |
| "grad_norm": 0.1134319753947347, |
| "learning_rate": 5.556208617034289e-06, |
| "loss": 0.3766, |
| "step": 555 |
| }, |
| { |
| "epoch": 4.2201138519924095, |
| "grad_norm": 0.1050634846149617, |
| "learning_rate": 5.448221972087631e-06, |
| "loss": 0.3792, |
| "step": 556 |
| }, |
| { |
| "epoch": 4.227703984819734, |
| "grad_norm": 0.10861492093915034, |
| "learning_rate": 5.341218293211143e-06, |
| "loss": 0.3857, |
| "step": 557 |
| }, |
| { |
| "epoch": 4.235294117647059, |
| "grad_norm": 0.10851653304619344, |
| "learning_rate": 5.235200624560341e-06, |
| "loss": 0.3795, |
| "step": 558 |
| }, |
| { |
| "epoch": 4.242884250474384, |
| "grad_norm": 0.10296399140659968, |
| "learning_rate": 5.130171982239685e-06, |
| "loss": 0.3846, |
| "step": 559 |
| }, |
| { |
| "epoch": 4.250474383301707, |
| "grad_norm": 0.10426146312115164, |
| "learning_rate": 5.026135354216717e-06, |
| "loss": 0.383, |
| "step": 560 |
| }, |
| { |
| "epoch": 4.258064516129032, |
| "grad_norm": 0.10232852820462886, |
| "learning_rate": 4.923093700237109e-06, |
| "loss": 0.3868, |
| "step": 561 |
| }, |
| { |
| "epoch": 4.265654648956357, |
| "grad_norm": 0.10102795561292162, |
| "learning_rate": 4.821049951740442e-06, |
| "loss": 0.3781, |
| "step": 562 |
| }, |
| { |
| "epoch": 4.273244781783681, |
| "grad_norm": 0.10261093996817437, |
| "learning_rate": 4.720007011776808e-06, |
| "loss": 0.3802, |
| "step": 563 |
| }, |
| { |
| "epoch": 4.280834914611006, |
| "grad_norm": 0.09919927002366413, |
| "learning_rate": 4.6199677549242285e-06, |
| "loss": 0.3837, |
| "step": 564 |
| }, |
| { |
| "epoch": 4.28842504743833, |
| "grad_norm": 0.10836098588585662, |
| "learning_rate": 4.520935027206857e-06, |
| "loss": 0.3869, |
| "step": 565 |
| }, |
| { |
| "epoch": 4.2960151802656545, |
| "grad_norm": 0.10526098051103763, |
| "learning_rate": 4.4229116460140495e-06, |
| "loss": 0.377, |
| "step": 566 |
| }, |
| { |
| "epoch": 4.303605313092979, |
| "grad_norm": 0.09914351892145006, |
| "learning_rate": 4.325900400020176e-06, |
| "loss": 0.3786, |
| "step": 567 |
| }, |
| { |
| "epoch": 4.311195445920304, |
| "grad_norm": 0.10303085522084827, |
| "learning_rate": 4.229904049105287e-06, |
| "loss": 0.3799, |
| "step": 568 |
| }, |
| { |
| "epoch": 4.318785578747628, |
| "grad_norm": 0.09870659666807487, |
| "learning_rate": 4.1349253242766265e-06, |
| "loss": 0.3723, |
| "step": 569 |
| }, |
| { |
| "epoch": 4.326375711574952, |
| "grad_norm": 0.09910026166356478, |
| "learning_rate": 4.040966927590901e-06, |
| "loss": 0.3839, |
| "step": 570 |
| }, |
| { |
| "epoch": 4.333965844402277, |
| "grad_norm": 0.10308676223772309, |
| "learning_rate": 3.9480315320774524e-06, |
| "loss": 0.3819, |
| "step": 571 |
| }, |
| { |
| "epoch": 4.341555977229602, |
| "grad_norm": 0.11412486094736646, |
| "learning_rate": 3.856121781662148e-06, |
| "loss": 0.3886, |
| "step": 572 |
| }, |
| { |
| "epoch": 4.349146110056926, |
| "grad_norm": 0.09763532306888358, |
| "learning_rate": 3.7652402910922513e-06, |
| "loss": 0.3798, |
| "step": 573 |
| }, |
| { |
| "epoch": 4.35673624288425, |
| "grad_norm": 0.09695937184022163, |
| "learning_rate": 3.675389645861951e-06, |
| "loss": 0.3855, |
| "step": 574 |
| }, |
| { |
| "epoch": 4.364326375711575, |
| "grad_norm": 0.10172178456609236, |
| "learning_rate": 3.5865724021388437e-06, |
| "loss": 0.3893, |
| "step": 575 |
| }, |
| { |
| "epoch": 4.3719165085389, |
| "grad_norm": 0.10300828152047768, |
| "learning_rate": 3.4987910866912402e-06, |
| "loss": 0.3873, |
| "step": 576 |
| }, |
| { |
| "epoch": 4.379506641366224, |
| "grad_norm": 0.09672665094607047, |
| "learning_rate": 3.4120481968162022e-06, |
| "loss": 0.3875, |
| "step": 577 |
| }, |
| { |
| "epoch": 4.387096774193548, |
| "grad_norm": 0.09743576074956742, |
| "learning_rate": 3.32634620026858e-06, |
| "loss": 0.3792, |
| "step": 578 |
| }, |
| { |
| "epoch": 4.394686907020873, |
| "grad_norm": 0.09625146137130541, |
| "learning_rate": 3.241687535190776e-06, |
| "loss": 0.3867, |
| "step": 579 |
| }, |
| { |
| "epoch": 4.402277039848197, |
| "grad_norm": 0.0945950664728936, |
| "learning_rate": 3.1580746100433646e-06, |
| "loss": 0.3824, |
| "step": 580 |
| }, |
| { |
| "epoch": 4.409867172675522, |
| "grad_norm": 0.0980784024646366, |
| "learning_rate": 3.0755098035365917e-06, |
| "loss": 0.3839, |
| "step": 581 |
| }, |
| { |
| "epoch": 4.417457305502847, |
| "grad_norm": 0.10142930578038363, |
| "learning_rate": 2.9939954645626934e-06, |
| "loss": 0.3831, |
| "step": 582 |
| }, |
| { |
| "epoch": 4.425047438330171, |
| "grad_norm": 0.1024718070994225, |
| "learning_rate": 2.913533912129105e-06, |
| "loss": 0.3838, |
| "step": 583 |
| }, |
| { |
| "epoch": 4.432637571157495, |
| "grad_norm": 0.09460467739691439, |
| "learning_rate": 2.8341274352924197e-06, |
| "loss": 0.3856, |
| "step": 584 |
| }, |
| { |
| "epoch": 4.44022770398482, |
| "grad_norm": 0.09628762445931284, |
| "learning_rate": 2.7557782930933298e-06, |
| "loss": 0.3813, |
| "step": 585 |
| }, |
| { |
| "epoch": 4.447817836812145, |
| "grad_norm": 0.09194772578210779, |
| "learning_rate": 2.6784887144923445e-06, |
| "loss": 0.3817, |
| "step": 586 |
| }, |
| { |
| "epoch": 4.455407969639468, |
| "grad_norm": 0.08931743384621908, |
| "learning_rate": 2.6022608983063522e-06, |
| "loss": 0.3788, |
| "step": 587 |
| }, |
| { |
| "epoch": 4.462998102466793, |
| "grad_norm": 0.09150589167868585, |
| "learning_rate": 2.5270970131460937e-06, |
| "loss": 0.3866, |
| "step": 588 |
| }, |
| { |
| "epoch": 4.470588235294118, |
| "grad_norm": 0.09812252230877141, |
| "learning_rate": 2.4529991973544664e-06, |
| "loss": 0.3903, |
| "step": 589 |
| }, |
| { |
| "epoch": 4.478178368121442, |
| "grad_norm": 0.09412962970253295, |
| "learning_rate": 2.3799695589456695e-06, |
| "loss": 0.3812, |
| "step": 590 |
| }, |
| { |
| "epoch": 4.485768500948766, |
| "grad_norm": 0.09424880620907046, |
| "learning_rate": 2.308010175545232e-06, |
| "loss": 0.3838, |
| "step": 591 |
| }, |
| { |
| "epoch": 4.493358633776091, |
| "grad_norm": 0.09122643056285845, |
| "learning_rate": 2.2371230943309598e-06, |
| "loss": 0.3896, |
| "step": 592 |
| }, |
| { |
| "epoch": 4.500948766603416, |
| "grad_norm": 0.09096615886328271, |
| "learning_rate": 2.1673103319746146e-06, |
| "loss": 0.3785, |
| "step": 593 |
| }, |
| { |
| "epoch": 4.50853889943074, |
| "grad_norm": 0.0997668049582305, |
| "learning_rate": 2.0985738745846086e-06, |
| "loss": 0.3873, |
| "step": 594 |
| }, |
| { |
| "epoch": 4.516129032258064, |
| "grad_norm": 0.09598005147735333, |
| "learning_rate": 2.0309156776494497e-06, |
| "loss": 0.3755, |
| "step": 595 |
| }, |
| { |
| "epoch": 4.523719165085389, |
| "grad_norm": 0.09405869487813164, |
| "learning_rate": 1.964337665982172e-06, |
| "loss": 0.3923, |
| "step": 596 |
| }, |
| { |
| "epoch": 4.531309297912713, |
| "grad_norm": 0.0916297167969929, |
| "learning_rate": 1.898841733665515e-06, |
| "loss": 0.3836, |
| "step": 597 |
| }, |
| { |
| "epoch": 4.538899430740038, |
| "grad_norm": 0.09844277128999404, |
| "learning_rate": 1.8344297439980475e-06, |
| "loss": 0.3814, |
| "step": 598 |
| }, |
| { |
| "epoch": 4.546489563567363, |
| "grad_norm": 0.09263986116298609, |
| "learning_rate": 1.7711035294412094e-06, |
| "loss": 0.3874, |
| "step": 599 |
| }, |
| { |
| "epoch": 4.554079696394687, |
| "grad_norm": 0.09337538395396143, |
| "learning_rate": 1.7088648915671236e-06, |
| "loss": 0.3819, |
| "step": 600 |
| }, |
| { |
| "epoch": 4.561669829222011, |
| "grad_norm": 0.09644194634203436, |
| "learning_rate": 1.6477156010073693e-06, |
| "loss": 0.3859, |
| "step": 601 |
| }, |
| { |
| "epoch": 4.569259962049336, |
| "grad_norm": 0.09104976943289746, |
| "learning_rate": 1.5876573974026043e-06, |
| "loss": 0.3859, |
| "step": 602 |
| }, |
| { |
| "epoch": 4.576850094876661, |
| "grad_norm": 0.1036240608596365, |
| "learning_rate": 1.5286919893530727e-06, |
| "loss": 0.378, |
| "step": 603 |
| }, |
| { |
| "epoch": 4.584440227703984, |
| "grad_norm": 0.10319393509093422, |
| "learning_rate": 1.4708210543700019e-06, |
| "loss": 0.3821, |
| "step": 604 |
| }, |
| { |
| "epoch": 4.592030360531309, |
| "grad_norm": 0.09419955371127982, |
| "learning_rate": 1.4140462388278641e-06, |
| "loss": 0.382, |
| "step": 605 |
| }, |
| { |
| "epoch": 4.599620493358634, |
| "grad_norm": 0.09325440545672938, |
| "learning_rate": 1.3583691579175563e-06, |
| "loss": 0.3796, |
| "step": 606 |
| }, |
| { |
| "epoch": 4.6072106261859584, |
| "grad_norm": 0.0948062866747191, |
| "learning_rate": 1.3037913956004444e-06, |
| "loss": 0.3802, |
| "step": 607 |
| }, |
| { |
| "epoch": 4.614800759013283, |
| "grad_norm": 0.08880917402719704, |
| "learning_rate": 1.2503145045632903e-06, |
| "loss": 0.3837, |
| "step": 608 |
| }, |
| { |
| "epoch": 4.622390891840607, |
| "grad_norm": 0.10097582002452218, |
| "learning_rate": 1.1979400061741075e-06, |
| "loss": 0.3771, |
| "step": 609 |
| }, |
| { |
| "epoch": 4.629981024667932, |
| "grad_norm": 0.09101806397046262, |
| "learning_rate": 1.146669390438837e-06, |
| "loss": 0.3806, |
| "step": 610 |
| }, |
| { |
| "epoch": 4.637571157495256, |
| "grad_norm": 0.08881115056650345, |
| "learning_rate": 1.0965041159589806e-06, |
| "loss": 0.3891, |
| "step": 611 |
| }, |
| { |
| "epoch": 4.645161290322581, |
| "grad_norm": 0.09427363876334546, |
| "learning_rate": 1.047445609890132e-06, |
| "loss": 0.3889, |
| "step": 612 |
| }, |
| { |
| "epoch": 4.652751423149905, |
| "grad_norm": 0.09134492739832982, |
| "learning_rate": 9.994952679013292e-07, |
| "loss": 0.3805, |
| "step": 613 |
| }, |
| { |
| "epoch": 4.660341555977229, |
| "grad_norm": 0.08733457555820553, |
| "learning_rate": 9.526544541353622e-07, |
| "loss": 0.3721, |
| "step": 614 |
| }, |
| { |
| "epoch": 4.667931688804554, |
| "grad_norm": 0.09375394173236, |
| "learning_rate": 9.069245011699901e-07, |
| "loss": 0.3809, |
| "step": 615 |
| }, |
| { |
| "epoch": 4.675521821631879, |
| "grad_norm": 0.09910629089900622, |
| "learning_rate": 8.623067099800076e-07, |
| "loss": 0.3781, |
| "step": 616 |
| }, |
| { |
| "epoch": 4.6831119544592035, |
| "grad_norm": 0.08948539795632664, |
| "learning_rate": 8.188023499002206e-07, |
| "loss": 0.3852, |
| "step": 617 |
| }, |
| { |
| "epoch": 4.690702087286527, |
| "grad_norm": 0.09627861222029245, |
| "learning_rate": 7.764126585893694e-07, |
| "loss": 0.3781, |
| "step": 618 |
| }, |
| { |
| "epoch": 4.698292220113852, |
| "grad_norm": 0.08783688983476867, |
| "learning_rate": 7.351388419948979e-07, |
| "loss": 0.3837, |
| "step": 619 |
| }, |
| { |
| "epoch": 4.705882352941177, |
| "grad_norm": 0.08787642986660921, |
| "learning_rate": 6.949820743186353e-07, |
| "loss": 0.3932, |
| "step": 620 |
| }, |
| { |
| "epoch": 4.713472485768501, |
| "grad_norm": 0.08788042105203867, |
| "learning_rate": 6.559434979834223e-07, |
| "loss": 0.3821, |
| "step": 621 |
| }, |
| { |
| "epoch": 4.721062618595825, |
| "grad_norm": 0.08638210961076086, |
| "learning_rate": 6.180242236005818e-07, |
| "loss": 0.385, |
| "step": 622 |
| }, |
| { |
| "epoch": 4.72865275142315, |
| "grad_norm": 0.08376769284093517, |
| "learning_rate": 5.812253299383308e-07, |
| "loss": 0.3764, |
| "step": 623 |
| }, |
| { |
| "epoch": 4.7362428842504745, |
| "grad_norm": 0.08758279977677409, |
| "learning_rate": 5.455478638911071e-07, |
| "loss": 0.3784, |
| "step": 624 |
| }, |
| { |
| "epoch": 4.743833017077799, |
| "grad_norm": 0.08749897373635039, |
| "learning_rate": 5.109928404497532e-07, |
| "loss": 0.3864, |
| "step": 625 |
| }, |
| { |
| "epoch": 4.751423149905124, |
| "grad_norm": 0.08812251812037451, |
| "learning_rate": 4.775612426726684e-07, |
| "loss": 0.3832, |
| "step": 626 |
| }, |
| { |
| "epoch": 4.759013282732448, |
| "grad_norm": 0.08938212786163188, |
| "learning_rate": 4.452540216578349e-07, |
| "loss": 0.3778, |
| "step": 627 |
| }, |
| { |
| "epoch": 4.766603415559772, |
| "grad_norm": 0.09686068566031868, |
| "learning_rate": 4.140720965157519e-07, |
| "loss": 0.3882, |
| "step": 628 |
| }, |
| { |
| "epoch": 4.774193548387097, |
| "grad_norm": 0.0845764980027318, |
| "learning_rate": 3.840163543433084e-07, |
| "loss": 0.3778, |
| "step": 629 |
| }, |
| { |
| "epoch": 4.781783681214421, |
| "grad_norm": 0.08252440850107318, |
| "learning_rate": 3.550876501985112e-07, |
| "loss": 0.3758, |
| "step": 630 |
| }, |
| { |
| "epoch": 4.7893738140417454, |
| "grad_norm": 0.08576998372456936, |
| "learning_rate": 3.272868070761881e-07, |
| "loss": 0.3857, |
| "step": 631 |
| }, |
| { |
| "epoch": 4.79696394686907, |
| "grad_norm": 0.08653227658708267, |
| "learning_rate": 3.006146158845713e-07, |
| "loss": 0.387, |
| "step": 632 |
| }, |
| { |
| "epoch": 4.804554079696395, |
| "grad_norm": 0.0868336765504615, |
| "learning_rate": 2.750718354227822e-07, |
| "loss": 0.3918, |
| "step": 633 |
| }, |
| { |
| "epoch": 4.8121442125237195, |
| "grad_norm": 0.08524574086673203, |
| "learning_rate": 2.506591923592572e-07, |
| "loss": 0.3879, |
| "step": 634 |
| }, |
| { |
| "epoch": 4.819734345351043, |
| "grad_norm": 0.0854074630262595, |
| "learning_rate": 2.273773812110802e-07, |
| "loss": 0.3822, |
| "step": 635 |
| }, |
| { |
| "epoch": 4.827324478178368, |
| "grad_norm": 0.08782569347415704, |
| "learning_rate": 2.0522706432419382e-07, |
| "loss": 0.389, |
| "step": 636 |
| }, |
| { |
| "epoch": 4.834914611005693, |
| "grad_norm": 0.08728478399615792, |
| "learning_rate": 1.842088718546009e-07, |
| "loss": 0.3792, |
| "step": 637 |
| }, |
| { |
| "epoch": 4.842504743833017, |
| "grad_norm": 0.08691066239057142, |
| "learning_rate": 1.6432340175039253e-07, |
| "loss": 0.3784, |
| "step": 638 |
| }, |
| { |
| "epoch": 4.850094876660341, |
| "grad_norm": 0.09126251564727415, |
| "learning_rate": 1.4557121973477472e-07, |
| "loss": 0.3861, |
| "step": 639 |
| }, |
| { |
| "epoch": 4.857685009487666, |
| "grad_norm": 0.08741494339348937, |
| "learning_rate": 1.2795285928994372e-07, |
| "loss": 0.3864, |
| "step": 640 |
| }, |
| { |
| "epoch": 4.8652751423149905, |
| "grad_norm": 0.0876181788276896, |
| "learning_rate": 1.1146882164193795e-07, |
| "loss": 0.3847, |
| "step": 641 |
| }, |
| { |
| "epoch": 4.872865275142315, |
| "grad_norm": 0.08754414570978997, |
| "learning_rate": 9.611957574634734e-08, |
| "loss": 0.3804, |
| "step": 642 |
| }, |
| { |
| "epoch": 4.88045540796964, |
| "grad_norm": 0.08306030029527668, |
| "learning_rate": 8.190555827499947e-08, |
| "loss": 0.3815, |
| "step": 643 |
| }, |
| { |
| "epoch": 4.888045540796964, |
| "grad_norm": 0.09068276050158985, |
| "learning_rate": 6.882717360352065e-08, |
| "loss": 0.3833, |
| "step": 644 |
| }, |
| { |
| "epoch": 4.895635673624288, |
| "grad_norm": 0.08477830736645849, |
| "learning_rate": 5.688479379984291e-08, |
| "loss": 0.3865, |
| "step": 645 |
| }, |
| { |
| "epoch": 4.903225806451613, |
| "grad_norm": 0.08365266522816545, |
| "learning_rate": 4.607875861359024e-08, |
| "loss": 0.3733, |
| "step": 646 |
| }, |
| { |
| "epoch": 4.910815939278938, |
| "grad_norm": 0.08509382362942203, |
| "learning_rate": 3.640937546646406e-08, |
| "loss": 0.3801, |
| "step": 647 |
| }, |
| { |
| "epoch": 4.9184060721062615, |
| "grad_norm": 0.08832189058708398, |
| "learning_rate": 2.787691944345472e-08, |
| "loss": 0.382, |
| "step": 648 |
| }, |
| { |
| "epoch": 4.925996204933586, |
| "grad_norm": 0.08727763140855628, |
| "learning_rate": 2.0481633285025505e-08, |
| "loss": 0.3799, |
| "step": 649 |
| }, |
| { |
| "epoch": 4.933586337760911, |
| "grad_norm": 0.08685568551948444, |
| "learning_rate": 1.4223727380215935e-08, |
| "loss": 0.3812, |
| "step": 650 |
| }, |
| { |
| "epoch": 4.9411764705882355, |
| "grad_norm": 0.08827489813293968, |
| "learning_rate": 9.103379760655451e-09, |
| "loss": 0.3897, |
| "step": 651 |
| }, |
| { |
| "epoch": 4.94876660341556, |
| "grad_norm": 0.08969524756207933, |
| "learning_rate": 5.120736095483026e-09, |
| "loss": 0.3803, |
| "step": 652 |
| }, |
| { |
| "epoch": 4.956356736242884, |
| "grad_norm": 0.0849566492970198, |
| "learning_rate": 2.2759096872260187e-09, |
| "loss": 0.3774, |
| "step": 653 |
| }, |
| { |
| "epoch": 4.963946869070209, |
| "grad_norm": 0.08816588035901379, |
| "learning_rate": 5.689814685538863e-10, |
| "loss": 0.3843, |
| "step": 654 |
| }, |
| { |
| "epoch": 4.971537001897533, |
| "grad_norm": 0.09034351763433966, |
| "learning_rate": 0.0, |
| "loss": 0.3759, |
| "step": 655 |
| }, |
| { |
| "epoch": 4.971537001897533, |
| "step": 655, |
| "total_flos": 1.573215812367627e+19, |
| "train_loss": 0.4743207697649948, |
| "train_runtime": 64212.961, |
| "train_samples_per_second": 5.249, |
| "train_steps_per_second": 0.01 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 655, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.573215812367627e+19, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|