diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8579 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.986666666666667, + "eval_steps": 10000, + "global_step": 140000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.99862042482382e-05, + "loss": 1.1884, + "step": 100 + }, + { + "epoch": 0.0, + "learning_rate": 1.9971981823741495e-05, + "loss": 1.0269, + "step": 200 + }, + { + "epoch": 0.01, + "learning_rate": 1.995775939924479e-05, + "loss": 0.9881, + "step": 300 + }, + { + "epoch": 0.01, + "learning_rate": 1.994353697474809e-05, + "loss": 0.9348, + "step": 400 + }, + { + "epoch": 0.01, + "learning_rate": 1.9929314550251384e-05, + "loss": 0.9291, + "step": 500 + }, + { + "epoch": 0.01, + "learning_rate": 1.991509212575468e-05, + "loss": 0.8964, + "step": 600 + }, + { + "epoch": 0.01, + "learning_rate": 1.9900869701257974e-05, + "loss": 0.8579, + "step": 700 + }, + { + "epoch": 0.02, + "learning_rate": 1.9886647276761272e-05, + "loss": 0.8678, + "step": 800 + }, + { + "epoch": 0.02, + "learning_rate": 1.9872424852264567e-05, + "loss": 0.8506, + "step": 900 + }, + { + "epoch": 0.02, + "learning_rate": 1.9858202427767862e-05, + "loss": 0.8268, + "step": 1000 + }, + { + "epoch": 0.02, + "learning_rate": 1.984398000327116e-05, + "loss": 0.8056, + "step": 1100 + }, + { + "epoch": 0.03, + "learning_rate": 1.9829757578774455e-05, + "loss": 0.7876, + "step": 1200 + }, + { + "epoch": 0.03, + "learning_rate": 1.981553515427775e-05, + "loss": 0.7956, + "step": 1300 + }, + { + "epoch": 0.03, + "learning_rate": 1.980131272978105e-05, + "loss": 0.7904, + "step": 1400 + }, + { + "epoch": 0.03, + "learning_rate": 1.9787090305284344e-05, + "loss": 0.7707, + "step": 1500 + }, + { + "epoch": 0.03, + "learning_rate": 1.977286788078764e-05, + "loss": 0.7652, + "step": 1600 + }, + { + "epoch": 0.04, + "learning_rate": 1.9758645456290934e-05, + "loss": 0.7798, + "step": 1700 + }, + { + "epoch": 0.04, + "learning_rate": 1.9744423031794232e-05, + "loss": 0.7454, + "step": 1800 + }, + { + "epoch": 0.04, + "learning_rate": 1.9730200607297527e-05, + "loss": 0.7506, + "step": 1900 + }, + { + "epoch": 0.04, + "learning_rate": 1.9715978182800822e-05, + "loss": 0.727, + "step": 2000 + }, + { + "epoch": 0.04, + "learning_rate": 1.970175575830412e-05, + "loss": 0.714, + "step": 2100 + }, + { + "epoch": 0.05, + "learning_rate": 1.9687533333807415e-05, + "loss": 0.7132, + "step": 2200 + }, + { + "epoch": 0.05, + "learning_rate": 1.9673310909310714e-05, + "loss": 0.7088, + "step": 2300 + }, + { + "epoch": 0.05, + "learning_rate": 1.965908848481401e-05, + "loss": 0.7266, + "step": 2400 + }, + { + "epoch": 0.05, + "learning_rate": 1.9644866060317303e-05, + "loss": 0.7031, + "step": 2500 + }, + { + "epoch": 0.06, + "learning_rate": 1.9630643635820602e-05, + "loss": 0.6948, + "step": 2600 + }, + { + "epoch": 0.06, + "learning_rate": 1.9616421211323893e-05, + "loss": 0.7048, + "step": 2700 + }, + { + "epoch": 0.06, + "learning_rate": 1.9602198786827192e-05, + "loss": 0.699, + "step": 2800 + }, + { + "epoch": 0.06, + "learning_rate": 1.9587976362330487e-05, + "loss": 0.679, + "step": 2900 + }, + { + "epoch": 0.06, + "learning_rate": 1.9573753937833785e-05, + "loss": 0.6847, + "step": 3000 + }, + { + "epoch": 0.07, + "learning_rate": 1.955953151333708e-05, + "loss": 0.6781, + "step": 3100 + }, + { + "epoch": 0.07, + "learning_rate": 1.9545309088840375e-05, + "loss": 0.6803, + "step": 3200 + }, + { + "epoch": 0.07, + "learning_rate": 1.9531086664343673e-05, + "loss": 0.6594, + "step": 3300 + }, + { + "epoch": 0.07, + "learning_rate": 1.951686423984697e-05, + "loss": 0.6729, + "step": 3400 + }, + { + "epoch": 0.07, + "learning_rate": 1.9502641815350263e-05, + "loss": 0.664, + "step": 3500 + }, + { + "epoch": 0.08, + "learning_rate": 1.9488419390853562e-05, + "loss": 0.6611, + "step": 3600 + }, + { + "epoch": 0.08, + "learning_rate": 1.9474196966356857e-05, + "loss": 0.6497, + "step": 3700 + }, + { + "epoch": 0.08, + "learning_rate": 1.945997454186015e-05, + "loss": 0.65, + "step": 3800 + }, + { + "epoch": 0.08, + "learning_rate": 1.9445752117363447e-05, + "loss": 0.647, + "step": 3900 + }, + { + "epoch": 0.09, + "learning_rate": 1.9431529692866745e-05, + "loss": 0.6528, + "step": 4000 + }, + { + "epoch": 0.09, + "learning_rate": 1.941730726837004e-05, + "loss": 0.6432, + "step": 4100 + }, + { + "epoch": 0.09, + "learning_rate": 1.9403084843873335e-05, + "loss": 0.6291, + "step": 4200 + }, + { + "epoch": 0.09, + "learning_rate": 1.9388862419376633e-05, + "loss": 0.6388, + "step": 4300 + }, + { + "epoch": 0.09, + "learning_rate": 1.937463999487993e-05, + "loss": 0.6261, + "step": 4400 + }, + { + "epoch": 0.1, + "learning_rate": 1.9360417570383227e-05, + "loss": 0.642, + "step": 4500 + }, + { + "epoch": 0.1, + "learning_rate": 1.934619514588652e-05, + "loss": 0.6399, + "step": 4600 + }, + { + "epoch": 0.1, + "learning_rate": 1.9331972721389817e-05, + "loss": 0.6213, + "step": 4700 + }, + { + "epoch": 0.1, + "learning_rate": 1.931775029689311e-05, + "loss": 0.6163, + "step": 4800 + }, + { + "epoch": 0.1, + "learning_rate": 1.9303527872396407e-05, + "loss": 0.6167, + "step": 4900 + }, + { + "epoch": 0.11, + "learning_rate": 1.9289305447899705e-05, + "loss": 0.6127, + "step": 5000 + }, + { + "epoch": 0.11, + "eval_loss": 0.6764795780181885, + "eval_runtime": 34.8863, + "eval_samples_per_second": 143.323, + "eval_steps_per_second": 1.147, + "step": 5000 + }, + { + "epoch": 0.11, + "learning_rate": 1.9275083023403e-05, + "loss": 0.5963, + "step": 5100 + }, + { + "epoch": 0.11, + "learning_rate": 1.9260860598906298e-05, + "loss": 0.6034, + "step": 5200 + }, + { + "epoch": 0.11, + "learning_rate": 1.9246638174409593e-05, + "loss": 0.5944, + "step": 5300 + }, + { + "epoch": 0.12, + "learning_rate": 1.9232415749912888e-05, + "loss": 0.5951, + "step": 5400 + }, + { + "epoch": 0.12, + "learning_rate": 1.9218193325416187e-05, + "loss": 0.6055, + "step": 5500 + }, + { + "epoch": 0.12, + "learning_rate": 1.920397090091948e-05, + "loss": 0.6052, + "step": 5600 + }, + { + "epoch": 0.12, + "learning_rate": 1.9189748476422777e-05, + "loss": 0.5907, + "step": 5700 + }, + { + "epoch": 0.12, + "learning_rate": 1.9175526051926075e-05, + "loss": 0.5933, + "step": 5800 + }, + { + "epoch": 0.13, + "learning_rate": 1.916130362742937e-05, + "loss": 0.6002, + "step": 5900 + }, + { + "epoch": 0.13, + "learning_rate": 1.9147081202932665e-05, + "loss": 0.5985, + "step": 6000 + }, + { + "epoch": 0.13, + "learning_rate": 1.913285877843596e-05, + "loss": 0.5761, + "step": 6100 + }, + { + "epoch": 0.13, + "learning_rate": 1.9118636353939258e-05, + "loss": 0.5749, + "step": 6200 + }, + { + "epoch": 0.13, + "learning_rate": 1.9104413929442553e-05, + "loss": 0.5855, + "step": 6300 + }, + { + "epoch": 0.14, + "learning_rate": 1.9090191504945848e-05, + "loss": 0.5724, + "step": 6400 + }, + { + "epoch": 0.14, + "learning_rate": 1.9075969080449146e-05, + "loss": 0.5856, + "step": 6500 + }, + { + "epoch": 0.14, + "learning_rate": 1.906174665595244e-05, + "loss": 0.5843, + "step": 6600 + }, + { + "epoch": 0.14, + "learning_rate": 1.904752423145574e-05, + "loss": 0.5624, + "step": 6700 + }, + { + "epoch": 0.15, + "learning_rate": 1.9033301806959035e-05, + "loss": 0.5784, + "step": 6800 + }, + { + "epoch": 0.15, + "learning_rate": 1.901907938246233e-05, + "loss": 0.5589, + "step": 6900 + }, + { + "epoch": 0.15, + "learning_rate": 1.9004856957965625e-05, + "loss": 0.5732, + "step": 7000 + }, + { + "epoch": 0.15, + "learning_rate": 1.899063453346892e-05, + "loss": 0.5529, + "step": 7100 + }, + { + "epoch": 0.15, + "learning_rate": 1.8976412108972218e-05, + "loss": 0.5648, + "step": 7200 + }, + { + "epoch": 0.16, + "learning_rate": 1.8962189684475513e-05, + "loss": 0.5535, + "step": 7300 + }, + { + "epoch": 0.16, + "learning_rate": 1.894796725997881e-05, + "loss": 0.5527, + "step": 7400 + }, + { + "epoch": 0.16, + "learning_rate": 1.8933744835482106e-05, + "loss": 0.5478, + "step": 7500 + }, + { + "epoch": 0.16, + "learning_rate": 1.89195224109854e-05, + "loss": 0.5419, + "step": 7600 + }, + { + "epoch": 0.16, + "learning_rate": 1.89052999864887e-05, + "loss": 0.5596, + "step": 7700 + }, + { + "epoch": 0.17, + "learning_rate": 1.8891077561991995e-05, + "loss": 0.5444, + "step": 7800 + }, + { + "epoch": 0.17, + "learning_rate": 1.887685513749529e-05, + "loss": 0.5526, + "step": 7900 + }, + { + "epoch": 0.17, + "learning_rate": 1.8862632712998585e-05, + "loss": 0.5343, + "step": 8000 + }, + { + "epoch": 0.17, + "learning_rate": 1.8848410288501883e-05, + "loss": 0.5327, + "step": 8100 + }, + { + "epoch": 0.17, + "learning_rate": 1.8834187864005178e-05, + "loss": 0.5289, + "step": 8200 + }, + { + "epoch": 0.18, + "learning_rate": 1.8819965439508473e-05, + "loss": 0.5467, + "step": 8300 + }, + { + "epoch": 0.18, + "learning_rate": 1.880574301501177e-05, + "loss": 0.5357, + "step": 8400 + }, + { + "epoch": 0.18, + "learning_rate": 1.8791520590515066e-05, + "loss": 0.5317, + "step": 8500 + }, + { + "epoch": 0.18, + "learning_rate": 1.877729816601836e-05, + "loss": 0.5326, + "step": 8600 + }, + { + "epoch": 0.19, + "learning_rate": 1.876307574152166e-05, + "loss": 0.5335, + "step": 8700 + }, + { + "epoch": 0.19, + "learning_rate": 1.8748853317024955e-05, + "loss": 0.5364, + "step": 8800 + }, + { + "epoch": 0.19, + "learning_rate": 1.8734630892528253e-05, + "loss": 0.5295, + "step": 8900 + }, + { + "epoch": 0.19, + "learning_rate": 1.8720408468031548e-05, + "loss": 0.53, + "step": 9000 + }, + { + "epoch": 0.19, + "learning_rate": 1.8706186043534843e-05, + "loss": 0.5325, + "step": 9100 + }, + { + "epoch": 0.2, + "learning_rate": 1.8691963619038138e-05, + "loss": 0.5196, + "step": 9200 + }, + { + "epoch": 0.2, + "learning_rate": 1.8677741194541433e-05, + "loss": 0.5206, + "step": 9300 + }, + { + "epoch": 0.2, + "learning_rate": 1.866351877004473e-05, + "loss": 0.5231, + "step": 9400 + }, + { + "epoch": 0.2, + "learning_rate": 1.8649296345548026e-05, + "loss": 0.5134, + "step": 9500 + }, + { + "epoch": 0.2, + "learning_rate": 1.8635073921051325e-05, + "loss": 0.5113, + "step": 9600 + }, + { + "epoch": 0.21, + "learning_rate": 1.862085149655462e-05, + "loss": 0.5147, + "step": 9700 + }, + { + "epoch": 0.21, + "learning_rate": 1.8606629072057914e-05, + "loss": 0.5255, + "step": 9800 + }, + { + "epoch": 0.21, + "learning_rate": 1.8592406647561213e-05, + "loss": 0.5106, + "step": 9900 + }, + { + "epoch": 0.21, + "learning_rate": 1.8578184223064508e-05, + "loss": 0.5083, + "step": 10000 + }, + { + "epoch": 0.21, + "eval_loss": 0.5610331296920776, + "eval_runtime": 34.9828, + "eval_samples_per_second": 142.927, + "eval_steps_per_second": 1.143, + "step": 10000 + }, + { + "epoch": 0.22, + "learning_rate": 1.8563961798567803e-05, + "loss": 0.5158, + "step": 10100 + }, + { + "epoch": 0.22, + "learning_rate": 1.8549739374071098e-05, + "loss": 0.5055, + "step": 10200 + }, + { + "epoch": 0.22, + "learning_rate": 1.8535516949574396e-05, + "loss": 0.5006, + "step": 10300 + }, + { + "epoch": 0.22, + "learning_rate": 1.852129452507769e-05, + "loss": 0.507, + "step": 10400 + }, + { + "epoch": 0.22, + "learning_rate": 1.8507072100580986e-05, + "loss": 0.4954, + "step": 10500 + }, + { + "epoch": 0.23, + "learning_rate": 1.8492849676084284e-05, + "loss": 0.5027, + "step": 10600 + }, + { + "epoch": 0.23, + "learning_rate": 1.847862725158758e-05, + "loss": 0.4875, + "step": 10700 + }, + { + "epoch": 0.23, + "learning_rate": 1.8464404827090874e-05, + "loss": 0.4911, + "step": 10800 + }, + { + "epoch": 0.23, + "learning_rate": 1.8450182402594173e-05, + "loss": 0.483, + "step": 10900 + }, + { + "epoch": 0.23, + "learning_rate": 1.8435959978097468e-05, + "loss": 0.4891, + "step": 11000 + }, + { + "epoch": 0.24, + "learning_rate": 1.8421737553600766e-05, + "loss": 0.4911, + "step": 11100 + }, + { + "epoch": 0.24, + "learning_rate": 1.8407515129104058e-05, + "loss": 0.4742, + "step": 11200 + }, + { + "epoch": 0.24, + "learning_rate": 1.8393292704607356e-05, + "loss": 0.4959, + "step": 11300 + }, + { + "epoch": 0.24, + "learning_rate": 1.837907028011065e-05, + "loss": 0.4959, + "step": 11400 + }, + { + "epoch": 0.25, + "learning_rate": 1.8364847855613946e-05, + "loss": 0.4798, + "step": 11500 + }, + { + "epoch": 0.25, + "learning_rate": 1.8350625431117244e-05, + "loss": 0.4927, + "step": 11600 + }, + { + "epoch": 0.25, + "learning_rate": 1.833640300662054e-05, + "loss": 0.4824, + "step": 11700 + }, + { + "epoch": 0.25, + "learning_rate": 1.8322180582123838e-05, + "loss": 0.4811, + "step": 11800 + }, + { + "epoch": 0.25, + "learning_rate": 1.8307958157627133e-05, + "loss": 0.4781, + "step": 11900 + }, + { + "epoch": 0.26, + "learning_rate": 1.8293735733130428e-05, + "loss": 0.4705, + "step": 12000 + }, + { + "epoch": 0.26, + "learning_rate": 1.8279513308633726e-05, + "loss": 0.4633, + "step": 12100 + }, + { + "epoch": 0.26, + "learning_rate": 1.826529088413702e-05, + "loss": 0.4809, + "step": 12200 + }, + { + "epoch": 0.26, + "learning_rate": 1.8251068459640316e-05, + "loss": 0.4801, + "step": 12300 + }, + { + "epoch": 0.26, + "learning_rate": 1.823684603514361e-05, + "loss": 0.4813, + "step": 12400 + }, + { + "epoch": 0.27, + "learning_rate": 1.822262361064691e-05, + "loss": 0.4682, + "step": 12500 + }, + { + "epoch": 0.27, + "learning_rate": 1.8208401186150204e-05, + "loss": 0.4745, + "step": 12600 + }, + { + "epoch": 0.27, + "learning_rate": 1.81941787616535e-05, + "loss": 0.4631, + "step": 12700 + }, + { + "epoch": 0.27, + "learning_rate": 1.8179956337156798e-05, + "loss": 0.4733, + "step": 12800 + }, + { + "epoch": 0.28, + "learning_rate": 1.8165733912660093e-05, + "loss": 0.4687, + "step": 12900 + }, + { + "epoch": 0.28, + "learning_rate": 1.8151511488163388e-05, + "loss": 0.4685, + "step": 13000 + }, + { + "epoch": 0.28, + "learning_rate": 1.8137289063666686e-05, + "loss": 0.4638, + "step": 13100 + }, + { + "epoch": 0.28, + "learning_rate": 1.812306663916998e-05, + "loss": 0.4482, + "step": 13200 + }, + { + "epoch": 0.28, + "learning_rate": 1.810884421467328e-05, + "loss": 0.4452, + "step": 13300 + }, + { + "epoch": 0.29, + "learning_rate": 1.809462179017657e-05, + "loss": 0.4553, + "step": 13400 + }, + { + "epoch": 0.29, + "learning_rate": 1.808039936567987e-05, + "loss": 0.4498, + "step": 13500 + }, + { + "epoch": 0.29, + "learning_rate": 1.8066176941183164e-05, + "loss": 0.4468, + "step": 13600 + }, + { + "epoch": 0.29, + "learning_rate": 1.805195451668646e-05, + "loss": 0.4466, + "step": 13700 + }, + { + "epoch": 0.29, + "learning_rate": 1.8037732092189757e-05, + "loss": 0.4465, + "step": 13800 + }, + { + "epoch": 0.3, + "learning_rate": 1.8023509667693052e-05, + "loss": 0.4426, + "step": 13900 + }, + { + "epoch": 0.3, + "learning_rate": 1.800928724319635e-05, + "loss": 0.4466, + "step": 14000 + }, + { + "epoch": 0.3, + "learning_rate": 1.7995064818699646e-05, + "loss": 0.4444, + "step": 14100 + }, + { + "epoch": 0.3, + "learning_rate": 1.798084239420294e-05, + "loss": 0.4445, + "step": 14200 + }, + { + "epoch": 0.31, + "learning_rate": 1.796661996970624e-05, + "loss": 0.4395, + "step": 14300 + }, + { + "epoch": 0.31, + "learning_rate": 1.795239754520953e-05, + "loss": 0.4425, + "step": 14400 + }, + { + "epoch": 0.31, + "learning_rate": 1.793817512071283e-05, + "loss": 0.4334, + "step": 14500 + }, + { + "epoch": 0.31, + "learning_rate": 1.7923952696216124e-05, + "loss": 0.4411, + "step": 14600 + }, + { + "epoch": 0.31, + "learning_rate": 1.7909730271719422e-05, + "loss": 0.4481, + "step": 14700 + }, + { + "epoch": 0.32, + "learning_rate": 1.7895507847222717e-05, + "loss": 0.431, + "step": 14800 + }, + { + "epoch": 0.32, + "learning_rate": 1.7881285422726012e-05, + "loss": 0.451, + "step": 14900 + }, + { + "epoch": 0.32, + "learning_rate": 1.786706299822931e-05, + "loss": 0.4406, + "step": 15000 + }, + { + "epoch": 0.32, + "eval_loss": 0.47009751200675964, + "eval_runtime": 35.0436, + "eval_samples_per_second": 142.68, + "eval_steps_per_second": 1.141, + "step": 15000 + }, + { + "epoch": 0.32, + "learning_rate": 1.7852840573732606e-05, + "loss": 0.4358, + "step": 15100 + }, + { + "epoch": 0.32, + "learning_rate": 1.78386181492359e-05, + "loss": 0.4333, + "step": 15200 + }, + { + "epoch": 0.33, + "learning_rate": 1.78243957247392e-05, + "loss": 0.4349, + "step": 15300 + }, + { + "epoch": 0.33, + "learning_rate": 1.7810173300242494e-05, + "loss": 0.4317, + "step": 15400 + }, + { + "epoch": 0.33, + "learning_rate": 1.779595087574579e-05, + "loss": 0.4248, + "step": 15500 + }, + { + "epoch": 0.33, + "learning_rate": 1.7781728451249084e-05, + "loss": 0.433, + "step": 15600 + }, + { + "epoch": 0.33, + "learning_rate": 1.7767506026752382e-05, + "loss": 0.4275, + "step": 15700 + }, + { + "epoch": 0.34, + "learning_rate": 1.7753283602255677e-05, + "loss": 0.4299, + "step": 15800 + }, + { + "epoch": 0.34, + "learning_rate": 1.7739061177758972e-05, + "loss": 0.4294, + "step": 15900 + }, + { + "epoch": 0.34, + "learning_rate": 1.772483875326227e-05, + "loss": 0.416, + "step": 16000 + }, + { + "epoch": 0.34, + "learning_rate": 1.7710616328765566e-05, + "loss": 0.4146, + "step": 16100 + }, + { + "epoch": 0.35, + "learning_rate": 1.7696393904268864e-05, + "loss": 0.4212, + "step": 16200 + }, + { + "epoch": 0.35, + "learning_rate": 1.768217147977216e-05, + "loss": 0.4201, + "step": 16300 + }, + { + "epoch": 0.35, + "learning_rate": 1.7667949055275454e-05, + "loss": 0.4147, + "step": 16400 + }, + { + "epoch": 0.35, + "learning_rate": 1.7653726630778752e-05, + "loss": 0.421, + "step": 16500 + }, + { + "epoch": 0.35, + "learning_rate": 1.7639504206282044e-05, + "loss": 0.4089, + "step": 16600 + }, + { + "epoch": 0.36, + "learning_rate": 1.7625281781785342e-05, + "loss": 0.4118, + "step": 16700 + }, + { + "epoch": 0.36, + "learning_rate": 1.7611059357288637e-05, + "loss": 0.4112, + "step": 16800 + }, + { + "epoch": 0.36, + "learning_rate": 1.7596836932791936e-05, + "loss": 0.408, + "step": 16900 + }, + { + "epoch": 0.36, + "learning_rate": 1.758261450829523e-05, + "loss": 0.4174, + "step": 17000 + }, + { + "epoch": 0.36, + "learning_rate": 1.7568392083798525e-05, + "loss": 0.4072, + "step": 17100 + }, + { + "epoch": 0.37, + "learning_rate": 1.7554169659301824e-05, + "loss": 0.4136, + "step": 17200 + }, + { + "epoch": 0.37, + "learning_rate": 1.753994723480512e-05, + "loss": 0.395, + "step": 17300 + }, + { + "epoch": 0.37, + "learning_rate": 1.7525724810308414e-05, + "loss": 0.4179, + "step": 17400 + }, + { + "epoch": 0.37, + "learning_rate": 1.7511502385811712e-05, + "loss": 0.4104, + "step": 17500 + }, + { + "epoch": 0.38, + "learning_rate": 1.7497279961315007e-05, + "loss": 0.404, + "step": 17600 + }, + { + "epoch": 0.38, + "learning_rate": 1.7483057536818302e-05, + "loss": 0.4011, + "step": 17700 + }, + { + "epoch": 0.38, + "learning_rate": 1.7468835112321597e-05, + "loss": 0.3986, + "step": 17800 + }, + { + "epoch": 0.38, + "learning_rate": 1.7454612687824895e-05, + "loss": 0.399, + "step": 17900 + }, + { + "epoch": 0.38, + "learning_rate": 1.744039026332819e-05, + "loss": 0.3954, + "step": 18000 + }, + { + "epoch": 0.39, + "learning_rate": 1.7426167838831485e-05, + "loss": 0.4122, + "step": 18100 + }, + { + "epoch": 0.39, + "learning_rate": 1.7411945414334784e-05, + "loss": 0.3826, + "step": 18200 + }, + { + "epoch": 0.39, + "learning_rate": 1.739772298983808e-05, + "loss": 0.3874, + "step": 18300 + }, + { + "epoch": 0.39, + "learning_rate": 1.7383500565341377e-05, + "loss": 0.3815, + "step": 18400 + }, + { + "epoch": 0.39, + "learning_rate": 1.7369278140844672e-05, + "loss": 0.3863, + "step": 18500 + }, + { + "epoch": 0.4, + "learning_rate": 1.7355055716347967e-05, + "loss": 0.3827, + "step": 18600 + }, + { + "epoch": 0.4, + "learning_rate": 1.7340833291851262e-05, + "loss": 0.3944, + "step": 18700 + }, + { + "epoch": 0.4, + "learning_rate": 1.7326610867354557e-05, + "loss": 0.392, + "step": 18800 + }, + { + "epoch": 0.4, + "learning_rate": 1.7312388442857855e-05, + "loss": 0.3948, + "step": 18900 + }, + { + "epoch": 0.41, + "learning_rate": 1.729816601836115e-05, + "loss": 0.382, + "step": 19000 + }, + { + "epoch": 0.41, + "learning_rate": 1.728394359386445e-05, + "loss": 0.3803, + "step": 19100 + }, + { + "epoch": 0.41, + "learning_rate": 1.7269721169367744e-05, + "loss": 0.3795, + "step": 19200 + }, + { + "epoch": 0.41, + "learning_rate": 1.725549874487104e-05, + "loss": 0.383, + "step": 19300 + }, + { + "epoch": 0.41, + "learning_rate": 1.7241276320374337e-05, + "loss": 0.3865, + "step": 19400 + }, + { + "epoch": 0.42, + "learning_rate": 1.7227053895877632e-05, + "loss": 0.3755, + "step": 19500 + }, + { + "epoch": 0.42, + "learning_rate": 1.7212831471380927e-05, + "loss": 0.3783, + "step": 19600 + }, + { + "epoch": 0.42, + "learning_rate": 1.7198609046884225e-05, + "loss": 0.3822, + "step": 19700 + }, + { + "epoch": 0.42, + "learning_rate": 1.718438662238752e-05, + "loss": 0.375, + "step": 19800 + }, + { + "epoch": 0.42, + "learning_rate": 1.7170164197890815e-05, + "loss": 0.3823, + "step": 19900 + }, + { + "epoch": 0.43, + "learning_rate": 1.715594177339411e-05, + "loss": 0.3847, + "step": 20000 + }, + { + "epoch": 0.43, + "eval_loss": 0.4110700190067291, + "eval_runtime": 35.0314, + "eval_samples_per_second": 142.729, + "eval_steps_per_second": 1.142, + "step": 20000 + }, + { + "epoch": 0.43, + "learning_rate": 1.714171934889741e-05, + "loss": 0.3818, + "step": 20100 + }, + { + "epoch": 0.43, + "learning_rate": 1.7127496924400704e-05, + "loss": 0.3802, + "step": 20200 + }, + { + "epoch": 0.43, + "learning_rate": 1.7113274499904e-05, + "loss": 0.3726, + "step": 20300 + }, + { + "epoch": 0.44, + "learning_rate": 1.7099052075407297e-05, + "loss": 0.3822, + "step": 20400 + }, + { + "epoch": 0.44, + "learning_rate": 1.7084829650910592e-05, + "loss": 0.363, + "step": 20500 + }, + { + "epoch": 0.44, + "learning_rate": 1.707060722641389e-05, + "loss": 0.3604, + "step": 20600 + }, + { + "epoch": 0.44, + "learning_rate": 1.7056384801917185e-05, + "loss": 0.3665, + "step": 20700 + }, + { + "epoch": 0.44, + "learning_rate": 1.704216237742048e-05, + "loss": 0.3575, + "step": 20800 + }, + { + "epoch": 0.45, + "learning_rate": 1.7027939952923775e-05, + "loss": 0.3655, + "step": 20900 + }, + { + "epoch": 0.45, + "learning_rate": 1.701371752842707e-05, + "loss": 0.3718, + "step": 21000 + }, + { + "epoch": 0.45, + "learning_rate": 1.699949510393037e-05, + "loss": 0.3667, + "step": 21100 + }, + { + "epoch": 0.45, + "learning_rate": 1.6985272679433663e-05, + "loss": 0.3511, + "step": 21200 + }, + { + "epoch": 0.45, + "learning_rate": 1.6971050254936962e-05, + "loss": 0.3644, + "step": 21300 + }, + { + "epoch": 0.46, + "learning_rate": 1.6956827830440257e-05, + "loss": 0.3638, + "step": 21400 + }, + { + "epoch": 0.46, + "learning_rate": 1.6942605405943552e-05, + "loss": 0.3637, + "step": 21500 + }, + { + "epoch": 0.46, + "learning_rate": 1.692838298144685e-05, + "loss": 0.3625, + "step": 21600 + }, + { + "epoch": 0.46, + "learning_rate": 1.6914160556950145e-05, + "loss": 0.3501, + "step": 21700 + }, + { + "epoch": 0.47, + "learning_rate": 1.689993813245344e-05, + "loss": 0.3604, + "step": 21800 + }, + { + "epoch": 0.47, + "learning_rate": 1.6885715707956735e-05, + "loss": 0.3615, + "step": 21900 + }, + { + "epoch": 0.47, + "learning_rate": 1.6871493283460033e-05, + "loss": 0.3558, + "step": 22000 + }, + { + "epoch": 0.47, + "learning_rate": 1.685727085896333e-05, + "loss": 0.3644, + "step": 22100 + }, + { + "epoch": 0.47, + "learning_rate": 1.6843048434466623e-05, + "loss": 0.3564, + "step": 22200 + }, + { + "epoch": 0.48, + "learning_rate": 1.682882600996992e-05, + "loss": 0.35, + "step": 22300 + }, + { + "epoch": 0.48, + "learning_rate": 1.6814603585473217e-05, + "loss": 0.3559, + "step": 22400 + }, + { + "epoch": 0.48, + "learning_rate": 1.680038116097651e-05, + "loss": 0.3585, + "step": 22500 + }, + { + "epoch": 0.48, + "learning_rate": 1.678615873647981e-05, + "loss": 0.3544, + "step": 22600 + }, + { + "epoch": 0.48, + "learning_rate": 1.6771936311983105e-05, + "loss": 0.3459, + "step": 22700 + }, + { + "epoch": 0.49, + "learning_rate": 1.6757713887486403e-05, + "loss": 0.3521, + "step": 22800 + }, + { + "epoch": 0.49, + "learning_rate": 1.6743491462989698e-05, + "loss": 0.3431, + "step": 22900 + }, + { + "epoch": 0.49, + "learning_rate": 1.6729269038492993e-05, + "loss": 0.337, + "step": 23000 + }, + { + "epoch": 0.49, + "learning_rate": 1.6715046613996288e-05, + "loss": 0.3376, + "step": 23100 + }, + { + "epoch": 0.49, + "learning_rate": 1.6700824189499583e-05, + "loss": 0.3345, + "step": 23200 + }, + { + "epoch": 0.5, + "learning_rate": 1.668660176500288e-05, + "loss": 0.3327, + "step": 23300 + }, + { + "epoch": 0.5, + "learning_rate": 1.6672379340506177e-05, + "loss": 0.3431, + "step": 23400 + }, + { + "epoch": 0.5, + "learning_rate": 1.6658156916009475e-05, + "loss": 0.3375, + "step": 23500 + }, + { + "epoch": 0.5, + "learning_rate": 1.664393449151277e-05, + "loss": 0.3369, + "step": 23600 + }, + { + "epoch": 0.51, + "learning_rate": 1.6629712067016065e-05, + "loss": 0.3397, + "step": 23700 + }, + { + "epoch": 0.51, + "learning_rate": 1.6615489642519363e-05, + "loss": 0.338, + "step": 23800 + }, + { + "epoch": 0.51, + "learning_rate": 1.6601267218022658e-05, + "loss": 0.3311, + "step": 23900 + }, + { + "epoch": 0.51, + "learning_rate": 1.6587044793525953e-05, + "loss": 0.3369, + "step": 24000 + }, + { + "epoch": 0.51, + "learning_rate": 1.6572822369029248e-05, + "loss": 0.3351, + "step": 24100 + }, + { + "epoch": 0.52, + "learning_rate": 1.6558599944532547e-05, + "loss": 0.3428, + "step": 24200 + }, + { + "epoch": 0.52, + "learning_rate": 1.654437752003584e-05, + "loss": 0.3324, + "step": 24300 + }, + { + "epoch": 0.52, + "learning_rate": 1.6530155095539136e-05, + "loss": 0.3273, + "step": 24400 + }, + { + "epoch": 0.52, + "learning_rate": 1.6515932671042435e-05, + "loss": 0.3262, + "step": 24500 + }, + { + "epoch": 0.52, + "learning_rate": 1.650171024654573e-05, + "loss": 0.3343, + "step": 24600 + }, + { + "epoch": 0.53, + "learning_rate": 1.6487487822049025e-05, + "loss": 0.3269, + "step": 24700 + }, + { + "epoch": 0.53, + "learning_rate": 1.6473265397552323e-05, + "loss": 0.336, + "step": 24800 + }, + { + "epoch": 0.53, + "learning_rate": 1.6459042973055618e-05, + "loss": 0.3316, + "step": 24900 + }, + { + "epoch": 0.53, + "learning_rate": 1.6444820548558916e-05, + "loss": 0.3257, + "step": 25000 + }, + { + "epoch": 0.53, + "eval_loss": 0.36412838101387024, + "eval_runtime": 34.9354, + "eval_samples_per_second": 143.121, + "eval_steps_per_second": 1.145, + "step": 25000 + }, + { + "epoch": 0.54, + "learning_rate": 1.6430598124062208e-05, + "loss": 0.3235, + "step": 25100 + }, + { + "epoch": 0.54, + "learning_rate": 1.6416375699565506e-05, + "loss": 0.3294, + "step": 25200 + }, + { + "epoch": 0.54, + "learning_rate": 1.64021532750688e-05, + "loss": 0.3268, + "step": 25300 + }, + { + "epoch": 0.54, + "learning_rate": 1.6387930850572096e-05, + "loss": 0.3162, + "step": 25400 + }, + { + "epoch": 0.54, + "learning_rate": 1.6373708426075395e-05, + "loss": 0.3199, + "step": 25500 + }, + { + "epoch": 0.55, + "learning_rate": 1.635948600157869e-05, + "loss": 0.3168, + "step": 25600 + }, + { + "epoch": 0.55, + "learning_rate": 1.6345263577081988e-05, + "loss": 0.3209, + "step": 25700 + }, + { + "epoch": 0.55, + "learning_rate": 1.6331041152585283e-05, + "loss": 0.3097, + "step": 25800 + }, + { + "epoch": 0.55, + "learning_rate": 1.6316818728088578e-05, + "loss": 0.3137, + "step": 25900 + }, + { + "epoch": 0.55, + "learning_rate": 1.6302596303591876e-05, + "loss": 0.3095, + "step": 26000 + }, + { + "epoch": 0.56, + "learning_rate": 1.628837387909517e-05, + "loss": 0.3205, + "step": 26100 + }, + { + "epoch": 0.56, + "learning_rate": 1.6274151454598466e-05, + "loss": 0.3141, + "step": 26200 + }, + { + "epoch": 0.56, + "learning_rate": 1.625992903010176e-05, + "loss": 0.3177, + "step": 26300 + }, + { + "epoch": 0.56, + "learning_rate": 1.624570660560506e-05, + "loss": 0.3198, + "step": 26400 + }, + { + "epoch": 0.57, + "learning_rate": 1.6231484181108355e-05, + "loss": 0.3187, + "step": 26500 + }, + { + "epoch": 0.57, + "learning_rate": 1.621726175661165e-05, + "loss": 0.3208, + "step": 26600 + }, + { + "epoch": 0.57, + "learning_rate": 1.6203039332114948e-05, + "loss": 0.3152, + "step": 26700 + }, + { + "epoch": 0.57, + "learning_rate": 1.6188816907618243e-05, + "loss": 0.3194, + "step": 26800 + }, + { + "epoch": 0.57, + "learning_rate": 1.6174594483121538e-05, + "loss": 0.3199, + "step": 26900 + }, + { + "epoch": 0.58, + "learning_rate": 1.6160372058624836e-05, + "loss": 0.3181, + "step": 27000 + }, + { + "epoch": 0.58, + "learning_rate": 1.614614963412813e-05, + "loss": 0.3185, + "step": 27100 + }, + { + "epoch": 0.58, + "learning_rate": 1.6131927209631426e-05, + "loss": 0.3178, + "step": 27200 + }, + { + "epoch": 0.58, + "learning_rate": 1.611770478513472e-05, + "loss": 0.3056, + "step": 27300 + }, + { + "epoch": 0.58, + "learning_rate": 1.610348236063802e-05, + "loss": 0.2985, + "step": 27400 + }, + { + "epoch": 0.59, + "learning_rate": 1.6089259936141315e-05, + "loss": 0.3099, + "step": 27500 + }, + { + "epoch": 0.59, + "learning_rate": 1.607503751164461e-05, + "loss": 0.3004, + "step": 27600 + }, + { + "epoch": 0.59, + "learning_rate": 1.6060815087147908e-05, + "loss": 0.2997, + "step": 27700 + }, + { + "epoch": 0.59, + "learning_rate": 1.6046592662651203e-05, + "loss": 0.3124, + "step": 27800 + }, + { + "epoch": 0.6, + "learning_rate": 1.60323702381545e-05, + "loss": 0.3008, + "step": 27900 + }, + { + "epoch": 0.6, + "learning_rate": 1.6018147813657796e-05, + "loss": 0.3086, + "step": 28000 + }, + { + "epoch": 0.6, + "learning_rate": 1.600392538916109e-05, + "loss": 0.3028, + "step": 28100 + }, + { + "epoch": 0.6, + "learning_rate": 1.598970296466439e-05, + "loss": 0.3035, + "step": 28200 + }, + { + "epoch": 0.6, + "learning_rate": 1.597548054016768e-05, + "loss": 0.2954, + "step": 28300 + }, + { + "epoch": 0.61, + "learning_rate": 1.596125811567098e-05, + "loss": 0.2794, + "step": 28400 + }, + { + "epoch": 0.61, + "learning_rate": 1.5947035691174274e-05, + "loss": 0.3009, + "step": 28500 + }, + { + "epoch": 0.61, + "learning_rate": 1.5932813266677573e-05, + "loss": 0.2945, + "step": 28600 + }, + { + "epoch": 0.61, + "learning_rate": 1.5918590842180868e-05, + "loss": 0.3038, + "step": 28700 + }, + { + "epoch": 0.61, + "learning_rate": 1.5904368417684163e-05, + "loss": 0.2877, + "step": 28800 + }, + { + "epoch": 0.62, + "learning_rate": 1.589014599318746e-05, + "loss": 0.2928, + "step": 28900 + }, + { + "epoch": 0.62, + "learning_rate": 1.5875923568690756e-05, + "loss": 0.298, + "step": 29000 + }, + { + "epoch": 0.62, + "learning_rate": 1.586170114419405e-05, + "loss": 0.2985, + "step": 29100 + }, + { + "epoch": 0.62, + "learning_rate": 1.584747871969735e-05, + "loss": 0.2868, + "step": 29200 + }, + { + "epoch": 0.63, + "learning_rate": 1.5833256295200644e-05, + "loss": 0.2935, + "step": 29300 + }, + { + "epoch": 0.63, + "learning_rate": 1.581903387070394e-05, + "loss": 0.2924, + "step": 29400 + }, + { + "epoch": 0.63, + "learning_rate": 1.5804811446207234e-05, + "loss": 0.2911, + "step": 29500 + }, + { + "epoch": 0.63, + "learning_rate": 1.5790589021710533e-05, + "loss": 0.2947, + "step": 29600 + }, + { + "epoch": 0.63, + "learning_rate": 1.5776366597213828e-05, + "loss": 0.2872, + "step": 29700 + }, + { + "epoch": 0.64, + "learning_rate": 1.5762144172717123e-05, + "loss": 0.2782, + "step": 29800 + }, + { + "epoch": 0.64, + "learning_rate": 1.574792174822042e-05, + "loss": 0.2877, + "step": 29900 + }, + { + "epoch": 0.64, + "learning_rate": 1.5733699323723716e-05, + "loss": 0.2874, + "step": 30000 + }, + { + "epoch": 0.64, + "eval_loss": 0.3142920732498169, + "eval_runtime": 34.9962, + "eval_samples_per_second": 142.872, + "eval_steps_per_second": 1.143, + "step": 30000 + }, + { + "epoch": 0.64, + "learning_rate": 1.5719476899227014e-05, + "loss": 0.2884, + "step": 30100 + }, + { + "epoch": 0.64, + "learning_rate": 1.570525447473031e-05, + "loss": 0.2756, + "step": 30200 + }, + { + "epoch": 0.65, + "learning_rate": 1.5691032050233604e-05, + "loss": 0.2847, + "step": 30300 + }, + { + "epoch": 0.65, + "learning_rate": 1.56768096257369e-05, + "loss": 0.2742, + "step": 30400 + }, + { + "epoch": 0.65, + "learning_rate": 1.5662587201240194e-05, + "loss": 0.2818, + "step": 30500 + }, + { + "epoch": 0.65, + "learning_rate": 1.5648364776743493e-05, + "loss": 0.2804, + "step": 30600 + }, + { + "epoch": 0.65, + "learning_rate": 1.5634142352246788e-05, + "loss": 0.2729, + "step": 30700 + }, + { + "epoch": 0.66, + "learning_rate": 1.5619919927750086e-05, + "loss": 0.2771, + "step": 30800 + }, + { + "epoch": 0.66, + "learning_rate": 1.560569750325338e-05, + "loss": 0.2863, + "step": 30900 + }, + { + "epoch": 0.66, + "learning_rate": 1.5591475078756676e-05, + "loss": 0.2791, + "step": 31000 + }, + { + "epoch": 0.66, + "learning_rate": 1.5577252654259974e-05, + "loss": 0.28, + "step": 31100 + }, + { + "epoch": 0.67, + "learning_rate": 1.556303022976327e-05, + "loss": 0.2742, + "step": 31200 + }, + { + "epoch": 0.67, + "learning_rate": 1.5548807805266564e-05, + "loss": 0.264, + "step": 31300 + }, + { + "epoch": 0.67, + "learning_rate": 1.5534585380769863e-05, + "loss": 0.2747, + "step": 31400 + }, + { + "epoch": 0.67, + "learning_rate": 1.5520362956273158e-05, + "loss": 0.2704, + "step": 31500 + }, + { + "epoch": 0.67, + "learning_rate": 1.5506140531776452e-05, + "loss": 0.2846, + "step": 31600 + }, + { + "epoch": 0.68, + "learning_rate": 1.5491918107279747e-05, + "loss": 0.2753, + "step": 31700 + }, + { + "epoch": 0.68, + "learning_rate": 1.5477695682783046e-05, + "loss": 0.2654, + "step": 31800 + }, + { + "epoch": 0.68, + "learning_rate": 1.546347325828634e-05, + "loss": 0.2666, + "step": 31900 + }, + { + "epoch": 0.68, + "learning_rate": 1.5449250833789636e-05, + "loss": 0.2661, + "step": 32000 + }, + { + "epoch": 0.68, + "learning_rate": 1.5435028409292934e-05, + "loss": 0.2738, + "step": 32100 + }, + { + "epoch": 0.69, + "learning_rate": 1.542080598479623e-05, + "loss": 0.2766, + "step": 32200 + }, + { + "epoch": 0.69, + "learning_rate": 1.5406583560299527e-05, + "loss": 0.2744, + "step": 32300 + }, + { + "epoch": 0.69, + "learning_rate": 1.5392361135802822e-05, + "loss": 0.2701, + "step": 32400 + }, + { + "epoch": 0.69, + "learning_rate": 1.5378138711306117e-05, + "loss": 0.2772, + "step": 32500 + }, + { + "epoch": 0.7, + "learning_rate": 1.5363916286809412e-05, + "loss": 0.2788, + "step": 32600 + }, + { + "epoch": 0.7, + "learning_rate": 1.5349693862312707e-05, + "loss": 0.2747, + "step": 32700 + }, + { + "epoch": 0.7, + "learning_rate": 1.5335471437816006e-05, + "loss": 0.2543, + "step": 32800 + }, + { + "epoch": 0.7, + "learning_rate": 1.53212490133193e-05, + "loss": 0.2654, + "step": 32900 + }, + { + "epoch": 0.7, + "learning_rate": 1.53070265888226e-05, + "loss": 0.2593, + "step": 33000 + }, + { + "epoch": 0.71, + "learning_rate": 1.5292804164325894e-05, + "loss": 0.2631, + "step": 33100 + }, + { + "epoch": 0.71, + "learning_rate": 1.527858173982919e-05, + "loss": 0.2654, + "step": 33200 + }, + { + "epoch": 0.71, + "learning_rate": 1.5264359315332487e-05, + "loss": 0.2684, + "step": 33300 + }, + { + "epoch": 0.71, + "learning_rate": 1.5250136890835782e-05, + "loss": 0.2575, + "step": 33400 + }, + { + "epoch": 0.71, + "learning_rate": 1.5235914466339079e-05, + "loss": 0.2711, + "step": 33500 + }, + { + "epoch": 0.72, + "learning_rate": 1.5221692041842372e-05, + "loss": 0.2589, + "step": 33600 + }, + { + "epoch": 0.72, + "learning_rate": 1.5207469617345669e-05, + "loss": 0.2659, + "step": 33700 + }, + { + "epoch": 0.72, + "learning_rate": 1.5193247192848966e-05, + "loss": 0.2513, + "step": 33800 + }, + { + "epoch": 0.72, + "learning_rate": 1.5179024768352262e-05, + "loss": 0.2681, + "step": 33900 + }, + { + "epoch": 0.73, + "learning_rate": 1.5164802343855557e-05, + "loss": 0.2528, + "step": 34000 + }, + { + "epoch": 0.73, + "learning_rate": 1.5150579919358854e-05, + "loss": 0.251, + "step": 34100 + }, + { + "epoch": 0.73, + "learning_rate": 1.513635749486215e-05, + "loss": 0.2487, + "step": 34200 + }, + { + "epoch": 0.73, + "learning_rate": 1.5122135070365447e-05, + "loss": 0.2498, + "step": 34300 + }, + { + "epoch": 0.73, + "learning_rate": 1.5107912645868742e-05, + "loss": 0.2506, + "step": 34400 + }, + { + "epoch": 0.74, + "learning_rate": 1.5093690221372039e-05, + "loss": 0.2549, + "step": 34500 + }, + { + "epoch": 0.74, + "learning_rate": 1.5079467796875336e-05, + "loss": 0.2482, + "step": 34600 + }, + { + "epoch": 0.74, + "learning_rate": 1.5065245372378629e-05, + "loss": 0.256, + "step": 34700 + }, + { + "epoch": 0.74, + "learning_rate": 1.5051022947881926e-05, + "loss": 0.2503, + "step": 34800 + }, + { + "epoch": 0.74, + "learning_rate": 1.5036800523385222e-05, + "loss": 0.2569, + "step": 34900 + }, + { + "epoch": 0.75, + "learning_rate": 1.5022578098888519e-05, + "loss": 0.2558, + "step": 35000 + }, + { + "epoch": 0.75, + "eval_loss": 0.2726137340068817, + "eval_runtime": 35.0016, + "eval_samples_per_second": 142.85, + "eval_steps_per_second": 1.143, + "step": 35000 + }, + { + "epoch": 0.75, + "learning_rate": 1.5008355674391814e-05, + "loss": 0.244, + "step": 35100 + }, + { + "epoch": 0.75, + "learning_rate": 1.499413324989511e-05, + "loss": 0.2551, + "step": 35200 + }, + { + "epoch": 0.75, + "learning_rate": 1.4979910825398407e-05, + "loss": 0.2396, + "step": 35300 + }, + { + "epoch": 0.76, + "learning_rate": 1.4965688400901704e-05, + "loss": 0.2579, + "step": 35400 + }, + { + "epoch": 0.76, + "learning_rate": 1.4951465976404999e-05, + "loss": 0.2444, + "step": 35500 + }, + { + "epoch": 0.76, + "learning_rate": 1.4937243551908295e-05, + "loss": 0.239, + "step": 35600 + }, + { + "epoch": 0.76, + "learning_rate": 1.4923021127411592e-05, + "loss": 0.2381, + "step": 35700 + }, + { + "epoch": 0.76, + "learning_rate": 1.4908798702914885e-05, + "loss": 0.2578, + "step": 35800 + }, + { + "epoch": 0.77, + "learning_rate": 1.4894576278418182e-05, + "loss": 0.2467, + "step": 35900 + }, + { + "epoch": 0.77, + "learning_rate": 1.4880353853921479e-05, + "loss": 0.238, + "step": 36000 + }, + { + "epoch": 0.77, + "learning_rate": 1.4866131429424775e-05, + "loss": 0.2441, + "step": 36100 + }, + { + "epoch": 0.77, + "learning_rate": 1.485190900492807e-05, + "loss": 0.2445, + "step": 36200 + }, + { + "epoch": 0.77, + "learning_rate": 1.4837686580431367e-05, + "loss": 0.2464, + "step": 36300 + }, + { + "epoch": 0.78, + "learning_rate": 1.4823464155934664e-05, + "loss": 0.2414, + "step": 36400 + }, + { + "epoch": 0.78, + "learning_rate": 1.480924173143796e-05, + "loss": 0.2462, + "step": 36500 + }, + { + "epoch": 0.78, + "learning_rate": 1.4795019306941255e-05, + "loss": 0.2411, + "step": 36600 + }, + { + "epoch": 0.78, + "learning_rate": 1.4780796882444552e-05, + "loss": 0.2378, + "step": 36700 + }, + { + "epoch": 0.79, + "learning_rate": 1.4766574457947847e-05, + "loss": 0.2298, + "step": 36800 + }, + { + "epoch": 0.79, + "learning_rate": 1.4752352033451142e-05, + "loss": 0.2382, + "step": 36900 + }, + { + "epoch": 0.79, + "learning_rate": 1.4738129608954439e-05, + "loss": 0.235, + "step": 37000 + }, + { + "epoch": 0.79, + "learning_rate": 1.4723907184457735e-05, + "loss": 0.2321, + "step": 37100 + }, + { + "epoch": 0.79, + "learning_rate": 1.4709684759961032e-05, + "loss": 0.229, + "step": 37200 + }, + { + "epoch": 0.8, + "learning_rate": 1.4695462335464327e-05, + "loss": 0.2411, + "step": 37300 + }, + { + "epoch": 0.8, + "learning_rate": 1.4681239910967624e-05, + "loss": 0.2269, + "step": 37400 + }, + { + "epoch": 0.8, + "learning_rate": 1.466701748647092e-05, + "loss": 0.2357, + "step": 37500 + }, + { + "epoch": 0.8, + "learning_rate": 1.4652795061974217e-05, + "loss": 0.2324, + "step": 37600 + }, + { + "epoch": 0.8, + "learning_rate": 1.4638572637477514e-05, + "loss": 0.2306, + "step": 37700 + }, + { + "epoch": 0.81, + "learning_rate": 1.4624350212980809e-05, + "loss": 0.2382, + "step": 37800 + }, + { + "epoch": 0.81, + "learning_rate": 1.4610127788484104e-05, + "loss": 0.2418, + "step": 37900 + }, + { + "epoch": 0.81, + "learning_rate": 1.4595905363987399e-05, + "loss": 0.2416, + "step": 38000 + }, + { + "epoch": 0.81, + "learning_rate": 1.4581682939490695e-05, + "loss": 0.2287, + "step": 38100 + }, + { + "epoch": 0.81, + "learning_rate": 1.4567460514993992e-05, + "loss": 0.2429, + "step": 38200 + }, + { + "epoch": 0.82, + "learning_rate": 1.4553238090497289e-05, + "loss": 0.2481, + "step": 38300 + }, + { + "epoch": 0.82, + "learning_rate": 1.4539015666000584e-05, + "loss": 0.2217, + "step": 38400 + }, + { + "epoch": 0.82, + "learning_rate": 1.452479324150388e-05, + "loss": 0.229, + "step": 38500 + }, + { + "epoch": 0.82, + "learning_rate": 1.4510570817007177e-05, + "loss": 0.2338, + "step": 38600 + }, + { + "epoch": 0.83, + "learning_rate": 1.4496348392510474e-05, + "loss": 0.2241, + "step": 38700 + }, + { + "epoch": 0.83, + "learning_rate": 1.448212596801377e-05, + "loss": 0.2144, + "step": 38800 + }, + { + "epoch": 0.83, + "learning_rate": 1.4467903543517065e-05, + "loss": 0.2331, + "step": 38900 + }, + { + "epoch": 0.83, + "learning_rate": 1.445368111902036e-05, + "loss": 0.2293, + "step": 39000 + }, + { + "epoch": 0.83, + "learning_rate": 1.4439458694523655e-05, + "loss": 0.2323, + "step": 39100 + }, + { + "epoch": 0.84, + "learning_rate": 1.4425236270026952e-05, + "loss": 0.218, + "step": 39200 + }, + { + "epoch": 0.84, + "learning_rate": 1.4411013845530248e-05, + "loss": 0.2286, + "step": 39300 + }, + { + "epoch": 0.84, + "learning_rate": 1.4396791421033545e-05, + "loss": 0.2325, + "step": 39400 + }, + { + "epoch": 0.84, + "learning_rate": 1.438256899653684e-05, + "loss": 0.2188, + "step": 39500 + }, + { + "epoch": 0.84, + "learning_rate": 1.4368346572040137e-05, + "loss": 0.2322, + "step": 39600 + }, + { + "epoch": 0.85, + "learning_rate": 1.4354124147543433e-05, + "loss": 0.2219, + "step": 39700 + }, + { + "epoch": 0.85, + "learning_rate": 1.433990172304673e-05, + "loss": 0.2218, + "step": 39800 + }, + { + "epoch": 0.85, + "learning_rate": 1.4325679298550027e-05, + "loss": 0.2253, + "step": 39900 + }, + { + "epoch": 0.85, + "learning_rate": 1.431145687405332e-05, + "loss": 0.2188, + "step": 40000 + }, + { + "epoch": 0.85, + "eval_loss": 0.24339932203292847, + "eval_runtime": 35.0062, + "eval_samples_per_second": 142.832, + "eval_steps_per_second": 1.143, + "step": 40000 + }, + { + "epoch": 0.86, + "learning_rate": 1.4297234449556617e-05, + "loss": 0.2144, + "step": 40100 + }, + { + "epoch": 0.86, + "learning_rate": 1.4283012025059912e-05, + "loss": 0.2282, + "step": 40200 + }, + { + "epoch": 0.86, + "learning_rate": 1.4268789600563208e-05, + "loss": 0.2155, + "step": 40300 + }, + { + "epoch": 0.86, + "learning_rate": 1.4254567176066505e-05, + "loss": 0.2153, + "step": 40400 + }, + { + "epoch": 0.86, + "learning_rate": 1.4240344751569802e-05, + "loss": 0.225, + "step": 40500 + }, + { + "epoch": 0.87, + "learning_rate": 1.4226122327073097e-05, + "loss": 0.2132, + "step": 40600 + }, + { + "epoch": 0.87, + "learning_rate": 1.4211899902576393e-05, + "loss": 0.2241, + "step": 40700 + }, + { + "epoch": 0.87, + "learning_rate": 1.419767747807969e-05, + "loss": 0.2194, + "step": 40800 + }, + { + "epoch": 0.87, + "learning_rate": 1.4183455053582987e-05, + "loss": 0.2102, + "step": 40900 + }, + { + "epoch": 0.87, + "learning_rate": 1.4169232629086283e-05, + "loss": 0.2138, + "step": 41000 + }, + { + "epoch": 0.88, + "learning_rate": 1.4155010204589577e-05, + "loss": 0.2193, + "step": 41100 + }, + { + "epoch": 0.88, + "learning_rate": 1.4140787780092873e-05, + "loss": 0.2218, + "step": 41200 + }, + { + "epoch": 0.88, + "learning_rate": 1.4126565355596168e-05, + "loss": 0.2193, + "step": 41300 + }, + { + "epoch": 0.88, + "learning_rate": 1.4112342931099465e-05, + "loss": 0.2218, + "step": 41400 + }, + { + "epoch": 0.89, + "learning_rate": 1.4098120506602762e-05, + "loss": 0.2172, + "step": 41500 + }, + { + "epoch": 0.89, + "learning_rate": 1.4083898082106058e-05, + "loss": 0.2143, + "step": 41600 + }, + { + "epoch": 0.89, + "learning_rate": 1.4069675657609353e-05, + "loss": 0.2181, + "step": 41700 + }, + { + "epoch": 0.89, + "learning_rate": 1.405545323311265e-05, + "loss": 0.2078, + "step": 41800 + }, + { + "epoch": 0.89, + "learning_rate": 1.4041230808615947e-05, + "loss": 0.2102, + "step": 41900 + }, + { + "epoch": 0.9, + "learning_rate": 1.4027008384119243e-05, + "loss": 0.2017, + "step": 42000 + }, + { + "epoch": 0.9, + "learning_rate": 1.401278595962254e-05, + "loss": 0.209, + "step": 42100 + }, + { + "epoch": 0.9, + "learning_rate": 1.3998563535125833e-05, + "loss": 0.2065, + "step": 42200 + }, + { + "epoch": 0.9, + "learning_rate": 1.398434111062913e-05, + "loss": 0.2024, + "step": 42300 + }, + { + "epoch": 0.9, + "learning_rate": 1.3970118686132425e-05, + "loss": 0.1976, + "step": 42400 + }, + { + "epoch": 0.91, + "learning_rate": 1.3955896261635721e-05, + "loss": 0.2041, + "step": 42500 + }, + { + "epoch": 0.91, + "learning_rate": 1.3941673837139018e-05, + "loss": 0.1998, + "step": 42600 + }, + { + "epoch": 0.91, + "learning_rate": 1.3927451412642315e-05, + "loss": 0.2026, + "step": 42700 + }, + { + "epoch": 0.91, + "learning_rate": 1.391322898814561e-05, + "loss": 0.2031, + "step": 42800 + }, + { + "epoch": 0.92, + "learning_rate": 1.3899006563648906e-05, + "loss": 0.2043, + "step": 42900 + }, + { + "epoch": 0.92, + "learning_rate": 1.3884784139152203e-05, + "loss": 0.2098, + "step": 43000 + }, + { + "epoch": 0.92, + "learning_rate": 1.38705617146555e-05, + "loss": 0.206, + "step": 43100 + }, + { + "epoch": 0.92, + "learning_rate": 1.3856339290158793e-05, + "loss": 0.1995, + "step": 43200 + }, + { + "epoch": 0.92, + "learning_rate": 1.384211686566209e-05, + "loss": 0.2029, + "step": 43300 + }, + { + "epoch": 0.93, + "learning_rate": 1.3827894441165386e-05, + "loss": 0.2054, + "step": 43400 + }, + { + "epoch": 0.93, + "learning_rate": 1.3813672016668681e-05, + "loss": 0.2027, + "step": 43500 + }, + { + "epoch": 0.93, + "learning_rate": 1.3799449592171978e-05, + "loss": 0.196, + "step": 43600 + }, + { + "epoch": 0.93, + "learning_rate": 1.3785227167675275e-05, + "loss": 0.1984, + "step": 43700 + }, + { + "epoch": 0.93, + "learning_rate": 1.3771004743178571e-05, + "loss": 0.2033, + "step": 43800 + }, + { + "epoch": 0.94, + "learning_rate": 1.3756782318681866e-05, + "loss": 0.2019, + "step": 43900 + }, + { + "epoch": 0.94, + "learning_rate": 1.3742559894185163e-05, + "loss": 0.1965, + "step": 44000 + }, + { + "epoch": 0.94, + "learning_rate": 1.372833746968846e-05, + "loss": 0.1878, + "step": 44100 + }, + { + "epoch": 0.94, + "learning_rate": 1.3714115045191756e-05, + "loss": 0.2072, + "step": 44200 + }, + { + "epoch": 0.95, + "learning_rate": 1.369989262069505e-05, + "loss": 0.204, + "step": 44300 + }, + { + "epoch": 0.95, + "learning_rate": 1.3685670196198346e-05, + "loss": 0.1969, + "step": 44400 + }, + { + "epoch": 0.95, + "learning_rate": 1.3671447771701643e-05, + "loss": 0.1908, + "step": 44500 + }, + { + "epoch": 0.95, + "learning_rate": 1.3657225347204938e-05, + "loss": 0.1937, + "step": 44600 + }, + { + "epoch": 0.95, + "learning_rate": 1.3643002922708235e-05, + "loss": 0.1987, + "step": 44700 + }, + { + "epoch": 0.96, + "learning_rate": 1.3628780498211531e-05, + "loss": 0.1921, + "step": 44800 + }, + { + "epoch": 0.96, + "learning_rate": 1.3614558073714828e-05, + "loss": 0.1967, + "step": 44900 + }, + { + "epoch": 0.96, + "learning_rate": 1.3600335649218125e-05, + "loss": 0.1963, + "step": 45000 + }, + { + "epoch": 0.96, + "eval_loss": 0.21258682012557983, + "eval_runtime": 34.9419, + "eval_samples_per_second": 143.095, + "eval_steps_per_second": 1.145, + "step": 45000 + }, + { + "epoch": 0.96, + "learning_rate": 1.358611322472142e-05, + "loss": 0.1905, + "step": 45100 + }, + { + "epoch": 0.96, + "learning_rate": 1.3571890800224716e-05, + "loss": 0.1997, + "step": 45200 + }, + { + "epoch": 0.97, + "learning_rate": 1.3557668375728013e-05, + "loss": 0.192, + "step": 45300 + }, + { + "epoch": 0.97, + "learning_rate": 1.3543445951231306e-05, + "loss": 0.1973, + "step": 45400 + }, + { + "epoch": 0.97, + "learning_rate": 1.3529223526734603e-05, + "loss": 0.1883, + "step": 45500 + }, + { + "epoch": 0.97, + "learning_rate": 1.35150011022379e-05, + "loss": 0.2007, + "step": 45600 + }, + { + "epoch": 0.97, + "learning_rate": 1.3500778677741195e-05, + "loss": 0.1926, + "step": 45700 + }, + { + "epoch": 0.98, + "learning_rate": 1.3486556253244491e-05, + "loss": 0.194, + "step": 45800 + }, + { + "epoch": 0.98, + "learning_rate": 1.3472333828747788e-05, + "loss": 0.1946, + "step": 45900 + }, + { + "epoch": 0.98, + "learning_rate": 1.3458111404251085e-05, + "loss": 0.186, + "step": 46000 + }, + { + "epoch": 0.98, + "learning_rate": 1.3443888979754381e-05, + "loss": 0.1922, + "step": 46100 + }, + { + "epoch": 0.99, + "learning_rate": 1.3429666555257676e-05, + "loss": 0.1912, + "step": 46200 + }, + { + "epoch": 0.99, + "learning_rate": 1.3415444130760973e-05, + "loss": 0.1919, + "step": 46300 + }, + { + "epoch": 0.99, + "learning_rate": 1.3401221706264266e-05, + "loss": 0.1975, + "step": 46400 + }, + { + "epoch": 0.99, + "learning_rate": 1.3386999281767563e-05, + "loss": 0.1902, + "step": 46500 + }, + { + "epoch": 0.99, + "learning_rate": 1.337277685727086e-05, + "loss": 0.1858, + "step": 46600 + }, + { + "epoch": 1.0, + "learning_rate": 1.3358554432774156e-05, + "loss": 0.1969, + "step": 46700 + }, + { + "epoch": 1.0, + "learning_rate": 1.3344332008277451e-05, + "loss": 0.1741, + "step": 46800 + }, + { + "epoch": 1.0, + "learning_rate": 1.3330109583780748e-05, + "loss": 0.1724, + "step": 46900 + }, + { + "epoch": 1.0, + "learning_rate": 1.3315887159284044e-05, + "loss": 0.1066, + "step": 47000 + }, + { + "epoch": 1.0, + "learning_rate": 1.3301664734787341e-05, + "loss": 0.1048, + "step": 47100 + }, + { + "epoch": 1.01, + "learning_rate": 1.3287442310290638e-05, + "loss": 0.1075, + "step": 47200 + }, + { + "epoch": 1.01, + "learning_rate": 1.3273219885793933e-05, + "loss": 0.1024, + "step": 47300 + }, + { + "epoch": 1.01, + "learning_rate": 1.325899746129723e-05, + "loss": 0.1054, + "step": 47400 + }, + { + "epoch": 1.01, + "learning_rate": 1.3244775036800523e-05, + "loss": 0.1056, + "step": 47500 + }, + { + "epoch": 1.02, + "learning_rate": 1.323055261230382e-05, + "loss": 0.1004, + "step": 47600 + }, + { + "epoch": 1.02, + "learning_rate": 1.3216330187807116e-05, + "loss": 0.1074, + "step": 47700 + }, + { + "epoch": 1.02, + "learning_rate": 1.3202107763310413e-05, + "loss": 0.1028, + "step": 47800 + }, + { + "epoch": 1.02, + "learning_rate": 1.3187885338813708e-05, + "loss": 0.1092, + "step": 47900 + }, + { + "epoch": 1.02, + "learning_rate": 1.3173662914317004e-05, + "loss": 0.1033, + "step": 48000 + }, + { + "epoch": 1.03, + "learning_rate": 1.3159440489820301e-05, + "loss": 0.1025, + "step": 48100 + }, + { + "epoch": 1.03, + "learning_rate": 1.3145218065323598e-05, + "loss": 0.1017, + "step": 48200 + }, + { + "epoch": 1.03, + "learning_rate": 1.3130995640826894e-05, + "loss": 0.1091, + "step": 48300 + }, + { + "epoch": 1.03, + "learning_rate": 1.311677321633019e-05, + "loss": 0.1054, + "step": 48400 + }, + { + "epoch": 1.03, + "learning_rate": 1.3102550791833486e-05, + "loss": 0.1033, + "step": 48500 + }, + { + "epoch": 1.04, + "learning_rate": 1.308832836733678e-05, + "loss": 0.1082, + "step": 48600 + }, + { + "epoch": 1.04, + "learning_rate": 1.3074105942840076e-05, + "loss": 0.1045, + "step": 48700 + }, + { + "epoch": 1.04, + "learning_rate": 1.3059883518343373e-05, + "loss": 0.1083, + "step": 48800 + }, + { + "epoch": 1.04, + "learning_rate": 1.304566109384667e-05, + "loss": 0.1067, + "step": 48900 + }, + { + "epoch": 1.05, + "learning_rate": 1.3031438669349964e-05, + "loss": 0.1047, + "step": 49000 + }, + { + "epoch": 1.05, + "learning_rate": 1.3017216244853261e-05, + "loss": 0.1038, + "step": 49100 + }, + { + "epoch": 1.05, + "learning_rate": 1.3002993820356558e-05, + "loss": 0.1016, + "step": 49200 + }, + { + "epoch": 1.05, + "learning_rate": 1.2988771395859854e-05, + "loss": 0.1045, + "step": 49300 + }, + { + "epoch": 1.05, + "learning_rate": 1.2974548971363151e-05, + "loss": 0.1081, + "step": 49400 + }, + { + "epoch": 1.06, + "learning_rate": 1.2960326546866446e-05, + "loss": 0.111, + "step": 49500 + }, + { + "epoch": 1.06, + "learning_rate": 1.294610412236974e-05, + "loss": 0.1044, + "step": 49600 + }, + { + "epoch": 1.06, + "learning_rate": 1.2931881697873036e-05, + "loss": 0.1084, + "step": 49700 + }, + { + "epoch": 1.06, + "learning_rate": 1.2917659273376332e-05, + "loss": 0.106, + "step": 49800 + }, + { + "epoch": 1.06, + "learning_rate": 1.2903436848879629e-05, + "loss": 0.1094, + "step": 49900 + }, + { + "epoch": 1.07, + "learning_rate": 1.2889214424382926e-05, + "loss": 0.1154, + "step": 50000 + }, + { + "epoch": 1.07, + "eval_loss": 0.205108642578125, + "eval_runtime": 34.2834, + "eval_samples_per_second": 145.843, + "eval_steps_per_second": 1.167, + "step": 50000 + }, + { + "epoch": 1.07, + "learning_rate": 1.287499199988622e-05, + "loss": 0.0996, + "step": 50100 + }, + { + "epoch": 1.07, + "learning_rate": 1.2860769575389517e-05, + "loss": 0.1065, + "step": 50200 + }, + { + "epoch": 1.07, + "learning_rate": 1.2846547150892814e-05, + "loss": 0.1057, + "step": 50300 + }, + { + "epoch": 1.08, + "learning_rate": 1.283232472639611e-05, + "loss": 0.0993, + "step": 50400 + }, + { + "epoch": 1.08, + "learning_rate": 1.2818102301899407e-05, + "loss": 0.1095, + "step": 50500 + }, + { + "epoch": 1.08, + "learning_rate": 1.2803879877402702e-05, + "loss": 0.1098, + "step": 50600 + }, + { + "epoch": 1.08, + "learning_rate": 1.2789657452905997e-05, + "loss": 0.1148, + "step": 50700 + }, + { + "epoch": 1.08, + "learning_rate": 1.2775435028409292e-05, + "loss": 0.1088, + "step": 50800 + }, + { + "epoch": 1.09, + "learning_rate": 1.2761212603912589e-05, + "loss": 0.0996, + "step": 50900 + }, + { + "epoch": 1.09, + "learning_rate": 1.2746990179415886e-05, + "loss": 0.1093, + "step": 51000 + }, + { + "epoch": 1.09, + "learning_rate": 1.2732767754919182e-05, + "loss": 0.103, + "step": 51100 + }, + { + "epoch": 1.09, + "learning_rate": 1.2718545330422477e-05, + "loss": 0.1029, + "step": 51200 + }, + { + "epoch": 1.09, + "learning_rate": 1.2704322905925774e-05, + "loss": 0.1012, + "step": 51300 + }, + { + "epoch": 1.1, + "learning_rate": 1.269010048142907e-05, + "loss": 0.1077, + "step": 51400 + }, + { + "epoch": 1.1, + "learning_rate": 1.2675878056932367e-05, + "loss": 0.1115, + "step": 51500 + }, + { + "epoch": 1.1, + "learning_rate": 1.2661655632435664e-05, + "loss": 0.0979, + "step": 51600 + }, + { + "epoch": 1.1, + "learning_rate": 1.2647433207938959e-05, + "loss": 0.1038, + "step": 51700 + }, + { + "epoch": 1.11, + "learning_rate": 1.2633210783442254e-05, + "loss": 0.1101, + "step": 51800 + }, + { + "epoch": 1.11, + "learning_rate": 1.2618988358945549e-05, + "loss": 0.1114, + "step": 51900 + }, + { + "epoch": 1.11, + "learning_rate": 1.2604765934448846e-05, + "loss": 0.1026, + "step": 52000 + }, + { + "epoch": 1.11, + "learning_rate": 1.2590543509952142e-05, + "loss": 0.0995, + "step": 52100 + }, + { + "epoch": 1.11, + "learning_rate": 1.2576321085455439e-05, + "loss": 0.1126, + "step": 52200 + }, + { + "epoch": 1.12, + "learning_rate": 1.2562098660958734e-05, + "loss": 0.1111, + "step": 52300 + }, + { + "epoch": 1.12, + "learning_rate": 1.254787623646203e-05, + "loss": 0.1033, + "step": 52400 + }, + { + "epoch": 1.12, + "learning_rate": 1.2533653811965327e-05, + "loss": 0.1048, + "step": 52500 + }, + { + "epoch": 1.12, + "learning_rate": 1.2519431387468624e-05, + "loss": 0.1, + "step": 52600 + }, + { + "epoch": 1.12, + "learning_rate": 1.250520896297192e-05, + "loss": 0.1056, + "step": 52700 + }, + { + "epoch": 1.13, + "learning_rate": 1.2490986538475214e-05, + "loss": 0.1104, + "step": 52800 + }, + { + "epoch": 1.13, + "learning_rate": 1.247676411397851e-05, + "loss": 0.1109, + "step": 52900 + }, + { + "epoch": 1.13, + "learning_rate": 1.2462541689481806e-05, + "loss": 0.1053, + "step": 53000 + }, + { + "epoch": 1.13, + "learning_rate": 1.2448319264985102e-05, + "loss": 0.0941, + "step": 53100 + }, + { + "epoch": 1.13, + "learning_rate": 1.2434096840488399e-05, + "loss": 0.1011, + "step": 53200 + }, + { + "epoch": 1.14, + "learning_rate": 1.2419874415991696e-05, + "loss": 0.1049, + "step": 53300 + }, + { + "epoch": 1.14, + "learning_rate": 1.2405651991494992e-05, + "loss": 0.106, + "step": 53400 + }, + { + "epoch": 1.14, + "learning_rate": 1.2391429566998287e-05, + "loss": 0.1047, + "step": 53500 + }, + { + "epoch": 1.14, + "learning_rate": 1.2377207142501584e-05, + "loss": 0.1015, + "step": 53600 + }, + { + "epoch": 1.15, + "learning_rate": 1.236298471800488e-05, + "loss": 0.1021, + "step": 53700 + }, + { + "epoch": 1.15, + "learning_rate": 1.2348762293508177e-05, + "loss": 0.1016, + "step": 53800 + }, + { + "epoch": 1.15, + "learning_rate": 1.233453986901147e-05, + "loss": 0.0967, + "step": 53900 + }, + { + "epoch": 1.15, + "learning_rate": 1.2320317444514767e-05, + "loss": 0.1061, + "step": 54000 + }, + { + "epoch": 1.15, + "learning_rate": 1.2306095020018062e-05, + "loss": 0.0994, + "step": 54100 + }, + { + "epoch": 1.16, + "learning_rate": 1.2291872595521359e-05, + "loss": 0.0998, + "step": 54200 + }, + { + "epoch": 1.16, + "learning_rate": 1.2277650171024655e-05, + "loss": 0.1068, + "step": 54300 + }, + { + "epoch": 1.16, + "learning_rate": 1.2263427746527952e-05, + "loss": 0.1015, + "step": 54400 + }, + { + "epoch": 1.16, + "learning_rate": 1.2249205322031249e-05, + "loss": 0.1051, + "step": 54500 + }, + { + "epoch": 1.16, + "learning_rate": 1.2234982897534544e-05, + "loss": 0.1101, + "step": 54600 + }, + { + "epoch": 1.17, + "learning_rate": 1.222076047303784e-05, + "loss": 0.1021, + "step": 54700 + }, + { + "epoch": 1.17, + "learning_rate": 1.2206538048541137e-05, + "loss": 0.1047, + "step": 54800 + }, + { + "epoch": 1.17, + "learning_rate": 1.2192315624044434e-05, + "loss": 0.0966, + "step": 54900 + }, + { + "epoch": 1.17, + "learning_rate": 1.2178093199547727e-05, + "loss": 0.1028, + "step": 55000 + }, + { + "epoch": 1.17, + "eval_loss": 0.19255822896957397, + "eval_runtime": 34.1404, + "eval_samples_per_second": 146.454, + "eval_steps_per_second": 1.172, + "step": 55000 + }, + { + "epoch": 1.18, + "learning_rate": 1.2163870775051024e-05, + "loss": 0.1049, + "step": 55100 + }, + { + "epoch": 1.18, + "learning_rate": 1.2149648350554319e-05, + "loss": 0.0977, + "step": 55200 + }, + { + "epoch": 1.18, + "learning_rate": 1.2135425926057615e-05, + "loss": 0.1011, + "step": 55300 + }, + { + "epoch": 1.18, + "learning_rate": 1.2121203501560912e-05, + "loss": 0.1003, + "step": 55400 + }, + { + "epoch": 1.18, + "learning_rate": 1.2106981077064209e-05, + "loss": 0.1109, + "step": 55500 + }, + { + "epoch": 1.19, + "learning_rate": 1.2092758652567505e-05, + "loss": 0.1085, + "step": 55600 + }, + { + "epoch": 1.19, + "learning_rate": 1.20785362280708e-05, + "loss": 0.1081, + "step": 55700 + }, + { + "epoch": 1.19, + "learning_rate": 1.2064313803574097e-05, + "loss": 0.1012, + "step": 55800 + }, + { + "epoch": 1.19, + "learning_rate": 1.2050091379077394e-05, + "loss": 0.1019, + "step": 55900 + }, + { + "epoch": 1.19, + "learning_rate": 1.2035868954580687e-05, + "loss": 0.1036, + "step": 56000 + }, + { + "epoch": 1.2, + "learning_rate": 1.2021646530083984e-05, + "loss": 0.1055, + "step": 56100 + }, + { + "epoch": 1.2, + "learning_rate": 1.200742410558728e-05, + "loss": 0.1105, + "step": 56200 + }, + { + "epoch": 1.2, + "learning_rate": 1.1993201681090575e-05, + "loss": 0.1006, + "step": 56300 + }, + { + "epoch": 1.2, + "learning_rate": 1.1978979256593872e-05, + "loss": 0.0995, + "step": 56400 + }, + { + "epoch": 1.21, + "learning_rate": 1.1964756832097169e-05, + "loss": 0.1044, + "step": 56500 + }, + { + "epoch": 1.21, + "learning_rate": 1.1950534407600465e-05, + "loss": 0.1021, + "step": 56600 + }, + { + "epoch": 1.21, + "learning_rate": 1.1936311983103762e-05, + "loss": 0.1049, + "step": 56700 + }, + { + "epoch": 1.21, + "learning_rate": 1.1922089558607057e-05, + "loss": 0.1102, + "step": 56800 + }, + { + "epoch": 1.21, + "learning_rate": 1.1907867134110354e-05, + "loss": 0.1017, + "step": 56900 + }, + { + "epoch": 1.22, + "learning_rate": 1.189364470961365e-05, + "loss": 0.0983, + "step": 57000 + }, + { + "epoch": 1.22, + "learning_rate": 1.1879422285116943e-05, + "loss": 0.1003, + "step": 57100 + }, + { + "epoch": 1.22, + "learning_rate": 1.186519986062024e-05, + "loss": 0.1, + "step": 57200 + }, + { + "epoch": 1.22, + "learning_rate": 1.1850977436123537e-05, + "loss": 0.109, + "step": 57300 + }, + { + "epoch": 1.22, + "learning_rate": 1.1836755011626832e-05, + "loss": 0.0958, + "step": 57400 + }, + { + "epoch": 1.23, + "learning_rate": 1.1822532587130128e-05, + "loss": 0.1057, + "step": 57500 + }, + { + "epoch": 1.23, + "learning_rate": 1.1808310162633425e-05, + "loss": 0.1067, + "step": 57600 + }, + { + "epoch": 1.23, + "learning_rate": 1.1794087738136722e-05, + "loss": 0.1108, + "step": 57700 + }, + { + "epoch": 1.23, + "learning_rate": 1.1779865313640018e-05, + "loss": 0.1089, + "step": 57800 + }, + { + "epoch": 1.24, + "learning_rate": 1.1765642889143313e-05, + "loss": 0.1035, + "step": 57900 + }, + { + "epoch": 1.24, + "learning_rate": 1.175142046464661e-05, + "loss": 0.1025, + "step": 58000 + }, + { + "epoch": 1.24, + "learning_rate": 1.1737198040149907e-05, + "loss": 0.1002, + "step": 58100 + }, + { + "epoch": 1.24, + "learning_rate": 1.17229756156532e-05, + "loss": 0.1018, + "step": 58200 + }, + { + "epoch": 1.24, + "learning_rate": 1.1708753191156497e-05, + "loss": 0.108, + "step": 58300 + }, + { + "epoch": 1.25, + "learning_rate": 1.1694530766659793e-05, + "loss": 0.0987, + "step": 58400 + }, + { + "epoch": 1.25, + "learning_rate": 1.1680308342163088e-05, + "loss": 0.1099, + "step": 58500 + }, + { + "epoch": 1.25, + "learning_rate": 1.1666085917666385e-05, + "loss": 0.1024, + "step": 58600 + }, + { + "epoch": 1.25, + "learning_rate": 1.1651863493169682e-05, + "loss": 0.0957, + "step": 58700 + }, + { + "epoch": 1.25, + "learning_rate": 1.1637641068672978e-05, + "loss": 0.1044, + "step": 58800 + }, + { + "epoch": 1.26, + "learning_rate": 1.1623418644176275e-05, + "loss": 0.1007, + "step": 58900 + }, + { + "epoch": 1.26, + "learning_rate": 1.160919621967957e-05, + "loss": 0.1042, + "step": 59000 + }, + { + "epoch": 1.26, + "learning_rate": 1.1594973795182867e-05, + "loss": 0.1059, + "step": 59100 + }, + { + "epoch": 1.26, + "learning_rate": 1.158075137068616e-05, + "loss": 0.1014, + "step": 59200 + }, + { + "epoch": 1.27, + "learning_rate": 1.1566528946189457e-05, + "loss": 0.1, + "step": 59300 + }, + { + "epoch": 1.27, + "learning_rate": 1.1552306521692753e-05, + "loss": 0.0947, + "step": 59400 + }, + { + "epoch": 1.27, + "learning_rate": 1.153808409719605e-05, + "loss": 0.1025, + "step": 59500 + }, + { + "epoch": 1.27, + "learning_rate": 1.1523861672699345e-05, + "loss": 0.1032, + "step": 59600 + }, + { + "epoch": 1.27, + "learning_rate": 1.1509639248202642e-05, + "loss": 0.1048, + "step": 59700 + }, + { + "epoch": 1.28, + "learning_rate": 1.1495416823705938e-05, + "loss": 0.0976, + "step": 59800 + }, + { + "epoch": 1.28, + "learning_rate": 1.1481194399209235e-05, + "loss": 0.1036, + "step": 59900 + }, + { + "epoch": 1.28, + "learning_rate": 1.1466971974712532e-05, + "loss": 0.1097, + "step": 60000 + }, + { + "epoch": 1.28, + "eval_loss": 0.18599912524223328, + "eval_runtime": 34.2806, + "eval_samples_per_second": 145.855, + "eval_steps_per_second": 1.167, + "step": 60000 + }, + { + "epoch": 1.28, + "learning_rate": 1.1452749550215827e-05, + "loss": 0.1038, + "step": 60100 + }, + { + "epoch": 1.28, + "learning_rate": 1.1438527125719123e-05, + "loss": 0.1013, + "step": 60200 + }, + { + "epoch": 1.29, + "learning_rate": 1.1424304701222416e-05, + "loss": 0.098, + "step": 60300 + }, + { + "epoch": 1.29, + "learning_rate": 1.1410082276725713e-05, + "loss": 0.1002, + "step": 60400 + }, + { + "epoch": 1.29, + "learning_rate": 1.139585985222901e-05, + "loss": 0.105, + "step": 60500 + }, + { + "epoch": 1.29, + "learning_rate": 1.1381637427732307e-05, + "loss": 0.1002, + "step": 60600 + }, + { + "epoch": 1.29, + "learning_rate": 1.1367415003235601e-05, + "loss": 0.0973, + "step": 60700 + }, + { + "epoch": 1.3, + "learning_rate": 1.1353192578738898e-05, + "loss": 0.1038, + "step": 60800 + }, + { + "epoch": 1.3, + "learning_rate": 1.1338970154242195e-05, + "loss": 0.0989, + "step": 60900 + }, + { + "epoch": 1.3, + "learning_rate": 1.1324747729745491e-05, + "loss": 0.1096, + "step": 61000 + }, + { + "epoch": 1.3, + "learning_rate": 1.1310525305248788e-05, + "loss": 0.0869, + "step": 61100 + }, + { + "epoch": 1.31, + "learning_rate": 1.1296302880752083e-05, + "loss": 0.1003, + "step": 61200 + }, + { + "epoch": 1.31, + "learning_rate": 1.128208045625538e-05, + "loss": 0.0945, + "step": 61300 + }, + { + "epoch": 1.31, + "learning_rate": 1.1267858031758673e-05, + "loss": 0.1004, + "step": 61400 + }, + { + "epoch": 1.31, + "learning_rate": 1.125363560726197e-05, + "loss": 0.0984, + "step": 61500 + }, + { + "epoch": 1.31, + "learning_rate": 1.1239413182765266e-05, + "loss": 0.098, + "step": 61600 + }, + { + "epoch": 1.32, + "learning_rate": 1.1225190758268563e-05, + "loss": 0.0982, + "step": 61700 + }, + { + "epoch": 1.32, + "learning_rate": 1.121096833377186e-05, + "loss": 0.0988, + "step": 61800 + }, + { + "epoch": 1.32, + "learning_rate": 1.1196745909275155e-05, + "loss": 0.0999, + "step": 61900 + }, + { + "epoch": 1.32, + "learning_rate": 1.1182523484778451e-05, + "loss": 0.1028, + "step": 62000 + }, + { + "epoch": 1.32, + "learning_rate": 1.1168301060281748e-05, + "loss": 0.0966, + "step": 62100 + }, + { + "epoch": 1.33, + "learning_rate": 1.1154078635785045e-05, + "loss": 0.0881, + "step": 62200 + }, + { + "epoch": 1.33, + "learning_rate": 1.113985621128834e-05, + "loss": 0.1008, + "step": 62300 + }, + { + "epoch": 1.33, + "learning_rate": 1.1125633786791635e-05, + "loss": 0.1029, + "step": 62400 + }, + { + "epoch": 1.33, + "learning_rate": 1.111141136229493e-05, + "loss": 0.0981, + "step": 62500 + }, + { + "epoch": 1.34, + "learning_rate": 1.1097188937798226e-05, + "loss": 0.093, + "step": 62600 + }, + { + "epoch": 1.34, + "learning_rate": 1.1082966513301523e-05, + "loss": 0.1009, + "step": 62700 + }, + { + "epoch": 1.34, + "learning_rate": 1.106874408880482e-05, + "loss": 0.0988, + "step": 62800 + }, + { + "epoch": 1.34, + "learning_rate": 1.1054521664308116e-05, + "loss": 0.0947, + "step": 62900 + }, + { + "epoch": 1.34, + "learning_rate": 1.1040299239811411e-05, + "loss": 0.1021, + "step": 63000 + }, + { + "epoch": 1.35, + "learning_rate": 1.1026076815314708e-05, + "loss": 0.0948, + "step": 63100 + }, + { + "epoch": 1.35, + "learning_rate": 1.1011854390818005e-05, + "loss": 0.0934, + "step": 63200 + }, + { + "epoch": 1.35, + "learning_rate": 1.0997631966321301e-05, + "loss": 0.0919, + "step": 63300 + }, + { + "epoch": 1.35, + "learning_rate": 1.0983409541824596e-05, + "loss": 0.0959, + "step": 63400 + }, + { + "epoch": 1.35, + "learning_rate": 1.0969187117327891e-05, + "loss": 0.0964, + "step": 63500 + }, + { + "epoch": 1.36, + "learning_rate": 1.0954964692831186e-05, + "loss": 0.1008, + "step": 63600 + }, + { + "epoch": 1.36, + "learning_rate": 1.0940742268334483e-05, + "loss": 0.1005, + "step": 63700 + }, + { + "epoch": 1.36, + "learning_rate": 1.092651984383778e-05, + "loss": 0.0891, + "step": 63800 + }, + { + "epoch": 1.36, + "learning_rate": 1.0912297419341076e-05, + "loss": 0.0962, + "step": 63900 + }, + { + "epoch": 1.37, + "learning_rate": 1.0898074994844373e-05, + "loss": 0.0891, + "step": 64000 + }, + { + "epoch": 1.37, + "learning_rate": 1.0883852570347668e-05, + "loss": 0.0946, + "step": 64100 + }, + { + "epoch": 1.37, + "learning_rate": 1.0869630145850965e-05, + "loss": 0.0977, + "step": 64200 + }, + { + "epoch": 1.37, + "learning_rate": 1.0855407721354261e-05, + "loss": 0.0999, + "step": 64300 + }, + { + "epoch": 1.37, + "learning_rate": 1.0841185296857558e-05, + "loss": 0.1032, + "step": 64400 + }, + { + "epoch": 1.38, + "learning_rate": 1.0826962872360853e-05, + "loss": 0.087, + "step": 64500 + }, + { + "epoch": 1.38, + "learning_rate": 1.0812740447864148e-05, + "loss": 0.0976, + "step": 64600 + }, + { + "epoch": 1.38, + "learning_rate": 1.0798518023367443e-05, + "loss": 0.1002, + "step": 64700 + }, + { + "epoch": 1.38, + "learning_rate": 1.078429559887074e-05, + "loss": 0.0991, + "step": 64800 + }, + { + "epoch": 1.38, + "learning_rate": 1.0770073174374036e-05, + "loss": 0.0928, + "step": 64900 + }, + { + "epoch": 1.39, + "learning_rate": 1.0755850749877333e-05, + "loss": 0.0966, + "step": 65000 + }, + { + "epoch": 1.39, + "learning_rate": 1.074162832538063e-05, + "loss": 0.0916, + "step": 65100 + }, + { + "epoch": 1.39, + "learning_rate": 1.0727405900883924e-05, + "loss": 0.0984, + "step": 65200 + }, + { + "epoch": 1.39, + "learning_rate": 1.0713183476387221e-05, + "loss": 0.1021, + "step": 65300 + }, + { + "epoch": 1.4, + "learning_rate": 1.0698961051890518e-05, + "loss": 0.092, + "step": 65400 + }, + { + "epoch": 1.4, + "learning_rate": 1.0684738627393814e-05, + "loss": 0.0997, + "step": 65500 + }, + { + "epoch": 1.4, + "learning_rate": 1.0670516202897108e-05, + "loss": 0.0862, + "step": 65600 + }, + { + "epoch": 1.4, + "learning_rate": 1.0656293778400404e-05, + "loss": 0.091, + "step": 65700 + }, + { + "epoch": 1.4, + "learning_rate": 1.06420713539037e-05, + "loss": 0.0945, + "step": 65800 + }, + { + "epoch": 1.41, + "learning_rate": 1.0627848929406996e-05, + "loss": 0.089, + "step": 65900 + }, + { + "epoch": 1.41, + "learning_rate": 1.0613626504910293e-05, + "loss": 0.0943, + "step": 66000 + }, + { + "epoch": 1.41, + "learning_rate": 1.059940408041359e-05, + "loss": 0.0919, + "step": 66100 + }, + { + "epoch": 1.41, + "learning_rate": 1.0585181655916886e-05, + "loss": 0.0947, + "step": 66200 + }, + { + "epoch": 1.41, + "learning_rate": 1.0570959231420181e-05, + "loss": 0.0915, + "step": 66300 + }, + { + "epoch": 1.42, + "learning_rate": 1.0556736806923478e-05, + "loss": 0.1016, + "step": 66400 + }, + { + "epoch": 1.42, + "learning_rate": 1.0542514382426774e-05, + "loss": 0.0953, + "step": 66500 + }, + { + "epoch": 1.42, + "learning_rate": 1.0528291957930071e-05, + "loss": 0.0953, + "step": 66600 + }, + { + "epoch": 1.42, + "learning_rate": 1.0514069533433364e-05, + "loss": 0.0943, + "step": 66700 + }, + { + "epoch": 1.43, + "learning_rate": 1.0499847108936661e-05, + "loss": 0.0983, + "step": 66800 + }, + { + "epoch": 1.43, + "learning_rate": 1.0485624684439956e-05, + "loss": 0.0929, + "step": 66900 + }, + { + "epoch": 1.43, + "learning_rate": 1.0471402259943253e-05, + "loss": 0.0983, + "step": 67000 + }, + { + "epoch": 1.43, + "learning_rate": 1.045717983544655e-05, + "loss": 0.0928, + "step": 67100 + }, + { + "epoch": 1.43, + "learning_rate": 1.0442957410949846e-05, + "loss": 0.0887, + "step": 67200 + }, + { + "epoch": 1.44, + "learning_rate": 1.0428734986453143e-05, + "loss": 0.0927, + "step": 67300 + }, + { + "epoch": 1.44, + "learning_rate": 1.0414512561956438e-05, + "loss": 0.0957, + "step": 67400 + }, + { + "epoch": 1.44, + "learning_rate": 1.0400290137459734e-05, + "loss": 0.0905, + "step": 67500 + }, + { + "epoch": 1.44, + "learning_rate": 1.0386067712963031e-05, + "loss": 0.0943, + "step": 67600 + }, + { + "epoch": 1.44, + "learning_rate": 1.0371845288466328e-05, + "loss": 0.093, + "step": 67700 + }, + { + "epoch": 1.45, + "learning_rate": 1.035762286396962e-05, + "loss": 0.0922, + "step": 67800 + }, + { + "epoch": 1.45, + "learning_rate": 1.0343400439472917e-05, + "loss": 0.095, + "step": 67900 + }, + { + "epoch": 1.45, + "learning_rate": 1.0329178014976212e-05, + "loss": 0.0904, + "step": 68000 + }, + { + "epoch": 1.45, + "learning_rate": 1.0314955590479509e-05, + "loss": 0.0981, + "step": 68100 + }, + { + "epoch": 1.45, + "learning_rate": 1.0300733165982806e-05, + "loss": 0.0973, + "step": 68200 + }, + { + "epoch": 1.46, + "learning_rate": 1.0286510741486102e-05, + "loss": 0.0891, + "step": 68300 + }, + { + "epoch": 1.46, + "learning_rate": 1.0272288316989399e-05, + "loss": 0.0942, + "step": 68400 + }, + { + "epoch": 1.46, + "learning_rate": 1.0258065892492694e-05, + "loss": 0.0869, + "step": 68500 + }, + { + "epoch": 1.46, + "learning_rate": 1.024384346799599e-05, + "loss": 0.1023, + "step": 68600 + }, + { + "epoch": 1.47, + "learning_rate": 1.0229621043499287e-05, + "loss": 0.1025, + "step": 68700 + }, + { + "epoch": 1.47, + "learning_rate": 1.021539861900258e-05, + "loss": 0.0878, + "step": 68800 + }, + { + "epoch": 1.47, + "learning_rate": 1.0201176194505877e-05, + "loss": 0.0921, + "step": 68900 + }, + { + "epoch": 1.47, + "learning_rate": 1.0186953770009174e-05, + "loss": 0.09, + "step": 69000 + }, + { + "epoch": 1.47, + "learning_rate": 1.0172731345512469e-05, + "loss": 0.0925, + "step": 69100 + }, + { + "epoch": 1.48, + "learning_rate": 1.0158508921015766e-05, + "loss": 0.0873, + "step": 69200 + }, + { + "epoch": 1.48, + "learning_rate": 1.0144286496519062e-05, + "loss": 0.089, + "step": 69300 + }, + { + "epoch": 1.48, + "learning_rate": 1.0130064072022359e-05, + "loss": 0.092, + "step": 69400 + }, + { + "epoch": 1.48, + "learning_rate": 1.0115841647525656e-05, + "loss": 0.0985, + "step": 69500 + }, + { + "epoch": 1.48, + "learning_rate": 1.010161922302895e-05, + "loss": 0.0895, + "step": 69600 + }, + { + "epoch": 1.49, + "learning_rate": 1.0087396798532247e-05, + "loss": 0.0845, + "step": 69700 + }, + { + "epoch": 1.49, + "learning_rate": 1.0073174374035544e-05, + "loss": 0.0905, + "step": 69800 + }, + { + "epoch": 1.49, + "learning_rate": 1.0058951949538837e-05, + "loss": 0.0901, + "step": 69900 + }, + { + "epoch": 1.49, + "learning_rate": 1.0044729525042134e-05, + "loss": 0.0955, + "step": 70000 + }, + { + "epoch": 1.49, + "eval_loss": 0.16799671947956085, + "eval_runtime": 34.201, + "eval_samples_per_second": 146.194, + "eval_steps_per_second": 1.17, + "step": 70000 + }, + { + "epoch": 1.5, + "learning_rate": 1.003050710054543e-05, + "loss": 0.0864, + "step": 70100 + }, + { + "epoch": 1.5, + "learning_rate": 1.0016284676048727e-05, + "loss": 0.0888, + "step": 70200 + }, + { + "epoch": 1.5, + "learning_rate": 1.0002062251552022e-05, + "loss": 0.0898, + "step": 70300 + }, + { + "epoch": 1.5, + "learning_rate": 9.987839827055319e-06, + "loss": 0.0887, + "step": 70400 + }, + { + "epoch": 1.5, + "learning_rate": 9.973617402558616e-06, + "loss": 0.0928, + "step": 70500 + }, + { + "epoch": 1.51, + "learning_rate": 9.959394978061912e-06, + "loss": 0.0897, + "step": 70600 + }, + { + "epoch": 1.51, + "learning_rate": 9.945172553565207e-06, + "loss": 0.0889, + "step": 70700 + }, + { + "epoch": 1.51, + "learning_rate": 9.930950129068502e-06, + "loss": 0.0922, + "step": 70800 + }, + { + "epoch": 1.51, + "learning_rate": 9.916727704571799e-06, + "loss": 0.0969, + "step": 70900 + }, + { + "epoch": 1.51, + "learning_rate": 9.902505280075096e-06, + "loss": 0.0905, + "step": 71000 + }, + { + "epoch": 1.52, + "learning_rate": 9.888282855578392e-06, + "loss": 0.0882, + "step": 71100 + }, + { + "epoch": 1.52, + "learning_rate": 9.874060431081687e-06, + "loss": 0.0953, + "step": 71200 + }, + { + "epoch": 1.52, + "learning_rate": 9.859838006584984e-06, + "loss": 0.0907, + "step": 71300 + }, + { + "epoch": 1.52, + "learning_rate": 9.845615582088279e-06, + "loss": 0.0932, + "step": 71400 + }, + { + "epoch": 1.53, + "learning_rate": 9.831393157591576e-06, + "loss": 0.0937, + "step": 71500 + }, + { + "epoch": 1.53, + "learning_rate": 9.817170733094872e-06, + "loss": 0.0932, + "step": 71600 + }, + { + "epoch": 1.53, + "learning_rate": 9.802948308598169e-06, + "loss": 0.0891, + "step": 71700 + }, + { + "epoch": 1.53, + "learning_rate": 9.788725884101464e-06, + "loss": 0.0924, + "step": 71800 + }, + { + "epoch": 1.53, + "learning_rate": 9.774503459604759e-06, + "loss": 0.0882, + "step": 71900 + }, + { + "epoch": 1.54, + "learning_rate": 9.760281035108055e-06, + "loss": 0.0901, + "step": 72000 + }, + { + "epoch": 1.54, + "learning_rate": 9.746058610611352e-06, + "loss": 0.0899, + "step": 72100 + }, + { + "epoch": 1.54, + "learning_rate": 9.731836186114649e-06, + "loss": 0.0897, + "step": 72200 + }, + { + "epoch": 1.54, + "learning_rate": 9.717613761617944e-06, + "loss": 0.0957, + "step": 72300 + }, + { + "epoch": 1.54, + "learning_rate": 9.70339133712124e-06, + "loss": 0.0909, + "step": 72400 + }, + { + "epoch": 1.55, + "learning_rate": 9.689168912624535e-06, + "loss": 0.0907, + "step": 72500 + }, + { + "epoch": 1.55, + "learning_rate": 9.674946488127832e-06, + "loss": 0.0873, + "step": 72600 + }, + { + "epoch": 1.55, + "learning_rate": 9.660724063631129e-06, + "loss": 0.093, + "step": 72700 + }, + { + "epoch": 1.55, + "learning_rate": 9.646501639134424e-06, + "loss": 0.0951, + "step": 72800 + }, + { + "epoch": 1.56, + "learning_rate": 9.63227921463772e-06, + "loss": 0.0905, + "step": 72900 + }, + { + "epoch": 1.56, + "learning_rate": 9.618056790141015e-06, + "loss": 0.0884, + "step": 73000 + }, + { + "epoch": 1.56, + "learning_rate": 9.603834365644312e-06, + "loss": 0.0898, + "step": 73100 + }, + { + "epoch": 1.56, + "learning_rate": 9.589611941147609e-06, + "loss": 0.0863, + "step": 73200 + }, + { + "epoch": 1.56, + "learning_rate": 9.575389516650905e-06, + "loss": 0.0922, + "step": 73300 + }, + { + "epoch": 1.57, + "learning_rate": 9.5611670921542e-06, + "loss": 0.0836, + "step": 73400 + }, + { + "epoch": 1.57, + "learning_rate": 9.546944667657497e-06, + "loss": 0.0892, + "step": 73500 + }, + { + "epoch": 1.57, + "learning_rate": 9.532722243160792e-06, + "loss": 0.0907, + "step": 73600 + }, + { + "epoch": 1.57, + "learning_rate": 9.518499818664089e-06, + "loss": 0.0852, + "step": 73700 + }, + { + "epoch": 1.57, + "learning_rate": 9.504277394167385e-06, + "loss": 0.0877, + "step": 73800 + }, + { + "epoch": 1.58, + "learning_rate": 9.49005496967068e-06, + "loss": 0.0912, + "step": 73900 + }, + { + "epoch": 1.58, + "learning_rate": 9.475832545173977e-06, + "loss": 0.0874, + "step": 74000 + }, + { + "epoch": 1.58, + "learning_rate": 9.461610120677272e-06, + "loss": 0.0844, + "step": 74100 + }, + { + "epoch": 1.58, + "learning_rate": 9.447387696180569e-06, + "loss": 0.084, + "step": 74200 + }, + { + "epoch": 1.59, + "learning_rate": 9.433165271683865e-06, + "loss": 0.0834, + "step": 74300 + }, + { + "epoch": 1.59, + "learning_rate": 9.41894284718716e-06, + "loss": 0.0897, + "step": 74400 + }, + { + "epoch": 1.59, + "learning_rate": 9.404720422690457e-06, + "loss": 0.0994, + "step": 74500 + }, + { + "epoch": 1.59, + "learning_rate": 9.390497998193754e-06, + "loss": 0.0936, + "step": 74600 + }, + { + "epoch": 1.59, + "learning_rate": 9.376275573697049e-06, + "loss": 0.0931, + "step": 74700 + }, + { + "epoch": 1.6, + "learning_rate": 9.362053149200345e-06, + "loss": 0.0835, + "step": 74800 + }, + { + "epoch": 1.6, + "learning_rate": 9.347830724703642e-06, + "loss": 0.0933, + "step": 74900 + }, + { + "epoch": 1.6, + "learning_rate": 9.333608300206937e-06, + "loss": 0.0936, + "step": 75000 + }, + { + "epoch": 1.6, + "learning_rate": 9.319385875710234e-06, + "loss": 0.0892, + "step": 75100 + }, + { + "epoch": 1.6, + "learning_rate": 9.305163451213528e-06, + "loss": 0.0857, + "step": 75200 + }, + { + "epoch": 1.61, + "learning_rate": 9.290941026716825e-06, + "loss": 0.0857, + "step": 75300 + }, + { + "epoch": 1.61, + "learning_rate": 9.276718602220122e-06, + "loss": 0.0806, + "step": 75400 + }, + { + "epoch": 1.61, + "learning_rate": 9.262496177723417e-06, + "loss": 0.0908, + "step": 75500 + }, + { + "epoch": 1.61, + "learning_rate": 9.248273753226713e-06, + "loss": 0.0869, + "step": 75600 + }, + { + "epoch": 1.61, + "learning_rate": 9.23405132873001e-06, + "loss": 0.0747, + "step": 75700 + }, + { + "epoch": 1.62, + "learning_rate": 9.219828904233305e-06, + "loss": 0.0844, + "step": 75800 + }, + { + "epoch": 1.62, + "learning_rate": 9.205606479736602e-06, + "loss": 0.0815, + "step": 75900 + }, + { + "epoch": 1.62, + "learning_rate": 9.191384055239897e-06, + "loss": 0.0819, + "step": 76000 + }, + { + "epoch": 1.62, + "learning_rate": 9.177161630743193e-06, + "loss": 0.0849, + "step": 76100 + }, + { + "epoch": 1.63, + "learning_rate": 9.16293920624649e-06, + "loss": 0.0864, + "step": 76200 + }, + { + "epoch": 1.63, + "learning_rate": 9.148716781749785e-06, + "loss": 0.0922, + "step": 76300 + }, + { + "epoch": 1.63, + "learning_rate": 9.134494357253082e-06, + "loss": 0.0853, + "step": 76400 + }, + { + "epoch": 1.63, + "learning_rate": 9.120271932756378e-06, + "loss": 0.0849, + "step": 76500 + }, + { + "epoch": 1.63, + "learning_rate": 9.106049508259673e-06, + "loss": 0.0857, + "step": 76600 + }, + { + "epoch": 1.64, + "learning_rate": 9.09182708376297e-06, + "loss": 0.0821, + "step": 76700 + }, + { + "epoch": 1.64, + "learning_rate": 9.077604659266267e-06, + "loss": 0.0887, + "step": 76800 + }, + { + "epoch": 1.64, + "learning_rate": 9.063382234769562e-06, + "loss": 0.0864, + "step": 76900 + }, + { + "epoch": 1.64, + "learning_rate": 9.049159810272858e-06, + "loss": 0.0858, + "step": 77000 + }, + { + "epoch": 1.64, + "learning_rate": 9.034937385776153e-06, + "loss": 0.0892, + "step": 77100 + }, + { + "epoch": 1.65, + "learning_rate": 9.02071496127945e-06, + "loss": 0.0804, + "step": 77200 + }, + { + "epoch": 1.65, + "learning_rate": 9.006492536782747e-06, + "loss": 0.0833, + "step": 77300 + }, + { + "epoch": 1.65, + "learning_rate": 8.992270112286042e-06, + "loss": 0.0843, + "step": 77400 + }, + { + "epoch": 1.65, + "learning_rate": 8.978047687789338e-06, + "loss": 0.0869, + "step": 77500 + }, + { + "epoch": 1.66, + "learning_rate": 8.963825263292633e-06, + "loss": 0.0907, + "step": 77600 + }, + { + "epoch": 1.66, + "learning_rate": 8.94960283879593e-06, + "loss": 0.0844, + "step": 77700 + }, + { + "epoch": 1.66, + "learning_rate": 8.935380414299227e-06, + "loss": 0.0795, + "step": 77800 + }, + { + "epoch": 1.66, + "learning_rate": 8.921157989802523e-06, + "loss": 0.0873, + "step": 77900 + }, + { + "epoch": 1.66, + "learning_rate": 8.906935565305818e-06, + "loss": 0.0829, + "step": 78000 + }, + { + "epoch": 1.67, + "learning_rate": 8.892713140809115e-06, + "loss": 0.0814, + "step": 78100 + }, + { + "epoch": 1.67, + "learning_rate": 8.87849071631241e-06, + "loss": 0.0844, + "step": 78200 + }, + { + "epoch": 1.67, + "learning_rate": 8.864268291815707e-06, + "loss": 0.0848, + "step": 78300 + }, + { + "epoch": 1.67, + "learning_rate": 8.850045867319003e-06, + "loss": 0.0911, + "step": 78400 + }, + { + "epoch": 1.67, + "learning_rate": 8.835823442822298e-06, + "loss": 0.0842, + "step": 78500 + }, + { + "epoch": 1.68, + "learning_rate": 8.821601018325595e-06, + "loss": 0.079, + "step": 78600 + }, + { + "epoch": 1.68, + "learning_rate": 8.80737859382889e-06, + "loss": 0.0835, + "step": 78700 + }, + { + "epoch": 1.68, + "learning_rate": 8.793156169332186e-06, + "loss": 0.0871, + "step": 78800 + }, + { + "epoch": 1.68, + "learning_rate": 8.778933744835483e-06, + "loss": 0.0809, + "step": 78900 + }, + { + "epoch": 1.69, + "learning_rate": 8.76471132033878e-06, + "loss": 0.0906, + "step": 79000 + }, + { + "epoch": 1.69, + "learning_rate": 8.750488895842075e-06, + "loss": 0.0836, + "step": 79100 + }, + { + "epoch": 1.69, + "learning_rate": 8.73626647134537e-06, + "loss": 0.0768, + "step": 79200 + }, + { + "epoch": 1.69, + "learning_rate": 8.722044046848666e-06, + "loss": 0.0844, + "step": 79300 + }, + { + "epoch": 1.69, + "learning_rate": 8.707821622351963e-06, + "loss": 0.0848, + "step": 79400 + }, + { + "epoch": 1.7, + "learning_rate": 8.69359919785526e-06, + "loss": 0.0862, + "step": 79500 + }, + { + "epoch": 1.7, + "learning_rate": 8.679376773358555e-06, + "loss": 0.0778, + "step": 79600 + }, + { + "epoch": 1.7, + "learning_rate": 8.665154348861851e-06, + "loss": 0.0813, + "step": 79700 + }, + { + "epoch": 1.7, + "learning_rate": 8.650931924365146e-06, + "loss": 0.0874, + "step": 79800 + }, + { + "epoch": 1.7, + "learning_rate": 8.636709499868443e-06, + "loss": 0.0772, + "step": 79900 + }, + { + "epoch": 1.71, + "learning_rate": 8.62248707537174e-06, + "loss": 0.0801, + "step": 80000 + }, + { + "epoch": 1.71, + "eval_loss": 0.14860820770263672, + "eval_runtime": 34.2128, + "eval_samples_per_second": 146.144, + "eval_steps_per_second": 1.169, + "step": 80000 + }, + { + "epoch": 1.71, + "learning_rate": 8.608264650875036e-06, + "loss": 0.087, + "step": 80100 + }, + { + "epoch": 1.71, + "learning_rate": 8.594042226378331e-06, + "loss": 0.0758, + "step": 80200 + }, + { + "epoch": 1.71, + "learning_rate": 8.579819801881626e-06, + "loss": 0.0855, + "step": 80300 + }, + { + "epoch": 1.72, + "learning_rate": 8.565597377384923e-06, + "loss": 0.0834, + "step": 80400 + }, + { + "epoch": 1.72, + "learning_rate": 8.55137495288822e-06, + "loss": 0.0846, + "step": 80500 + }, + { + "epoch": 1.72, + "learning_rate": 8.537152528391516e-06, + "loss": 0.079, + "step": 80600 + }, + { + "epoch": 1.72, + "learning_rate": 8.522930103894811e-06, + "loss": 0.0838, + "step": 80700 + }, + { + "epoch": 1.72, + "learning_rate": 8.508707679398108e-06, + "loss": 0.0868, + "step": 80800 + }, + { + "epoch": 1.73, + "learning_rate": 8.494485254901403e-06, + "loss": 0.0923, + "step": 80900 + }, + { + "epoch": 1.73, + "learning_rate": 8.4802628304047e-06, + "loss": 0.0851, + "step": 81000 + }, + { + "epoch": 1.73, + "learning_rate": 8.466040405907996e-06, + "loss": 0.0839, + "step": 81100 + }, + { + "epoch": 1.73, + "learning_rate": 8.451817981411293e-06, + "loss": 0.0832, + "step": 81200 + }, + { + "epoch": 1.73, + "learning_rate": 8.437595556914588e-06, + "loss": 0.0794, + "step": 81300 + }, + { + "epoch": 1.74, + "learning_rate": 8.423373132417883e-06, + "loss": 0.0772, + "step": 81400 + }, + { + "epoch": 1.74, + "learning_rate": 8.40915070792118e-06, + "loss": 0.0801, + "step": 81500 + }, + { + "epoch": 1.74, + "learning_rate": 8.394928283424476e-06, + "loss": 0.0806, + "step": 81600 + }, + { + "epoch": 1.74, + "learning_rate": 8.380705858927773e-06, + "loss": 0.0799, + "step": 81700 + }, + { + "epoch": 1.75, + "learning_rate": 8.366483434431068e-06, + "loss": 0.0823, + "step": 81800 + }, + { + "epoch": 1.75, + "learning_rate": 8.352261009934365e-06, + "loss": 0.0781, + "step": 81900 + }, + { + "epoch": 1.75, + "learning_rate": 8.33803858543766e-06, + "loss": 0.0872, + "step": 82000 + }, + { + "epoch": 1.75, + "learning_rate": 8.323816160940956e-06, + "loss": 0.0776, + "step": 82100 + }, + { + "epoch": 1.75, + "learning_rate": 8.309593736444253e-06, + "loss": 0.0801, + "step": 82200 + }, + { + "epoch": 1.76, + "learning_rate": 8.29537131194755e-06, + "loss": 0.0869, + "step": 82300 + }, + { + "epoch": 1.76, + "learning_rate": 8.281148887450845e-06, + "loss": 0.0837, + "step": 82400 + }, + { + "epoch": 1.76, + "learning_rate": 8.26692646295414e-06, + "loss": 0.0871, + "step": 82500 + }, + { + "epoch": 1.76, + "learning_rate": 8.252704038457436e-06, + "loss": 0.0787, + "step": 82600 + }, + { + "epoch": 1.76, + "learning_rate": 8.238481613960733e-06, + "loss": 0.0811, + "step": 82700 + }, + { + "epoch": 1.77, + "learning_rate": 8.22425918946403e-06, + "loss": 0.0779, + "step": 82800 + }, + { + "epoch": 1.77, + "learning_rate": 8.210036764967324e-06, + "loss": 0.0781, + "step": 82900 + }, + { + "epoch": 1.77, + "learning_rate": 8.195814340470621e-06, + "loss": 0.0797, + "step": 83000 + }, + { + "epoch": 1.77, + "learning_rate": 8.181591915973916e-06, + "loss": 0.0873, + "step": 83100 + }, + { + "epoch": 1.77, + "learning_rate": 8.167369491477213e-06, + "loss": 0.0769, + "step": 83200 + }, + { + "epoch": 1.78, + "learning_rate": 8.15314706698051e-06, + "loss": 0.0859, + "step": 83300 + }, + { + "epoch": 1.78, + "learning_rate": 8.138924642483806e-06, + "loss": 0.0745, + "step": 83400 + }, + { + "epoch": 1.78, + "learning_rate": 8.124702217987101e-06, + "loss": 0.0789, + "step": 83500 + }, + { + "epoch": 1.78, + "learning_rate": 8.110479793490396e-06, + "loss": 0.091, + "step": 83600 + }, + { + "epoch": 1.79, + "learning_rate": 8.096257368993693e-06, + "loss": 0.0758, + "step": 83700 + }, + { + "epoch": 1.79, + "learning_rate": 8.08203494449699e-06, + "loss": 0.0815, + "step": 83800 + }, + { + "epoch": 1.79, + "learning_rate": 8.067812520000286e-06, + "loss": 0.0852, + "step": 83900 + }, + { + "epoch": 1.79, + "learning_rate": 8.053590095503581e-06, + "loss": 0.0742, + "step": 84000 + }, + { + "epoch": 1.79, + "learning_rate": 8.039367671006878e-06, + "loss": 0.0806, + "step": 84100 + }, + { + "epoch": 1.8, + "learning_rate": 8.025145246510173e-06, + "loss": 0.0836, + "step": 84200 + }, + { + "epoch": 1.8, + "learning_rate": 8.01092282201347e-06, + "loss": 0.0771, + "step": 84300 + }, + { + "epoch": 1.8, + "learning_rate": 7.996700397516766e-06, + "loss": 0.0745, + "step": 84400 + }, + { + "epoch": 1.8, + "learning_rate": 7.982477973020063e-06, + "loss": 0.0795, + "step": 84500 + }, + { + "epoch": 1.8, + "learning_rate": 7.968255548523358e-06, + "loss": 0.0784, + "step": 84600 + }, + { + "epoch": 1.81, + "learning_rate": 7.954033124026653e-06, + "loss": 0.0762, + "step": 84700 + }, + { + "epoch": 1.81, + "learning_rate": 7.93981069952995e-06, + "loss": 0.0822, + "step": 84800 + }, + { + "epoch": 1.81, + "learning_rate": 7.925588275033246e-06, + "loss": 0.0775, + "step": 84900 + }, + { + "epoch": 1.81, + "learning_rate": 7.911365850536543e-06, + "loss": 0.0784, + "step": 85000 + }, + { + "epoch": 1.82, + "learning_rate": 7.897143426039838e-06, + "loss": 0.0823, + "step": 85100 + }, + { + "epoch": 1.82, + "learning_rate": 7.882921001543134e-06, + "loss": 0.077, + "step": 85200 + }, + { + "epoch": 1.82, + "learning_rate": 7.86869857704643e-06, + "loss": 0.0737, + "step": 85300 + }, + { + "epoch": 1.82, + "learning_rate": 7.854476152549726e-06, + "loss": 0.0769, + "step": 85400 + }, + { + "epoch": 1.82, + "learning_rate": 7.840253728053023e-06, + "loss": 0.0783, + "step": 85500 + }, + { + "epoch": 1.83, + "learning_rate": 7.826031303556318e-06, + "loss": 0.0817, + "step": 85600 + }, + { + "epoch": 1.83, + "learning_rate": 7.811808879059614e-06, + "loss": 0.0759, + "step": 85700 + }, + { + "epoch": 1.83, + "learning_rate": 7.79758645456291e-06, + "loss": 0.0772, + "step": 85800 + }, + { + "epoch": 1.83, + "learning_rate": 7.783364030066206e-06, + "loss": 0.0792, + "step": 85900 + }, + { + "epoch": 1.83, + "learning_rate": 7.769141605569503e-06, + "loss": 0.0738, + "step": 86000 + }, + { + "epoch": 1.84, + "learning_rate": 7.7549191810728e-06, + "loss": 0.0694, + "step": 86100 + }, + { + "epoch": 1.84, + "learning_rate": 7.740696756576094e-06, + "loss": 0.0825, + "step": 86200 + }, + { + "epoch": 1.84, + "learning_rate": 7.72647433207939e-06, + "loss": 0.0809, + "step": 86300 + }, + { + "epoch": 1.84, + "learning_rate": 7.712251907582686e-06, + "loss": 0.0751, + "step": 86400 + }, + { + "epoch": 1.85, + "learning_rate": 7.698029483085982e-06, + "loss": 0.0775, + "step": 86500 + }, + { + "epoch": 1.85, + "learning_rate": 7.683807058589279e-06, + "loss": 0.0771, + "step": 86600 + }, + { + "epoch": 1.85, + "learning_rate": 7.669584634092574e-06, + "loss": 0.0777, + "step": 86700 + }, + { + "epoch": 1.85, + "learning_rate": 7.65536220959587e-06, + "loss": 0.0773, + "step": 86800 + }, + { + "epoch": 1.85, + "learning_rate": 7.641139785099166e-06, + "loss": 0.0749, + "step": 86900 + }, + { + "epoch": 1.86, + "learning_rate": 7.626917360602462e-06, + "loss": 0.0773, + "step": 87000 + }, + { + "epoch": 1.86, + "learning_rate": 7.612694936105759e-06, + "loss": 0.0859, + "step": 87100 + }, + { + "epoch": 1.86, + "learning_rate": 7.598472511609054e-06, + "loss": 0.0798, + "step": 87200 + }, + { + "epoch": 1.86, + "learning_rate": 7.584250087112351e-06, + "loss": 0.0755, + "step": 87300 + }, + { + "epoch": 1.86, + "learning_rate": 7.5700276626156465e-06, + "loss": 0.0808, + "step": 87400 + }, + { + "epoch": 1.87, + "learning_rate": 7.555805238118943e-06, + "loss": 0.0725, + "step": 87500 + }, + { + "epoch": 1.87, + "learning_rate": 7.541582813622239e-06, + "loss": 0.0812, + "step": 87600 + }, + { + "epoch": 1.87, + "learning_rate": 7.527360389125536e-06, + "loss": 0.0794, + "step": 87700 + }, + { + "epoch": 1.87, + "learning_rate": 7.513137964628831e-06, + "loss": 0.0692, + "step": 87800 + }, + { + "epoch": 1.88, + "learning_rate": 7.4989155401321265e-06, + "loss": 0.0773, + "step": 87900 + }, + { + "epoch": 1.88, + "learning_rate": 7.484693115635423e-06, + "loss": 0.0695, + "step": 88000 + }, + { + "epoch": 1.88, + "learning_rate": 7.470470691138719e-06, + "loss": 0.0853, + "step": 88100 + }, + { + "epoch": 1.88, + "learning_rate": 7.456248266642016e-06, + "loss": 0.0759, + "step": 88200 + }, + { + "epoch": 1.88, + "learning_rate": 7.442025842145311e-06, + "loss": 0.0731, + "step": 88300 + }, + { + "epoch": 1.89, + "learning_rate": 7.427803417648607e-06, + "loss": 0.0776, + "step": 88400 + }, + { + "epoch": 1.89, + "learning_rate": 7.413580993151903e-06, + "loss": 0.0799, + "step": 88500 + }, + { + "epoch": 1.89, + "learning_rate": 7.3993585686552e-06, + "loss": 0.083, + "step": 88600 + }, + { + "epoch": 1.89, + "learning_rate": 7.385136144158496e-06, + "loss": 0.0833, + "step": 88700 + }, + { + "epoch": 1.89, + "learning_rate": 7.3709137196617906e-06, + "loss": 0.072, + "step": 88800 + }, + { + "epoch": 1.9, + "learning_rate": 7.356691295165087e-06, + "loss": 0.0755, + "step": 88900 + }, + { + "epoch": 1.9, + "learning_rate": 7.342468870668383e-06, + "loss": 0.0788, + "step": 89000 + }, + { + "epoch": 1.9, + "learning_rate": 7.32824644617168e-06, + "loss": 0.0782, + "step": 89100 + }, + { + "epoch": 1.9, + "learning_rate": 7.3140240216749755e-06, + "loss": 0.0697, + "step": 89200 + }, + { + "epoch": 1.91, + "learning_rate": 7.299801597178272e-06, + "loss": 0.0764, + "step": 89300 + }, + { + "epoch": 1.91, + "learning_rate": 7.285579172681567e-06, + "loss": 0.0711, + "step": 89400 + }, + { + "epoch": 1.91, + "learning_rate": 7.271356748184864e-06, + "loss": 0.0691, + "step": 89500 + }, + { + "epoch": 1.91, + "learning_rate": 7.25713432368816e-06, + "loss": 0.0708, + "step": 89600 + }, + { + "epoch": 1.91, + "learning_rate": 7.242911899191456e-06, + "loss": 0.076, + "step": 89700 + }, + { + "epoch": 1.92, + "learning_rate": 7.228689474694752e-06, + "loss": 0.0721, + "step": 89800 + }, + { + "epoch": 1.92, + "learning_rate": 7.214467050198047e-06, + "loss": 0.0758, + "step": 89900 + }, + { + "epoch": 1.92, + "learning_rate": 7.200244625701344e-06, + "loss": 0.0762, + "step": 90000 + }, + { + "epoch": 1.92, + "eval_loss": 0.1308322250843048, + "eval_runtime": 34.2018, + "eval_samples_per_second": 146.191, + "eval_steps_per_second": 1.17, + "step": 90000 + }, + { + "epoch": 1.92, + "learning_rate": 7.18602220120464e-06, + "loss": 0.0774, + "step": 90100 + }, + { + "epoch": 1.92, + "learning_rate": 7.171799776707936e-06, + "loss": 0.0753, + "step": 90200 + }, + { + "epoch": 1.93, + "learning_rate": 7.157577352211232e-06, + "loss": 0.0731, + "step": 90300 + }, + { + "epoch": 1.93, + "learning_rate": 7.143354927714528e-06, + "loss": 0.0793, + "step": 90400 + }, + { + "epoch": 1.93, + "learning_rate": 7.129132503217824e-06, + "loss": 0.0665, + "step": 90500 + }, + { + "epoch": 1.93, + "learning_rate": 7.11491007872112e-06, + "loss": 0.0732, + "step": 90600 + }, + { + "epoch": 1.93, + "learning_rate": 7.100687654224416e-06, + "loss": 0.0768, + "step": 90700 + }, + { + "epoch": 1.94, + "learning_rate": 7.086465229727713e-06, + "loss": 0.0716, + "step": 90800 + }, + { + "epoch": 1.94, + "learning_rate": 7.072242805231009e-06, + "loss": 0.0715, + "step": 90900 + }, + { + "epoch": 1.94, + "learning_rate": 7.058020380734304e-06, + "loss": 0.0748, + "step": 91000 + }, + { + "epoch": 1.94, + "learning_rate": 7.0437979562376e-06, + "loss": 0.0672, + "step": 91100 + }, + { + "epoch": 1.95, + "learning_rate": 7.029575531740896e-06, + "loss": 0.0775, + "step": 91200 + }, + { + "epoch": 1.95, + "learning_rate": 7.015353107244193e-06, + "loss": 0.0696, + "step": 91300 + }, + { + "epoch": 1.95, + "learning_rate": 7.001130682747489e-06, + "loss": 0.0688, + "step": 91400 + }, + { + "epoch": 1.95, + "learning_rate": 6.9869082582507845e-06, + "loss": 0.067, + "step": 91500 + }, + { + "epoch": 1.95, + "learning_rate": 6.97268583375408e-06, + "loss": 0.079, + "step": 91600 + }, + { + "epoch": 1.96, + "learning_rate": 6.958463409257377e-06, + "loss": 0.0729, + "step": 91700 + }, + { + "epoch": 1.96, + "learning_rate": 6.944240984760673e-06, + "loss": 0.0701, + "step": 91800 + }, + { + "epoch": 1.96, + "learning_rate": 6.9300185602639695e-06, + "loss": 0.0699, + "step": 91900 + }, + { + "epoch": 1.96, + "learning_rate": 6.9157961357672645e-06, + "loss": 0.0743, + "step": 92000 + }, + { + "epoch": 1.96, + "learning_rate": 6.90157371127056e-06, + "loss": 0.0657, + "step": 92100 + }, + { + "epoch": 1.97, + "learning_rate": 6.887351286773857e-06, + "loss": 0.071, + "step": 92200 + }, + { + "epoch": 1.97, + "learning_rate": 6.873128862277153e-06, + "loss": 0.0685, + "step": 92300 + }, + { + "epoch": 1.97, + "learning_rate": 6.8589064377804494e-06, + "loss": 0.079, + "step": 92400 + }, + { + "epoch": 1.97, + "learning_rate": 6.844684013283745e-06, + "loss": 0.0709, + "step": 92500 + }, + { + "epoch": 1.98, + "learning_rate": 6.830461588787041e-06, + "loss": 0.0693, + "step": 92600 + }, + { + "epoch": 1.98, + "learning_rate": 6.816239164290337e-06, + "loss": 0.0703, + "step": 92700 + }, + { + "epoch": 1.98, + "learning_rate": 6.8020167397936336e-06, + "loss": 0.0698, + "step": 92800 + }, + { + "epoch": 1.98, + "learning_rate": 6.787794315296929e-06, + "loss": 0.0727, + "step": 92900 + }, + { + "epoch": 1.98, + "learning_rate": 6.773571890800226e-06, + "loss": 0.0641, + "step": 93000 + }, + { + "epoch": 1.99, + "learning_rate": 6.759349466303521e-06, + "loss": 0.0765, + "step": 93100 + }, + { + "epoch": 1.99, + "learning_rate": 6.745127041806817e-06, + "loss": 0.0653, + "step": 93200 + }, + { + "epoch": 1.99, + "learning_rate": 6.7309046173101135e-06, + "loss": 0.0687, + "step": 93300 + }, + { + "epoch": 1.99, + "learning_rate": 6.716682192813409e-06, + "loss": 0.0759, + "step": 93400 + }, + { + "epoch": 1.99, + "learning_rate": 6.702459768316706e-06, + "loss": 0.0688, + "step": 93500 + }, + { + "epoch": 2.0, + "learning_rate": 6.688237343820001e-06, + "loss": 0.0673, + "step": 93600 + }, + { + "epoch": 2.0, + "learning_rate": 6.674014919323298e-06, + "loss": 0.0666, + "step": 93700 + }, + { + "epoch": 2.0, + "learning_rate": 6.6597924948265935e-06, + "loss": 0.0588, + "step": 93800 + }, + { + "epoch": 2.0, + "learning_rate": 6.64557007032989e-06, + "loss": 0.0303, + "step": 93900 + }, + { + "epoch": 2.01, + "learning_rate": 6.631347645833186e-06, + "loss": 0.0387, + "step": 94000 + }, + { + "epoch": 2.01, + "learning_rate": 6.617125221336483e-06, + "loss": 0.0344, + "step": 94100 + }, + { + "epoch": 2.01, + "learning_rate": 6.602902796839778e-06, + "loss": 0.0354, + "step": 94200 + }, + { + "epoch": 2.01, + "learning_rate": 6.588680372343073e-06, + "loss": 0.0306, + "step": 94300 + }, + { + "epoch": 2.01, + "learning_rate": 6.57445794784637e-06, + "loss": 0.0331, + "step": 94400 + }, + { + "epoch": 2.02, + "learning_rate": 6.560235523349666e-06, + "loss": 0.0325, + "step": 94500 + }, + { + "epoch": 2.02, + "learning_rate": 6.5460130988529626e-06, + "loss": 0.0319, + "step": 94600 + }, + { + "epoch": 2.02, + "learning_rate": 6.5317906743562575e-06, + "loss": 0.0337, + "step": 94700 + }, + { + "epoch": 2.02, + "learning_rate": 6.517568249859554e-06, + "loss": 0.0338, + "step": 94800 + }, + { + "epoch": 2.02, + "learning_rate": 6.50334582536285e-06, + "loss": 0.0362, + "step": 94900 + }, + { + "epoch": 2.03, + "learning_rate": 6.489123400866147e-06, + "loss": 0.0363, + "step": 95000 + }, + { + "epoch": 2.03, + "learning_rate": 6.4749009763694425e-06, + "loss": 0.0354, + "step": 95100 + }, + { + "epoch": 2.03, + "learning_rate": 6.4606785518727375e-06, + "loss": 0.0275, + "step": 95200 + }, + { + "epoch": 2.03, + "learning_rate": 6.446456127376034e-06, + "loss": 0.0328, + "step": 95300 + }, + { + "epoch": 2.04, + "learning_rate": 6.43223370287933e-06, + "loss": 0.0353, + "step": 95400 + }, + { + "epoch": 2.04, + "learning_rate": 6.418011278382627e-06, + "loss": 0.0342, + "step": 95500 + }, + { + "epoch": 2.04, + "learning_rate": 6.4037888538859225e-06, + "loss": 0.0341, + "step": 95600 + }, + { + "epoch": 2.04, + "learning_rate": 6.389566429389219e-06, + "loss": 0.0317, + "step": 95700 + }, + { + "epoch": 2.04, + "learning_rate": 6.375344004892514e-06, + "loss": 0.033, + "step": 95800 + }, + { + "epoch": 2.05, + "learning_rate": 6.361121580395811e-06, + "loss": 0.0334, + "step": 95900 + }, + { + "epoch": 2.05, + "learning_rate": 6.346899155899107e-06, + "loss": 0.0361, + "step": 96000 + }, + { + "epoch": 2.05, + "learning_rate": 6.332676731402403e-06, + "loss": 0.0319, + "step": 96100 + }, + { + "epoch": 2.05, + "learning_rate": 6.318454306905699e-06, + "loss": 0.0347, + "step": 96200 + }, + { + "epoch": 2.05, + "learning_rate": 6.304231882408994e-06, + "loss": 0.0333, + "step": 96300 + }, + { + "epoch": 2.06, + "learning_rate": 6.290009457912291e-06, + "loss": 0.037, + "step": 96400 + }, + { + "epoch": 2.06, + "learning_rate": 6.2757870334155865e-06, + "loss": 0.0343, + "step": 96500 + }, + { + "epoch": 2.06, + "learning_rate": 6.261564608918883e-06, + "loss": 0.0334, + "step": 96600 + }, + { + "epoch": 2.06, + "learning_rate": 6.247342184422179e-06, + "loss": 0.0312, + "step": 96700 + }, + { + "epoch": 2.07, + "learning_rate": 6.233119759925475e-06, + "loss": 0.0315, + "step": 96800 + }, + { + "epoch": 2.07, + "learning_rate": 6.218897335428771e-06, + "loss": 0.0268, + "step": 96900 + }, + { + "epoch": 2.07, + "learning_rate": 6.204674910932067e-06, + "loss": 0.0263, + "step": 97000 + }, + { + "epoch": 2.07, + "learning_rate": 6.190452486435363e-06, + "loss": 0.0303, + "step": 97100 + }, + { + "epoch": 2.07, + "learning_rate": 6.17623006193866e-06, + "loss": 0.0345, + "step": 97200 + }, + { + "epoch": 2.08, + "learning_rate": 6.162007637441956e-06, + "loss": 0.0358, + "step": 97300 + }, + { + "epoch": 2.08, + "learning_rate": 6.147785212945251e-06, + "loss": 0.0369, + "step": 97400 + }, + { + "epoch": 2.08, + "learning_rate": 6.133562788448547e-06, + "loss": 0.0392, + "step": 97500 + }, + { + "epoch": 2.08, + "learning_rate": 6.119340363951843e-06, + "loss": 0.0319, + "step": 97600 + }, + { + "epoch": 2.08, + "learning_rate": 6.10511793945514e-06, + "loss": 0.0372, + "step": 97700 + }, + { + "epoch": 2.09, + "learning_rate": 6.090895514958436e-06, + "loss": 0.0317, + "step": 97800 + }, + { + "epoch": 2.09, + "learning_rate": 6.076673090461731e-06, + "loss": 0.0321, + "step": 97900 + }, + { + "epoch": 2.09, + "learning_rate": 6.062450665965027e-06, + "loss": 0.0369, + "step": 98000 + }, + { + "epoch": 2.09, + "learning_rate": 6.048228241468324e-06, + "loss": 0.0332, + "step": 98100 + }, + { + "epoch": 2.09, + "learning_rate": 6.03400581697162e-06, + "loss": 0.0306, + "step": 98200 + }, + { + "epoch": 2.1, + "learning_rate": 6.019783392474916e-06, + "loss": 0.0337, + "step": 98300 + }, + { + "epoch": 2.1, + "learning_rate": 6.005560967978211e-06, + "loss": 0.0332, + "step": 98400 + }, + { + "epoch": 2.1, + "learning_rate": 5.991338543481507e-06, + "loss": 0.0337, + "step": 98500 + }, + { + "epoch": 2.1, + "learning_rate": 5.977116118984804e-06, + "loss": 0.0296, + "step": 98600 + }, + { + "epoch": 2.11, + "learning_rate": 5.9628936944881e-06, + "loss": 0.0341, + "step": 98700 + }, + { + "epoch": 2.11, + "learning_rate": 5.948671269991396e-06, + "loss": 0.0287, + "step": 98800 + }, + { + "epoch": 2.11, + "learning_rate": 5.934448845494692e-06, + "loss": 0.0313, + "step": 98900 + }, + { + "epoch": 2.11, + "learning_rate": 5.920226420997988e-06, + "loss": 0.0335, + "step": 99000 + }, + { + "epoch": 2.11, + "learning_rate": 5.906003996501284e-06, + "loss": 0.0327, + "step": 99100 + }, + { + "epoch": 2.12, + "learning_rate": 5.8917815720045805e-06, + "loss": 0.0363, + "step": 99200 + }, + { + "epoch": 2.12, + "learning_rate": 5.877559147507876e-06, + "loss": 0.0339, + "step": 99300 + }, + { + "epoch": 2.12, + "learning_rate": 5.863336723011173e-06, + "loss": 0.0282, + "step": 99400 + }, + { + "epoch": 2.12, + "learning_rate": 5.849114298514468e-06, + "loss": 0.0339, + "step": 99500 + }, + { + "epoch": 2.12, + "learning_rate": 5.834891874017764e-06, + "loss": 0.0297, + "step": 99600 + }, + { + "epoch": 2.13, + "learning_rate": 5.82066944952106e-06, + "loss": 0.0287, + "step": 99700 + }, + { + "epoch": 2.13, + "learning_rate": 5.806447025024356e-06, + "loss": 0.0321, + "step": 99800 + }, + { + "epoch": 2.13, + "learning_rate": 5.792224600527653e-06, + "loss": 0.0304, + "step": 99900 + }, + { + "epoch": 2.13, + "learning_rate": 5.778002176030948e-06, + "loss": 0.0351, + "step": 100000 + }, + { + "epoch": 2.13, + "eval_loss": 0.13596704602241516, + "eval_runtime": 34.4302, + "eval_samples_per_second": 145.221, + "eval_steps_per_second": 1.162, + "step": 100000 + }, + { + "epoch": 2.14, + "learning_rate": 5.7637797515342445e-06, + "loss": 0.0314, + "step": 100100 + }, + { + "epoch": 2.14, + "learning_rate": 5.74955732703754e-06, + "loss": 0.033, + "step": 100200 + }, + { + "epoch": 2.14, + "learning_rate": 5.735334902540837e-06, + "loss": 0.0358, + "step": 100300 + }, + { + "epoch": 2.14, + "learning_rate": 5.721112478044133e-06, + "loss": 0.0345, + "step": 100400 + }, + { + "epoch": 2.14, + "learning_rate": 5.7068900535474295e-06, + "loss": 0.0353, + "step": 100500 + }, + { + "epoch": 2.15, + "learning_rate": 5.6926676290507245e-06, + "loss": 0.0347, + "step": 100600 + }, + { + "epoch": 2.15, + "learning_rate": 5.67844520455402e-06, + "loss": 0.0301, + "step": 100700 + }, + { + "epoch": 2.15, + "learning_rate": 5.664222780057317e-06, + "loss": 0.0337, + "step": 100800 + }, + { + "epoch": 2.15, + "learning_rate": 5.650000355560613e-06, + "loss": 0.0317, + "step": 100900 + }, + { + "epoch": 2.15, + "learning_rate": 5.6357779310639095e-06, + "loss": 0.0309, + "step": 101000 + }, + { + "epoch": 2.16, + "learning_rate": 5.6215555065672044e-06, + "loss": 0.0307, + "step": 101100 + }, + { + "epoch": 2.16, + "learning_rate": 5.607333082070501e-06, + "loss": 0.0321, + "step": 101200 + }, + { + "epoch": 2.16, + "learning_rate": 5.593110657573797e-06, + "loss": 0.0326, + "step": 101300 + }, + { + "epoch": 2.16, + "learning_rate": 5.578888233077094e-06, + "loss": 0.0286, + "step": 101400 + }, + { + "epoch": 2.17, + "learning_rate": 5.564665808580389e-06, + "loss": 0.0345, + "step": 101500 + }, + { + "epoch": 2.17, + "learning_rate": 5.550443384083684e-06, + "loss": 0.0285, + "step": 101600 + }, + { + "epoch": 2.17, + "learning_rate": 5.536220959586981e-06, + "loss": 0.0332, + "step": 101700 + }, + { + "epoch": 2.17, + "learning_rate": 5.521998535090277e-06, + "loss": 0.0293, + "step": 101800 + }, + { + "epoch": 2.17, + "learning_rate": 5.5077761105935736e-06, + "loss": 0.0347, + "step": 101900 + }, + { + "epoch": 2.18, + "learning_rate": 5.493553686096869e-06, + "loss": 0.0353, + "step": 102000 + }, + { + "epoch": 2.18, + "learning_rate": 5.479331261600166e-06, + "loss": 0.0297, + "step": 102100 + }, + { + "epoch": 2.18, + "learning_rate": 5.465108837103461e-06, + "loss": 0.0377, + "step": 102200 + }, + { + "epoch": 2.18, + "learning_rate": 5.450886412606758e-06, + "loss": 0.0387, + "step": 102300 + }, + { + "epoch": 2.18, + "learning_rate": 5.4366639881100535e-06, + "loss": 0.0255, + "step": 102400 + }, + { + "epoch": 2.19, + "learning_rate": 5.42244156361335e-06, + "loss": 0.027, + "step": 102500 + }, + { + "epoch": 2.19, + "learning_rate": 5.408219139116646e-06, + "loss": 0.0325, + "step": 102600 + }, + { + "epoch": 2.19, + "learning_rate": 5.393996714619941e-06, + "loss": 0.0302, + "step": 102700 + }, + { + "epoch": 2.19, + "learning_rate": 5.379774290123238e-06, + "loss": 0.0358, + "step": 102800 + }, + { + "epoch": 2.2, + "learning_rate": 5.3655518656265335e-06, + "loss": 0.031, + "step": 102900 + }, + { + "epoch": 2.2, + "learning_rate": 5.35132944112983e-06, + "loss": 0.0322, + "step": 103000 + }, + { + "epoch": 2.2, + "learning_rate": 5.337107016633126e-06, + "loss": 0.0321, + "step": 103100 + }, + { + "epoch": 2.2, + "learning_rate": 5.322884592136422e-06, + "loss": 0.0327, + "step": 103200 + }, + { + "epoch": 2.2, + "learning_rate": 5.308662167639718e-06, + "loss": 0.0348, + "step": 103300 + }, + { + "epoch": 2.21, + "learning_rate": 5.294439743143014e-06, + "loss": 0.0339, + "step": 103400 + }, + { + "epoch": 2.21, + "learning_rate": 5.28021731864631e-06, + "loss": 0.0319, + "step": 103500 + }, + { + "epoch": 2.21, + "learning_rate": 5.265994894149607e-06, + "loss": 0.0315, + "step": 103600 + }, + { + "epoch": 2.21, + "learning_rate": 5.2517724696529026e-06, + "loss": 0.0319, + "step": 103700 + }, + { + "epoch": 2.21, + "learning_rate": 5.2375500451561975e-06, + "loss": 0.0305, + "step": 103800 + }, + { + "epoch": 2.22, + "learning_rate": 5.223327620659494e-06, + "loss": 0.0294, + "step": 103900 + }, + { + "epoch": 2.22, + "learning_rate": 5.20910519616279e-06, + "loss": 0.033, + "step": 104000 + }, + { + "epoch": 2.22, + "learning_rate": 5.194882771666087e-06, + "loss": 0.0303, + "step": 104100 + }, + { + "epoch": 2.22, + "learning_rate": 5.1806603471693825e-06, + "loss": 0.0353, + "step": 104200 + }, + { + "epoch": 2.23, + "learning_rate": 5.166437922672678e-06, + "loss": 0.0307, + "step": 104300 + }, + { + "epoch": 2.23, + "learning_rate": 5.152215498175974e-06, + "loss": 0.0321, + "step": 104400 + }, + { + "epoch": 2.23, + "learning_rate": 5.137993073679271e-06, + "loss": 0.0328, + "step": 104500 + }, + { + "epoch": 2.23, + "learning_rate": 5.123770649182567e-06, + "loss": 0.0357, + "step": 104600 + }, + { + "epoch": 2.23, + "learning_rate": 5.109548224685863e-06, + "loss": 0.0302, + "step": 104700 + }, + { + "epoch": 2.24, + "learning_rate": 5.095325800189158e-06, + "loss": 0.0349, + "step": 104800 + }, + { + "epoch": 2.24, + "learning_rate": 5.081103375692454e-06, + "loss": 0.0295, + "step": 104900 + }, + { + "epoch": 2.24, + "learning_rate": 5.066880951195751e-06, + "loss": 0.0276, + "step": 105000 + }, + { + "epoch": 2.24, + "learning_rate": 5.052658526699047e-06, + "loss": 0.0328, + "step": 105100 + }, + { + "epoch": 2.24, + "learning_rate": 5.038436102202343e-06, + "loss": 0.0372, + "step": 105200 + }, + { + "epoch": 2.25, + "learning_rate": 5.024213677705639e-06, + "loss": 0.035, + "step": 105300 + }, + { + "epoch": 2.25, + "learning_rate": 5.009991253208935e-06, + "loss": 0.0294, + "step": 105400 + }, + { + "epoch": 2.25, + "learning_rate": 4.995768828712231e-06, + "loss": 0.0345, + "step": 105500 + }, + { + "epoch": 2.25, + "learning_rate": 4.981546404215527e-06, + "loss": 0.0373, + "step": 105600 + }, + { + "epoch": 2.25, + "learning_rate": 4.967323979718823e-06, + "loss": 0.0305, + "step": 105700 + }, + { + "epoch": 2.26, + "learning_rate": 4.953101555222119e-06, + "loss": 0.0323, + "step": 105800 + }, + { + "epoch": 2.26, + "learning_rate": 4.938879130725415e-06, + "loss": 0.0275, + "step": 105900 + }, + { + "epoch": 2.26, + "learning_rate": 4.924656706228711e-06, + "loss": 0.0323, + "step": 106000 + }, + { + "epoch": 2.26, + "learning_rate": 4.910434281732007e-06, + "loss": 0.0342, + "step": 106100 + }, + { + "epoch": 2.27, + "learning_rate": 4.896211857235303e-06, + "loss": 0.0315, + "step": 106200 + }, + { + "epoch": 2.27, + "learning_rate": 4.881989432738599e-06, + "loss": 0.0286, + "step": 106300 + }, + { + "epoch": 2.27, + "learning_rate": 4.867767008241896e-06, + "loss": 0.0296, + "step": 106400 + }, + { + "epoch": 2.27, + "learning_rate": 4.8535445837451915e-06, + "loss": 0.0416, + "step": 106500 + }, + { + "epoch": 2.27, + "learning_rate": 4.839322159248487e-06, + "loss": 0.0329, + "step": 106600 + }, + { + "epoch": 2.28, + "learning_rate": 4.825099734751784e-06, + "loss": 0.0301, + "step": 106700 + }, + { + "epoch": 2.28, + "learning_rate": 4.810877310255079e-06, + "loss": 0.0327, + "step": 106800 + }, + { + "epoch": 2.28, + "learning_rate": 4.796654885758376e-06, + "loss": 0.032, + "step": 106900 + }, + { + "epoch": 2.28, + "learning_rate": 4.782432461261671e-06, + "loss": 0.0292, + "step": 107000 + }, + { + "epoch": 2.28, + "learning_rate": 4.768210036764967e-06, + "loss": 0.0327, + "step": 107100 + }, + { + "epoch": 2.29, + "learning_rate": 4.753987612268264e-06, + "loss": 0.0288, + "step": 107200 + }, + { + "epoch": 2.29, + "learning_rate": 4.73976518777156e-06, + "loss": 0.0323, + "step": 107300 + }, + { + "epoch": 2.29, + "learning_rate": 4.7255427632748555e-06, + "loss": 0.0328, + "step": 107400 + }, + { + "epoch": 2.29, + "learning_rate": 4.711320338778152e-06, + "loss": 0.032, + "step": 107500 + }, + { + "epoch": 2.3, + "learning_rate": 4.697097914281448e-06, + "loss": 0.0321, + "step": 107600 + }, + { + "epoch": 2.3, + "learning_rate": 4.682875489784744e-06, + "loss": 0.0355, + "step": 107700 + }, + { + "epoch": 2.3, + "learning_rate": 4.6686530652880405e-06, + "loss": 0.0314, + "step": 107800 + }, + { + "epoch": 2.3, + "learning_rate": 4.6544306407913355e-06, + "loss": 0.0329, + "step": 107900 + }, + { + "epoch": 2.3, + "learning_rate": 4.640208216294632e-06, + "loss": 0.0322, + "step": 108000 + }, + { + "epoch": 2.31, + "learning_rate": 4.625985791797928e-06, + "loss": 0.0306, + "step": 108100 + }, + { + "epoch": 2.31, + "learning_rate": 4.611763367301224e-06, + "loss": 0.0362, + "step": 108200 + }, + { + "epoch": 2.31, + "learning_rate": 4.5975409428045205e-06, + "loss": 0.0304, + "step": 108300 + }, + { + "epoch": 2.31, + "learning_rate": 4.583318518307816e-06, + "loss": 0.028, + "step": 108400 + }, + { + "epoch": 2.31, + "learning_rate": 4.569096093811112e-06, + "loss": 0.0309, + "step": 108500 + }, + { + "epoch": 2.32, + "learning_rate": 4.554873669314409e-06, + "loss": 0.0294, + "step": 108600 + }, + { + "epoch": 2.32, + "learning_rate": 4.540651244817705e-06, + "loss": 0.0337, + "step": 108700 + }, + { + "epoch": 2.32, + "learning_rate": 4.526428820321e-06, + "loss": 0.0268, + "step": 108800 + }, + { + "epoch": 2.32, + "learning_rate": 4.512206395824297e-06, + "loss": 0.0291, + "step": 108900 + }, + { + "epoch": 2.33, + "learning_rate": 4.497983971327592e-06, + "loss": 0.0335, + "step": 109000 + }, + { + "epoch": 2.33, + "learning_rate": 4.483761546830889e-06, + "loss": 0.0321, + "step": 109100 + }, + { + "epoch": 2.33, + "learning_rate": 4.4695391223341845e-06, + "loss": 0.0277, + "step": 109200 + }, + { + "epoch": 2.33, + "learning_rate": 4.45531669783748e-06, + "loss": 0.0316, + "step": 109300 + }, + { + "epoch": 2.33, + "learning_rate": 4.441094273340777e-06, + "loss": 0.033, + "step": 109400 + }, + { + "epoch": 2.34, + "learning_rate": 4.426871848844073e-06, + "loss": 0.0337, + "step": 109500 + }, + { + "epoch": 2.34, + "learning_rate": 4.412649424347369e-06, + "loss": 0.0315, + "step": 109600 + }, + { + "epoch": 2.34, + "learning_rate": 4.398426999850665e-06, + "loss": 0.0301, + "step": 109700 + }, + { + "epoch": 2.34, + "learning_rate": 4.384204575353961e-06, + "loss": 0.0265, + "step": 109800 + }, + { + "epoch": 2.34, + "learning_rate": 4.369982150857257e-06, + "loss": 0.0282, + "step": 109900 + }, + { + "epoch": 2.35, + "learning_rate": 4.355759726360553e-06, + "loss": 0.0347, + "step": 110000 + }, + { + "epoch": 2.35, + "eval_loss": 0.13324007391929626, + "eval_runtime": 34.3057, + "eval_samples_per_second": 145.748, + "eval_steps_per_second": 1.166, + "step": 110000 + }, + { + "epoch": 2.35, + "learning_rate": 4.341537301863849e-06, + "loss": 0.032, + "step": 110100 + }, + { + "epoch": 2.35, + "learning_rate": 4.327314877367145e-06, + "loss": 0.0281, + "step": 110200 + }, + { + "epoch": 2.35, + "learning_rate": 4.313092452870441e-06, + "loss": 0.0327, + "step": 110300 + }, + { + "epoch": 2.36, + "learning_rate": 4.298870028373737e-06, + "loss": 0.0325, + "step": 110400 + }, + { + "epoch": 2.36, + "learning_rate": 4.284647603877034e-06, + "loss": 0.0283, + "step": 110500 + }, + { + "epoch": 2.36, + "learning_rate": 4.270425179380329e-06, + "loss": 0.0346, + "step": 110600 + }, + { + "epoch": 2.36, + "learning_rate": 4.256202754883625e-06, + "loss": 0.036, + "step": 110700 + }, + { + "epoch": 2.36, + "learning_rate": 4.241980330386921e-06, + "loss": 0.0275, + "step": 110800 + }, + { + "epoch": 2.37, + "learning_rate": 4.227757905890218e-06, + "loss": 0.0329, + "step": 110900 + }, + { + "epoch": 2.37, + "learning_rate": 4.2135354813935135e-06, + "loss": 0.0332, + "step": 111000 + }, + { + "epoch": 2.37, + "learning_rate": 4.199313056896809e-06, + "loss": 0.0315, + "step": 111100 + }, + { + "epoch": 2.37, + "learning_rate": 4.185090632400105e-06, + "loss": 0.0302, + "step": 111200 + }, + { + "epoch": 2.37, + "learning_rate": 4.170868207903402e-06, + "loss": 0.0308, + "step": 111300 + }, + { + "epoch": 2.38, + "learning_rate": 4.156645783406698e-06, + "loss": 0.0311, + "step": 111400 + }, + { + "epoch": 2.38, + "learning_rate": 4.1424233589099935e-06, + "loss": 0.0295, + "step": 111500 + }, + { + "epoch": 2.38, + "learning_rate": 4.128200934413289e-06, + "loss": 0.037, + "step": 111600 + }, + { + "epoch": 2.38, + "learning_rate": 4.113978509916586e-06, + "loss": 0.0348, + "step": 111700 + }, + { + "epoch": 2.39, + "learning_rate": 4.099756085419882e-06, + "loss": 0.0331, + "step": 111800 + }, + { + "epoch": 2.39, + "learning_rate": 4.085533660923178e-06, + "loss": 0.0295, + "step": 111900 + }, + { + "epoch": 2.39, + "learning_rate": 4.071311236426474e-06, + "loss": 0.0311, + "step": 112000 + }, + { + "epoch": 2.39, + "learning_rate": 4.05708881192977e-06, + "loss": 0.0316, + "step": 112100 + }, + { + "epoch": 2.39, + "learning_rate": 4.042866387433066e-06, + "loss": 0.0292, + "step": 112200 + }, + { + "epoch": 2.4, + "learning_rate": 4.028643962936362e-06, + "loss": 0.0261, + "step": 112300 + }, + { + "epoch": 2.4, + "learning_rate": 4.014421538439658e-06, + "loss": 0.03, + "step": 112400 + }, + { + "epoch": 2.4, + "learning_rate": 4.000199113942954e-06, + "loss": 0.0324, + "step": 112500 + }, + { + "epoch": 2.4, + "learning_rate": 3.98597668944625e-06, + "loss": 0.0288, + "step": 112600 + }, + { + "epoch": 2.4, + "learning_rate": 3.971754264949546e-06, + "loss": 0.0274, + "step": 112700 + }, + { + "epoch": 2.41, + "learning_rate": 3.9575318404528426e-06, + "loss": 0.0288, + "step": 112800 + }, + { + "epoch": 2.41, + "learning_rate": 3.943309415956138e-06, + "loss": 0.0308, + "step": 112900 + }, + { + "epoch": 2.41, + "learning_rate": 3.929086991459434e-06, + "loss": 0.0284, + "step": 113000 + }, + { + "epoch": 2.41, + "learning_rate": 3.914864566962731e-06, + "loss": 0.0245, + "step": 113100 + }, + { + "epoch": 2.41, + "learning_rate": 3.900642142466026e-06, + "loss": 0.026, + "step": 113200 + }, + { + "epoch": 2.42, + "learning_rate": 3.8864197179693225e-06, + "loss": 0.0326, + "step": 113300 + }, + { + "epoch": 2.42, + "learning_rate": 3.872197293472618e-06, + "loss": 0.0321, + "step": 113400 + }, + { + "epoch": 2.42, + "learning_rate": 3.857974868975914e-06, + "loss": 0.0351, + "step": 113500 + }, + { + "epoch": 2.42, + "learning_rate": 3.843752444479211e-06, + "loss": 0.0362, + "step": 113600 + }, + { + "epoch": 2.43, + "learning_rate": 3.829530019982507e-06, + "loss": 0.0338, + "step": 113700 + }, + { + "epoch": 2.43, + "learning_rate": 3.8153075954858025e-06, + "loss": 0.032, + "step": 113800 + }, + { + "epoch": 2.43, + "learning_rate": 3.8010851709890987e-06, + "loss": 0.031, + "step": 113900 + }, + { + "epoch": 2.43, + "learning_rate": 3.7868627464923945e-06, + "loss": 0.0326, + "step": 114000 + }, + { + "epoch": 2.43, + "learning_rate": 3.7726403219956908e-06, + "loss": 0.0355, + "step": 114100 + }, + { + "epoch": 2.44, + "learning_rate": 3.758417897498987e-06, + "loss": 0.0285, + "step": 114200 + }, + { + "epoch": 2.44, + "learning_rate": 3.744195473002283e-06, + "loss": 0.0346, + "step": 114300 + }, + { + "epoch": 2.44, + "learning_rate": 3.729973048505579e-06, + "loss": 0.0369, + "step": 114400 + }, + { + "epoch": 2.44, + "learning_rate": 3.7157506240088753e-06, + "loss": 0.0324, + "step": 114500 + }, + { + "epoch": 2.44, + "learning_rate": 3.701528199512171e-06, + "loss": 0.0317, + "step": 114600 + }, + { + "epoch": 2.45, + "learning_rate": 3.6873057750154674e-06, + "loss": 0.0289, + "step": 114700 + }, + { + "epoch": 2.45, + "learning_rate": 3.6730833505187628e-06, + "loss": 0.0312, + "step": 114800 + }, + { + "epoch": 2.45, + "learning_rate": 3.658860926022059e-06, + "loss": 0.0322, + "step": 114900 + }, + { + "epoch": 2.45, + "learning_rate": 3.6446385015253557e-06, + "loss": 0.0266, + "step": 115000 + }, + { + "epoch": 2.46, + "learning_rate": 3.630416077028651e-06, + "loss": 0.0289, + "step": 115100 + }, + { + "epoch": 2.46, + "learning_rate": 3.6161936525319473e-06, + "loss": 0.036, + "step": 115200 + }, + { + "epoch": 2.46, + "learning_rate": 3.6019712280352436e-06, + "loss": 0.028, + "step": 115300 + }, + { + "epoch": 2.46, + "learning_rate": 3.5877488035385394e-06, + "loss": 0.029, + "step": 115400 + }, + { + "epoch": 2.46, + "learning_rate": 3.5735263790418356e-06, + "loss": 0.0314, + "step": 115500 + }, + { + "epoch": 2.47, + "learning_rate": 3.5593039545451315e-06, + "loss": 0.0227, + "step": 115600 + }, + { + "epoch": 2.47, + "learning_rate": 3.5450815300484277e-06, + "loss": 0.0317, + "step": 115700 + }, + { + "epoch": 2.47, + "learning_rate": 3.530859105551724e-06, + "loss": 0.0324, + "step": 115800 + }, + { + "epoch": 2.47, + "learning_rate": 3.5166366810550193e-06, + "loss": 0.0309, + "step": 115900 + }, + { + "epoch": 2.47, + "learning_rate": 3.5024142565583156e-06, + "loss": 0.0287, + "step": 116000 + }, + { + "epoch": 2.48, + "learning_rate": 3.4881918320616123e-06, + "loss": 0.027, + "step": 116100 + }, + { + "epoch": 2.48, + "learning_rate": 3.4739694075649077e-06, + "loss": 0.0331, + "step": 116200 + }, + { + "epoch": 2.48, + "learning_rate": 3.459746983068204e-06, + "loss": 0.0307, + "step": 116300 + }, + { + "epoch": 2.48, + "learning_rate": 3.4455245585714997e-06, + "loss": 0.0286, + "step": 116400 + }, + { + "epoch": 2.49, + "learning_rate": 3.431302134074796e-06, + "loss": 0.0332, + "step": 116500 + }, + { + "epoch": 2.49, + "learning_rate": 3.417079709578092e-06, + "loss": 0.0281, + "step": 116600 + }, + { + "epoch": 2.49, + "learning_rate": 3.402857285081388e-06, + "loss": 0.0323, + "step": 116700 + }, + { + "epoch": 2.49, + "learning_rate": 3.3886348605846843e-06, + "loss": 0.0303, + "step": 116800 + }, + { + "epoch": 2.49, + "learning_rate": 3.3744124360879805e-06, + "loss": 0.0336, + "step": 116900 + }, + { + "epoch": 2.5, + "learning_rate": 3.360190011591276e-06, + "loss": 0.0333, + "step": 117000 + }, + { + "epoch": 2.5, + "learning_rate": 3.3459675870945726e-06, + "loss": 0.0283, + "step": 117100 + }, + { + "epoch": 2.5, + "learning_rate": 3.331745162597868e-06, + "loss": 0.0312, + "step": 117200 + }, + { + "epoch": 2.5, + "learning_rate": 3.3175227381011642e-06, + "loss": 0.0319, + "step": 117300 + }, + { + "epoch": 2.5, + "learning_rate": 3.3033003136044605e-06, + "loss": 0.0281, + "step": 117400 + }, + { + "epoch": 2.51, + "learning_rate": 3.2890778891077563e-06, + "loss": 0.0331, + "step": 117500 + }, + { + "epoch": 2.51, + "learning_rate": 3.2748554646110525e-06, + "loss": 0.0348, + "step": 117600 + }, + { + "epoch": 2.51, + "learning_rate": 3.2606330401143488e-06, + "loss": 0.0245, + "step": 117700 + }, + { + "epoch": 2.51, + "learning_rate": 3.2464106156176446e-06, + "loss": 0.0279, + "step": 117800 + }, + { + "epoch": 2.52, + "learning_rate": 3.232188191120941e-06, + "loss": 0.0285, + "step": 117900 + }, + { + "epoch": 2.52, + "learning_rate": 3.2179657666242362e-06, + "loss": 0.0288, + "step": 118000 + }, + { + "epoch": 2.52, + "learning_rate": 3.2037433421275325e-06, + "loss": 0.0293, + "step": 118100 + }, + { + "epoch": 2.52, + "learning_rate": 3.189520917630829e-06, + "loss": 0.0296, + "step": 118200 + }, + { + "epoch": 2.52, + "learning_rate": 3.1752984931341245e-06, + "loss": 0.0303, + "step": 118300 + }, + { + "epoch": 2.53, + "learning_rate": 3.1610760686374208e-06, + "loss": 0.0309, + "step": 118400 + }, + { + "epoch": 2.53, + "learning_rate": 3.146853644140717e-06, + "loss": 0.0348, + "step": 118500 + }, + { + "epoch": 2.53, + "learning_rate": 3.132631219644013e-06, + "loss": 0.0299, + "step": 118600 + }, + { + "epoch": 2.53, + "learning_rate": 3.118408795147309e-06, + "loss": 0.0318, + "step": 118700 + }, + { + "epoch": 2.53, + "learning_rate": 3.104186370650605e-06, + "loss": 0.0266, + "step": 118800 + }, + { + "epoch": 2.54, + "learning_rate": 3.089963946153901e-06, + "loss": 0.0294, + "step": 118900 + }, + { + "epoch": 2.54, + "learning_rate": 3.0757415216571974e-06, + "loss": 0.0281, + "step": 119000 + }, + { + "epoch": 2.54, + "learning_rate": 3.061519097160493e-06, + "loss": 0.0251, + "step": 119100 + }, + { + "epoch": 2.54, + "learning_rate": 3.0472966726637895e-06, + "loss": 0.0288, + "step": 119200 + }, + { + "epoch": 2.55, + "learning_rate": 3.0330742481670857e-06, + "loss": 0.0313, + "step": 119300 + }, + { + "epoch": 2.55, + "learning_rate": 3.018851823670381e-06, + "loss": 0.0295, + "step": 119400 + }, + { + "epoch": 2.55, + "learning_rate": 3.0046293991736774e-06, + "loss": 0.0334, + "step": 119500 + }, + { + "epoch": 2.55, + "learning_rate": 2.990406974676973e-06, + "loss": 0.0243, + "step": 119600 + }, + { + "epoch": 2.55, + "learning_rate": 2.9761845501802694e-06, + "loss": 0.0335, + "step": 119700 + }, + { + "epoch": 2.56, + "learning_rate": 2.9619621256835657e-06, + "loss": 0.0327, + "step": 119800 + }, + { + "epoch": 2.56, + "learning_rate": 2.9477397011868615e-06, + "loss": 0.0304, + "step": 119900 + }, + { + "epoch": 2.56, + "learning_rate": 2.9335172766901577e-06, + "loss": 0.0302, + "step": 120000 + }, + { + "epoch": 2.56, + "eval_loss": 0.13005784153938293, + "eval_runtime": 34.1867, + "eval_samples_per_second": 146.256, + "eval_steps_per_second": 1.17, + "step": 120000 + }, + { + "epoch": 2.56, + "learning_rate": 2.919294852193454e-06, + "loss": 0.0274, + "step": 120100 + }, + { + "epoch": 2.56, + "learning_rate": 2.9050724276967494e-06, + "loss": 0.0289, + "step": 120200 + }, + { + "epoch": 2.57, + "learning_rate": 2.890850003200046e-06, + "loss": 0.0267, + "step": 120300 + }, + { + "epoch": 2.57, + "learning_rate": 2.8766275787033414e-06, + "loss": 0.0307, + "step": 120400 + }, + { + "epoch": 2.57, + "learning_rate": 2.8624051542066377e-06, + "loss": 0.0268, + "step": 120500 + }, + { + "epoch": 2.57, + "learning_rate": 2.848182729709934e-06, + "loss": 0.0314, + "step": 120600 + }, + { + "epoch": 2.57, + "learning_rate": 2.8339603052132297e-06, + "loss": 0.0306, + "step": 120700 + }, + { + "epoch": 2.58, + "learning_rate": 2.819737880716526e-06, + "loss": 0.03, + "step": 120800 + }, + { + "epoch": 2.58, + "learning_rate": 2.8055154562198222e-06, + "loss": 0.0276, + "step": 120900 + }, + { + "epoch": 2.58, + "learning_rate": 2.791293031723118e-06, + "loss": 0.0315, + "step": 121000 + }, + { + "epoch": 2.58, + "learning_rate": 2.7770706072264143e-06, + "loss": 0.0345, + "step": 121100 + }, + { + "epoch": 2.59, + "learning_rate": 2.7628481827297097e-06, + "loss": 0.03, + "step": 121200 + }, + { + "epoch": 2.59, + "learning_rate": 2.7486257582330064e-06, + "loss": 0.0265, + "step": 121300 + }, + { + "epoch": 2.59, + "learning_rate": 2.7344033337363026e-06, + "loss": 0.0317, + "step": 121400 + }, + { + "epoch": 2.59, + "learning_rate": 2.720180909239598e-06, + "loss": 0.0362, + "step": 121500 + }, + { + "epoch": 2.59, + "learning_rate": 2.7059584847428942e-06, + "loss": 0.0293, + "step": 121600 + }, + { + "epoch": 2.6, + "learning_rate": 2.6917360602461905e-06, + "loss": 0.0262, + "step": 121700 + }, + { + "epoch": 2.6, + "learning_rate": 2.6775136357494863e-06, + "loss": 0.0306, + "step": 121800 + }, + { + "epoch": 2.6, + "learning_rate": 2.6632912112527826e-06, + "loss": 0.0288, + "step": 121900 + }, + { + "epoch": 2.6, + "learning_rate": 2.6490687867560784e-06, + "loss": 0.0309, + "step": 122000 + }, + { + "epoch": 2.6, + "learning_rate": 2.6348463622593746e-06, + "loss": 0.0259, + "step": 122100 + }, + { + "epoch": 2.61, + "learning_rate": 2.620623937762671e-06, + "loss": 0.0288, + "step": 122200 + }, + { + "epoch": 2.61, + "learning_rate": 2.6064015132659663e-06, + "loss": 0.0359, + "step": 122300 + }, + { + "epoch": 2.61, + "learning_rate": 2.592179088769263e-06, + "loss": 0.0277, + "step": 122400 + }, + { + "epoch": 2.61, + "learning_rate": 2.577956664272559e-06, + "loss": 0.0314, + "step": 122500 + }, + { + "epoch": 2.62, + "learning_rate": 2.5637342397758546e-06, + "loss": 0.0277, + "step": 122600 + }, + { + "epoch": 2.62, + "learning_rate": 2.549511815279151e-06, + "loss": 0.0348, + "step": 122700 + }, + { + "epoch": 2.62, + "learning_rate": 2.5352893907824466e-06, + "loss": 0.0295, + "step": 122800 + }, + { + "epoch": 2.62, + "learning_rate": 2.521066966285743e-06, + "loss": 0.0325, + "step": 122900 + }, + { + "epoch": 2.62, + "learning_rate": 2.506844541789039e-06, + "loss": 0.032, + "step": 123000 + }, + { + "epoch": 2.63, + "learning_rate": 2.492622117292335e-06, + "loss": 0.028, + "step": 123100 + }, + { + "epoch": 2.63, + "learning_rate": 2.478399692795631e-06, + "loss": 0.0292, + "step": 123200 + }, + { + "epoch": 2.63, + "learning_rate": 2.464177268298927e-06, + "loss": 0.0348, + "step": 123300 + }, + { + "epoch": 2.63, + "learning_rate": 2.4499548438022232e-06, + "loss": 0.0271, + "step": 123400 + }, + { + "epoch": 2.63, + "learning_rate": 2.4357324193055195e-06, + "loss": 0.0285, + "step": 123500 + }, + { + "epoch": 2.64, + "learning_rate": 2.4215099948088153e-06, + "loss": 0.0278, + "step": 123600 + }, + { + "epoch": 2.64, + "learning_rate": 2.407287570312111e-06, + "loss": 0.0298, + "step": 123700 + }, + { + "epoch": 2.64, + "learning_rate": 2.3930651458154074e-06, + "loss": 0.0298, + "step": 123800 + }, + { + "epoch": 2.64, + "learning_rate": 2.3788427213187036e-06, + "loss": 0.0309, + "step": 123900 + }, + { + "epoch": 2.65, + "learning_rate": 2.3646202968219994e-06, + "loss": 0.0299, + "step": 124000 + }, + { + "epoch": 2.65, + "learning_rate": 2.3503978723252953e-06, + "loss": 0.0266, + "step": 124100 + }, + { + "epoch": 2.65, + "learning_rate": 2.3361754478285915e-06, + "loss": 0.0249, + "step": 124200 + }, + { + "epoch": 2.65, + "learning_rate": 2.3219530233318877e-06, + "loss": 0.0286, + "step": 124300 + }, + { + "epoch": 2.65, + "learning_rate": 2.3077305988351836e-06, + "loss": 0.0262, + "step": 124400 + }, + { + "epoch": 2.66, + "learning_rate": 2.29350817433848e-06, + "loss": 0.0276, + "step": 124500 + }, + { + "epoch": 2.66, + "learning_rate": 2.2792857498417756e-06, + "loss": 0.028, + "step": 124600 + }, + { + "epoch": 2.66, + "learning_rate": 2.265063325345072e-06, + "loss": 0.0287, + "step": 124700 + }, + { + "epoch": 2.66, + "learning_rate": 2.2508409008483677e-06, + "loss": 0.0313, + "step": 124800 + }, + { + "epoch": 2.66, + "learning_rate": 2.236618476351664e-06, + "loss": 0.0281, + "step": 124900 + }, + { + "epoch": 2.67, + "learning_rate": 2.2223960518549598e-06, + "loss": 0.0229, + "step": 125000 + }, + { + "epoch": 2.67, + "learning_rate": 2.208173627358256e-06, + "loss": 0.027, + "step": 125100 + }, + { + "epoch": 2.67, + "learning_rate": 2.193951202861552e-06, + "loss": 0.0279, + "step": 125200 + }, + { + "epoch": 2.67, + "learning_rate": 2.179728778364848e-06, + "loss": 0.0298, + "step": 125300 + }, + { + "epoch": 2.68, + "learning_rate": 2.165506353868144e-06, + "loss": 0.0295, + "step": 125400 + }, + { + "epoch": 2.68, + "learning_rate": 2.15128392937144e-06, + "loss": 0.0223, + "step": 125500 + }, + { + "epoch": 2.68, + "learning_rate": 2.1370615048747364e-06, + "loss": 0.0298, + "step": 125600 + }, + { + "epoch": 2.68, + "learning_rate": 2.122839080378032e-06, + "loss": 0.0322, + "step": 125700 + }, + { + "epoch": 2.68, + "learning_rate": 2.108616655881328e-06, + "loss": 0.0282, + "step": 125800 + }, + { + "epoch": 2.69, + "learning_rate": 2.0943942313846243e-06, + "loss": 0.0296, + "step": 125900 + }, + { + "epoch": 2.69, + "learning_rate": 2.0801718068879205e-06, + "loss": 0.0258, + "step": 126000 + }, + { + "epoch": 2.69, + "learning_rate": 2.0659493823912163e-06, + "loss": 0.0277, + "step": 126100 + }, + { + "epoch": 2.69, + "learning_rate": 2.051726957894512e-06, + "loss": 0.0285, + "step": 126200 + }, + { + "epoch": 2.69, + "learning_rate": 2.0375045333978084e-06, + "loss": 0.0314, + "step": 126300 + }, + { + "epoch": 2.7, + "learning_rate": 2.0232821089011046e-06, + "loss": 0.0312, + "step": 126400 + }, + { + "epoch": 2.7, + "learning_rate": 2.0090596844044005e-06, + "loss": 0.0285, + "step": 126500 + }, + { + "epoch": 2.7, + "learning_rate": 1.9948372599076967e-06, + "loss": 0.0291, + "step": 126600 + }, + { + "epoch": 2.7, + "learning_rate": 1.980614835410993e-06, + "loss": 0.0253, + "step": 126700 + }, + { + "epoch": 2.71, + "learning_rate": 1.9663924109142888e-06, + "loss": 0.0242, + "step": 126800 + }, + { + "epoch": 2.71, + "learning_rate": 1.9521699864175846e-06, + "loss": 0.0316, + "step": 126900 + }, + { + "epoch": 2.71, + "learning_rate": 1.937947561920881e-06, + "loss": 0.0305, + "step": 127000 + }, + { + "epoch": 2.71, + "learning_rate": 1.923725137424177e-06, + "loss": 0.0244, + "step": 127100 + }, + { + "epoch": 2.71, + "learning_rate": 1.909502712927473e-06, + "loss": 0.0263, + "step": 127200 + }, + { + "epoch": 2.72, + "learning_rate": 1.895280288430769e-06, + "loss": 0.0281, + "step": 127300 + }, + { + "epoch": 2.72, + "learning_rate": 1.881057863934065e-06, + "loss": 0.027, + "step": 127400 + }, + { + "epoch": 2.72, + "learning_rate": 1.8668354394373612e-06, + "loss": 0.0265, + "step": 127500 + }, + { + "epoch": 2.72, + "learning_rate": 1.852613014940657e-06, + "loss": 0.0302, + "step": 127600 + }, + { + "epoch": 2.72, + "learning_rate": 1.838390590443953e-06, + "loss": 0.0273, + "step": 127700 + }, + { + "epoch": 2.73, + "learning_rate": 1.824168165947249e-06, + "loss": 0.0243, + "step": 127800 + }, + { + "epoch": 2.73, + "learning_rate": 1.8099457414505453e-06, + "loss": 0.0246, + "step": 127900 + }, + { + "epoch": 2.73, + "learning_rate": 1.7957233169538414e-06, + "loss": 0.0246, + "step": 128000 + }, + { + "epoch": 2.73, + "learning_rate": 1.7815008924571372e-06, + "loss": 0.0307, + "step": 128100 + }, + { + "epoch": 2.73, + "learning_rate": 1.7672784679604332e-06, + "loss": 0.0255, + "step": 128200 + }, + { + "epoch": 2.74, + "learning_rate": 1.7530560434637295e-06, + "loss": 0.0231, + "step": 128300 + }, + { + "epoch": 2.74, + "learning_rate": 1.7388336189670255e-06, + "loss": 0.0261, + "step": 128400 + }, + { + "epoch": 2.74, + "learning_rate": 1.7246111944703215e-06, + "loss": 0.0277, + "step": 128500 + }, + { + "epoch": 2.74, + "learning_rate": 1.7103887699736173e-06, + "loss": 0.0289, + "step": 128600 + }, + { + "epoch": 2.75, + "learning_rate": 1.6961663454769136e-06, + "loss": 0.0272, + "step": 128700 + }, + { + "epoch": 2.75, + "learning_rate": 1.6819439209802096e-06, + "loss": 0.0304, + "step": 128800 + }, + { + "epoch": 2.75, + "learning_rate": 1.6677214964835057e-06, + "loss": 0.0253, + "step": 128900 + }, + { + "epoch": 2.75, + "learning_rate": 1.6534990719868017e-06, + "loss": 0.0289, + "step": 129000 + }, + { + "epoch": 2.75, + "learning_rate": 1.639276647490098e-06, + "loss": 0.0287, + "step": 129100 + }, + { + "epoch": 2.76, + "learning_rate": 1.6250542229933938e-06, + "loss": 0.0302, + "step": 129200 + }, + { + "epoch": 2.76, + "learning_rate": 1.6108317984966898e-06, + "loss": 0.0227, + "step": 129300 + }, + { + "epoch": 2.76, + "learning_rate": 1.5966093739999858e-06, + "loss": 0.0302, + "step": 129400 + }, + { + "epoch": 2.76, + "learning_rate": 1.582386949503282e-06, + "loss": 0.0242, + "step": 129500 + }, + { + "epoch": 2.76, + "learning_rate": 1.568164525006578e-06, + "loss": 0.0281, + "step": 129600 + }, + { + "epoch": 2.77, + "learning_rate": 1.553942100509874e-06, + "loss": 0.032, + "step": 129700 + }, + { + "epoch": 2.77, + "learning_rate": 1.53971967601317e-06, + "loss": 0.0251, + "step": 129800 + }, + { + "epoch": 2.77, + "learning_rate": 1.5254972515164662e-06, + "loss": 0.0262, + "step": 129900 + }, + { + "epoch": 2.77, + "learning_rate": 1.5112748270197622e-06, + "loss": 0.029, + "step": 130000 + }, + { + "epoch": 2.77, + "eval_loss": 0.12821832299232483, + "eval_runtime": 34.334, + "eval_samples_per_second": 145.628, + "eval_steps_per_second": 1.165, + "step": 130000 + }, + { + "epoch": 2.78, + "learning_rate": 1.4970524025230583e-06, + "loss": 0.0269, + "step": 130100 + }, + { + "epoch": 2.78, + "learning_rate": 1.482829978026354e-06, + "loss": 0.0303, + "step": 130200 + }, + { + "epoch": 2.78, + "learning_rate": 1.4686075535296503e-06, + "loss": 0.0309, + "step": 130300 + }, + { + "epoch": 2.78, + "learning_rate": 1.4543851290329464e-06, + "loss": 0.0262, + "step": 130400 + }, + { + "epoch": 2.78, + "learning_rate": 1.4401627045362424e-06, + "loss": 0.0252, + "step": 130500 + }, + { + "epoch": 2.79, + "learning_rate": 1.4259402800395384e-06, + "loss": 0.024, + "step": 130600 + }, + { + "epoch": 2.79, + "learning_rate": 1.4117178555428347e-06, + "loss": 0.0292, + "step": 130700 + }, + { + "epoch": 2.79, + "learning_rate": 1.3974954310461305e-06, + "loss": 0.0264, + "step": 130800 + }, + { + "epoch": 2.79, + "learning_rate": 1.3832730065494265e-06, + "loss": 0.0245, + "step": 130900 + }, + { + "epoch": 2.79, + "learning_rate": 1.3690505820527225e-06, + "loss": 0.0281, + "step": 131000 + }, + { + "epoch": 2.8, + "learning_rate": 1.3548281575560188e-06, + "loss": 0.0302, + "step": 131100 + }, + { + "epoch": 2.8, + "learning_rate": 1.3406057330593148e-06, + "loss": 0.0278, + "step": 131200 + }, + { + "epoch": 2.8, + "learning_rate": 1.3263833085626106e-06, + "loss": 0.0277, + "step": 131300 + }, + { + "epoch": 2.8, + "learning_rate": 1.3121608840659067e-06, + "loss": 0.0252, + "step": 131400 + }, + { + "epoch": 2.81, + "learning_rate": 1.297938459569203e-06, + "loss": 0.027, + "step": 131500 + }, + { + "epoch": 2.81, + "learning_rate": 1.283716035072499e-06, + "loss": 0.0276, + "step": 131600 + }, + { + "epoch": 2.81, + "learning_rate": 1.269493610575795e-06, + "loss": 0.0234, + "step": 131700 + }, + { + "epoch": 2.81, + "learning_rate": 1.2552711860790908e-06, + "loss": 0.0285, + "step": 131800 + }, + { + "epoch": 2.81, + "learning_rate": 1.241048761582387e-06, + "loss": 0.0298, + "step": 131900 + }, + { + "epoch": 2.82, + "learning_rate": 1.226826337085683e-06, + "loss": 0.027, + "step": 132000 + }, + { + "epoch": 2.82, + "learning_rate": 1.2126039125889791e-06, + "loss": 0.0309, + "step": 132100 + }, + { + "epoch": 2.82, + "learning_rate": 1.1983814880922751e-06, + "loss": 0.0257, + "step": 132200 + }, + { + "epoch": 2.82, + "learning_rate": 1.1841590635955712e-06, + "loss": 0.0263, + "step": 132300 + }, + { + "epoch": 2.82, + "learning_rate": 1.1699366390988672e-06, + "loss": 0.024, + "step": 132400 + }, + { + "epoch": 2.83, + "learning_rate": 1.1557142146021632e-06, + "loss": 0.0313, + "step": 132500 + }, + { + "epoch": 2.83, + "learning_rate": 1.1414917901054593e-06, + "loss": 0.0214, + "step": 132600 + }, + { + "epoch": 2.83, + "learning_rate": 1.1272693656087553e-06, + "loss": 0.027, + "step": 132700 + }, + { + "epoch": 2.83, + "learning_rate": 1.1130469411120516e-06, + "loss": 0.0286, + "step": 132800 + }, + { + "epoch": 2.84, + "learning_rate": 1.0988245166153474e-06, + "loss": 0.0242, + "step": 132900 + }, + { + "epoch": 2.84, + "learning_rate": 1.0846020921186436e-06, + "loss": 0.0314, + "step": 133000 + }, + { + "epoch": 2.84, + "learning_rate": 1.0703796676219394e-06, + "loss": 0.0247, + "step": 133100 + }, + { + "epoch": 2.84, + "learning_rate": 1.0561572431252357e-06, + "loss": 0.0281, + "step": 133200 + }, + { + "epoch": 2.84, + "learning_rate": 1.0419348186285317e-06, + "loss": 0.0214, + "step": 133300 + }, + { + "epoch": 2.85, + "learning_rate": 1.0277123941318277e-06, + "loss": 0.0313, + "step": 133400 + }, + { + "epoch": 2.85, + "learning_rate": 1.0134899696351238e-06, + "loss": 0.0291, + "step": 133500 + }, + { + "epoch": 2.85, + "learning_rate": 9.992675451384198e-07, + "loss": 0.0268, + "step": 133600 + }, + { + "epoch": 2.85, + "learning_rate": 9.850451206417158e-07, + "loss": 0.0285, + "step": 133700 + }, + { + "epoch": 2.85, + "learning_rate": 9.708226961450119e-07, + "loss": 0.0266, + "step": 133800 + }, + { + "epoch": 2.86, + "learning_rate": 9.56600271648308e-07, + "loss": 0.0266, + "step": 133900 + }, + { + "epoch": 2.86, + "learning_rate": 9.42377847151604e-07, + "loss": 0.0226, + "step": 134000 + }, + { + "epoch": 2.86, + "learning_rate": 9.281554226549e-07, + "loss": 0.0273, + "step": 134100 + }, + { + "epoch": 2.86, + "learning_rate": 9.139329981581961e-07, + "loss": 0.0278, + "step": 134200 + }, + { + "epoch": 2.87, + "learning_rate": 8.99710573661492e-07, + "loss": 0.031, + "step": 134300 + }, + { + "epoch": 2.87, + "learning_rate": 8.854881491647882e-07, + "loss": 0.0274, + "step": 134400 + }, + { + "epoch": 2.87, + "learning_rate": 8.712657246680842e-07, + "loss": 0.0245, + "step": 134500 + }, + { + "epoch": 2.87, + "learning_rate": 8.570433001713803e-07, + "loss": 0.0264, + "step": 134600 + }, + { + "epoch": 2.87, + "learning_rate": 8.428208756746763e-07, + "loss": 0.0314, + "step": 134700 + }, + { + "epoch": 2.88, + "learning_rate": 8.285984511779724e-07, + "loss": 0.0283, + "step": 134800 + }, + { + "epoch": 2.88, + "learning_rate": 8.143760266812683e-07, + "loss": 0.0281, + "step": 134900 + }, + { + "epoch": 2.88, + "learning_rate": 8.001536021845645e-07, + "loss": 0.0278, + "step": 135000 + }, + { + "epoch": 2.88, + "learning_rate": 7.859311776878604e-07, + "loss": 0.0334, + "step": 135100 + }, + { + "epoch": 2.88, + "learning_rate": 7.717087531911565e-07, + "loss": 0.0235, + "step": 135200 + }, + { + "epoch": 2.89, + "learning_rate": 7.574863286944526e-07, + "loss": 0.0337, + "step": 135300 + }, + { + "epoch": 2.89, + "learning_rate": 7.432639041977487e-07, + "loss": 0.0221, + "step": 135400 + }, + { + "epoch": 2.89, + "learning_rate": 7.290414797010446e-07, + "loss": 0.0293, + "step": 135500 + }, + { + "epoch": 2.89, + "learning_rate": 7.148190552043408e-07, + "loss": 0.0227, + "step": 135600 + }, + { + "epoch": 2.89, + "learning_rate": 7.005966307076367e-07, + "loss": 0.0248, + "step": 135700 + }, + { + "epoch": 2.9, + "learning_rate": 6.863742062109328e-07, + "loss": 0.0291, + "step": 135800 + }, + { + "epoch": 2.9, + "learning_rate": 6.721517817142289e-07, + "loss": 0.0261, + "step": 135900 + }, + { + "epoch": 2.9, + "learning_rate": 6.579293572175249e-07, + "loss": 0.029, + "step": 136000 + }, + { + "epoch": 2.9, + "learning_rate": 6.437069327208209e-07, + "loss": 0.0266, + "step": 136100 + }, + { + "epoch": 2.91, + "learning_rate": 6.294845082241171e-07, + "loss": 0.0277, + "step": 136200 + }, + { + "epoch": 2.91, + "learning_rate": 6.152620837274131e-07, + "loss": 0.0231, + "step": 136300 + }, + { + "epoch": 2.91, + "learning_rate": 6.010396592307091e-07, + "loss": 0.0284, + "step": 136400 + }, + { + "epoch": 2.91, + "learning_rate": 5.868172347340052e-07, + "loss": 0.0269, + "step": 136500 + }, + { + "epoch": 2.91, + "learning_rate": 5.725948102373012e-07, + "loss": 0.0254, + "step": 136600 + }, + { + "epoch": 2.92, + "learning_rate": 5.583723857405972e-07, + "loss": 0.0267, + "step": 136700 + }, + { + "epoch": 2.92, + "learning_rate": 5.441499612438933e-07, + "loss": 0.0254, + "step": 136800 + }, + { + "epoch": 2.92, + "learning_rate": 5.299275367471893e-07, + "loss": 0.0271, + "step": 136900 + }, + { + "epoch": 2.92, + "learning_rate": 5.157051122504854e-07, + "loss": 0.0281, + "step": 137000 + }, + { + "epoch": 2.92, + "learning_rate": 5.014826877537815e-07, + "loss": 0.0326, + "step": 137100 + }, + { + "epoch": 2.93, + "learning_rate": 4.872602632570775e-07, + "loss": 0.0275, + "step": 137200 + }, + { + "epoch": 2.93, + "learning_rate": 4.7303783876037353e-07, + "loss": 0.0312, + "step": 137300 + }, + { + "epoch": 2.93, + "learning_rate": 4.5881541426366957e-07, + "loss": 0.0265, + "step": 137400 + }, + { + "epoch": 2.93, + "learning_rate": 4.445929897669656e-07, + "loss": 0.0282, + "step": 137500 + }, + { + "epoch": 2.94, + "learning_rate": 4.303705652702617e-07, + "loss": 0.0279, + "step": 137600 + }, + { + "epoch": 2.94, + "learning_rate": 4.161481407735577e-07, + "loss": 0.0283, + "step": 137700 + }, + { + "epoch": 2.94, + "learning_rate": 4.0192571627685375e-07, + "loss": 0.0271, + "step": 137800 + }, + { + "epoch": 2.94, + "learning_rate": 3.877032917801498e-07, + "loss": 0.0297, + "step": 137900 + }, + { + "epoch": 2.94, + "learning_rate": 3.7348086728344587e-07, + "loss": 0.0288, + "step": 138000 + }, + { + "epoch": 2.95, + "learning_rate": 3.592584427867419e-07, + "loss": 0.0259, + "step": 138100 + }, + { + "epoch": 2.95, + "learning_rate": 3.4503601829003793e-07, + "loss": 0.0293, + "step": 138200 + }, + { + "epoch": 2.95, + "learning_rate": 3.3081359379333396e-07, + "loss": 0.0228, + "step": 138300 + }, + { + "epoch": 2.95, + "learning_rate": 3.1659116929663005e-07, + "loss": 0.0234, + "step": 138400 + }, + { + "epoch": 2.95, + "learning_rate": 3.023687447999261e-07, + "loss": 0.0228, + "step": 138500 + }, + { + "epoch": 2.96, + "learning_rate": 2.881463203032221e-07, + "loss": 0.0196, + "step": 138600 + }, + { + "epoch": 2.96, + "learning_rate": 2.7392389580651815e-07, + "loss": 0.0264, + "step": 138700 + }, + { + "epoch": 2.96, + "learning_rate": 2.5970147130981423e-07, + "loss": 0.029, + "step": 138800 + }, + { + "epoch": 2.96, + "learning_rate": 2.4547904681311026e-07, + "loss": 0.0236, + "step": 138900 + }, + { + "epoch": 2.97, + "learning_rate": 2.312566223164063e-07, + "loss": 0.0216, + "step": 139000 + }, + { + "epoch": 2.97, + "learning_rate": 2.1703419781970235e-07, + "loss": 0.0275, + "step": 139100 + }, + { + "epoch": 2.97, + "learning_rate": 2.0281177332299839e-07, + "loss": 0.0243, + "step": 139200 + }, + { + "epoch": 2.97, + "learning_rate": 1.8858934882629444e-07, + "loss": 0.027, + "step": 139300 + }, + { + "epoch": 2.97, + "learning_rate": 1.7436692432959048e-07, + "loss": 0.033, + "step": 139400 + }, + { + "epoch": 2.98, + "learning_rate": 1.6014449983288654e-07, + "loss": 0.0267, + "step": 139500 + }, + { + "epoch": 2.98, + "learning_rate": 1.4592207533618257e-07, + "loss": 0.0283, + "step": 139600 + }, + { + "epoch": 2.98, + "learning_rate": 1.3169965083947863e-07, + "loss": 0.0244, + "step": 139700 + }, + { + "epoch": 2.98, + "learning_rate": 1.1747722634277467e-07, + "loss": 0.03, + "step": 139800 + }, + { + "epoch": 2.98, + "learning_rate": 1.0325480184607072e-07, + "loss": 0.0277, + "step": 139900 + }, + { + "epoch": 2.99, + "learning_rate": 8.903237734936676e-08, + "loss": 0.03, + "step": 140000 + }, + { + "epoch": 2.99, + "eval_loss": 0.1257346123456955, + "eval_runtime": 34.2432, + "eval_samples_per_second": 146.014, + "eval_steps_per_second": 1.168, + "step": 140000 + } + ], + "logging_steps": 100, + "max_steps": 140625, + "num_train_epochs": 3, + "save_steps": 10000, + "total_flos": 3822663594147840.0, + "trial_name": null, + "trial_params": null +}