|
{ |
|
"best_metric": 2.4689557552337646, |
|
"best_model_checkpoint": "./output/training_results/C017_random_sample_llama3-8b-base_pretrain_20240504_182259/checkpoint-800", |
|
"epoch": 4.0, |
|
"eval_steps": 200, |
|
"global_step": 3944, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0010141987829614604, |
|
"grad_norm": 4.267137538119642, |
|
"learning_rate": 7.5e-07, |
|
"loss": 2.7134, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005070993914807302, |
|
"grad_norm": 4.879489677016923, |
|
"learning_rate": 2.25e-06, |
|
"loss": 2.7254, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.010141987829614604, |
|
"grad_norm": 2.7621009561709564, |
|
"learning_rate": 6e-06, |
|
"loss": 2.707, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.015212981744421906, |
|
"grad_norm": 2.404100845677231, |
|
"learning_rate": 9e-06, |
|
"loss": 2.6421, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02028397565922921, |
|
"grad_norm": 2.4429846538599254, |
|
"learning_rate": 1.275e-05, |
|
"loss": 2.6682, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02535496957403651, |
|
"grad_norm": 2.8575493026010625, |
|
"learning_rate": 1.4916395742870319e-05, |
|
"loss": 2.6639, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.030425963488843813, |
|
"grad_norm": 2.4347171369214538, |
|
"learning_rate": 1.4709241308404976e-05, |
|
"loss": 2.6624, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.035496957403651115, |
|
"grad_norm": 2.5792627004512942, |
|
"learning_rate": 1.4504714365262738e-05, |
|
"loss": 2.6351, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04056795131845842, |
|
"grad_norm": 2.1789139866654366, |
|
"learning_rate": 1.4302784881547452e-05, |
|
"loss": 2.6055, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04563894523326572, |
|
"grad_norm": 2.232485210798856, |
|
"learning_rate": 1.4103423130872168e-05, |
|
"loss": 2.5938, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05070993914807302, |
|
"grad_norm": 2.2896589926745814, |
|
"learning_rate": 1.390659968963626e-05, |
|
"loss": 2.6334, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.055780933062880324, |
|
"grad_norm": 2.7780457428021985, |
|
"learning_rate": 1.3712285434323396e-05, |
|
"loss": 2.646, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.060851926977687626, |
|
"grad_norm": 1.9399001575023072, |
|
"learning_rate": 1.352045153882017e-05, |
|
"loss": 2.6182, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06592292089249494, |
|
"grad_norm": 1.9083156579424998, |
|
"learning_rate": 1.3331069471755332e-05, |
|
"loss": 2.6056, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07099391480730223, |
|
"grad_norm": 2.2298396560554683, |
|
"learning_rate": 1.314411099385942e-05, |
|
"loss": 2.6043, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.07606490872210954, |
|
"grad_norm": 1.9661711744318215, |
|
"learning_rate": 1.2959548155344706e-05, |
|
"loss": 2.6321, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08113590263691683, |
|
"grad_norm": 2.1260634398939438, |
|
"learning_rate": 1.2777353293305311e-05, |
|
"loss": 2.5744, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.08620689655172414, |
|
"grad_norm": 2.171189842092272, |
|
"learning_rate": 1.2597499029137354e-05, |
|
"loss": 2.6102, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.09127789046653144, |
|
"grad_norm": 2.118995328928547, |
|
"learning_rate": 1.2419958265979023e-05, |
|
"loss": 2.6056, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.09634888438133875, |
|
"grad_norm": 2.1743656445294466, |
|
"learning_rate": 1.2244704186170414e-05, |
|
"loss": 2.591, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.10141987829614604, |
|
"grad_norm": 2.100620832387391, |
|
"learning_rate": 1.2106129489565247e-05, |
|
"loss": 2.6461, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.10649087221095335, |
|
"grad_norm": 2.02911049207023, |
|
"learning_rate": 1.1934924740853141e-05, |
|
"loss": 2.5878, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.11156186612576065, |
|
"grad_norm": 2.12870974325018, |
|
"learning_rate": 1.1765933050017452e-05, |
|
"loss": 2.5793, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.11663286004056796, |
|
"grad_norm": 1.9038783159180614, |
|
"learning_rate": 1.1599128637544344e-05, |
|
"loss": 2.5612, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.12170385395537525, |
|
"grad_norm": 1.9647399779959451, |
|
"learning_rate": 1.1434485991200533e-05, |
|
"loss": 2.6083, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.12677484787018256, |
|
"grad_norm": 1.88937427094592, |
|
"learning_rate": 1.1271979863605386e-05, |
|
"loss": 2.5561, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.13184584178498987, |
|
"grad_norm": 1.8208051471693176, |
|
"learning_rate": 1.111158526982193e-05, |
|
"loss": 2.5884, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.13691683569979715, |
|
"grad_norm": 1.771422341312915, |
|
"learning_rate": 1.0953277484966689e-05, |
|
"loss": 2.5509, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.14198782961460446, |
|
"grad_norm": 1.8296701053391813, |
|
"learning_rate": 1.0797032041838185e-05, |
|
"loss": 2.5784, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.14705882352941177, |
|
"grad_norm": 1.8139046565289612, |
|
"learning_rate": 1.0642824728564022e-05, |
|
"loss": 2.5624, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.15212981744421908, |
|
"grad_norm": 1.9862915107502803, |
|
"learning_rate": 1.0490631586266381e-05, |
|
"loss": 2.6007, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.15720081135902636, |
|
"grad_norm": 1.8392246134083736, |
|
"learning_rate": 1.0340428906745863e-05, |
|
"loss": 2.5775, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.16227180527383367, |
|
"grad_norm": 1.9250085841598776, |
|
"learning_rate": 1.0192193230183505e-05, |
|
"loss": 2.6045, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.16734279918864098, |
|
"grad_norm": 2.1119936162911825, |
|
"learning_rate": 1.0045901342860905e-05, |
|
"loss": 2.5838, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"grad_norm": 1.9416866546338962, |
|
"learning_rate": 9.901530274898272e-06, |
|
"loss": 2.5643, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.17748478701825557, |
|
"grad_norm": 1.871570899679003, |
|
"learning_rate": 9.75905729801036e-06, |
|
"loss": 2.5549, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.18255578093306288, |
|
"grad_norm": 2.0672616615182897, |
|
"learning_rate": 9.61845992328009e-06, |
|
"loss": 2.561, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1876267748478702, |
|
"grad_norm": 1.8373271363293353, |
|
"learning_rate": 9.479715898949807e-06, |
|
"loss": 2.5728, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.1926977687626775, |
|
"grad_norm": 1.9497106449021773, |
|
"learning_rate": 9.342803208230014e-06, |
|
"loss": 2.5535, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.19776876267748478, |
|
"grad_norm": 1.913646357656738, |
|
"learning_rate": 9.207700067125492e-06, |
|
"loss": 2.5411, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2028397565922921, |
|
"grad_norm": 1.7027113982701332, |
|
"learning_rate": 9.074384922278684e-06, |
|
"loss": 2.5442, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2028397565922921, |
|
"eval_loss": 2.55521821975708, |
|
"eval_runtime": 81.0607, |
|
"eval_samples_per_second": 86.429, |
|
"eval_steps_per_second": 0.679, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2079107505070994, |
|
"grad_norm": 1.753576639879344, |
|
"learning_rate": 8.942836448830213e-06, |
|
"loss": 2.5264, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.2129817444219067, |
|
"grad_norm": 1.7785092188900598, |
|
"learning_rate": 8.813033548296443e-06, |
|
"loss": 2.5645, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.21805273833671399, |
|
"grad_norm": 1.7915296631060966, |
|
"learning_rate": 8.684955346463971e-06, |
|
"loss": 2.555, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.2231237322515213, |
|
"grad_norm": 1.7452346531223148, |
|
"learning_rate": 8.558581191300906e-06, |
|
"loss": 2.6118, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2281947261663286, |
|
"grad_norm": 2.339774136223256, |
|
"learning_rate": 8.433890650884857e-06, |
|
"loss": 2.5284, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2332657200811359, |
|
"grad_norm": 1.7961229516339332, |
|
"learning_rate": 8.310863511347508e-06, |
|
"loss": 2.558, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2383367139959432, |
|
"grad_norm": 2.000305491613022, |
|
"learning_rate": 8.189479774835651e-06, |
|
"loss": 2.5312, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.2434077079107505, |
|
"grad_norm": 1.9162907270104979, |
|
"learning_rate": 8.069719657488614e-06, |
|
"loss": 2.4983, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2484787018255578, |
|
"grad_norm": 1.9447544732938296, |
|
"learning_rate": 7.951563587431902e-06, |
|
"loss": 2.5462, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.2535496957403651, |
|
"grad_norm": 1.8244106804572084, |
|
"learning_rate": 7.834992202787018e-06, |
|
"loss": 2.5354, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.25862068965517243, |
|
"grad_norm": 1.714609238517639, |
|
"learning_rate": 7.719986349697309e-06, |
|
"loss": 2.5386, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.26369168356997974, |
|
"grad_norm": 1.795436681758725, |
|
"learning_rate": 7.606527080369728e-06, |
|
"loss": 2.5388, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.268762677484787, |
|
"grad_norm": 1.7081706265027667, |
|
"learning_rate": 7.494595651132443e-06, |
|
"loss": 2.568, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.2738336713995943, |
|
"grad_norm": 1.6958291617768828, |
|
"learning_rate": 7.384173520508138e-06, |
|
"loss": 2.5489, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2789046653144016, |
|
"grad_norm": 1.6677502189962874, |
|
"learning_rate": 7.275242347302937e-06, |
|
"loss": 2.5666, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2839756592292089, |
|
"grad_norm": 1.6916519769077745, |
|
"learning_rate": 7.167783988710829e-06, |
|
"loss": 2.5161, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.28904665314401623, |
|
"grad_norm": 1.9276199368209956, |
|
"learning_rate": 7.061780498433485e-06, |
|
"loss": 2.5461, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.29411764705882354, |
|
"grad_norm": 1.721858200785338, |
|
"learning_rate": 6.957214124815376e-06, |
|
"loss": 2.56, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.29918864097363085, |
|
"grad_norm": 1.7023218265873687, |
|
"learning_rate": 6.854067308994081e-06, |
|
"loss": 2.5252, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.30425963488843816, |
|
"grad_norm": 1.7702063060142263, |
|
"learning_rate": 6.752322683065677e-06, |
|
"loss": 2.5365, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3093306288032454, |
|
"grad_norm": 1.807175965887596, |
|
"learning_rate": 6.651963068265119e-06, |
|
"loss": 2.5351, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.3144016227180527, |
|
"grad_norm": 1.7687398862728192, |
|
"learning_rate": 6.5529714731614995e-06, |
|
"loss": 2.5184, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.31947261663286003, |
|
"grad_norm": 1.808664958617461, |
|
"learning_rate": 6.455331091868087e-06, |
|
"loss": 2.5062, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.32454361054766734, |
|
"grad_norm": 1.9021979000655393, |
|
"learning_rate": 6.359025302267049e-06, |
|
"loss": 2.5225, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.32961460446247465, |
|
"grad_norm": 1.704473391712384, |
|
"learning_rate": 6.264037664248752e-06, |
|
"loss": 2.5233, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.33468559837728196, |
|
"grad_norm": 1.751379362669565, |
|
"learning_rate": 6.17035191796554e-06, |
|
"loss": 2.4854, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.33975659229208927, |
|
"grad_norm": 1.6980009285341724, |
|
"learning_rate": 6.077951982099886e-06, |
|
"loss": 2.5008, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 1.6987141788770321, |
|
"learning_rate": 5.986821952146847e-06, |
|
"loss": 2.5438, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.34989858012170383, |
|
"grad_norm": 1.6781775461943316, |
|
"learning_rate": 5.89694609871067e-06, |
|
"loss": 2.5417, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.35496957403651114, |
|
"grad_norm": 1.7326892052245193, |
|
"learning_rate": 5.808308865815513e-06, |
|
"loss": 2.5185, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.36004056795131845, |
|
"grad_norm": 1.743645811121294, |
|
"learning_rate": 5.720894869230136e-06, |
|
"loss": 2.5094, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.36511156186612576, |
|
"grad_norm": 1.7256678519147217, |
|
"learning_rate": 5.634688894806482e-06, |
|
"loss": 2.5316, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.37018255578093306, |
|
"grad_norm": 1.6209115792712339, |
|
"learning_rate": 5.549675896832072e-06, |
|
"loss": 2.5164, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.3752535496957404, |
|
"grad_norm": 1.6497735310259896, |
|
"learning_rate": 5.465840996396076e-06, |
|
"loss": 2.5363, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3803245436105477, |
|
"grad_norm": 1.665747208014539, |
|
"learning_rate": 5.383169479769005e-06, |
|
"loss": 2.5015, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.385395537525355, |
|
"grad_norm": 1.8360023746562857, |
|
"learning_rate": 5.301646796795905e-06, |
|
"loss": 2.4465, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.39046653144016225, |
|
"grad_norm": 1.721788501212322, |
|
"learning_rate": 5.221258559302969e-06, |
|
"loss": 2.5104, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.39553752535496955, |
|
"grad_norm": 1.7896539066797603, |
|
"learning_rate": 5.141990539517474e-06, |
|
"loss": 2.5406, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.40060851926977686, |
|
"grad_norm": 1.7026594592165973, |
|
"learning_rate": 5.0638286685009445e-06, |
|
"loss": 2.5403, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.4056795131845842, |
|
"grad_norm": 1.7666645373608338, |
|
"learning_rate": 4.986759034595453e-06, |
|
"loss": 2.5376, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4056795131845842, |
|
"eval_loss": 2.509550094604492, |
|
"eval_runtime": 81.0126, |
|
"eval_samples_per_second": 86.48, |
|
"eval_steps_per_second": 0.679, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4107505070993915, |
|
"grad_norm": 1.702454460655481, |
|
"learning_rate": 4.910767881882966e-06, |
|
"loss": 2.5017, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.4158215010141988, |
|
"grad_norm": 1.6625424708509573, |
|
"learning_rate": 4.83584160865765e-06, |
|
"loss": 2.5271, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4208924949290061, |
|
"grad_norm": 1.6622717975288752, |
|
"learning_rate": 4.761966765911026e-06, |
|
"loss": 2.5238, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.4259634888438134, |
|
"grad_norm": 1.6256800857720881, |
|
"learning_rate": 4.689130055829907e-06, |
|
"loss": 2.5191, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.43103448275862066, |
|
"grad_norm": 1.7950911413498376, |
|
"learning_rate": 4.617318330307044e-06, |
|
"loss": 2.4909, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.43610547667342797, |
|
"grad_norm": 1.5866160053351177, |
|
"learning_rate": 4.5465185894642715e-06, |
|
"loss": 2.5128, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4411764705882353, |
|
"grad_norm": 1.6754882575554404, |
|
"learning_rate": 4.476717980188313e-06, |
|
"loss": 2.5028, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.4462474645030426, |
|
"grad_norm": 1.6606915353792953, |
|
"learning_rate": 4.407903794678819e-06, |
|
"loss": 2.5207, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4513184584178499, |
|
"grad_norm": 1.8160247477825882, |
|
"learning_rate": 4.340063469008923e-06, |
|
"loss": 2.5017, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.4563894523326572, |
|
"grad_norm": 1.7663094048322825, |
|
"learning_rate": 4.2731845816978475e-06, |
|
"loss": 2.5021, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4614604462474645, |
|
"grad_norm": 1.7799998175038592, |
|
"learning_rate": 4.207254852295854e-06, |
|
"loss": 2.4953, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.4665314401622718, |
|
"grad_norm": 1.6715645487953392, |
|
"learning_rate": 4.142262139981073e-06, |
|
"loss": 2.4435, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.4716024340770791, |
|
"grad_norm": 1.7256265015398793, |
|
"learning_rate": 4.078194442168494e-06, |
|
"loss": 2.5146, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.4766734279918864, |
|
"grad_norm": 1.6662015811964308, |
|
"learning_rate": 4.015039893130705e-06, |
|
"loss": 2.5187, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4817444219066937, |
|
"grad_norm": 1.7649431318197315, |
|
"learning_rate": 3.952786762630535e-06, |
|
"loss": 2.5223, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.486815415821501, |
|
"grad_norm": 1.679617464261057, |
|
"learning_rate": 3.891423454565385e-06, |
|
"loss": 2.4394, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4918864097363083, |
|
"grad_norm": 1.6233085596184735, |
|
"learning_rate": 3.830938505623211e-06, |
|
"loss": 2.512, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.4969574036511156, |
|
"grad_norm": 1.7195900327055993, |
|
"learning_rate": 3.7713205839500707e-06, |
|
"loss": 2.4649, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5020283975659229, |
|
"grad_norm": 1.7034828407083669, |
|
"learning_rate": 3.7125584878291374e-06, |
|
"loss": 2.497, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.5070993914807302, |
|
"grad_norm": 1.7618287486879018, |
|
"learning_rate": 3.6546411443711164e-06, |
|
"loss": 2.5353, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5121703853955375, |
|
"grad_norm": 1.6191614066287776, |
|
"learning_rate": 3.597557608215969e-06, |
|
"loss": 2.5052, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"grad_norm": 1.6450813134062763, |
|
"learning_rate": 3.54129706024587e-06, |
|
"loss": 2.5106, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5223123732251521, |
|
"grad_norm": 1.7767916102532666, |
|
"learning_rate": 3.4858488063093135e-06, |
|
"loss": 2.4651, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5273833671399595, |
|
"grad_norm": 1.6720237829560067, |
|
"learning_rate": 3.431202275956285e-06, |
|
"loss": 2.4908, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5324543610547667, |
|
"grad_norm": 1.6484154917054958, |
|
"learning_rate": 3.3773470211844283e-06, |
|
"loss": 2.4856, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.537525354969574, |
|
"grad_norm": 1.651838194240797, |
|
"learning_rate": 3.324272715196116e-06, |
|
"loss": 2.4675, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5425963488843814, |
|
"grad_norm": 1.6241151521510617, |
|
"learning_rate": 3.2719691511663524e-06, |
|
"loss": 2.4896, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5476673427991886, |
|
"grad_norm": 1.6894175077795812, |
|
"learning_rate": 3.2204262410214273e-06, |
|
"loss": 2.4556, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.552738336713996, |
|
"grad_norm": 1.6686417855987385, |
|
"learning_rate": 3.1696340142282437e-06, |
|
"loss": 2.5062, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.5578093306288032, |
|
"grad_norm": 1.7200856267540612, |
|
"learning_rate": 3.119582616594238e-06, |
|
"loss": 2.4878, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5628803245436106, |
|
"grad_norm": 1.672252633477676, |
|
"learning_rate": 3.0702623090778174e-06, |
|
"loss": 2.5077, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.5679513184584178, |
|
"grad_norm": 1.7008466667698958, |
|
"learning_rate": 3.021663466609246e-06, |
|
"loss": 2.4837, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.5730223123732252, |
|
"grad_norm": 1.6805676799462346, |
|
"learning_rate": 2.973776576921883e-06, |
|
"loss": 2.5062, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.5780933062880325, |
|
"grad_norm": 1.6136103005628197, |
|
"learning_rate": 2.9265922393937183e-06, |
|
"loss": 2.5035, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5831643002028397, |
|
"grad_norm": 1.6014078073339035, |
|
"learning_rate": 2.880101163899116e-06, |
|
"loss": 2.5101, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 1.7220406203120746, |
|
"learning_rate": 2.8342941696706994e-06, |
|
"loss": 2.5217, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5933062880324543, |
|
"grad_norm": 1.6605964063316545, |
|
"learning_rate": 2.789162184171294e-06, |
|
"loss": 2.4756, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.5983772819472617, |
|
"grad_norm": 1.6566249973518374, |
|
"learning_rate": 2.7446962419758632e-06, |
|
"loss": 2.4739, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.603448275862069, |
|
"grad_norm": 1.6340883136536262, |
|
"learning_rate": 2.700887483663357e-06, |
|
"loss": 2.4869, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.6085192697768763, |
|
"grad_norm": 1.6233109361058542, |
|
"learning_rate": 2.657727154718401e-06, |
|
"loss": 2.4487, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6085192697768763, |
|
"eval_loss": 2.4831416606903076, |
|
"eval_runtime": 80.984, |
|
"eval_samples_per_second": 86.511, |
|
"eval_steps_per_second": 0.679, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6135902636916836, |
|
"grad_norm": 1.616769928055098, |
|
"learning_rate": 2.615206604442756e-06, |
|
"loss": 2.4638, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.6186612576064908, |
|
"grad_norm": 1.6396235170920117, |
|
"learning_rate": 2.5733172848764733e-06, |
|
"loss": 2.4891, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6237322515212982, |
|
"grad_norm": 1.5936144163067276, |
|
"learning_rate": 2.5320507497286705e-06, |
|
"loss": 2.4902, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6288032454361054, |
|
"grad_norm": 1.6679977682798468, |
|
"learning_rate": 2.491398653317866e-06, |
|
"loss": 2.4695, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6338742393509128, |
|
"grad_norm": 1.7008178983911084, |
|
"learning_rate": 2.4513527495217875e-06, |
|
"loss": 2.4626, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6389452332657201, |
|
"grad_norm": 1.610985443276998, |
|
"learning_rate": 2.4119048907365937e-06, |
|
"loss": 2.4934, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6440162271805274, |
|
"grad_norm": 1.6323121910464156, |
|
"learning_rate": 2.3730470268454385e-06, |
|
"loss": 2.4819, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.6490872210953347, |
|
"grad_norm": 1.6525382291119861, |
|
"learning_rate": 2.3347712041962997e-06, |
|
"loss": 2.5046, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.654158215010142, |
|
"grad_norm": 1.6380351817927594, |
|
"learning_rate": 2.297069564589013e-06, |
|
"loss": 2.4864, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.6592292089249493, |
|
"grad_norm": 1.6579813340009797, |
|
"learning_rate": 2.259934344271433e-06, |
|
"loss": 2.4715, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6643002028397565, |
|
"grad_norm": 1.7919239246160015, |
|
"learning_rate": 2.22335787294466e-06, |
|
"loss": 2.4972, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.6693711967545639, |
|
"grad_norm": 1.586961409961355, |
|
"learning_rate": 2.18733257277726e-06, |
|
"loss": 2.4787, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.6744421906693712, |
|
"grad_norm": 1.684301176230389, |
|
"learning_rate": 2.1518509574284106e-06, |
|
"loss": 2.4158, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.6795131845841785, |
|
"grad_norm": 1.6178388175493554, |
|
"learning_rate": 2.123852145211829e-06, |
|
"loss": 2.5152, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6845841784989858, |
|
"grad_norm": 1.704137336957441, |
|
"learning_rate": 2.089330585293108e-06, |
|
"loss": 2.4807, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 1.653288753563856, |
|
"learning_rate": 2.055332226962747e-06, |
|
"loss": 2.4781, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6947261663286004, |
|
"grad_norm": 1.6910190620923418, |
|
"learning_rate": 2.0218499227907136e-06, |
|
"loss": 2.5114, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.6997971602434077, |
|
"grad_norm": 1.6297896630103186, |
|
"learning_rate": 1.988876612270826e-06, |
|
"loss": 2.4963, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.704868154158215, |
|
"grad_norm": 1.6254042637268307, |
|
"learning_rate": 1.9564053208943578e-06, |
|
"loss": 2.4651, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.7099391480730223, |
|
"grad_norm": 1.849820644961665, |
|
"learning_rate": 1.924429159232111e-06, |
|
"loss": 2.4625, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7150101419878296, |
|
"grad_norm": 1.6947938784926828, |
|
"learning_rate": 1.892941322024907e-06, |
|
"loss": 2.4683, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.7200811359026369, |
|
"grad_norm": 1.6500218076608433, |
|
"learning_rate": 1.861935087282421e-06, |
|
"loss": 2.474, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7251521298174443, |
|
"grad_norm": 1.5695461599237197, |
|
"learning_rate": 1.8314038153902991e-06, |
|
"loss": 2.4626, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.7302231237322515, |
|
"grad_norm": 1.661274439764298, |
|
"learning_rate": 1.8013409482254947e-06, |
|
"loss": 2.4901, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7352941176470589, |
|
"grad_norm": 1.5971717624468098, |
|
"learning_rate": 1.7717400082797614e-06, |
|
"loss": 2.498, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.7403651115618661, |
|
"grad_norm": 1.6006841184664817, |
|
"learning_rate": 1.7425945977912387e-06, |
|
"loss": 2.5096, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7454361054766734, |
|
"grad_norm": 1.8078007149616142, |
|
"learning_rate": 1.7138983978840686e-06, |
|
"loss": 2.4733, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.7505070993914807, |
|
"grad_norm": 1.6080637102108633, |
|
"learning_rate": 1.685645167715982e-06, |
|
"loss": 2.4645, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.755578093306288, |
|
"grad_norm": 1.6034092883417612, |
|
"learning_rate": 1.6578287436337897e-06, |
|
"loss": 2.4874, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.7606490872210954, |
|
"grad_norm": 1.6562691168973722, |
|
"learning_rate": 1.6304430383367233e-06, |
|
"loss": 2.5147, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7657200811359026, |
|
"grad_norm": 1.631836734297837, |
|
"learning_rate": 1.6034820400475576e-06, |
|
"loss": 2.449, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.77079107505071, |
|
"grad_norm": 2.633902381426751, |
|
"learning_rate": 1.5769398116914607e-06, |
|
"loss": 2.4502, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.7758620689655172, |
|
"grad_norm": 1.6338196504524252, |
|
"learning_rate": 1.550810490082507e-06, |
|
"loss": 2.4375, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.7809330628803245, |
|
"grad_norm": 1.6881605246261733, |
|
"learning_rate": 1.5250882851177956e-06, |
|
"loss": 2.4623, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7860040567951319, |
|
"grad_norm": 1.7430128340035491, |
|
"learning_rate": 1.4997674789791142e-06, |
|
"loss": 2.4592, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.7910750507099391, |
|
"grad_norm": 1.6974037503954427, |
|
"learning_rate": 1.4748424253420905e-06, |
|
"loss": 2.5001, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7961460446247465, |
|
"grad_norm": 1.6057434981804433, |
|
"learning_rate": 1.4503075485927704e-06, |
|
"loss": 2.4603, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.8012170385395537, |
|
"grad_norm": 1.5564356238507298, |
|
"learning_rate": 1.4261573430515669e-06, |
|
"loss": 2.4357, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8062880324543611, |
|
"grad_norm": 1.7042405076576008, |
|
"learning_rate": 1.4023863722045201e-06, |
|
"loss": 2.4747, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.8113590263691683, |
|
"grad_norm": 1.5640034942530554, |
|
"learning_rate": 1.3789892679418134e-06, |
|
"loss": 2.5324, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8113590263691683, |
|
"eval_loss": 2.4689557552337646, |
|
"eval_runtime": 81.0232, |
|
"eval_samples_per_second": 86.469, |
|
"eval_steps_per_second": 0.679, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8164300202839757, |
|
"grad_norm": 1.7227060519078905, |
|
"learning_rate": 1.3559607298034838e-06, |
|
"loss": 2.4806, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.821501014198783, |
|
"grad_norm": 1.5855673393298833, |
|
"learning_rate": 1.333295524232277e-06, |
|
"loss": 2.4642, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8265720081135902, |
|
"grad_norm": 1.8155636812941185, |
|
"learning_rate": 1.310988483833583e-06, |
|
"loss": 2.4746, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.8316430020283976, |
|
"grad_norm": 1.6824796691575312, |
|
"learning_rate": 1.289034506642401e-06, |
|
"loss": 2.5168, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8367139959432048, |
|
"grad_norm": 1.6084122349859742, |
|
"learning_rate": 1.2674285553972776e-06, |
|
"loss": 2.4112, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.8417849898580122, |
|
"grad_norm": 1.6807591569306923, |
|
"learning_rate": 1.2461656568211607e-06, |
|
"loss": 2.4555, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8468559837728195, |
|
"grad_norm": 1.64520194930749, |
|
"learning_rate": 1.2252409009091154e-06, |
|
"loss": 2.5222, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.8519269776876268, |
|
"grad_norm": 1.642941398726877, |
|
"learning_rate": 1.2046494402228485e-06, |
|
"loss": 2.4607, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8569979716024341, |
|
"grad_norm": 1.6323907187692908, |
|
"learning_rate": 1.1843864891919843e-06, |
|
"loss": 2.4724, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 1.6489728444762863, |
|
"learning_rate": 1.1644473234220412e-06, |
|
"loss": 2.483, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.8671399594320487, |
|
"grad_norm": 1.5735584816022383, |
|
"learning_rate": 1.1448272790090529e-06, |
|
"loss": 2.4423, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.8722109533468559, |
|
"grad_norm": 1.6290164674794758, |
|
"learning_rate": 1.1255217518607806e-06, |
|
"loss": 2.4745, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8772819472616633, |
|
"grad_norm": 1.9631129344699565, |
|
"learning_rate": 1.1065261970244678e-06, |
|
"loss": 2.4595, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.8823529411764706, |
|
"grad_norm": 1.8876833985138877, |
|
"learning_rate": 1.0878361280210782e-06, |
|
"loss": 2.4761, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8874239350912779, |
|
"grad_norm": 1.7449962901668679, |
|
"learning_rate": 1.0694471161859696e-06, |
|
"loss": 2.4726, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.8924949290060852, |
|
"grad_norm": 1.6608657901001447, |
|
"learning_rate": 1.051354790015952e-06, |
|
"loss": 2.4817, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8975659229208925, |
|
"grad_norm": 1.6370419920913908, |
|
"learning_rate": 1.0335548345226733e-06, |
|
"loss": 2.4861, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.9026369168356998, |
|
"grad_norm": 1.6266725844295284, |
|
"learning_rate": 1.016042990592287e-06, |
|
"loss": 2.4437, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.907707910750507, |
|
"grad_norm": 1.5909779389607082, |
|
"learning_rate": 9.988150543513476e-07, |
|
"loss": 2.4605, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.9127789046653144, |
|
"grad_norm": 1.5796802186393568, |
|
"learning_rate": 9.818668765388872e-07, |
|
"loss": 2.4863, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9178498985801217, |
|
"grad_norm": 1.5779871460684796, |
|
"learning_rate": 9.651943618846152e-07, |
|
"loss": 2.4514, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.922920892494929, |
|
"grad_norm": 1.605102383763968, |
|
"learning_rate": 9.487934684931995e-07, |
|
"loss": 2.474, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9279918864097363, |
|
"grad_norm": 1.6069103870683263, |
|
"learning_rate": 9.326602072345758e-07, |
|
"loss": 2.4828, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.9330628803245437, |
|
"grad_norm": 1.6236038441464034, |
|
"learning_rate": 9.167906411402357e-07, |
|
"loss": 2.4501, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9381338742393509, |
|
"grad_norm": 1.6140284100171378, |
|
"learning_rate": 9.011808848054445e-07, |
|
"loss": 2.4441, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.9432048681541582, |
|
"grad_norm": 1.9823289784825078, |
|
"learning_rate": 8.858271037973411e-07, |
|
"loss": 2.4834, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9482758620689655, |
|
"grad_norm": 1.7094985628575186, |
|
"learning_rate": 8.707255140688767e-07, |
|
"loss": 2.4428, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.9533468559837728, |
|
"grad_norm": 1.5851821971427773, |
|
"learning_rate": 8.558723813785198e-07, |
|
"loss": 2.4459, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9584178498985801, |
|
"grad_norm": 1.8489283203955083, |
|
"learning_rate": 8.412640207157327e-07, |
|
"loss": 2.4671, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.9634888438133874, |
|
"grad_norm": 1.565327828926634, |
|
"learning_rate": 8.268967957320976e-07, |
|
"loss": 2.4762, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.9685598377281948, |
|
"grad_norm": 1.5753092524917698, |
|
"learning_rate": 8.127671181781262e-07, |
|
"loss": 2.487, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.973630831643002, |
|
"grad_norm": 1.5627741498336793, |
|
"learning_rate": 7.988714473456279e-07, |
|
"loss": 2.4899, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9787018255578094, |
|
"grad_norm": 1.7322054425536324, |
|
"learning_rate": 7.852062895156654e-07, |
|
"loss": 2.4328, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.9837728194726166, |
|
"grad_norm": 1.5912533141539165, |
|
"learning_rate": 7.717681974119764e-07, |
|
"loss": 2.4887, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9888438133874239, |
|
"grad_norm": 1.7127177872013957, |
|
"learning_rate": 7.585537696598922e-07, |
|
"loss": 2.4414, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.9939148073022313, |
|
"grad_norm": 1.6239111267541033, |
|
"learning_rate": 7.455596502506312e-07, |
|
"loss": 2.4962, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9989858012170385, |
|
"grad_norm": 1.6117561424503084, |
|
"learning_rate": 7.327825280109957e-07, |
|
"loss": 2.4738, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.0040567951318458, |
|
"grad_norm": 1.9019039739296713, |
|
"learning_rate": 7.20219136078357e-07, |
|
"loss": 2.27, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.0091277890466532, |
|
"grad_norm": 1.7075178009820928, |
|
"learning_rate": 7.078662513809528e-07, |
|
"loss": 2.3072, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.0141987829614605, |
|
"grad_norm": 1.7844249258995124, |
|
"learning_rate": 6.957206941233838e-07, |
|
"loss": 2.265, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0141987829614605, |
|
"eval_loss": 2.473280668258667, |
|
"eval_runtime": 81.0085, |
|
"eval_samples_per_second": 86.485, |
|
"eval_steps_per_second": 0.679, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0192697768762677, |
|
"grad_norm": 1.833316481131949, |
|
"learning_rate": 6.837793272773345e-07, |
|
"loss": 2.3069, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.024340770791075, |
|
"grad_norm": 1.7388775994426842, |
|
"learning_rate": 6.720390560774066e-07, |
|
"loss": 2.266, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.0294117647058822, |
|
"grad_norm": 1.6270190329782648, |
|
"learning_rate": 6.604968275220875e-07, |
|
"loss": 2.2664, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.0344827586206897, |
|
"grad_norm": 1.7956207149367391, |
|
"learning_rate": 6.491496298797458e-07, |
|
"loss": 2.2394, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.039553752535497, |
|
"grad_norm": 1.6994135825189252, |
|
"learning_rate": 6.379944921996764e-07, |
|
"loss": 2.2727, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.0446247464503042, |
|
"grad_norm": 1.677197538792328, |
|
"learning_rate": 6.270284838280882e-07, |
|
"loss": 2.2072, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.0496957403651115, |
|
"grad_norm": 1.719327046611783, |
|
"learning_rate": 6.162487139290532e-07, |
|
"loss": 2.3021, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.054766734279919, |
|
"grad_norm": 1.7292340128968464, |
|
"learning_rate": 6.056523310103172e-07, |
|
"loss": 2.2737, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.0598377281947262, |
|
"grad_norm": 1.7428974260955565, |
|
"learning_rate": 5.95236522453988e-07, |
|
"loss": 2.2556, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.0649087221095335, |
|
"grad_norm": 1.694959472171586, |
|
"learning_rate": 5.849985140519998e-07, |
|
"loss": 2.2992, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.0699797160243407, |
|
"grad_norm": 1.7439692178448947, |
|
"learning_rate": 5.749355695463754e-07, |
|
"loss": 2.2557, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.075050709939148, |
|
"grad_norm": 1.7558636029085997, |
|
"learning_rate": 5.650449901741813e-07, |
|
"loss": 2.2474, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.0801217038539555, |
|
"grad_norm": 1.785367595963534, |
|
"learning_rate": 5.553241142171985e-07, |
|
"loss": 2.267, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.0851926977687627, |
|
"grad_norm": 1.7537584511707027, |
|
"learning_rate": 5.45770316556211e-07, |
|
"loss": 2.2823, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.09026369168357, |
|
"grad_norm": 1.6825060417395732, |
|
"learning_rate": 5.363810082299148e-07, |
|
"loss": 2.2525, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.0953346855983772, |
|
"grad_norm": 1.7339475460772475, |
|
"learning_rate": 5.27153635998387e-07, |
|
"loss": 2.3006, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.1004056795131847, |
|
"grad_norm": 1.6977028436147512, |
|
"learning_rate": 5.180856819110773e-07, |
|
"loss": 2.2862, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.105476673427992, |
|
"grad_norm": 1.7119437312783958, |
|
"learning_rate": 5.091746628792904e-07, |
|
"loss": 2.243, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.1105476673427992, |
|
"grad_norm": 1.7918277133466605, |
|
"learning_rate": 5.004181302531108e-07, |
|
"loss": 2.2653, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.1156186612576064, |
|
"grad_norm": 1.7198038075584687, |
|
"learning_rate": 4.918136694027396e-07, |
|
"loss": 2.2741, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.1206896551724137, |
|
"grad_norm": 1.7122122501534425, |
|
"learning_rate": 4.833588993041994e-07, |
|
"loss": 2.2757, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.1257606490872212, |
|
"grad_norm": 1.6934117050919777, |
|
"learning_rate": 4.750514721293719e-07, |
|
"loss": 2.2484, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.1308316430020284, |
|
"grad_norm": 1.8096755323665539, |
|
"learning_rate": 4.6688907284032994e-07, |
|
"loss": 2.2329, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.1359026369168357, |
|
"grad_norm": 1.7732841203420067, |
|
"learning_rate": 4.588694187879258e-07, |
|
"loss": 2.2636, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.140973630831643, |
|
"grad_norm": 1.70514589311023, |
|
"learning_rate": 4.5099025931459913e-07, |
|
"loss": 2.2778, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.1460446247464504, |
|
"grad_norm": 1.7135354540773058, |
|
"learning_rate": 4.4324937536136735e-07, |
|
"loss": 2.2905, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.1511156186612577, |
|
"grad_norm": 1.6901713268949445, |
|
"learning_rate": 4.3564457907896125e-07, |
|
"loss": 2.302, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.156186612576065, |
|
"grad_norm": 1.7350424488382163, |
|
"learning_rate": 4.281737134430704e-07, |
|
"loss": 2.2441, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.1612576064908722, |
|
"grad_norm": 1.7433418190612922, |
|
"learning_rate": 4.208346518736604e-07, |
|
"loss": 2.2639, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.1663286004056794, |
|
"grad_norm": 1.7278183208713844, |
|
"learning_rate": 4.136252978583281e-07, |
|
"loss": 2.272, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.171399594320487, |
|
"grad_norm": 1.7049575091462312, |
|
"learning_rate": 4.0654358457965706e-07, |
|
"loss": 2.2822, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.1764705882352942, |
|
"grad_norm": 1.7614119208994081, |
|
"learning_rate": 3.995874745465392e-07, |
|
"loss": 2.2882, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.1815415821501014, |
|
"grad_norm": 1.7783667378053016, |
|
"learning_rate": 3.927549592294267e-07, |
|
"loss": 2.2779, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.1866125760649087, |
|
"grad_norm": 1.7857803604726208, |
|
"learning_rate": 3.8604405869947905e-07, |
|
"loss": 2.2504, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.1916835699797161, |
|
"grad_norm": 1.7894737586957659, |
|
"learning_rate": 3.794528212715714e-07, |
|
"loss": 2.2896, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.1967545638945234, |
|
"grad_norm": 1.7605294591830605, |
|
"learning_rate": 3.7297932315112855e-07, |
|
"loss": 2.2803, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.2018255578093306, |
|
"grad_norm": 1.7037189312181982, |
|
"learning_rate": 3.6662166808475126e-07, |
|
"loss": 2.2595, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.206896551724138, |
|
"grad_norm": 1.802568691083643, |
|
"learning_rate": 3.6037798701460037e-07, |
|
"loss": 2.3097, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.2119675456389452, |
|
"grad_norm": 1.7227242510965723, |
|
"learning_rate": 3.5424643773650545e-07, |
|
"loss": 2.2473, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.2170385395537526, |
|
"grad_norm": 1.7126735182979083, |
|
"learning_rate": 3.482252045617637e-07, |
|
"loss": 2.3002, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.2170385395537526, |
|
"eval_loss": 2.4735846519470215, |
|
"eval_runtime": 81.0924, |
|
"eval_samples_per_second": 86.395, |
|
"eval_steps_per_second": 0.678, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.2221095334685599, |
|
"grad_norm": 1.7418672417675343, |
|
"learning_rate": 3.423124979825969e-07, |
|
"loss": 2.2259, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.2271805273833671, |
|
"grad_norm": 1.7536106052680211, |
|
"learning_rate": 3.365065543412324e-07, |
|
"loss": 2.2625, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.2322515212981744, |
|
"grad_norm": 1.6738354256007202, |
|
"learning_rate": 3.3080563550257607e-07, |
|
"loss": 2.2762, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.2373225152129819, |
|
"grad_norm": 1.7304199756653005, |
|
"learning_rate": 3.2520802853044393e-07, |
|
"loss": 2.2864, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.2423935091277891, |
|
"grad_norm": 1.761088776037141, |
|
"learning_rate": 3.197120453673215e-07, |
|
"loss": 2.2665, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.2474645030425964, |
|
"grad_norm": 1.7101358055188194, |
|
"learning_rate": 3.143160225176168e-07, |
|
"loss": 2.2775, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.2525354969574036, |
|
"grad_norm": 1.7571854143932952, |
|
"learning_rate": 3.0901832073437713e-07, |
|
"loss": 2.2979, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.2576064908722109, |
|
"grad_norm": 1.7216743809437804, |
|
"learning_rate": 3.0381732470943653e-07, |
|
"loss": 2.3094, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.2626774847870181, |
|
"grad_norm": 1.6935950803242086, |
|
"learning_rate": 2.9871144276696387e-07, |
|
"loss": 2.2707, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.2677484787018256, |
|
"grad_norm": 1.7158452472154153, |
|
"learning_rate": 2.9369910656037903e-07, |
|
"loss": 2.2532, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.2728194726166329, |
|
"grad_norm": 1.7587458046328184, |
|
"learning_rate": 2.8877877077260676e-07, |
|
"loss": 2.2968, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.2778904665314401, |
|
"grad_norm": 1.7348605445713965, |
|
"learning_rate": 2.839489128196406e-07, |
|
"loss": 2.2596, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.2829614604462476, |
|
"grad_norm": 1.6962275978449755, |
|
"learning_rate": 2.7920803255737635e-07, |
|
"loss": 2.2579, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.2880324543610548, |
|
"grad_norm": 1.7562952815143784, |
|
"learning_rate": 2.7455465199170286e-07, |
|
"loss": 2.2518, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.293103448275862, |
|
"grad_norm": 1.6974150722131578, |
|
"learning_rate": 2.699873149917968e-07, |
|
"loss": 2.2504, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.2981744421906694, |
|
"grad_norm": 1.7036916845012207, |
|
"learning_rate": 2.655045870066172e-07, |
|
"loss": 2.2861, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.3032454361054766, |
|
"grad_norm": 1.7486208966066876, |
|
"learning_rate": 2.6110505478454324e-07, |
|
"loss": 2.2467, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.3083164300202839, |
|
"grad_norm": 1.712258524308874, |
|
"learning_rate": 2.5678732609615423e-07, |
|
"loss": 2.2515, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.3133874239350913, |
|
"grad_norm": 1.7341023622582277, |
|
"learning_rate": 2.525500294600939e-07, |
|
"loss": 2.2757, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.3184584178498986, |
|
"grad_norm": 1.889990239211246, |
|
"learning_rate": 2.4839181387201796e-07, |
|
"loss": 2.2791, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.3235294117647058, |
|
"grad_norm": 1.798861207791198, |
|
"learning_rate": 2.4431134853656976e-07, |
|
"loss": 2.2817, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.3286004056795133, |
|
"grad_norm": 1.7472239831698717, |
|
"learning_rate": 2.4030732260238086e-07, |
|
"loss": 2.2521, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.3336713995943206, |
|
"grad_norm": 1.782522588407923, |
|
"learning_rate": 2.3637844490004408e-07, |
|
"loss": 2.2316, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.3387423935091278, |
|
"grad_norm": 1.6996053792107884, |
|
"learning_rate": 2.325234436830538e-07, |
|
"loss": 2.2734, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.343813387423935, |
|
"grad_norm": 1.7994805518930097, |
|
"learning_rate": 2.2874106637166403e-07, |
|
"loss": 2.2484, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.3488843813387423, |
|
"grad_norm": 1.7489331509437775, |
|
"learning_rate": 2.2503007929965749e-07, |
|
"loss": 2.28, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.3539553752535496, |
|
"grad_norm": 1.7160678233869127, |
|
"learning_rate": 2.2138926746397777e-07, |
|
"loss": 2.2565, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.359026369168357, |
|
"grad_norm": 1.814687918697313, |
|
"learning_rate": 2.178174342772177e-07, |
|
"loss": 2.2517, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.3640973630831643, |
|
"grad_norm": 1.6987256946879317, |
|
"learning_rate": 2.143134013229167e-07, |
|
"loss": 2.2672, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.3691683569979716, |
|
"grad_norm": 1.7371785897491874, |
|
"learning_rate": 2.1087600811366032e-07, |
|
"loss": 2.2628, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.3742393509127788, |
|
"grad_norm": 1.745926263655127, |
|
"learning_rate": 2.075041118519355e-07, |
|
"loss": 2.2532, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 1.700613383279488, |
|
"learning_rate": 2.0419658719373504e-07, |
|
"loss": 2.2617, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.3843813387423936, |
|
"grad_norm": 1.691103098158946, |
|
"learning_rate": 2.009523260148652e-07, |
|
"loss": 2.2391, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.3894523326572008, |
|
"grad_norm": 1.6917956046319294, |
|
"learning_rate": 1.977702371799498e-07, |
|
"loss": 2.2973, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.394523326572008, |
|
"grad_norm": 1.7504566996070137, |
|
"learning_rate": 1.946492463140869e-07, |
|
"loss": 2.3102, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.3995943204868153, |
|
"grad_norm": 1.838843879022522, |
|
"learning_rate": 1.9158829557714903e-07, |
|
"loss": 2.2819, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.4046653144016228, |
|
"grad_norm": 1.7034157869918263, |
|
"learning_rate": 1.8858634344068625e-07, |
|
"loss": 2.2463, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.40973630831643, |
|
"grad_norm": 1.7726664220307162, |
|
"learning_rate": 1.8564236446742146e-07, |
|
"loss": 2.2458, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.4148073022312373, |
|
"grad_norm": 1.7584441947795304, |
|
"learning_rate": 1.8275534909329853e-07, |
|
"loss": 2.2663, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.4198782961460445, |
|
"grad_norm": 1.7548926938859895, |
|
"learning_rate": 1.7992430341207304e-07, |
|
"loss": 2.29, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.4198782961460445, |
|
"eval_loss": 2.4734323024749756, |
|
"eval_runtime": 81.002, |
|
"eval_samples_per_second": 86.492, |
|
"eval_steps_per_second": 0.679, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.424949290060852, |
|
"grad_norm": 1.691411914276979, |
|
"learning_rate": 1.7714824896240595e-07, |
|
"loss": 2.2565, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.4300202839756593, |
|
"grad_norm": 1.7523279327159709, |
|
"learning_rate": 1.7442622251745125e-07, |
|
"loss": 2.2582, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.4350912778904665, |
|
"grad_norm": 1.6844227513504313, |
|
"learning_rate": 1.717572758768978e-07, |
|
"loss": 2.2416, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.4401622718052738, |
|
"grad_norm": 2.2030630647830245, |
|
"learning_rate": 1.6914047566145662e-07, |
|
"loss": 2.2289, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.445233265720081, |
|
"grad_norm": 1.7795541841017355, |
|
"learning_rate": 1.6657490310975468e-07, |
|
"loss": 2.2841, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.4503042596348885, |
|
"grad_norm": 1.8134633165357201, |
|
"learning_rate": 1.6405965387762636e-07, |
|
"loss": 2.2542, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.4553752535496958, |
|
"grad_norm": 1.7604092301048675, |
|
"learning_rate": 1.615938378397648e-07, |
|
"loss": 2.2493, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.460446247464503, |
|
"grad_norm": 1.8595724042593027, |
|
"learning_rate": 1.5917657889372315e-07, |
|
"loss": 2.2484, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.4655172413793103, |
|
"grad_norm": 1.7081713686615858, |
|
"learning_rate": 1.568070147662311e-07, |
|
"loss": 2.2744, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.4705882352941178, |
|
"grad_norm": 14.41030902656843, |
|
"learning_rate": 1.5448429682181186e-07, |
|
"loss": 2.2609, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.475659229208925, |
|
"grad_norm": 1.7702111899429174, |
|
"learning_rate": 1.5220758987367309e-07, |
|
"loss": 2.2955, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.4807302231237323, |
|
"grad_norm": 1.7932941724173908, |
|
"learning_rate": 1.4997607199684964e-07, |
|
"loss": 2.2478, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.4858012170385395, |
|
"grad_norm": 1.7327449633169845, |
|
"learning_rate": 1.477889343435765e-07, |
|
"loss": 2.2713, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.4908722109533468, |
|
"grad_norm": 1.7047486187689578, |
|
"learning_rate": 1.456453809608691e-07, |
|
"loss": 2.2586, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.495943204868154, |
|
"grad_norm": 1.7085975289965103, |
|
"learning_rate": 1.4354462861028889e-07, |
|
"loss": 2.2602, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.5010141987829615, |
|
"grad_norm": 1.7708851051604204, |
|
"learning_rate": 1.414859065898731e-07, |
|
"loss": 2.2913, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.5060851926977687, |
|
"grad_norm": 1.6849008491575197, |
|
"learning_rate": 1.3946845655820588e-07, |
|
"loss": 2.2129, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.5111561866125762, |
|
"grad_norm": 1.6770410018579935, |
|
"learning_rate": 1.374915323606102e-07, |
|
"loss": 2.2641, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.5162271805273835, |
|
"grad_norm": 1.7333889728562109, |
|
"learning_rate": 1.3555439985743863e-07, |
|
"loss": 2.3096, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.5212981744421907, |
|
"grad_norm": 1.7381149429179856, |
|
"learning_rate": 1.3365633675444236e-07, |
|
"loss": 2.2449, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.526369168356998, |
|
"grad_norm": 1.7508604376509869, |
|
"learning_rate": 1.317966324351968e-07, |
|
"loss": 2.3006, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.5314401622718052, |
|
"grad_norm": 1.731173156378831, |
|
"learning_rate": 1.2997458779556342e-07, |
|
"loss": 2.2721, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.5365111561866125, |
|
"grad_norm": 1.7880722742651989, |
|
"learning_rate": 1.2818951508016706e-07, |
|
"loss": 2.2839, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.5415821501014197, |
|
"grad_norm": 1.766456825336907, |
|
"learning_rate": 1.264407377208682e-07, |
|
"loss": 2.2542, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.5466531440162272, |
|
"grad_norm": 1.793293076179441, |
|
"learning_rate": 1.2472759017720967e-07, |
|
"loss": 2.2345, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.5517241379310345, |
|
"grad_norm": 1.7255231286858488, |
|
"learning_rate": 1.2304941777881816e-07, |
|
"loss": 2.2587, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.556795131845842, |
|
"grad_norm": 1.7107497208562314, |
|
"learning_rate": 1.214055765697399e-07, |
|
"loss": 2.2587, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.5618661257606492, |
|
"grad_norm": 1.7448234273922532, |
|
"learning_rate": 1.197954331546911e-07, |
|
"loss": 2.2493, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.5669371196754565, |
|
"grad_norm": 1.713933005233849, |
|
"learning_rate": 1.1821836454720342e-07, |
|
"loss": 2.3028, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.5720081135902637, |
|
"grad_norm": 1.8430768650069782, |
|
"learning_rate": 1.1667375801964492e-07, |
|
"loss": 2.2595, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.577079107505071, |
|
"grad_norm": 1.7903141506679578, |
|
"learning_rate": 1.15161010955097e-07, |
|
"loss": 2.2555, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.5821501014198782, |
|
"grad_norm": 1.810165731715535, |
|
"learning_rate": 1.136795307010685e-07, |
|
"loss": 2.2728, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.5872210953346855, |
|
"grad_norm": 1.7357274884238136, |
|
"learning_rate": 1.1222873442502753e-07, |
|
"loss": 2.2741, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.592292089249493, |
|
"grad_norm": 1.7545984913046129, |
|
"learning_rate": 1.108080489717326e-07, |
|
"loss": 2.2609, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.5973630831643002, |
|
"grad_norm": 1.8639925458297812, |
|
"learning_rate": 1.0941691072234387e-07, |
|
"loss": 2.2349, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.6024340770791075, |
|
"grad_norm": 1.7125402909483072, |
|
"learning_rate": 1.080547654552963e-07, |
|
"loss": 2.2929, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.607505070993915, |
|
"grad_norm": 1.7300627575439524, |
|
"learning_rate": 1.0672106820891631e-07, |
|
"loss": 2.2823, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.6125760649087222, |
|
"grad_norm": 1.7190554348875562, |
|
"learning_rate": 1.0541528314576339e-07, |
|
"loss": 2.2708, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.6176470588235294, |
|
"grad_norm": 1.724918915538896, |
|
"learning_rate": 1.04136883418679e-07, |
|
"loss": 2.2491, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.6227180527383367, |
|
"grad_norm": 1.7342048226287368, |
|
"learning_rate": 1.0288535103852444e-07, |
|
"loss": 2.2566, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.6227180527383367, |
|
"eval_loss": 2.472487688064575, |
|
"eval_runtime": 81.0795, |
|
"eval_samples_per_second": 86.409, |
|
"eval_steps_per_second": 0.678, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.627789046653144, |
|
"grad_norm": 1.752725508386252, |
|
"learning_rate": 1.0166017674359012e-07, |
|
"loss": 2.2115, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.6328600405679512, |
|
"grad_norm": 1.7053034674622713, |
|
"learning_rate": 1.0046085987065856e-07, |
|
"loss": 2.2349, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.6379310344827587, |
|
"grad_norm": 1.6910767224745546, |
|
"learning_rate": 9.928690822770361e-08, |
|
"loss": 2.2661, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.643002028397566, |
|
"grad_norm": 1.9415101732879068, |
|
"learning_rate": 9.81378379682085e-08, |
|
"loss": 2.2355, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.6480730223123732, |
|
"grad_norm": 1.7692640477521646, |
|
"learning_rate": 9.70131734670856e-08, |
|
"loss": 2.2605, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.6531440162271807, |
|
"grad_norm": 1.7825871200246013, |
|
"learning_rate": 9.59124471981808e-08, |
|
"loss": 2.2842, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.658215010141988, |
|
"grad_norm": 1.805395258521555, |
|
"learning_rate": 9.483519961334607e-08, |
|
"loss": 2.2543, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.6632860040567952, |
|
"grad_norm": 1.7151309029731219, |
|
"learning_rate": 9.378097902306157e-08, |
|
"loss": 2.2507, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.6683569979716024, |
|
"grad_norm": 1.7662462146082336, |
|
"learning_rate": 9.274934147859458e-08, |
|
"loss": 2.2822, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.6734279918864097, |
|
"grad_norm": 1.7065430440445857, |
|
"learning_rate": 9.173985065567343e-08, |
|
"loss": 2.2727, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.678498985801217, |
|
"grad_norm": 1.8167004072102202, |
|
"learning_rate": 9.075207773966592e-08, |
|
"loss": 2.2582, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.6835699797160242, |
|
"grad_norm": 1.7276973068156511, |
|
"learning_rate": 8.978560131224021e-08, |
|
"loss": 2.2451, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.6886409736308317, |
|
"grad_norm": 1.7787413203893692, |
|
"learning_rate": 8.88400072394981e-08, |
|
"loss": 2.2421, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.693711967545639, |
|
"grad_norm": 0.8868153668800921, |
|
"learning_rate": 8.791488856155857e-08, |
|
"loss": 2.2354, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.6987829614604464, |
|
"grad_norm": 1.6998265742091707, |
|
"learning_rate": 8.700984538358205e-08, |
|
"loss": 2.264, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.7038539553752536, |
|
"grad_norm": 1.7045446815412617, |
|
"learning_rate": 8.612448476821393e-08, |
|
"loss": 2.2775, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.708924949290061, |
|
"grad_norm": 1.7898247009022359, |
|
"learning_rate": 8.525842062943714e-08, |
|
"loss": 2.2733, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.7139959432048681, |
|
"grad_norm": 1.7604334600933766, |
|
"learning_rate": 8.441127362781345e-08, |
|
"loss": 2.2704, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.7190669371196754, |
|
"grad_norm": 1.8108867949678853, |
|
"learning_rate": 8.358267106710315e-08, |
|
"loss": 2.2626, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.7241379310344827, |
|
"grad_norm": 1.6881452920332736, |
|
"learning_rate": 8.277224679224312e-08, |
|
"loss": 2.2694, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.72920892494929, |
|
"grad_norm": 1.7530216839199022, |
|
"learning_rate": 8.197964108867328e-08, |
|
"loss": 2.2622, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.7342799188640974, |
|
"grad_norm": 1.7278497657123897, |
|
"learning_rate": 8.12045005829916e-08, |
|
"loss": 2.2471, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.7393509127789046, |
|
"grad_norm": 1.8213327178561642, |
|
"learning_rate": 8.044647814492792e-08, |
|
"loss": 2.2313, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.744421906693712, |
|
"grad_norm": 1.8304362576609268, |
|
"learning_rate": 7.970523279061717e-08, |
|
"loss": 2.2738, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.7494929006085194, |
|
"grad_norm": 1.7718300765439339, |
|
"learning_rate": 7.898042958716228e-08, |
|
"loss": 2.2308, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.7545638945233266, |
|
"grad_norm": 1.7305535723288619, |
|
"learning_rate": 7.827173955846786e-08, |
|
"loss": 2.2513, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.7596348884381339, |
|
"grad_norm": 1.7402125464421778, |
|
"learning_rate": 7.757883959233495e-08, |
|
"loss": 2.2429, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.7647058823529411, |
|
"grad_norm": 1.8175975710441392, |
|
"learning_rate": 7.690141234879847e-08, |
|
"loss": 2.288, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.7697768762677484, |
|
"grad_norm": 1.851991292226803, |
|
"learning_rate": 7.623914616969753e-08, |
|
"loss": 2.2644, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.7748478701825556, |
|
"grad_norm": 1.6602366231900278, |
|
"learning_rate": 7.559173498946088e-08, |
|
"loss": 2.2733, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.779918864097363, |
|
"grad_norm": 1.7034994512549433, |
|
"learning_rate": 7.495887824709769e-08, |
|
"loss": 2.2674, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.7849898580121704, |
|
"grad_norm": 1.7102833212058115, |
|
"learning_rate": 7.434028079937624e-08, |
|
"loss": 2.2752, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.7900608519269778, |
|
"grad_norm": 2.1016603731428067, |
|
"learning_rate": 7.373565283518085e-08, |
|
"loss": 2.2726, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.795131845841785, |
|
"grad_norm": 1.7876491597075783, |
|
"learning_rate": 7.314470979103019e-08, |
|
"loss": 2.2188, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.8002028397565923, |
|
"grad_norm": 1.7984832581935817, |
|
"learning_rate": 7.256717226774701e-08, |
|
"loss": 2.2772, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.8052738336713996, |
|
"grad_norm": 1.7621637378160073, |
|
"learning_rate": 7.200276594826329e-08, |
|
"loss": 2.2466, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.8103448275862069, |
|
"grad_norm": 1.7255493399444854, |
|
"learning_rate": 7.145122151655066e-08, |
|
"loss": 2.2633, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.815415821501014, |
|
"grad_norm": 1.7774418294615342, |
|
"learning_rate": 7.101906869364121e-08, |
|
"loss": 2.2966, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.8204868154158214, |
|
"grad_norm": 1.7397631305330485, |
|
"learning_rate": 7.049001264123894e-08, |
|
"loss": 2.2644, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.8255578093306288, |
|
"grad_norm": 1.7641738767791946, |
|
"learning_rate": 6.997309032084255e-08, |
|
"loss": 2.3052, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.8255578093306288, |
|
"eval_loss": 2.4720866680145264, |
|
"eval_runtime": 81.0596, |
|
"eval_samples_per_second": 86.43, |
|
"eval_steps_per_second": 0.679, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.830628803245436, |
|
"grad_norm": 1.730995593445214, |
|
"learning_rate": 6.946805070044455e-08, |
|
"loss": 2.2748, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.8356997971602436, |
|
"grad_norm": 1.708076665562477, |
|
"learning_rate": 6.897464737518235e-08, |
|
"loss": 2.2709, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.8407707910750508, |
|
"grad_norm": 1.7961247246527527, |
|
"learning_rate": 6.849263849253629e-08, |
|
"loss": 2.2756, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.845841784989858, |
|
"grad_norm": 1.7873259024447121, |
|
"learning_rate": 6.802178667856782e-08, |
|
"loss": 2.2619, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.8509127789046653, |
|
"grad_norm": 1.7208578483390204, |
|
"learning_rate": 6.756185896518329e-08, |
|
"loss": 2.2563, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.8559837728194726, |
|
"grad_norm": 1.6824119656694438, |
|
"learning_rate": 6.711262671841385e-08, |
|
"loss": 2.2524, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.8610547667342798, |
|
"grad_norm": 1.717042060961093, |
|
"learning_rate": 6.667386556769717e-08, |
|
"loss": 2.3135, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 1.866125760649087, |
|
"grad_norm": 1.736419652896857, |
|
"learning_rate": 6.624535533615173e-08, |
|
"loss": 2.288, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.8711967545638946, |
|
"grad_norm": 1.75637188785577, |
|
"learning_rate": 6.582687997182971e-08, |
|
"loss": 2.2392, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.8762677484787018, |
|
"grad_norm": 1.7282509939601418, |
|
"learning_rate": 6.54182274799391e-08, |
|
"loss": 2.2662, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.8813387423935093, |
|
"grad_norm": 1.7060962855685544, |
|
"learning_rate": 6.501918985602177e-08, |
|
"loss": 2.2935, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.8864097363083165, |
|
"grad_norm": 1.7581616823404618, |
|
"learning_rate": 6.462956302007797e-08, |
|
"loss": 2.2478, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.8914807302231238, |
|
"grad_norm": 1.7987997676993257, |
|
"learning_rate": 6.424914675162432e-08, |
|
"loss": 2.2853, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 1.896551724137931, |
|
"grad_norm": 1.7116689993633696, |
|
"learning_rate": 6.387774462567602e-08, |
|
"loss": 2.2503, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.9016227180527383, |
|
"grad_norm": 1.7086258587789072, |
|
"learning_rate": 6.351516394964051e-08, |
|
"loss": 2.2822, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.9066937119675456, |
|
"grad_norm": 1.8235148496074345, |
|
"learning_rate": 6.31612157011135e-08, |
|
"loss": 2.2879, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.9117647058823528, |
|
"grad_norm": 1.7448709638927917, |
|
"learning_rate": 6.281571446656485e-08, |
|
"loss": 2.2586, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 1.9168356997971603, |
|
"grad_norm": 1.7421662505581106, |
|
"learning_rate": 6.247847838090545e-08, |
|
"loss": 2.2791, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.9219066937119675, |
|
"grad_norm": 1.825830026911039, |
|
"learning_rate": 6.21493290679226e-08, |
|
"loss": 2.2385, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 1.9269776876267748, |
|
"grad_norm": 1.796187481606512, |
|
"learning_rate": 6.182809158157558e-08, |
|
"loss": 2.2756, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.9320486815415823, |
|
"grad_norm": 1.7552941496595575, |
|
"learning_rate": 6.151459434813879e-08, |
|
"loss": 2.2587, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 1.9371196754563895, |
|
"grad_norm": 1.7522494947057408, |
|
"learning_rate": 6.120866910918446e-08, |
|
"loss": 2.2585, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.9421906693711968, |
|
"grad_norm": 1.7522459962159465, |
|
"learning_rate": 6.091015086539273e-08, |
|
"loss": 2.251, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 1.947261663286004, |
|
"grad_norm": 1.702096284758162, |
|
"learning_rate": 6.061887782118077e-08, |
|
"loss": 2.285, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.9523326572008113, |
|
"grad_norm": 1.7643281133012019, |
|
"learning_rate": 6.033469133013957e-08, |
|
"loss": 2.2846, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.9574036511156185, |
|
"grad_norm": 1.6926355627529537, |
|
"learning_rate": 6.005743584126981e-08, |
|
"loss": 2.2124, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.962474645030426, |
|
"grad_norm": 1.6991484085258466, |
|
"learning_rate": 5.984051918509233e-08, |
|
"loss": 2.2919, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 1.9675456389452333, |
|
"grad_norm": 1.6959402402475394, |
|
"learning_rate": 5.957535718971899e-08, |
|
"loss": 2.2133, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.9726166328600405, |
|
"grad_norm": 1.7435422008262311, |
|
"learning_rate": 5.931670667334593e-08, |
|
"loss": 2.2272, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 1.977687626774848, |
|
"grad_norm": 1.7235339509485863, |
|
"learning_rate": 5.906442337098544e-08, |
|
"loss": 2.2566, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.9827586206896552, |
|
"grad_norm": 1.8046591422600013, |
|
"learning_rate": 5.881836586579961e-08, |
|
"loss": 2.295, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 1.9878296146044625, |
|
"grad_norm": 1.8447312096680564, |
|
"learning_rate": 5.8578395539777033e-08, |
|
"loss": 2.29, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.9929006085192698, |
|
"grad_norm": 1.6943108398877464, |
|
"learning_rate": 5.834437652514426e-08, |
|
"loss": 2.2188, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 1.997971602434077, |
|
"grad_norm": 1.7174652428188777, |
|
"learning_rate": 5.811617565650129e-08, |
|
"loss": 2.2692, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.0030425963488843, |
|
"grad_norm": 1.6831299340128894, |
|
"learning_rate": 5.7893662423673665e-08, |
|
"loss": 2.2025, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 2.0081135902636915, |
|
"grad_norm": 1.826795197065323, |
|
"learning_rate": 5.767670892527061e-08, |
|
"loss": 2.2579, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.0131845841784988, |
|
"grad_norm": 1.7520235012361185, |
|
"learning_rate": 5.746518982294192e-08, |
|
"loss": 2.2388, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 2.0182555780933065, |
|
"grad_norm": 1.8440219249964744, |
|
"learning_rate": 5.72589822963234e-08, |
|
"loss": 2.2582, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.0233265720081137, |
|
"grad_norm": 1.7151060194819, |
|
"learning_rate": 5.705796599866345e-08, |
|
"loss": 2.2156, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 2.028397565922921, |
|
"grad_norm": 1.7333738899068507, |
|
"learning_rate": 5.686202301312118e-08, |
|
"loss": 2.2702, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.028397565922921, |
|
"eval_loss": 2.4733877182006836, |
|
"eval_runtime": 81.1205, |
|
"eval_samples_per_second": 86.365, |
|
"eval_steps_per_second": 0.678, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.0334685598377282, |
|
"grad_norm": 1.7637474983877708, |
|
"learning_rate": 5.667103780972823e-08, |
|
"loss": 2.2378, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 2.0385395537525355, |
|
"grad_norm": 1.7730571315134518, |
|
"learning_rate": 5.648489720300554e-08, |
|
"loss": 2.2513, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.0436105476673427, |
|
"grad_norm": 1.774271074894755, |
|
"learning_rate": 5.630349031022691e-08, |
|
"loss": 2.2518, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 2.04868154158215, |
|
"grad_norm": 1.6997020509374097, |
|
"learning_rate": 5.6126708510320976e-08, |
|
"loss": 2.2464, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.0537525354969572, |
|
"grad_norm": 1.7833382557650153, |
|
"learning_rate": 5.595444540340353e-08, |
|
"loss": 2.2317, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 2.0588235294117645, |
|
"grad_norm": 1.7296871432561252, |
|
"learning_rate": 5.578659677093205e-08, |
|
"loss": 2.231, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.063894523326572, |
|
"grad_norm": 1.7166463945290173, |
|
"learning_rate": 5.562306053647459e-08, |
|
"loss": 2.2347, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 2.0689655172413794, |
|
"grad_norm": 1.7948324654757548, |
|
"learning_rate": 5.546373672708482e-08, |
|
"loss": 2.2458, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.0740365111561867, |
|
"grad_norm": 1.745646645076283, |
|
"learning_rate": 5.530852743527571e-08, |
|
"loss": 2.2504, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 2.079107505070994, |
|
"grad_norm": 1.7778201657756552, |
|
"learning_rate": 5.515733678158393e-08, |
|
"loss": 2.26, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.084178498985801, |
|
"grad_norm": 1.7226724662159607, |
|
"learning_rate": 5.5010070877717374e-08, |
|
"loss": 2.24, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 2.0892494929006085, |
|
"grad_norm": 1.737085412071484, |
|
"learning_rate": 5.486663779027808e-08, |
|
"loss": 2.2138, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.0943204868154157, |
|
"grad_norm": 1.7680067007098665, |
|
"learning_rate": 5.4726947505053265e-08, |
|
"loss": 2.2688, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 2.099391480730223, |
|
"grad_norm": 1.7414742255329991, |
|
"learning_rate": 5.459091189186688e-08, |
|
"loss": 2.2591, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.1044624746450302, |
|
"grad_norm": 1.7804223600059563, |
|
"learning_rate": 5.4458444669984314e-08, |
|
"loss": 2.2337, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.109533468559838, |
|
"grad_norm": 1.7481822321590552, |
|
"learning_rate": 5.432946137406314e-08, |
|
"loss": 2.2792, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.114604462474645, |
|
"grad_norm": 1.7497391573214505, |
|
"learning_rate": 5.420387932064249e-08, |
|
"loss": 2.2927, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 2.1196754563894524, |
|
"grad_norm": 1.7279168540890797, |
|
"learning_rate": 5.408161757516413e-08, |
|
"loss": 2.2451, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.1247464503042597, |
|
"grad_norm": 1.7394662730899328, |
|
"learning_rate": 5.396259691951805e-08, |
|
"loss": 2.2424, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 2.129817444219067, |
|
"grad_norm": 1.77875077601377, |
|
"learning_rate": 5.384673982010568e-08, |
|
"loss": 2.2402, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.134888438133874, |
|
"grad_norm": 1.7319261658863345, |
|
"learning_rate": 5.373397039641377e-08, |
|
"loss": 2.2287, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 2.1399594320486814, |
|
"grad_norm": 1.751571162082358, |
|
"learning_rate": 5.362421439009217e-08, |
|
"loss": 2.2334, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.1450304259634887, |
|
"grad_norm": 1.8093044605440316, |
|
"learning_rate": 5.351739913452874e-08, |
|
"loss": 2.271, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 2.150101419878296, |
|
"grad_norm": 1.8469881188013633, |
|
"learning_rate": 5.341345352491468e-08, |
|
"loss": 2.2284, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.1551724137931036, |
|
"grad_norm": 1.7711139740473771, |
|
"learning_rate": 5.331230798879373e-08, |
|
"loss": 2.2644, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 2.160243407707911, |
|
"grad_norm": 1.7271859975777568, |
|
"learning_rate": 5.3213894457088646e-08, |
|
"loss": 2.2378, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.165314401622718, |
|
"grad_norm": 1.8925272013685321, |
|
"learning_rate": 5.3118146335598536e-08, |
|
"loss": 2.265, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 2.1703853955375254, |
|
"grad_norm": 1.7527393142771752, |
|
"learning_rate": 5.3024998476960626e-08, |
|
"loss": 2.2183, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.1754563894523327, |
|
"grad_norm": 1.7698628867396988, |
|
"learning_rate": 5.293438715307019e-08, |
|
"loss": 2.233, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 2.18052738336714, |
|
"grad_norm": 1.724950058777004, |
|
"learning_rate": 5.2846250027952295e-08, |
|
"loss": 2.249, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.185598377281947, |
|
"grad_norm": 1.9072718835854334, |
|
"learning_rate": 5.276052613107927e-08, |
|
"loss": 2.2342, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 2.1906693711967544, |
|
"grad_norm": 1.7983471937343785, |
|
"learning_rate": 5.2677155831127696e-08, |
|
"loss": 2.2707, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.1957403651115617, |
|
"grad_norm": 1.7092533410568467, |
|
"learning_rate": 5.259608081016899e-08, |
|
"loss": 2.2479, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 2.2008113590263694, |
|
"grad_norm": 1.7921254707864127, |
|
"learning_rate": 5.2517244038287416e-08, |
|
"loss": 2.229, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.2058823529411766, |
|
"grad_norm": 1.75489401951672, |
|
"learning_rate": 5.244058974861976e-08, |
|
"loss": 2.2772, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 2.210953346855984, |
|
"grad_norm": 1.8175479517709452, |
|
"learning_rate": 5.236606341281078e-08, |
|
"loss": 2.2356, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.216024340770791, |
|
"grad_norm": 1.808556074117745, |
|
"learning_rate": 5.229361171687859e-08, |
|
"loss": 2.2553, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 2.2210953346855984, |
|
"grad_norm": 1.7664667006627157, |
|
"learning_rate": 5.2223182537484316e-08, |
|
"loss": 2.2719, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.2261663286004056, |
|
"grad_norm": 1.7502392717778497, |
|
"learning_rate": 5.2154724918600314e-08, |
|
"loss": 2.2583, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 2.231237322515213, |
|
"grad_norm": 1.7242967584463027, |
|
"learning_rate": 5.208818904857144e-08, |
|
"loss": 2.2411, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.231237322515213, |
|
"eval_loss": 2.474597930908203, |
|
"eval_runtime": 81.0438, |
|
"eval_samples_per_second": 86.447, |
|
"eval_steps_per_second": 0.679, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.23630831643002, |
|
"grad_norm": 1.760326712726159, |
|
"learning_rate": 5.202352623756371e-08, |
|
"loss": 2.2356, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 2.2413793103448274, |
|
"grad_norm": 1.7625638663030738, |
|
"learning_rate": 5.1960688895395006e-08, |
|
"loss": 2.2441, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.2464503042596347, |
|
"grad_norm": 1.7518142596486186, |
|
"learning_rate": 5.189963050974238e-08, |
|
"loss": 2.2674, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 2.2515212981744424, |
|
"grad_norm": 1.8040378121090448, |
|
"learning_rate": 5.184030562472053e-08, |
|
"loss": 2.2233, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.2565922920892496, |
|
"grad_norm": 1.769147010660197, |
|
"learning_rate": 5.1782669819826294e-08, |
|
"loss": 2.2445, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 2.261663286004057, |
|
"grad_norm": 1.802360281392845, |
|
"learning_rate": 5.1726679689243875e-08, |
|
"loss": 2.234, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.266734279918864, |
|
"grad_norm": 1.763707867667644, |
|
"learning_rate": 5.1672292821505586e-08, |
|
"loss": 2.2132, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 2.2718052738336714, |
|
"grad_norm": 1.75034581686763, |
|
"learning_rate": 5.161946777950308e-08, |
|
"loss": 2.2381, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.2768762677484786, |
|
"grad_norm": 1.7401836199474783, |
|
"learning_rate": 5.1568164080844036e-08, |
|
"loss": 2.2416, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 2.281947261663286, |
|
"grad_norm": 1.7713650977668527, |
|
"learning_rate": 5.1518342178549174e-08, |
|
"loss": 2.224, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.287018255578093, |
|
"grad_norm": 1.7671231076913356, |
|
"learning_rate": 5.146996344208486e-08, |
|
"loss": 2.2183, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 2.292089249492901, |
|
"grad_norm": 1.7464419032652747, |
|
"learning_rate": 5.142299013872629e-08, |
|
"loss": 2.2419, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.297160243407708, |
|
"grad_norm": 1.7990294085116565, |
|
"learning_rate": 5.1377385415246445e-08, |
|
"loss": 2.2311, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 2.3022312373225153, |
|
"grad_norm": 1.7543351264072877, |
|
"learning_rate": 5.1333113279926185e-08, |
|
"loss": 2.238, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.3073022312373226, |
|
"grad_norm": 1.6898279670163325, |
|
"learning_rate": 5.129013858488057e-08, |
|
"loss": 2.2308, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 2.31237322515213, |
|
"grad_norm": 1.7334567047607963, |
|
"learning_rate": 5.124842700869695e-08, |
|
"loss": 2.3031, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.317444219066937, |
|
"grad_norm": 1.760983319309442, |
|
"learning_rate": 5.120794503938012e-08, |
|
"loss": 2.2455, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 2.3225152129817443, |
|
"grad_norm": 1.7621675205518297, |
|
"learning_rate": 5.116865995760006e-08, |
|
"loss": 2.228, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.3275862068965516, |
|
"grad_norm": 1.8080633887862172, |
|
"learning_rate": 5.113053982023768e-08, |
|
"loss": 2.284, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 2.332657200811359, |
|
"grad_norm": 1.7592998081055247, |
|
"learning_rate": 5.1093553444224286e-08, |
|
"loss": 2.2196, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.337728194726166, |
|
"grad_norm": 1.7831607571885368, |
|
"learning_rate": 5.105767039067024e-08, |
|
"loss": 2.269, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 2.342799188640974, |
|
"grad_norm": 1.7176459519033709, |
|
"learning_rate": 5.102286094927856e-08, |
|
"loss": 2.2435, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.347870182555781, |
|
"grad_norm": 1.7512756209003166, |
|
"learning_rate": 5.098909612303925e-08, |
|
"loss": 2.2579, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 2.3529411764705883, |
|
"grad_norm": 1.7419259056225642, |
|
"learning_rate": 5.095634761319991e-08, |
|
"loss": 2.268, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.3580121703853956, |
|
"grad_norm": 1.7461469979215953, |
|
"learning_rate": 5.092458780450876e-08, |
|
"loss": 2.2252, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 2.363083164300203, |
|
"grad_norm": 1.745083473021831, |
|
"learning_rate": 5.089378975072569e-08, |
|
"loss": 2.2591, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.36815415821501, |
|
"grad_norm": 1.8343705825023535, |
|
"learning_rate": 5.086392716039744e-08, |
|
"loss": 2.2626, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 2.3732251521298173, |
|
"grad_norm": 1.7515682941502182, |
|
"learning_rate": 5.0834974382892763e-08, |
|
"loss": 2.2378, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.3782961460446246, |
|
"grad_norm": 1.772483228062822, |
|
"learning_rate": 5.080690639469371e-08, |
|
"loss": 2.2906, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 2.3833671399594323, |
|
"grad_norm": 1.8298309311035177, |
|
"learning_rate": 5.077969878593903e-08, |
|
"loss": 2.2782, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.3884381338742395, |
|
"grad_norm": 1.778228901931638, |
|
"learning_rate": 5.0753327747215805e-08, |
|
"loss": 2.2687, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 2.393509127789047, |
|
"grad_norm": 1.9355725485663295, |
|
"learning_rate": 5.0727770056595594e-08, |
|
"loss": 2.25, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.398580121703854, |
|
"grad_norm": 1.7876677525732199, |
|
"learning_rate": 5.070300306691114e-08, |
|
"loss": 2.2811, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 2.4036511156186613, |
|
"grad_norm": 1.766450812020173, |
|
"learning_rate": 5.067900469327011e-08, |
|
"loss": 2.265, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.4087221095334685, |
|
"grad_norm": 1.6988211316677768, |
|
"learning_rate": 5.065575340080193e-08, |
|
"loss": 2.2458, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 2.413793103448276, |
|
"grad_norm": 1.777565241311822, |
|
"learning_rate": 5.063322819263436e-08, |
|
"loss": 2.289, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.418864097363083, |
|
"grad_norm": 1.766648317811343, |
|
"learning_rate": 5.061140859809592e-08, |
|
"loss": 2.2263, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 2.4239350912778903, |
|
"grad_norm": 1.760808570512941, |
|
"learning_rate": 5.059027466114087e-08, |
|
"loss": 2.2371, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.4290060851926976, |
|
"grad_norm": 1.7497881623660254, |
|
"learning_rate": 5.056980692899308e-08, |
|
"loss": 2.2186, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 2.4340770791075053, |
|
"grad_norm": 1.904368651484495, |
|
"learning_rate": 5.0549986441005356e-08, |
|
"loss": 2.2413, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.4340770791075053, |
|
"eval_loss": 2.4748759269714355, |
|
"eval_runtime": 81.0832, |
|
"eval_samples_per_second": 86.405, |
|
"eval_steps_per_second": 0.678, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.4391480730223125, |
|
"grad_norm": 1.7410363640013542, |
|
"learning_rate": 5.053079471773089e-08, |
|
"loss": 2.2531, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 2.4442190669371198, |
|
"grad_norm": 1.7518018775000213, |
|
"learning_rate": 5.0512213750203305e-08, |
|
"loss": 2.2473, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.449290060851927, |
|
"grad_norm": 1.7662222396602074, |
|
"learning_rate": 5.049422598942212e-08, |
|
"loss": 2.2389, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 2.4543610547667343, |
|
"grad_norm": 1.780666367007688, |
|
"learning_rate": 5.0476814336040274e-08, |
|
"loss": 2.197, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.4594320486815415, |
|
"grad_norm": 1.7499711395815145, |
|
"learning_rate": 5.04599621302504e-08, |
|
"loss": 2.2261, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 2.464503042596349, |
|
"grad_norm": 1.7882713122146334, |
|
"learning_rate": 5.04436531418668e-08, |
|
"loss": 2.2393, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.469574036511156, |
|
"grad_norm": 1.75643986036064, |
|
"learning_rate": 5.042787156059982e-08, |
|
"loss": 2.2439, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 2.4746450304259637, |
|
"grad_norm": 1.7353199942499, |
|
"learning_rate": 5.041260198651953e-08, |
|
"loss": 2.2275, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.479716024340771, |
|
"grad_norm": 1.7683236873580634, |
|
"learning_rate": 5.039782942070575e-08, |
|
"loss": 2.2378, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 2.4847870182555782, |
|
"grad_norm": 1.7482878827223234, |
|
"learning_rate": 5.038353925608112e-08, |
|
"loss": 2.2655, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.4898580121703855, |
|
"grad_norm": 1.7553465772492238, |
|
"learning_rate": 5.036971726842454e-08, |
|
"loss": 2.2509, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 2.4949290060851927, |
|
"grad_norm": 1.7194051175937297, |
|
"learning_rate": 5.035634960756173e-08, |
|
"loss": 2.2246, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.780820717878673, |
|
"learning_rate": 5.0345973520341744e-08, |
|
"loss": 2.3116, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 2.5050709939148073, |
|
"grad_norm": 1.7092302368812895, |
|
"learning_rate": 5.0333389906255366e-08, |
|
"loss": 2.2434, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.5101419878296145, |
|
"grad_norm": 1.6995993050400164, |
|
"learning_rate": 5.03212237555571e-08, |
|
"loss": 2.234, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 2.5152129817444218, |
|
"grad_norm": 1.7916125090755124, |
|
"learning_rate": 5.030946256214713e-08, |
|
"loss": 2.2365, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.520283975659229, |
|
"grad_norm": 1.743409123646943, |
|
"learning_rate": 5.0298094154063516e-08, |
|
"loss": 2.2778, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 2.5253549695740363, |
|
"grad_norm": 1.7989761193864806, |
|
"learning_rate": 5.028710668564437e-08, |
|
"loss": 2.2698, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.530425963488844, |
|
"grad_norm": 1.768436463277154, |
|
"learning_rate": 5.027648862984817e-08, |
|
"loss": 2.2295, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 2.535496957403651, |
|
"grad_norm": 1.7762161444449078, |
|
"learning_rate": 5.026622877072948e-08, |
|
"loss": 2.2772, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.5405679513184585, |
|
"grad_norm": 1.7325943514517332, |
|
"learning_rate": 5.0256316196067565e-08, |
|
"loss": 2.2326, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 2.5456389452332657, |
|
"grad_norm": 1.7568007182157335, |
|
"learning_rate": 5.024674029014512e-08, |
|
"loss": 2.2575, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.550709939148073, |
|
"grad_norm": 1.7465474101311085, |
|
"learning_rate": 5.023749072667476e-08, |
|
"loss": 2.2398, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 2.5557809330628802, |
|
"grad_norm": 1.7105972624166814, |
|
"learning_rate": 5.022855746187064e-08, |
|
"loss": 2.2348, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.5608519269776875, |
|
"grad_norm": 1.759196327867933, |
|
"learning_rate": 5.021993072766265e-08, |
|
"loss": 2.2302, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 2.565922920892495, |
|
"grad_norm": 1.7618696598564434, |
|
"learning_rate": 5.0211601025050875e-08, |
|
"loss": 2.2783, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.5709939148073024, |
|
"grad_norm": 1.7357397604845723, |
|
"learning_rate": 5.020355911759782e-08, |
|
"loss": 2.2399, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 2.5760649087221097, |
|
"grad_norm": 1.7797963559349856, |
|
"learning_rate": 5.019579602505595e-08, |
|
"loss": 2.3119, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.581135902636917, |
|
"grad_norm": 1.7476476267237637, |
|
"learning_rate": 5.0188303017128396e-08, |
|
"loss": 2.2362, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 2.586206896551724, |
|
"grad_norm": 1.7871655712678034, |
|
"learning_rate": 5.018107160736018e-08, |
|
"loss": 2.2684, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.5912778904665315, |
|
"grad_norm": 1.8564365985849263, |
|
"learning_rate": 5.0174093547158035e-08, |
|
"loss": 2.2683, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 2.5963488843813387, |
|
"grad_norm": 1.7498854511370805, |
|
"learning_rate": 5.016736081993624e-08, |
|
"loss": 2.2518, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.601419878296146, |
|
"grad_norm": 1.7966977533010748, |
|
"learning_rate": 5.016086563538651e-08, |
|
"loss": 2.2218, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 2.606490872210953, |
|
"grad_norm": 1.7558979371137615, |
|
"learning_rate": 5.015460042386951e-08, |
|
"loss": 2.2658, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.6115618661257605, |
|
"grad_norm": 1.7805268954368878, |
|
"learning_rate": 5.014855783092602e-08, |
|
"loss": 2.2324, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 2.6166328600405677, |
|
"grad_norm": 1.7547744035144406, |
|
"learning_rate": 5.0142730711905564e-08, |
|
"loss": 2.2635, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.6217038539553754, |
|
"grad_norm": 1.7892043381738651, |
|
"learning_rate": 5.013711212671024e-08, |
|
"loss": 2.2174, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 2.6267748478701827, |
|
"grad_norm": 1.7661048483256172, |
|
"learning_rate": 5.013169533465201e-08, |
|
"loss": 2.2411, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.63184584178499, |
|
"grad_norm": 1.7714992602824393, |
|
"learning_rate": 5.012647378942108e-08, |
|
"loss": 2.2379, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 2.636916835699797, |
|
"grad_norm": 1.757980523509378, |
|
"learning_rate": 5.0121441134163554e-08, |
|
"loss": 2.216, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.636916835699797, |
|
"eval_loss": 2.4749209880828857, |
|
"eval_runtime": 81.0391, |
|
"eval_samples_per_second": 86.452, |
|
"eval_steps_per_second": 0.679, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.6419878296146044, |
|
"grad_norm": 1.8485215916583273, |
|
"learning_rate": 5.011659119666631e-08, |
|
"loss": 2.2233, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 2.6470588235294117, |
|
"grad_norm": 1.7863067371305124, |
|
"learning_rate": 5.0111917984647157e-08, |
|
"loss": 2.244, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.652129817444219, |
|
"grad_norm": 1.7146816358296353, |
|
"learning_rate": 5.010741568114834e-08, |
|
"loss": 2.2351, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 2.6572008113590266, |
|
"grad_norm": 1.831188399230356, |
|
"learning_rate": 5.0103078640031516e-08, |
|
"loss": 2.2269, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.662271805273834, |
|
"grad_norm": 1.7724728214531387, |
|
"learning_rate": 5.009890138157231e-08, |
|
"loss": 2.2075, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 2.667342799188641, |
|
"grad_norm": 1.782021890238949, |
|
"learning_rate": 5.009487858815262e-08, |
|
"loss": 2.217, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.6724137931034484, |
|
"grad_norm": 1.7481328251498853, |
|
"learning_rate": 5.0091005100048845e-08, |
|
"loss": 2.2719, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 2.6774847870182557, |
|
"grad_norm": 1.7906104909059064, |
|
"learning_rate": 5.0087275911314286e-08, |
|
"loss": 2.236, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.682555780933063, |
|
"grad_norm": 1.7602535674283515, |
|
"learning_rate": 5.008368616575389e-08, |
|
"loss": 2.2479, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 2.68762677484787, |
|
"grad_norm": 1.775336072092801, |
|
"learning_rate": 5.00802311529897e-08, |
|
"loss": 2.2651, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.6926977687626774, |
|
"grad_norm": 1.7553544981528668, |
|
"learning_rate": 5.00769063046152e-08, |
|
"loss": 2.2695, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 2.6977687626774847, |
|
"grad_norm": 1.827043155040219, |
|
"learning_rate": 5.0073707190436947e-08, |
|
"loss": 2.2565, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.702839756592292, |
|
"grad_norm": 1.7286161050152862, |
|
"learning_rate": 5.00706295148018e-08, |
|
"loss": 2.2447, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 2.707910750507099, |
|
"grad_norm": 1.818175461042268, |
|
"learning_rate": 5.0067669113008144e-08, |
|
"loss": 2.2437, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.7129817444219064, |
|
"grad_norm": 1.8017061603291116, |
|
"learning_rate": 5.006482194779946e-08, |
|
"loss": 2.2557, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 2.718052738336714, |
|
"grad_norm": 1.7866064039916518, |
|
"learning_rate": 5.006208410593867e-08, |
|
"loss": 2.2752, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.7231237322515214, |
|
"grad_norm": 1.7655940160674672, |
|
"learning_rate": 5.0059451794861766e-08, |
|
"loss": 2.2834, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 2.7281947261663286, |
|
"grad_norm": 1.7936324116014108, |
|
"learning_rate": 5.005692133940906e-08, |
|
"loss": 2.2634, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.733265720081136, |
|
"grad_norm": 1.7857563825463283, |
|
"learning_rate": 5.00544891786327e-08, |
|
"loss": 2.2741, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 2.738336713995943, |
|
"grad_norm": 1.7472045814339527, |
|
"learning_rate": 5.005215186267882e-08, |
|
"loss": 2.2644, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.7434077079107504, |
|
"grad_norm": 1.8795177703424921, |
|
"learning_rate": 5.0049906049743e-08, |
|
"loss": 2.3007, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 2.7484787018255576, |
|
"grad_norm": 1.8521743861085576, |
|
"learning_rate": 5.004774850309745e-08, |
|
"loss": 2.2366, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.7535496957403653, |
|
"grad_norm": 1.7735396381086006, |
|
"learning_rate": 5.0045676088188616e-08, |
|
"loss": 2.2481, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"grad_norm": 1.750426755759642, |
|
"learning_rate": 5.004368576980381e-08, |
|
"loss": 2.2235, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.76369168356998, |
|
"grad_norm": 1.7041388090684644, |
|
"learning_rate": 5.004177460930539e-08, |
|
"loss": 2.2231, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 2.768762677484787, |
|
"grad_norm": 1.8140115420681437, |
|
"learning_rate": 5.003993976193124e-08, |
|
"loss": 2.2138, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.7738336713995944, |
|
"grad_norm": 1.822513477317258, |
|
"learning_rate": 5.0038178474160234e-08, |
|
"loss": 2.2612, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 2.7789046653144016, |
|
"grad_norm": 1.7108014551704207, |
|
"learning_rate": 5.003648808114121e-08, |
|
"loss": 2.2464, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.783975659229209, |
|
"grad_norm": 1.7880353168056893, |
|
"learning_rate": 5.0034866004184443e-08, |
|
"loss": 2.2571, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 2.789046653144016, |
|
"grad_norm": 1.738078469289302, |
|
"learning_rate": 5.003330974831406e-08, |
|
"loss": 2.2712, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.7941176470588234, |
|
"grad_norm": 1.851997917147577, |
|
"learning_rate": 5.0031816899880413e-08, |
|
"loss": 2.266, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 2.7991886409736306, |
|
"grad_norm": 1.7297614602052127, |
|
"learning_rate": 5.0030385124230966e-08, |
|
"loss": 2.2423, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.804259634888438, |
|
"grad_norm": 1.8006816107770167, |
|
"learning_rate": 5.002901216343864e-08, |
|
"loss": 2.2506, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 2.8093306288032456, |
|
"grad_norm": 1.8037373860257597, |
|
"learning_rate": 5.002769583408638e-08, |
|
"loss": 2.2504, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.814401622718053, |
|
"grad_norm": 1.7406557827783702, |
|
"learning_rate": 5.002643402510677e-08, |
|
"loss": 2.2676, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 2.81947261663286, |
|
"grad_norm": 1.7784795193672072, |
|
"learning_rate": 5.0025224695675576e-08, |
|
"loss": 2.2052, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.8245436105476673, |
|
"grad_norm": 1.7627831810019972, |
|
"learning_rate": 5.002406587315805e-08, |
|
"loss": 2.2315, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 2.8296146044624746, |
|
"grad_norm": 1.798869752268086, |
|
"learning_rate": 5.0022955651106973e-08, |
|
"loss": 2.2436, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.834685598377282, |
|
"grad_norm": 1.712097491290732, |
|
"learning_rate": 5.00218921873112e-08, |
|
"loss": 2.274, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 2.839756592292089, |
|
"grad_norm": 1.8197661388888422, |
|
"learning_rate": 5.002087370189384e-08, |
|
"loss": 2.2696, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.839756592292089, |
|
"eval_loss": 2.4746689796447754, |
|
"eval_runtime": 80.933, |
|
"eval_samples_per_second": 86.565, |
|
"eval_steps_per_second": 0.68, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.844827586206897, |
|
"grad_norm": 1.7693694208988924, |
|
"learning_rate": 5.001989847545882e-08, |
|
"loss": 2.2054, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 2.849898580121704, |
|
"grad_norm": 1.8223549799119019, |
|
"learning_rate": 5.001896484728491e-08, |
|
"loss": 2.2656, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.8549695740365113, |
|
"grad_norm": 1.805868445642325, |
|
"learning_rate": 5.00180712135662e-08, |
|
"loss": 2.26, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 2.8600405679513186, |
|
"grad_norm": 1.7505054153674502, |
|
"learning_rate": 5.001721602569797e-08, |
|
"loss": 2.2465, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.865111561866126, |
|
"grad_norm": 1.8486977309170785, |
|
"learning_rate": 5.0016397788606984e-08, |
|
"loss": 2.2764, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 2.870182555780933, |
|
"grad_norm": 1.7740829866432102, |
|
"learning_rate": 5.0015615059125324e-08, |
|
"loss": 2.2303, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.8752535496957403, |
|
"grad_norm": 1.7656514305652502, |
|
"learning_rate": 5.00148664444067e-08, |
|
"loss": 2.238, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 2.8803245436105476, |
|
"grad_norm": 1.7634420973902674, |
|
"learning_rate": 5.001415060038435e-08, |
|
"loss": 2.2489, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.885395537525355, |
|
"grad_norm": 1.8143454888420456, |
|
"learning_rate": 5.0013466230269694e-08, |
|
"loss": 2.2607, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 2.890466531440162, |
|
"grad_norm": 1.7405623983796592, |
|
"learning_rate": 5.001281208309067e-08, |
|
"loss": 2.2677, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.8955375253549693, |
|
"grad_norm": 1.7692613071607504, |
|
"learning_rate": 5.0012186952269086e-08, |
|
"loss": 2.2499, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 2.900608519269777, |
|
"grad_norm": 1.8007487263191868, |
|
"learning_rate": 5.0011589674235926e-08, |
|
"loss": 2.277, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.9056795131845843, |
|
"grad_norm": 1.7487914626739638, |
|
"learning_rate": 5.001101912708386e-08, |
|
"loss": 2.2377, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 2.9107505070993915, |
|
"grad_norm": 1.7555747509644022, |
|
"learning_rate": 5.0010474229256126e-08, |
|
"loss": 2.2532, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.915821501014199, |
|
"grad_norm": 1.791874000591728, |
|
"learning_rate": 5.0009953938270927e-08, |
|
"loss": 2.234, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 2.920892494929006, |
|
"grad_norm": 1.8071787232301668, |
|
"learning_rate": 5.0009457249480536e-08, |
|
"loss": 2.2316, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.9259634888438133, |
|
"grad_norm": 1.7814343272445903, |
|
"learning_rate": 5.000898319486436e-08, |
|
"loss": 2.2427, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 2.9310344827586206, |
|
"grad_norm": 1.8248593697919109, |
|
"learning_rate": 5.000853084185513e-08, |
|
"loss": 2.2027, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.9361054766734282, |
|
"grad_norm": 1.7986268547334479, |
|
"learning_rate": 5.00080992921975e-08, |
|
"loss": 2.244, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 1.8701642658692874, |
|
"learning_rate": 5.0007687680838296e-08, |
|
"loss": 2.2341, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.9462474645030428, |
|
"grad_norm": 1.7265239787323012, |
|
"learning_rate": 5.000729517484766e-08, |
|
"loss": 2.2781, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 2.95131845841785, |
|
"grad_norm": 1.7596094154490194, |
|
"learning_rate": 5.0006920972370384e-08, |
|
"loss": 2.2184, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.9563894523326573, |
|
"grad_norm": 1.775542548895703, |
|
"learning_rate": 5.000656430160671e-08, |
|
"loss": 2.2404, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 2.9614604462474645, |
|
"grad_norm": 1.7859302210997496, |
|
"learning_rate": 5.0006224419821984e-08, |
|
"loss": 2.2567, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.9665314401622718, |
|
"grad_norm": 1.8410867262560875, |
|
"learning_rate": 5.000590061238431e-08, |
|
"loss": 2.2288, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 2.971602434077079, |
|
"grad_norm": 1.79261063919542, |
|
"learning_rate": 5.0005592191829755e-08, |
|
"loss": 2.2421, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.9766734279918863, |
|
"grad_norm": 1.787266539908181, |
|
"learning_rate": 5.0005298496954236e-08, |
|
"loss": 2.2713, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 2.9817444219066935, |
|
"grad_norm": 1.8046073077938924, |
|
"learning_rate": 5.000501889193161e-08, |
|
"loss": 2.2292, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.986815415821501, |
|
"grad_norm": 1.785150585134779, |
|
"learning_rate": 5.0004752765457286e-08, |
|
"loss": 2.2557, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 2.991886409736308, |
|
"grad_norm": 1.7007836630596234, |
|
"learning_rate": 5.000449952991666e-08, |
|
"loss": 2.2913, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.9969574036511157, |
|
"grad_norm": 1.7834634363941848, |
|
"learning_rate": 5.000425862057791e-08, |
|
"loss": 2.2178, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 3.002028397565923, |
|
"grad_norm": 1.7711499203458665, |
|
"learning_rate": 5.000402949480845e-08, |
|
"loss": 2.2302, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 3.0070993914807302, |
|
"grad_norm": 1.757400702100505, |
|
"learning_rate": 5.000381163131448e-08, |
|
"loss": 2.228, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 3.0121703853955375, |
|
"grad_norm": 1.7587243027978727, |
|
"learning_rate": 5.0003604529403105e-08, |
|
"loss": 2.2532, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 3.0172413793103448, |
|
"grad_norm": 1.8076763012567914, |
|
"learning_rate": 5.000340770826644e-08, |
|
"loss": 2.2812, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 3.022312373225152, |
|
"grad_norm": 1.7710168575859588, |
|
"learning_rate": 5.000322070628711e-08, |
|
"loss": 2.2227, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 3.0273833671399593, |
|
"grad_norm": 1.7518567665908418, |
|
"learning_rate": 5.0003043080364665e-08, |
|
"loss": 2.267, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 3.032454361054767, |
|
"grad_norm": 1.75371879782544, |
|
"learning_rate": 5.0002874405262365e-08, |
|
"loss": 2.2748, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 3.037525354969574, |
|
"grad_norm": 1.7604102341237111, |
|
"learning_rate": 5.000271427297382e-08, |
|
"loss": 2.244, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 3.0425963488843815, |
|
"grad_norm": 1.7473066315528492, |
|
"learning_rate": 5.0002562292108974e-08, |
|
"loss": 2.2455, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.0425963488843815, |
|
"eval_loss": 2.475208282470703, |
|
"eval_runtime": 81.0816, |
|
"eval_samples_per_second": 86.407, |
|
"eval_steps_per_second": 0.678, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.0476673427991887, |
|
"grad_norm": 1.8183626105425974, |
|
"learning_rate": 5.000241808729891e-08, |
|
"loss": 2.2598, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 3.052738336713996, |
|
"grad_norm": 1.776003383845723, |
|
"learning_rate": 5.00022812986191e-08, |
|
"loss": 2.2749, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 3.0578093306288032, |
|
"grad_norm": 1.8405505191800016, |
|
"learning_rate": 5.0002151581030434e-08, |
|
"loss": 2.2201, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 3.0628803245436105, |
|
"grad_norm": 1.7687042107524293, |
|
"learning_rate": 5.00020286038378e-08, |
|
"loss": 2.2398, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 3.0679513184584177, |
|
"grad_norm": 1.7504153888466234, |
|
"learning_rate": 5.000191205016553e-08, |
|
"loss": 2.2221, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 3.073022312373225, |
|
"grad_norm": 1.7642074409964643, |
|
"learning_rate": 5.000180161644944e-08, |
|
"loss": 2.2223, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 3.0780933062880322, |
|
"grad_norm": 1.7392036544850287, |
|
"learning_rate": 5.000169701194494e-08, |
|
"loss": 2.2192, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 3.08316430020284, |
|
"grad_norm": 1.720350344708903, |
|
"learning_rate": 5.0001597958250776e-08, |
|
"loss": 2.2315, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 3.088235294117647, |
|
"grad_norm": 1.7724706443214726, |
|
"learning_rate": 5.000150418884808e-08, |
|
"loss": 2.2501, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 3.0933062880324544, |
|
"grad_norm": 1.7924639073969963, |
|
"learning_rate": 5.000141544865421e-08, |
|
"loss": 2.2446, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 3.0983772819472617, |
|
"grad_norm": 1.736852243176053, |
|
"learning_rate": 5.000133149359102e-08, |
|
"loss": 2.2457, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 3.103448275862069, |
|
"grad_norm": 1.784090807966895, |
|
"learning_rate": 5.000125209016723e-08, |
|
"loss": 2.2521, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 3.108519269776876, |
|
"grad_norm": 1.7552195819841987, |
|
"learning_rate": 5.000117701507439e-08, |
|
"loss": 2.2331, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 3.1135902636916835, |
|
"grad_norm": 1.7588419707647238, |
|
"learning_rate": 5.0001106054796176e-08, |
|
"loss": 2.2465, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 3.1186612576064907, |
|
"grad_norm": 1.731249391051153, |
|
"learning_rate": 5.000103900523059e-08, |
|
"loss": 2.2154, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 3.123732251521298, |
|
"grad_norm": 1.86107961069035, |
|
"learning_rate": 5.0000975671324725e-08, |
|
"loss": 2.2498, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 3.1288032454361057, |
|
"grad_norm": 1.7453958505335196, |
|
"learning_rate": 5.000091586672176e-08, |
|
"loss": 2.213, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 3.133874239350913, |
|
"grad_norm": 1.739107722358469, |
|
"learning_rate": 5.000085941341981e-08, |
|
"loss": 2.2703, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 3.13894523326572, |
|
"grad_norm": 1.723031377500322, |
|
"learning_rate": 5.000080614144228e-08, |
|
"loss": 2.256, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 3.1440162271805274, |
|
"grad_norm": 1.7859618571141844, |
|
"learning_rate": 5.0000755888519526e-08, |
|
"loss": 2.2446, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3.1490872210953347, |
|
"grad_norm": 1.7642645902841112, |
|
"learning_rate": 5.0000708499781274e-08, |
|
"loss": 2.2365, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 3.154158215010142, |
|
"grad_norm": 1.8188951223969028, |
|
"learning_rate": 5.000066382745973e-08, |
|
"loss": 2.2743, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 3.159229208924949, |
|
"grad_norm": 1.8017937041457348, |
|
"learning_rate": 5.000062173060291e-08, |
|
"loss": 2.2501, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 3.1643002028397564, |
|
"grad_norm": 1.7816544045204796, |
|
"learning_rate": 5.0000582074797944e-08, |
|
"loss": 2.2025, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 3.1693711967545637, |
|
"grad_norm": 1.7911385432695703, |
|
"learning_rate": 5.0000544731904076e-08, |
|
"loss": 2.2284, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 3.1744421906693714, |
|
"grad_norm": 1.9232399576032946, |
|
"learning_rate": 5.000050957979507e-08, |
|
"loss": 2.2407, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 3.1795131845841786, |
|
"grad_norm": 1.7293397524348884, |
|
"learning_rate": 5.000047650211071e-08, |
|
"loss": 2.2468, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 3.184584178498986, |
|
"grad_norm": 1.7870474846773756, |
|
"learning_rate": 5.000044538801721e-08, |
|
"loss": 2.2432, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 3.189655172413793, |
|
"grad_norm": 1.7179456705770244, |
|
"learning_rate": 5.000041613197611e-08, |
|
"loss": 2.2478, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 3.1947261663286004, |
|
"grad_norm": 1.782930312662543, |
|
"learning_rate": 5.0000388633521626e-08, |
|
"loss": 2.219, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 3.1997971602434077, |
|
"grad_norm": 1.8396726211182168, |
|
"learning_rate": 5.000036279704598e-08, |
|
"loss": 2.2131, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 3.204868154158215, |
|
"grad_norm": 1.7441223394696925, |
|
"learning_rate": 5.000033853159261e-08, |
|
"loss": 2.216, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 3.209939148073022, |
|
"grad_norm": 1.79701015495686, |
|
"learning_rate": 5.000031575065695e-08, |
|
"loss": 2.2423, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 3.2150101419878294, |
|
"grad_norm": 1.7824241551117812, |
|
"learning_rate": 5.000029437199458e-08, |
|
"loss": 2.245, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 3.220081135902637, |
|
"grad_norm": 1.7859671284571614, |
|
"learning_rate": 5.000027431743653e-08, |
|
"loss": 2.2466, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 3.2251521298174444, |
|
"grad_norm": 1.7508641392805016, |
|
"learning_rate": 5.000025551271141e-08, |
|
"loss": 2.2123, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 3.2302231237322516, |
|
"grad_norm": 1.790375251718636, |
|
"learning_rate": 5.000023788727435e-08, |
|
"loss": 2.2387, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 3.235294117647059, |
|
"grad_norm": 1.8347285573544698, |
|
"learning_rate": 5.0000221374142326e-08, |
|
"loss": 2.2024, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 3.240365111561866, |
|
"grad_norm": 1.766020664546832, |
|
"learning_rate": 5.0000205909735805e-08, |
|
"loss": 2.25, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 3.2454361054766734, |
|
"grad_norm": 1.7685652184853669, |
|
"learning_rate": 5.000019143372644e-08, |
|
"loss": 2.216, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.2454361054766734, |
|
"eval_loss": 2.475315809249878, |
|
"eval_runtime": 81.0728, |
|
"eval_samples_per_second": 86.416, |
|
"eval_steps_per_second": 0.678, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.2505070993914806, |
|
"grad_norm": 1.8114020440458831, |
|
"learning_rate": 5.000017788889067e-08, |
|
"loss": 2.2909, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 3.255578093306288, |
|
"grad_norm": 1.8044780174846506, |
|
"learning_rate": 5.0000165220969006e-08, |
|
"loss": 2.2682, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 3.260649087221095, |
|
"grad_norm": 1.8227060747974817, |
|
"learning_rate": 5.0000153378530776e-08, |
|
"loss": 2.2551, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 3.2657200811359024, |
|
"grad_norm": 1.712746112733307, |
|
"learning_rate": 5.000014231284425e-08, |
|
"loss": 2.2085, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 3.27079107505071, |
|
"grad_norm": 1.7693643379563115, |
|
"learning_rate": 5.000013197775189e-08, |
|
"loss": 2.2089, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 3.2758620689655173, |
|
"grad_norm": 1.742416891486272, |
|
"learning_rate": 5.000012232955056e-08, |
|
"loss": 2.2256, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 3.2809330628803246, |
|
"grad_norm": 1.7588332712006007, |
|
"learning_rate": 5.000011332687656e-08, |
|
"loss": 2.2411, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 3.286004056795132, |
|
"grad_norm": 1.748987632844159, |
|
"learning_rate": 5.000010493059533e-08, |
|
"loss": 2.2161, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 3.291075050709939, |
|
"grad_norm": 1.7730209178260556, |
|
"learning_rate": 5.000009710369558e-08, |
|
"loss": 2.2454, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 3.2961460446247464, |
|
"grad_norm": 1.7638994477476329, |
|
"learning_rate": 5.000008981118782e-08, |
|
"loss": 2.2762, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 3.3012170385395536, |
|
"grad_norm": 1.8306906774843352, |
|
"learning_rate": 5.000008302000705e-08, |
|
"loss": 2.2484, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 3.306288032454361, |
|
"grad_norm": 1.8155910247025784, |
|
"learning_rate": 5.0000076698919504e-08, |
|
"loss": 2.2172, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 3.3113590263691686, |
|
"grad_norm": 1.9000838772092157, |
|
"learning_rate": 5.0000070818433264e-08, |
|
"loss": 2.2639, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 3.316430020283976, |
|
"grad_norm": 1.8182257588876376, |
|
"learning_rate": 5.000006535071267e-08, |
|
"loss": 2.2302, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 3.321501014198783, |
|
"grad_norm": 1.7421030430480422, |
|
"learning_rate": 5.0000060269496374e-08, |
|
"loss": 2.2618, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 3.3265720081135903, |
|
"grad_norm": 1.7545361773998456, |
|
"learning_rate": 5.0000055550018825e-08, |
|
"loss": 2.2174, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 3.3316430020283976, |
|
"grad_norm": 1.7382589137313635, |
|
"learning_rate": 5.000005116893524e-08, |
|
"loss": 2.2497, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 3.336713995943205, |
|
"grad_norm": 1.7544110577796528, |
|
"learning_rate": 5.000004710424972e-08, |
|
"loss": 2.2386, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 3.341784989858012, |
|
"grad_norm": 1.7756370830140873, |
|
"learning_rate": 5.0000043335246576e-08, |
|
"loss": 2.2124, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 3.3468559837728193, |
|
"grad_norm": 1.7647740914276824, |
|
"learning_rate": 5.0000039842424645e-08, |
|
"loss": 2.2357, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.3519269776876266, |
|
"grad_norm": 1.7614092517536837, |
|
"learning_rate": 5.000003660743452e-08, |
|
"loss": 2.2823, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 3.356997971602434, |
|
"grad_norm": 1.7889494130903192, |
|
"learning_rate": 5.000003361301858e-08, |
|
"loss": 2.1835, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 3.3620689655172415, |
|
"grad_norm": 1.7154434994558871, |
|
"learning_rate": 5.000003084295374e-08, |
|
"loss": 2.2724, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 3.367139959432049, |
|
"grad_norm": 1.8155130093382392, |
|
"learning_rate": 5.0000028281996743e-08, |
|
"loss": 2.2823, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 3.372210953346856, |
|
"grad_norm": 1.880078020122213, |
|
"learning_rate": 5.0000025915832e-08, |
|
"loss": 2.2421, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 3.3772819472616633, |
|
"grad_norm": 1.7913171122885942, |
|
"learning_rate": 5.000002373102181e-08, |
|
"loss": 2.1806, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 3.3823529411764706, |
|
"grad_norm": 1.8110141267464457, |
|
"learning_rate": 5.000002171495887e-08, |
|
"loss": 2.2315, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 3.387423935091278, |
|
"grad_norm": 1.8187945379716748, |
|
"learning_rate": 5.000001985582107e-08, |
|
"loss": 2.2207, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 3.392494929006085, |
|
"grad_norm": 1.7822827152937282, |
|
"learning_rate": 5.000001814252828e-08, |
|
"loss": 2.2411, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 3.3975659229208923, |
|
"grad_norm": 1.7281310183638643, |
|
"learning_rate": 5.0000016564701364e-08, |
|
"loss": 2.2415, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 3.4026369168357, |
|
"grad_norm": 1.7550793470914747, |
|
"learning_rate": 5.000001511262302e-08, |
|
"loss": 2.2464, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 3.4077079107505073, |
|
"grad_norm": 1.7459578038518018, |
|
"learning_rate": 5.0000013777200565e-08, |
|
"loss": 2.2504, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 3.4127789046653145, |
|
"grad_norm": 1.740338062654503, |
|
"learning_rate": 5.000001254993049e-08, |
|
"loss": 2.2292, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 3.417849898580122, |
|
"grad_norm": 1.8005446847395141, |
|
"learning_rate": 5.000001142286484e-08, |
|
"loss": 2.2646, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 3.422920892494929, |
|
"grad_norm": 1.8075984781615184, |
|
"learning_rate": 5.000001038857911e-08, |
|
"loss": 2.2549, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 3.4279918864097363, |
|
"grad_norm": 1.7944612854944237, |
|
"learning_rate": 5.000000944014192e-08, |
|
"loss": 2.2607, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 3.4330628803245435, |
|
"grad_norm": 1.8042996177778357, |
|
"learning_rate": 5.000000857108604e-08, |
|
"loss": 2.2129, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 3.438133874239351, |
|
"grad_norm": 1.812331539187214, |
|
"learning_rate": 5.0000007775380984e-08, |
|
"loss": 2.247, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 3.443204868154158, |
|
"grad_norm": 1.7634101221518121, |
|
"learning_rate": 5.0000007047407e-08, |
|
"loss": 2.2454, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"grad_norm": 1.8137979752467785, |
|
"learning_rate": 5.000000638193037e-08, |
|
"loss": 2.2348, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.4482758620689653, |
|
"eval_loss": 2.475677013397217, |
|
"eval_runtime": 81.0429, |
|
"eval_samples_per_second": 86.448, |
|
"eval_steps_per_second": 0.679, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.453346855983773, |
|
"grad_norm": 1.7639358988388496, |
|
"learning_rate": 5.0000005774079994e-08, |
|
"loss": 2.2434, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 3.4584178498985803, |
|
"grad_norm": 1.8860372894717414, |
|
"learning_rate": 5.0000005219325215e-08, |
|
"loss": 2.2184, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 3.4634888438133875, |
|
"grad_norm": 1.792302245525526, |
|
"learning_rate": 5.000000471345483e-08, |
|
"loss": 2.2405, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 3.4685598377281948, |
|
"grad_norm": 1.7326646638681342, |
|
"learning_rate": 5.000000425255718e-08, |
|
"loss": 2.2582, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 3.473630831643002, |
|
"grad_norm": 1.7944771245301363, |
|
"learning_rate": 5.0000003833001365e-08, |
|
"loss": 2.202, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 3.4787018255578093, |
|
"grad_norm": 1.8158606522431084, |
|
"learning_rate": 5.000000345141943e-08, |
|
"loss": 2.2533, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 3.4837728194726165, |
|
"grad_norm": 1.8541024781685664, |
|
"learning_rate": 5.0000003104689555e-08, |
|
"loss": 2.2387, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 3.4888438133874238, |
|
"grad_norm": 1.7999921658917655, |
|
"learning_rate": 5.0000002789920174e-08, |
|
"loss": 2.2441, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 3.4939148073022315, |
|
"grad_norm": 1.7685774604287066, |
|
"learning_rate": 5.000000250443497e-08, |
|
"loss": 2.3018, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 3.4989858012170387, |
|
"grad_norm": 1.7777470112493552, |
|
"learning_rate": 5.000000224575872e-08, |
|
"loss": 2.2433, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 3.504056795131846, |
|
"grad_norm": 1.7748253950125374, |
|
"learning_rate": 5.000000201160396e-08, |
|
"loss": 2.2782, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 3.5091277890466532, |
|
"grad_norm": 1.7842700957790634, |
|
"learning_rate": 5.000000179985839e-08, |
|
"loss": 2.2659, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 3.5141987829614605, |
|
"grad_norm": 1.798939281745875, |
|
"learning_rate": 5.000000160857302e-08, |
|
"loss": 2.2396, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 3.5192697768762677, |
|
"grad_norm": 1.8045276757468276, |
|
"learning_rate": 5.000000143595102e-08, |
|
"loss": 2.2325, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 3.524340770791075, |
|
"grad_norm": 1.7262031285233723, |
|
"learning_rate": 5.0000001280337235e-08, |
|
"loss": 2.243, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 3.5294117647058822, |
|
"grad_norm": 1.8375261220518257, |
|
"learning_rate": 5.000000114020828e-08, |
|
"loss": 2.2075, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 3.5344827586206895, |
|
"grad_norm": 1.8163152606406519, |
|
"learning_rate": 5.0000001014163305e-08, |
|
"loss": 2.2494, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 3.5395537525354968, |
|
"grad_norm": 1.8525927462335219, |
|
"learning_rate": 5.0000000900915245e-08, |
|
"loss": 2.2163, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 3.544624746450304, |
|
"grad_norm": 1.7805165281974848, |
|
"learning_rate": 5.000000079928269e-08, |
|
"loss": 2.2525, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 3.5496957403651117, |
|
"grad_norm": 1.7990737454408499, |
|
"learning_rate": 5.000000070818217e-08, |
|
"loss": 2.2874, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.554766734279919, |
|
"grad_norm": 1.8247781997920414, |
|
"learning_rate": 5.000000062662102e-08, |
|
"loss": 2.2215, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 3.559837728194726, |
|
"grad_norm": 1.9826615858248522, |
|
"learning_rate": 5.000000055369062e-08, |
|
"loss": 2.2443, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 3.5649087221095335, |
|
"grad_norm": 1.799487216606698, |
|
"learning_rate": 5.000000048856012e-08, |
|
"loss": 2.2266, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 3.5699797160243407, |
|
"grad_norm": 1.8091696515518445, |
|
"learning_rate": 5.0000000430470526e-08, |
|
"loss": 2.2517, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 3.575050709939148, |
|
"grad_norm": 1.7814535925288772, |
|
"learning_rate": 5.0000000378729234e-08, |
|
"loss": 2.2321, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 3.5801217038539552, |
|
"grad_norm": 1.850742214981416, |
|
"learning_rate": 5.000000033270488e-08, |
|
"loss": 2.2597, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 3.585192697768763, |
|
"grad_norm": 1.7822355084719033, |
|
"learning_rate": 5.000000029182252e-08, |
|
"loss": 2.2963, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 3.59026369168357, |
|
"grad_norm": 1.7548584963433536, |
|
"learning_rate": 5.0000000255559235e-08, |
|
"loss": 2.2669, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 3.5953346855983774, |
|
"grad_norm": 1.8526633444752874, |
|
"learning_rate": 5.0000000223439884e-08, |
|
"loss": 2.2367, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 3.6004056795131847, |
|
"grad_norm": 1.8813498033155052, |
|
"learning_rate": 5.0000000195033304e-08, |
|
"loss": 2.2373, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 3.605476673427992, |
|
"grad_norm": 1.7670822667081592, |
|
"learning_rate": 5.0000000169948675e-08, |
|
"loss": 2.2705, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 3.610547667342799, |
|
"grad_norm": 1.7756286276528583, |
|
"learning_rate": 5.000000014783217e-08, |
|
"loss": 2.2979, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 3.6156186612576064, |
|
"grad_norm": 1.7467172856710016, |
|
"learning_rate": 5.000000012836387e-08, |
|
"loss": 2.2538, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 3.6206896551724137, |
|
"grad_norm": 1.7107623358426811, |
|
"learning_rate": 5.000000011125491e-08, |
|
"loss": 2.2807, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 3.625760649087221, |
|
"grad_norm": 1.8431542462448438, |
|
"learning_rate": 5.000000009624475e-08, |
|
"loss": 2.252, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 3.630831643002028, |
|
"grad_norm": 1.7683303237840782, |
|
"learning_rate": 5.000000008309876e-08, |
|
"loss": 2.2722, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 3.6359026369168355, |
|
"grad_norm": 1.7463535795755278, |
|
"learning_rate": 5.000000007160591e-08, |
|
"loss": 2.2712, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 3.640973630831643, |
|
"grad_norm": 1.8412435208194315, |
|
"learning_rate": 5.0000000061576706e-08, |
|
"loss": 2.2438, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 3.6460446247464504, |
|
"grad_norm": 1.7731354966851007, |
|
"learning_rate": 5.000000005284119e-08, |
|
"loss": 2.2305, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 3.6511156186612577, |
|
"grad_norm": 1.7263977118619886, |
|
"learning_rate": 5.0000000045247174e-08, |
|
"loss": 2.238, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.6511156186612577, |
|
"eval_loss": 2.475299596786499, |
|
"eval_runtime": 81.0503, |
|
"eval_samples_per_second": 86.44, |
|
"eval_steps_per_second": 0.679, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.656186612576065, |
|
"grad_norm": 1.725184319305705, |
|
"learning_rate": 5.000000003865863e-08, |
|
"loss": 2.2283, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 3.661257606490872, |
|
"grad_norm": 1.9023050674895976, |
|
"learning_rate": 5.000000003295409e-08, |
|
"loss": 2.21, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 3.6663286004056794, |
|
"grad_norm": 1.8044353617499143, |
|
"learning_rate": 5.0000000028025353e-08, |
|
"loss": 2.2658, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 3.6713995943204867, |
|
"grad_norm": 1.7560239895320502, |
|
"learning_rate": 5.0000000023776127e-08, |
|
"loss": 2.2558, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 3.6764705882352944, |
|
"grad_norm": 1.9019670084185585, |
|
"learning_rate": 5.00000000201209e-08, |
|
"loss": 2.2154, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 3.6815415821501016, |
|
"grad_norm": 1.835689830804529, |
|
"learning_rate": 5.0000000016983875e-08, |
|
"loss": 2.2586, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 3.686612576064909, |
|
"grad_norm": 1.8589538257906977, |
|
"learning_rate": 5.000000001429796e-08, |
|
"loss": 2.2388, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 3.691683569979716, |
|
"grad_norm": 1.8068715773945243, |
|
"learning_rate": 5.000000001200391e-08, |
|
"loss": 2.2571, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 3.6967545638945234, |
|
"grad_norm": 1.7775448603509494, |
|
"learning_rate": 5.0000000010049494e-08, |
|
"loss": 2.2751, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 3.7018255578093306, |
|
"grad_norm": 1.748064680879759, |
|
"learning_rate": 5.0000000008388774e-08, |
|
"loss": 2.2183, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 3.706896551724138, |
|
"grad_norm": 1.752057568304335, |
|
"learning_rate": 5.000000000698141e-08, |
|
"loss": 2.2532, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 3.711967545638945, |
|
"grad_norm": 1.7976874660325244, |
|
"learning_rate": 5.000000000579206e-08, |
|
"loss": 2.2447, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 3.7170385395537524, |
|
"grad_norm": 1.8361658170177098, |
|
"learning_rate": 5.000000000478986e-08, |
|
"loss": 2.2274, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 3.7221095334685597, |
|
"grad_norm": 1.7595086838837224, |
|
"learning_rate": 5.0000000003947866e-08, |
|
"loss": 2.2704, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 3.727180527383367, |
|
"grad_norm": 1.7772692374868122, |
|
"learning_rate": 5.0000000003242645e-08, |
|
"loss": 2.2394, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 3.732251521298174, |
|
"grad_norm": 1.7860835232171102, |
|
"learning_rate": 5.000000000265387e-08, |
|
"loss": 2.238, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 3.737322515212982, |
|
"grad_norm": 1.7590689183822192, |
|
"learning_rate": 5.000000000216394e-08, |
|
"loss": 2.2764, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 3.742393509127789, |
|
"grad_norm": 1.7659065260707336, |
|
"learning_rate": 5.0000000001757664e-08, |
|
"loss": 2.2459, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 3.7474645030425964, |
|
"grad_norm": 1.8153822365083379, |
|
"learning_rate": 5.0000000001421954e-08, |
|
"loss": 2.2299, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 3.7525354969574036, |
|
"grad_norm": 1.7704864144251407, |
|
"learning_rate": 5.0000000001145583e-08, |
|
"loss": 2.247, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.757606490872211, |
|
"grad_norm": 1.7268180675977047, |
|
"learning_rate": 5.000000000091894e-08, |
|
"loss": 2.2483, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 3.762677484787018, |
|
"grad_norm": 1.7808473093189052, |
|
"learning_rate": 5.000000000073382e-08, |
|
"loss": 2.2774, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 3.767748478701826, |
|
"grad_norm": 1.7999930755140212, |
|
"learning_rate": 5.0000000000583246e-08, |
|
"loss": 2.2209, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 3.772819472616633, |
|
"grad_norm": 1.7741565202241085, |
|
"learning_rate": 5.0000000000461306e-08, |
|
"loss": 2.2353, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 3.7778904665314403, |
|
"grad_norm": 1.8046657930760326, |
|
"learning_rate": 5.0000000000363e-08, |
|
"loss": 2.2255, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 3.7829614604462476, |
|
"grad_norm": 1.8038566902418574, |
|
"learning_rate": 5.000000000028412e-08, |
|
"loss": 2.2781, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 3.788032454361055, |
|
"grad_norm": 1.7944584001789026, |
|
"learning_rate": 5.0000000000221146e-08, |
|
"loss": 2.272, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 3.793103448275862, |
|
"grad_norm": 1.7491739462315397, |
|
"learning_rate": 5.0000000000171125e-08, |
|
"loss": 2.2293, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 3.7981744421906694, |
|
"grad_norm": 1.7505007397811716, |
|
"learning_rate": 5.000000000013161e-08, |
|
"loss": 2.2373, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 3.8032454361054766, |
|
"grad_norm": 1.8014769703402196, |
|
"learning_rate": 5.000000000010057e-08, |
|
"loss": 2.2552, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 3.808316430020284, |
|
"grad_norm": 1.7608287864741985, |
|
"learning_rate": 5.0000000000076337e-08, |
|
"loss": 2.2277, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 3.813387423935091, |
|
"grad_norm": 1.8323757058256038, |
|
"learning_rate": 5.0000000000057536e-08, |
|
"loss": 2.2341, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 3.8184584178498984, |
|
"grad_norm": 1.7574657806555387, |
|
"learning_rate": 5.000000000004304e-08, |
|
"loss": 2.2223, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 3.8235294117647056, |
|
"grad_norm": 1.7900689784727426, |
|
"learning_rate": 5.000000000003194e-08, |
|
"loss": 2.2445, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 3.8286004056795133, |
|
"grad_norm": 1.7873969080046235, |
|
"learning_rate": 5.000000000002351e-08, |
|
"loss": 2.2692, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 3.8336713995943206, |
|
"grad_norm": 1.7693343584107923, |
|
"learning_rate": 5.000000000001716e-08, |
|
"loss": 2.1982, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 3.838742393509128, |
|
"grad_norm": 1.782049247288072, |
|
"learning_rate": 5.00000000000124e-08, |
|
"loss": 2.2417, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 3.843813387423935, |
|
"grad_norm": 1.8357614780582354, |
|
"learning_rate": 5.000000000000888e-08, |
|
"loss": 2.2414, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 3.8488843813387423, |
|
"grad_norm": 1.7593131764821546, |
|
"learning_rate": 5.0000000000006284e-08, |
|
"loss": 2.2721, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 3.8539553752535496, |
|
"grad_norm": 1.8355045282767246, |
|
"learning_rate": 5.0000000000004405e-08, |
|
"loss": 2.2349, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.8539553752535496, |
|
"eval_loss": 2.475205421447754, |
|
"eval_runtime": 81.089, |
|
"eval_samples_per_second": 86.399, |
|
"eval_steps_per_second": 0.678, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.859026369168357, |
|
"grad_norm": 1.7617334472370734, |
|
"learning_rate": 5.000000000000305e-08, |
|
"loss": 2.2796, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 3.8640973630831645, |
|
"grad_norm": 1.7655616354078496, |
|
"learning_rate": 5.000000000000208e-08, |
|
"loss": 2.2904, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 3.869168356997972, |
|
"grad_norm": 1.7499887502905194, |
|
"learning_rate": 5.00000000000014e-08, |
|
"loss": 2.2289, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 3.874239350912779, |
|
"grad_norm": 1.7552158736441676, |
|
"learning_rate": 5.000000000000093e-08, |
|
"loss": 2.2524, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 3.8793103448275863, |
|
"grad_norm": 1.779864718453615, |
|
"learning_rate": 5.0000000000000607e-08, |
|
"loss": 2.2557, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 3.8843813387423936, |
|
"grad_norm": 1.8326086874257492, |
|
"learning_rate": 5.000000000000039e-08, |
|
"loss": 2.2642, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 3.889452332657201, |
|
"grad_norm": 1.7709614684441606, |
|
"learning_rate": 5.000000000000024e-08, |
|
"loss": 2.2316, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 3.894523326572008, |
|
"grad_norm": 1.8053802580849208, |
|
"learning_rate": 5.000000000000015e-08, |
|
"loss": 2.2568, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 3.8995943204868153, |
|
"grad_norm": 1.7935470548184194, |
|
"learning_rate": 5.0000000000000104e-08, |
|
"loss": 2.2993, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 3.9046653144016226, |
|
"grad_norm": 1.7497664491493299, |
|
"learning_rate": 5.000000000000006e-08, |
|
"loss": 2.1989, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 3.90973630831643, |
|
"grad_norm": 1.754972418650299, |
|
"learning_rate": 5.000000000000003e-08, |
|
"loss": 2.2424, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 3.914807302231237, |
|
"grad_norm": 1.7589479994346042, |
|
"learning_rate": 5.000000000000002e-08, |
|
"loss": 2.2632, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 3.9198782961460448, |
|
"grad_norm": 1.7971848831669277, |
|
"learning_rate": 5.000000000000001e-08, |
|
"loss": 2.2336, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 3.924949290060852, |
|
"grad_norm": 1.7639968737695348, |
|
"learning_rate": 5.0000000000000004e-08, |
|
"loss": 2.2296, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 3.9300202839756593, |
|
"grad_norm": 1.72827012743299, |
|
"learning_rate": 5e-08, |
|
"loss": 2.2451, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 3.9350912778904665, |
|
"grad_norm": 1.749153588059136, |
|
"learning_rate": 5e-08, |
|
"loss": 2.258, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 3.940162271805274, |
|
"grad_norm": 1.753206456867822, |
|
"learning_rate": 5e-08, |
|
"loss": 2.2587, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 3.945233265720081, |
|
"grad_norm": 1.7816747777928572, |
|
"learning_rate": 5e-08, |
|
"loss": 2.2532, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 3.9503042596348883, |
|
"grad_norm": 1.7762615930524053, |
|
"learning_rate": 5e-08, |
|
"loss": 2.2331, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 3.955375253549696, |
|
"grad_norm": 1.8039115341801395, |
|
"learning_rate": 5e-08, |
|
"loss": 2.2271, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.9604462474645032, |
|
"grad_norm": 1.7530354888252304, |
|
"learning_rate": 5e-08, |
|
"loss": 2.2191, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 3.9655172413793105, |
|
"grad_norm": 1.883699780217342, |
|
"learning_rate": 5e-08, |
|
"loss": 2.2059, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 3.9705882352941178, |
|
"grad_norm": 1.7246634345468168, |
|
"learning_rate": 5e-08, |
|
"loss": 2.2482, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 3.975659229208925, |
|
"grad_norm": 1.762677648630269, |
|
"learning_rate": 5e-08, |
|
"loss": 2.2521, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 3.9807302231237323, |
|
"grad_norm": 1.786354894638501, |
|
"learning_rate": 5e-08, |
|
"loss": 2.2763, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 3.9858012170385395, |
|
"grad_norm": 1.81100838850099, |
|
"learning_rate": 5e-08, |
|
"loss": 2.2326, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 3.9908722109533468, |
|
"grad_norm": 1.8115971845880692, |
|
"learning_rate": 5e-08, |
|
"loss": 2.2409, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 3.995943204868154, |
|
"grad_norm": 1.901268059775357, |
|
"learning_rate": 5e-08, |
|
"loss": 2.2217, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 3944, |
|
"total_flos": 411954472550400.0, |
|
"train_loss": 2.318100369605283, |
|
"train_runtime": 14372.236, |
|
"train_samples_per_second": 17.546, |
|
"train_steps_per_second": 0.274 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3944, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 200, |
|
"total_flos": 411954472550400.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|