|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.0, |
|
"eval_steps": 500, |
|
"global_step": 234, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.038461538461538464, |
|
"grad_norm": 3.6227714421853245, |
|
"learning_rate": 4.9999188892913205e-06, |
|
"loss": 0.9247, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.07692307692307693, |
|
"grad_norm": 2.33743873241245, |
|
"learning_rate": 4.999675562428437e-06, |
|
"loss": 0.8839, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.11538461538461539, |
|
"grad_norm": 1.6117080535089503, |
|
"learning_rate": 4.999270035200483e-06, |
|
"loss": 0.8349, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 1.5518021106969933, |
|
"learning_rate": 4.998702333921538e-06, |
|
"loss": 0.8214, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.19230769230769232, |
|
"grad_norm": 1.3811790146555043, |
|
"learning_rate": 4.997972495428924e-06, |
|
"loss": 0.844, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.23076923076923078, |
|
"grad_norm": 1.310050810479314, |
|
"learning_rate": 4.9970805670808174e-06, |
|
"loss": 0.8319, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.2692307692307692, |
|
"grad_norm": 1.2786534191408507, |
|
"learning_rate": 4.996026606753167e-06, |
|
"loss": 0.8674, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 1.3148686759764385, |
|
"learning_rate": 4.994810682835951e-06, |
|
"loss": 0.7042, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.34615384615384615, |
|
"grad_norm": 1.090285225955643, |
|
"learning_rate": 4.9934328742287285e-06, |
|
"loss": 0.7071, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 0.907111664124928, |
|
"learning_rate": 4.991893270335526e-06, |
|
"loss": 0.8748, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.4230769230769231, |
|
"grad_norm": 0.6642504831184634, |
|
"learning_rate": 4.990191971059033e-06, |
|
"loss": 0.7435, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 0.6678557889351041, |
|
"learning_rate": 4.988329086794122e-06, |
|
"loss": 0.6785, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.701381067554106, |
|
"learning_rate": 4.986304738420684e-06, |
|
"loss": 0.6449, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.5384615384615384, |
|
"grad_norm": 0.6790086078752827, |
|
"learning_rate": 4.984119057295783e-06, |
|
"loss": 0.7425, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5769230769230769, |
|
"grad_norm": 0.6905224183064627, |
|
"learning_rate": 4.981772185245135e-06, |
|
"loss": 0.7057, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.7984130598075898, |
|
"learning_rate": 4.979264274553906e-06, |
|
"loss": 0.8067, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.6538461538461539, |
|
"grad_norm": 0.6497176581551809, |
|
"learning_rate": 4.976595487956824e-06, |
|
"loss": 0.7255, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.6923076923076923, |
|
"grad_norm": 0.6629822563364289, |
|
"learning_rate": 4.973765998627628e-06, |
|
"loss": 0.798, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.7307692307692307, |
|
"grad_norm": 0.6132162810922874, |
|
"learning_rate": 4.970775990167826e-06, |
|
"loss": 0.632, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.5554781405012508, |
|
"learning_rate": 4.967625656594782e-06, |
|
"loss": 0.6248, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.8076923076923077, |
|
"grad_norm": 0.6049571126552993, |
|
"learning_rate": 4.964315202329127e-06, |
|
"loss": 0.8391, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.8461538461538461, |
|
"grad_norm": 0.5430982822611681, |
|
"learning_rate": 4.9608448421814944e-06, |
|
"loss": 0.7089, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.8846153846153846, |
|
"grad_norm": 0.48739605520739077, |
|
"learning_rate": 4.9572148013385815e-06, |
|
"loss": 0.8009, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.45997941430542894, |
|
"learning_rate": 4.953425315348534e-06, |
|
"loss": 0.583, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.9615384615384616, |
|
"grad_norm": 0.47230353827776894, |
|
"learning_rate": 4.94947663010567e-06, |
|
"loss": 0.5849, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.47082502311551994, |
|
"learning_rate": 4.9453690018345144e-06, |
|
"loss": 0.7537, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.0384615384615385, |
|
"grad_norm": 0.46211533472097527, |
|
"learning_rate": 4.941102697073181e-06, |
|
"loss": 0.5991, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 0.46162874786523983, |
|
"learning_rate": 4.9366779926560705e-06, |
|
"loss": 0.7157, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.1153846153846154, |
|
"grad_norm": 0.43803956195742527, |
|
"learning_rate": 4.932095175695911e-06, |
|
"loss": 0.7947, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 0.43236553483657136, |
|
"learning_rate": 4.927354543565131e-06, |
|
"loss": 0.5883, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.1923076923076923, |
|
"grad_norm": 0.40276363608769367, |
|
"learning_rate": 4.922456403876552e-06, |
|
"loss": 0.6482, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 0.4722131511905768, |
|
"learning_rate": 4.917401074463441e-06, |
|
"loss": 0.72, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.2692307692307692, |
|
"grad_norm": 0.4248174640747636, |
|
"learning_rate": 4.912188883358879e-06, |
|
"loss": 0.7177, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.3076923076923077, |
|
"grad_norm": 0.42868915417283154, |
|
"learning_rate": 4.9068201687744774e-06, |
|
"loss": 0.602, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.3461538461538463, |
|
"grad_norm": 0.42437464257989843, |
|
"learning_rate": 4.901295279078431e-06, |
|
"loss": 0.684, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 0.4148057850825471, |
|
"learning_rate": 4.895614572772916e-06, |
|
"loss": 0.5163, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.4230769230769231, |
|
"grad_norm": 0.441534358834555, |
|
"learning_rate": 4.889778418470823e-06, |
|
"loss": 0.5569, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.4615384615384617, |
|
"grad_norm": 0.41185704733550094, |
|
"learning_rate": 4.883787194871841e-06, |
|
"loss": 0.5279, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.4257014048854468, |
|
"learning_rate": 4.8776412907378845e-06, |
|
"loss": 0.6732, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.4609674066796317, |
|
"learning_rate": 4.8713411048678635e-06, |
|
"loss": 0.68, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.5769230769230769, |
|
"grad_norm": 0.4027959547024447, |
|
"learning_rate": 4.864887046071814e-06, |
|
"loss": 0.7983, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.6153846153846154, |
|
"grad_norm": 0.4569844209382049, |
|
"learning_rate": 4.858279533144358e-06, |
|
"loss": 0.4545, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.6538461538461537, |
|
"grad_norm": 0.4257095872017722, |
|
"learning_rate": 4.851518994837544e-06, |
|
"loss": 0.6318, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"grad_norm": 0.40879419445400555, |
|
"learning_rate": 4.844605869833011e-06, |
|
"loss": 0.5548, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.7307692307692308, |
|
"grad_norm": 0.43599839467908014, |
|
"learning_rate": 4.837540606713538e-06, |
|
"loss": 0.519, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.7692307692307692, |
|
"grad_norm": 0.445760123696135, |
|
"learning_rate": 4.83032366393392e-06, |
|
"loss": 0.6377, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.8076923076923077, |
|
"grad_norm": 0.4456381082157439, |
|
"learning_rate": 4.8229555097912335e-06, |
|
"loss": 0.7013, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 0.40541134413006163, |
|
"learning_rate": 4.815436622394442e-06, |
|
"loss": 0.5183, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.8846153846153846, |
|
"grad_norm": 0.44406611064318774, |
|
"learning_rate": 4.807767489633372e-06, |
|
"loss": 0.8679, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 0.48978477026333694, |
|
"learning_rate": 4.799948609147061e-06, |
|
"loss": 0.5619, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.9615384615384617, |
|
"grad_norm": 0.42927653335100224, |
|
"learning_rate": 4.791980488291457e-06, |
|
"loss": 0.5482, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.3993118925497605, |
|
"learning_rate": 4.783863644106502e-06, |
|
"loss": 0.5905, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 2.0384615384615383, |
|
"grad_norm": 0.45533421207861624, |
|
"learning_rate": 4.775598603282587e-06, |
|
"loss": 0.7045, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 2.076923076923077, |
|
"grad_norm": 0.4309081724975481, |
|
"learning_rate": 4.7671859021263635e-06, |
|
"loss": 0.5227, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 2.1153846153846154, |
|
"grad_norm": 0.4300292912476653, |
|
"learning_rate": 4.758626086525956e-06, |
|
"loss": 0.5115, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 0.38059576443400334, |
|
"learning_rate": 4.749919711915531e-06, |
|
"loss": 0.4959, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.1923076923076925, |
|
"grad_norm": 0.4126674320822358, |
|
"learning_rate": 4.74106734323926e-06, |
|
"loss": 0.6265, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 2.230769230769231, |
|
"grad_norm": 0.4671745147500316, |
|
"learning_rate": 4.73206955491466e-06, |
|
"loss": 0.5165, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 2.269230769230769, |
|
"grad_norm": 0.41618486395615545, |
|
"learning_rate": 4.7229269307953235e-06, |
|
"loss": 0.5657, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 0.43433496329970006, |
|
"learning_rate": 4.7136400641330245e-06, |
|
"loss": 0.666, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.3461538461538463, |
|
"grad_norm": 0.39780973220100413, |
|
"learning_rate": 4.704209557539235e-06, |
|
"loss": 0.5704, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.3846153846153846, |
|
"grad_norm": 0.4048550114338101, |
|
"learning_rate": 4.694636022946012e-06, |
|
"loss": 0.5673, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.423076923076923, |
|
"grad_norm": 0.385995975226885, |
|
"learning_rate": 4.684920081566295e-06, |
|
"loss": 0.5959, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 0.3949623073886239, |
|
"learning_rate": 4.675062363853599e-06, |
|
"loss": 0.6107, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.4330059721657707, |
|
"learning_rate": 4.665063509461098e-06, |
|
"loss": 0.5281, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.5384615384615383, |
|
"grad_norm": 0.44446090274862377, |
|
"learning_rate": 4.654924167200124e-06, |
|
"loss": 0.5839, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.5769230769230766, |
|
"grad_norm": 0.4450780587127078, |
|
"learning_rate": 4.6446449949980665e-06, |
|
"loss": 0.5034, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.6153846153846154, |
|
"grad_norm": 0.43555773421037586, |
|
"learning_rate": 4.634226659855681e-06, |
|
"loss": 0.62, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.6538461538461537, |
|
"grad_norm": 0.44535610498714195, |
|
"learning_rate": 4.623669837803803e-06, |
|
"loss": 0.5418, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"grad_norm": 0.4266516883259732, |
|
"learning_rate": 4.612975213859487e-06, |
|
"loss": 0.6767, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.730769230769231, |
|
"grad_norm": 0.4170049551133152, |
|
"learning_rate": 4.602143481981556e-06, |
|
"loss": 0.6701, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 0.43191999973483397, |
|
"learning_rate": 4.591175345025567e-06, |
|
"loss": 0.5555, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 2.8076923076923075, |
|
"grad_norm": 0.4361492711034067, |
|
"learning_rate": 4.580071514698211e-06, |
|
"loss": 0.6593, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 2.8461538461538463, |
|
"grad_norm": 0.39471819539317293, |
|
"learning_rate": 4.568832711511125e-06, |
|
"loss": 0.5535, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 2.8846153846153846, |
|
"grad_norm": 0.38086387704457475, |
|
"learning_rate": 4.5574596647341414e-06, |
|
"loss": 0.4545, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 2.9230769230769234, |
|
"grad_norm": 0.45356707180004513, |
|
"learning_rate": 4.545953112347967e-06, |
|
"loss": 0.6251, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 2.9615384615384617, |
|
"grad_norm": 0.4254117893636373, |
|
"learning_rate": 4.5343138009963e-06, |
|
"loss": 0.5367, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.3871428196474084, |
|
"learning_rate": 4.522542485937369e-06, |
|
"loss": 0.4539, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 3.0384615384615383, |
|
"grad_norm": 0.42671046348757163, |
|
"learning_rate": 4.510639930994942e-06, |
|
"loss": 0.513, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 0.46427278945358347, |
|
"learning_rate": 4.498606908508754e-06, |
|
"loss": 0.3476, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 3.1153846153846154, |
|
"grad_norm": 0.3755914435180136, |
|
"learning_rate": 4.486444199284386e-06, |
|
"loss": 0.3924, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 3.1538461538461537, |
|
"grad_norm": 0.46176939111709947, |
|
"learning_rate": 4.474152592542613e-06, |
|
"loss": 0.7687, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 3.1923076923076925, |
|
"grad_norm": 0.5272804561176313, |
|
"learning_rate": 4.4617328858681806e-06, |
|
"loss": 0.5281, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 3.230769230769231, |
|
"grad_norm": 0.3948130729702814, |
|
"learning_rate": 4.449185885158056e-06, |
|
"loss": 0.5106, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 3.269230769230769, |
|
"grad_norm": 0.4087463332644831, |
|
"learning_rate": 4.436512404569136e-06, |
|
"loss": 0.4434, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 3.3076923076923075, |
|
"grad_norm": 0.5269799431074561, |
|
"learning_rate": 4.423713266465415e-06, |
|
"loss": 0.3944, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 3.3461538461538463, |
|
"grad_norm": 0.461938868726041, |
|
"learning_rate": 4.410789301364621e-06, |
|
"loss": 0.5757, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 3.3846153846153846, |
|
"grad_norm": 0.46622397789610276, |
|
"learning_rate": 4.397741347884329e-06, |
|
"loss": 0.6648, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 3.423076923076923, |
|
"grad_norm": 0.43959089723440037, |
|
"learning_rate": 4.384570252687542e-06, |
|
"loss": 0.6123, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 3.4615384615384617, |
|
"grad_norm": 0.4651240638848072, |
|
"learning_rate": 4.3712768704277535e-06, |
|
"loss": 0.511, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.44262354534992776, |
|
"learning_rate": 4.357862063693486e-06, |
|
"loss": 0.4815, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 3.5384615384615383, |
|
"grad_norm": 0.39275121653018796, |
|
"learning_rate": 4.3443267029523265e-06, |
|
"loss": 0.6761, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 3.5769230769230766, |
|
"grad_norm": 0.39829066035096183, |
|
"learning_rate": 4.3306716664944345e-06, |
|
"loss": 0.66, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 3.6153846153846154, |
|
"grad_norm": 0.3972752441251099, |
|
"learning_rate": 4.316897840375558e-06, |
|
"loss": 0.3552, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 3.6538461538461537, |
|
"grad_norm": 0.43087508212959164, |
|
"learning_rate": 4.303006118359536e-06, |
|
"loss": 0.3577, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"grad_norm": 0.37367031545629315, |
|
"learning_rate": 4.288997401860303e-06, |
|
"loss": 0.4744, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 3.730769230769231, |
|
"grad_norm": 0.4908959204559879, |
|
"learning_rate": 4.274872599883396e-06, |
|
"loss": 0.8344, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 3.769230769230769, |
|
"grad_norm": 0.479031181871949, |
|
"learning_rate": 4.260632628966974e-06, |
|
"loss": 0.444, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 3.8076923076923075, |
|
"grad_norm": 0.42816486401058296, |
|
"learning_rate": 4.246278413122344e-06, |
|
"loss": 0.4751, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"grad_norm": 0.4441229834123768, |
|
"learning_rate": 4.231810883773999e-06, |
|
"loss": 0.6354, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.8846153846153846, |
|
"grad_norm": 0.5505544877136986, |
|
"learning_rate": 4.217230979699188e-06, |
|
"loss": 0.404, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 3.9230769230769234, |
|
"grad_norm": 0.428310468582681, |
|
"learning_rate": 4.202539646966993e-06, |
|
"loss": 0.4905, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 3.9615384615384617, |
|
"grad_norm": 0.36741931493755814, |
|
"learning_rate": 4.187737838876941e-06, |
|
"loss": 0.4986, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.3765611133789381, |
|
"learning_rate": 4.172826515897146e-06, |
|
"loss": 0.4131, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 4.038461538461538, |
|
"grad_norm": 0.43538159963075335, |
|
"learning_rate": 4.1578066456019885e-06, |
|
"loss": 0.5491, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 4.076923076923077, |
|
"grad_norm": 0.42184253496128765, |
|
"learning_rate": 4.1426792026093274e-06, |
|
"loss": 0.5411, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 4.115384615384615, |
|
"grad_norm": 0.40484010942448745, |
|
"learning_rate": 4.12744516851726e-06, |
|
"loss": 0.3895, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 4.153846153846154, |
|
"grad_norm": 0.35000742524077344, |
|
"learning_rate": 4.112105531840427e-06, |
|
"loss": 0.548, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 4.1923076923076925, |
|
"grad_norm": 0.45752349995263586, |
|
"learning_rate": 4.09666128794587e-06, |
|
"loss": 0.4775, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 4.230769230769231, |
|
"grad_norm": 0.40329722163324117, |
|
"learning_rate": 4.081113438988443e-06, |
|
"loss": 0.2606, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 4.269230769230769, |
|
"grad_norm": 0.3842172424364366, |
|
"learning_rate": 4.065462993845785e-06, |
|
"loss": 0.3961, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"grad_norm": 0.41497732883921246, |
|
"learning_rate": 4.049710968052851e-06, |
|
"loss": 0.4113, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 4.346153846153846, |
|
"grad_norm": 0.39255048207938287, |
|
"learning_rate": 4.0338583837360225e-06, |
|
"loss": 0.4558, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 4.384615384615385, |
|
"grad_norm": 0.42620728115181244, |
|
"learning_rate": 4.017906269546778e-06, |
|
"loss": 0.3251, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 4.423076923076923, |
|
"grad_norm": 0.5169226176212537, |
|
"learning_rate": 4.001855660594948e-06, |
|
"loss": 0.4581, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 4.461538461538462, |
|
"grad_norm": 0.40368787971042785, |
|
"learning_rate": 3.985707598381544e-06, |
|
"loss": 0.4953, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.5423983145992816, |
|
"learning_rate": 3.969463130731183e-06, |
|
"loss": 0.6845, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 4.538461538461538, |
|
"grad_norm": 0.46715408940216974, |
|
"learning_rate": 3.953123311724092e-06, |
|
"loss": 0.5063, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 4.576923076923077, |
|
"grad_norm": 0.4361497261141325, |
|
"learning_rate": 3.93668920162771e-06, |
|
"loss": 0.4339, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 0.402405096617437, |
|
"learning_rate": 3.92016186682789e-06, |
|
"loss": 0.4568, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 4.653846153846154, |
|
"grad_norm": 0.47183080337248506, |
|
"learning_rate": 3.903542379759703e-06, |
|
"loss": 0.5403, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 4.6923076923076925, |
|
"grad_norm": 0.43474922930243004, |
|
"learning_rate": 3.8868318188378475e-06, |
|
"loss": 0.4105, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 4.730769230769231, |
|
"grad_norm": 0.43857419939588704, |
|
"learning_rate": 3.870031268386676e-06, |
|
"loss": 0.438, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 4.769230769230769, |
|
"grad_norm": 0.537974523699989, |
|
"learning_rate": 3.853141818569829e-06, |
|
"loss": 0.4117, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 4.8076923076923075, |
|
"grad_norm": 0.3969885457122397, |
|
"learning_rate": 3.836164565319503e-06, |
|
"loss": 0.543, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 4.846153846153846, |
|
"grad_norm": 0.36690332072305304, |
|
"learning_rate": 3.819100610265332e-06, |
|
"loss": 0.5541, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 4.884615384615385, |
|
"grad_norm": 0.5119390431012444, |
|
"learning_rate": 3.8019510606629063e-06, |
|
"loss": 0.5643, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 0.4592095946696792, |
|
"learning_rate": 3.7847170293219223e-06, |
|
"loss": 0.3905, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 4.961538461538462, |
|
"grad_norm": 0.35650580989737807, |
|
"learning_rate": 3.767399634533976e-06, |
|
"loss": 0.5302, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.3754488505326029, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 0.3909, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 5.038461538461538, |
|
"grad_norm": 0.41646937067352735, |
|
"learning_rate": 3.732519254757344e-06, |
|
"loss": 0.5944, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 5.076923076923077, |
|
"grad_norm": 0.40757576595922934, |
|
"learning_rate": 3.714958533106515e-06, |
|
"loss": 0.4101, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 5.115384615384615, |
|
"grad_norm": 0.44167101044285684, |
|
"learning_rate": 3.6973189745375772e-06, |
|
"loss": 0.605, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 5.153846153846154, |
|
"grad_norm": 0.8949596690000747, |
|
"learning_rate": 3.679601723656205e-06, |
|
"loss": 0.377, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 5.1923076923076925, |
|
"grad_norm": 0.4894480201337239, |
|
"learning_rate": 3.661807930109422e-06, |
|
"loss": 0.4202, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 5.230769230769231, |
|
"grad_norm": 0.48711144702694403, |
|
"learning_rate": 3.643938748510989e-06, |
|
"loss": 0.4178, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 5.269230769230769, |
|
"grad_norm": 0.40010645794899213, |
|
"learning_rate": 3.6259953383664924e-06, |
|
"loss": 0.5813, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 5.3076923076923075, |
|
"grad_norm": 0.46290191921273705, |
|
"learning_rate": 3.607978863998104e-06, |
|
"loss": 0.3735, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 5.346153846153846, |
|
"grad_norm": 0.4572988531725457, |
|
"learning_rate": 3.5898904944690256e-06, |
|
"loss": 0.2532, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 5.384615384615385, |
|
"grad_norm": 0.3937342666166495, |
|
"learning_rate": 3.5717314035076355e-06, |
|
"loss": 0.4399, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 5.423076923076923, |
|
"grad_norm": 0.5865553694046071, |
|
"learning_rate": 3.5535027694313233e-06, |
|
"loss": 0.4118, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 5.461538461538462, |
|
"grad_norm": 0.44868948609399, |
|
"learning_rate": 3.53520577507003e-06, |
|
"loss": 0.5487, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 0.46918871672194645, |
|
"learning_rate": 3.516841607689501e-06, |
|
"loss": 0.3151, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 5.538461538461538, |
|
"grad_norm": 0.4103213544946049, |
|
"learning_rate": 3.4984114589142388e-06, |
|
"loss": 0.3456, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 5.576923076923077, |
|
"grad_norm": 0.4269140275443733, |
|
"learning_rate": 3.479916524650188e-06, |
|
"loss": 0.3889, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 5.615384615384615, |
|
"grad_norm": 0.4390888322307832, |
|
"learning_rate": 3.461358005007128e-06, |
|
"loss": 0.3875, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 5.653846153846154, |
|
"grad_norm": 0.4836520353054927, |
|
"learning_rate": 3.442737104220801e-06, |
|
"loss": 0.4952, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 5.6923076923076925, |
|
"grad_norm": 0.44749171068850996, |
|
"learning_rate": 3.4240550305747776e-06, |
|
"loss": 0.2631, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 5.730769230769231, |
|
"grad_norm": 0.4397924186863106, |
|
"learning_rate": 3.4053129963220423e-06, |
|
"loss": 0.4175, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 5.769230769230769, |
|
"grad_norm": 0.44733865616762203, |
|
"learning_rate": 3.386512217606339e-06, |
|
"loss": 0.2536, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 5.8076923076923075, |
|
"grad_norm": 0.4416324576002063, |
|
"learning_rate": 3.3676539143832577e-06, |
|
"loss": 0.4575, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 5.846153846153846, |
|
"grad_norm": 0.41183773131044676, |
|
"learning_rate": 3.3487393103410683e-06, |
|
"loss": 0.4202, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 5.884615384615385, |
|
"grad_norm": 0.4177204473083171, |
|
"learning_rate": 3.3297696328213215e-06, |
|
"loss": 0.5144, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 5.923076923076923, |
|
"grad_norm": 0.5632867664875632, |
|
"learning_rate": 3.3107461127392072e-06, |
|
"loss": 0.4375, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 5.961538461538462, |
|
"grad_norm": 0.5035767522506955, |
|
"learning_rate": 3.291669984503682e-06, |
|
"loss": 0.3978, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.38032531941618825, |
|
"learning_rate": 3.272542485937369e-06, |
|
"loss": 0.5376, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 6.038461538461538, |
|
"grad_norm": 0.7171824151944847, |
|
"learning_rate": 3.25336485819624e-06, |
|
"loss": 0.4286, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 6.076923076923077, |
|
"grad_norm": 0.8186367401766156, |
|
"learning_rate": 3.2341383456890776e-06, |
|
"loss": 0.4859, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 6.115384615384615, |
|
"grad_norm": 0.4282358834280859, |
|
"learning_rate": 3.214864195996723e-06, |
|
"loss": 0.3245, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 0.5299932389205944, |
|
"learning_rate": 3.195543659791132e-06, |
|
"loss": 0.3781, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 6.1923076923076925, |
|
"grad_norm": 0.7312028043937328, |
|
"learning_rate": 3.17617799075421e-06, |
|
"loss": 0.4593, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 6.230769230769231, |
|
"grad_norm": 0.4438755365149523, |
|
"learning_rate": 3.1567684454964674e-06, |
|
"loss": 0.3754, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 6.269230769230769, |
|
"grad_norm": 0.49986780180998563, |
|
"learning_rate": 3.1373162834754835e-06, |
|
"loss": 0.3928, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 6.3076923076923075, |
|
"grad_norm": 0.5018972246724045, |
|
"learning_rate": 3.117822766914174e-06, |
|
"loss": 0.2548, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 6.346153846153846, |
|
"grad_norm": 0.5483100007115429, |
|
"learning_rate": 3.0982891607188948e-06, |
|
"loss": 0.277, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 6.384615384615385, |
|
"grad_norm": 0.49689281955934633, |
|
"learning_rate": 3.0787167323973584e-06, |
|
"loss": 0.2597, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 6.423076923076923, |
|
"grad_norm": 0.4027937354358392, |
|
"learning_rate": 3.0591067519763894e-06, |
|
"loss": 0.3835, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 6.461538461538462, |
|
"grad_norm": 0.5118060463198152, |
|
"learning_rate": 3.039460491919516e-06, |
|
"loss": 0.3728, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 0.47925684071917685, |
|
"learning_rate": 3.019779227044398e-06, |
|
"loss": 0.4208, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 6.538461538461538, |
|
"grad_norm": 0.4352967699287943, |
|
"learning_rate": 3.0000642344401115e-06, |
|
"loss": 0.508, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 6.576923076923077, |
|
"grad_norm": 0.4550075954830929, |
|
"learning_rate": 2.9803167933842712e-06, |
|
"loss": 0.4579, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 6.615384615384615, |
|
"grad_norm": 0.4177250074477428, |
|
"learning_rate": 2.960538185260029e-06, |
|
"loss": 0.2693, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 6.653846153846154, |
|
"grad_norm": 0.485295005073564, |
|
"learning_rate": 2.9407296934729227e-06, |
|
"loss": 0.3715, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 6.6923076923076925, |
|
"grad_norm": 0.3538134050545686, |
|
"learning_rate": 2.920892603367596e-06, |
|
"loss": 0.3562, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 6.730769230769231, |
|
"grad_norm": 0.447652246093713, |
|
"learning_rate": 2.9010282021444008e-06, |
|
"loss": 0.5187, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 6.769230769230769, |
|
"grad_norm": 0.4997227920641816, |
|
"learning_rate": 2.881137778775864e-06, |
|
"loss": 0.3123, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 6.8076923076923075, |
|
"grad_norm": 0.4348269374773686, |
|
"learning_rate": 2.8612226239230536e-06, |
|
"loss": 0.4705, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 6.846153846153846, |
|
"grad_norm": 0.4358691651373222, |
|
"learning_rate": 2.8412840298518295e-06, |
|
"loss": 0.3094, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 6.884615384615385, |
|
"grad_norm": 0.4084037477612026, |
|
"learning_rate": 2.821323290348987e-06, |
|
"loss": 0.3531, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 6.923076923076923, |
|
"grad_norm": 0.37734080915888196, |
|
"learning_rate": 2.8013417006383078e-06, |
|
"loss": 0.3738, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 6.961538461538462, |
|
"grad_norm": 0.37345514337855473, |
|
"learning_rate": 2.781340557296514e-06, |
|
"loss": 0.3986, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.4661469904131123, |
|
"learning_rate": 2.761321158169134e-06, |
|
"loss": 0.3717, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 7.038461538461538, |
|
"grad_norm": 0.7076797880268779, |
|
"learning_rate": 2.7412848022862883e-06, |
|
"loss": 0.4596, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 7.076923076923077, |
|
"grad_norm": 0.4705027460981129, |
|
"learning_rate": 2.7212327897783963e-06, |
|
"loss": 0.2957, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 7.115384615384615, |
|
"grad_norm": 0.3742202895931818, |
|
"learning_rate": 2.7011664217918154e-06, |
|
"loss": 0.1995, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 7.153846153846154, |
|
"grad_norm": 0.6149190541876877, |
|
"learning_rate": 2.6810870004044065e-06, |
|
"loss": 0.3374, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 7.1923076923076925, |
|
"grad_norm": 0.568239675579952, |
|
"learning_rate": 2.6609958285410488e-06, |
|
"loss": 0.3496, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 7.230769230769231, |
|
"grad_norm": 0.501969212073365, |
|
"learning_rate": 2.6408942098890937e-06, |
|
"loss": 0.3381, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 7.269230769230769, |
|
"grad_norm": 0.3800552771656915, |
|
"learning_rate": 2.620783448813768e-06, |
|
"loss": 0.3114, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 7.3076923076923075, |
|
"grad_norm": 0.5327131952318535, |
|
"learning_rate": 2.6006648502735384e-06, |
|
"loss": 0.282, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 7.346153846153846, |
|
"grad_norm": 0.4985133449111499, |
|
"learning_rate": 2.5805397197354333e-06, |
|
"loss": 0.3596, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 7.384615384615385, |
|
"grad_norm": 0.4330481942035506, |
|
"learning_rate": 2.560409363090331e-06, |
|
"loss": 0.3929, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 7.423076923076923, |
|
"grad_norm": 0.5096890385502361, |
|
"learning_rate": 2.5402750865682283e-06, |
|
"loss": 0.4024, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 7.461538461538462, |
|
"grad_norm": 1.1384220300859678, |
|
"learning_rate": 2.5201381966534748e-06, |
|
"loss": 0.3574, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.45277823692409325, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.3694, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 7.538461538461538, |
|
"grad_norm": 0.3598366252810026, |
|
"learning_rate": 2.4798618033465256e-06, |
|
"loss": 0.3475, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 7.576923076923077, |
|
"grad_norm": 0.507605650711227, |
|
"learning_rate": 2.459724913431772e-06, |
|
"loss": 0.227, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 7.615384615384615, |
|
"grad_norm": 0.4176653745984533, |
|
"learning_rate": 2.43959063690967e-06, |
|
"loss": 0.3827, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 7.653846153846154, |
|
"grad_norm": 0.357063614895515, |
|
"learning_rate": 2.4194602802645684e-06, |
|
"loss": 0.4291, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 0.40598350398883626, |
|
"learning_rate": 2.399335149726463e-06, |
|
"loss": 0.2007, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 7.730769230769231, |
|
"grad_norm": 0.4765396454533307, |
|
"learning_rate": 2.379216551186233e-06, |
|
"loss": 0.3043, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 7.769230769230769, |
|
"grad_norm": 0.40291257531450625, |
|
"learning_rate": 2.3591057901109063e-06, |
|
"loss": 0.425, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 7.8076923076923075, |
|
"grad_norm": 0.37654212040684903, |
|
"learning_rate": 2.3390041714589516e-06, |
|
"loss": 0.2046, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 7.846153846153846, |
|
"grad_norm": 0.42635430764759097, |
|
"learning_rate": 2.3189129995955944e-06, |
|
"loss": 0.3833, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 7.884615384615385, |
|
"grad_norm": 0.5211999817060086, |
|
"learning_rate": 2.2988335782081854e-06, |
|
"loss": 0.5149, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 7.923076923076923, |
|
"grad_norm": 0.3750493471230682, |
|
"learning_rate": 2.2787672102216045e-06, |
|
"loss": 0.3424, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 7.961538461538462, |
|
"grad_norm": 0.45738143535125575, |
|
"learning_rate": 2.258715197713712e-06, |
|
"loss": 0.2011, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.41140646800685055, |
|
"learning_rate": 2.238678841830867e-06, |
|
"loss": 0.6172, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 8.038461538461538, |
|
"grad_norm": 0.4519178763903938, |
|
"learning_rate": 2.2186594427034868e-06, |
|
"loss": 0.2643, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 8.076923076923077, |
|
"grad_norm": 0.39353646673207954, |
|
"learning_rate": 2.1986582993616926e-06, |
|
"loss": 0.3688, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 8.115384615384615, |
|
"grad_norm": 0.3325326790913655, |
|
"learning_rate": 2.178676709651014e-06, |
|
"loss": 0.1834, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 8.153846153846153, |
|
"grad_norm": 1.7571111944550344, |
|
"learning_rate": 2.1587159701481718e-06, |
|
"loss": 0.362, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 8.192307692307692, |
|
"grad_norm": 0.41759821313528867, |
|
"learning_rate": 2.1387773760769477e-06, |
|
"loss": 0.2023, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 8.23076923076923, |
|
"grad_norm": 0.4314676228849499, |
|
"learning_rate": 2.1188622212241366e-06, |
|
"loss": 0.1954, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 8.26923076923077, |
|
"grad_norm": 0.3877676711204905, |
|
"learning_rate": 2.0989717978555992e-06, |
|
"loss": 0.4745, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 8.307692307692308, |
|
"grad_norm": 0.47245327189394, |
|
"learning_rate": 2.079107396632404e-06, |
|
"loss": 0.2973, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 8.346153846153847, |
|
"grad_norm": 0.4052393199704439, |
|
"learning_rate": 2.0592703065270777e-06, |
|
"loss": 0.2097, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 8.384615384615385, |
|
"grad_norm": 0.4313505494454412, |
|
"learning_rate": 2.0394618147399713e-06, |
|
"loss": 0.2667, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 8.423076923076923, |
|
"grad_norm": 0.4454566796379792, |
|
"learning_rate": 2.019683206615729e-06, |
|
"loss": 0.2783, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 8.461538461538462, |
|
"grad_norm": 0.42931541141385865, |
|
"learning_rate": 1.9999357655598894e-06, |
|
"loss": 0.331, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.5368759318605051, |
|
"learning_rate": 1.9802207729556023e-06, |
|
"loss": 0.4092, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 8.538461538461538, |
|
"grad_norm": 0.39319040108908987, |
|
"learning_rate": 1.960539508080485e-06, |
|
"loss": 0.3224, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 8.576923076923077, |
|
"grad_norm": 0.33398610783086696, |
|
"learning_rate": 1.940893248023612e-06, |
|
"loss": 0.3013, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 8.615384615384615, |
|
"grad_norm": 0.41372509168979466, |
|
"learning_rate": 1.921283267602643e-06, |
|
"loss": 0.4431, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 8.653846153846153, |
|
"grad_norm": 0.45779278911697124, |
|
"learning_rate": 1.9017108392811065e-06, |
|
"loss": 0.2524, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 8.692307692307692, |
|
"grad_norm": 0.40817506076516025, |
|
"learning_rate": 1.8821772330858259e-06, |
|
"loss": 0.2038, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 8.73076923076923, |
|
"grad_norm": 0.37963645007803587, |
|
"learning_rate": 1.8626837165245165e-06, |
|
"loss": 0.385, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 8.76923076923077, |
|
"grad_norm": 0.48116684628673423, |
|
"learning_rate": 1.8432315545035328e-06, |
|
"loss": 0.2815, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 8.807692307692308, |
|
"grad_norm": 0.5907769436853136, |
|
"learning_rate": 1.8238220092457909e-06, |
|
"loss": 0.3713, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 8.846153846153847, |
|
"grad_norm": 0.6129928044002855, |
|
"learning_rate": 1.8044563402088686e-06, |
|
"loss": 0.3014, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 8.884615384615385, |
|
"grad_norm": 0.4955121873754913, |
|
"learning_rate": 1.7851358040032773e-06, |
|
"loss": 0.2634, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 8.923076923076923, |
|
"grad_norm": 0.48937225000365864, |
|
"learning_rate": 1.7658616543109237e-06, |
|
"loss": 0.2948, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 8.961538461538462, |
|
"grad_norm": 0.6666285440555229, |
|
"learning_rate": 1.7466351418037608e-06, |
|
"loss": 0.4233, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.370440162908677, |
|
"learning_rate": 1.7274575140626318e-06, |
|
"loss": 0.3876, |
|
"step": 234 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 390, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 368952499765248.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|