sos-sft-base / trainer_state.json
simonycl's picture
Upload folder using huggingface_hub
e287f47 verified
{
"best_metric": 0.04752533510327339,
"best_model_checkpoint": "results/checkpoint-35000",
"epoch": 10.0,
"eval_steps": 500,
"global_step": 36070,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02772387025228722,
"grad_norm": 1.750556230545044,
"learning_rate": 9.999814117181637e-06,
"loss": 6.049,
"step": 100
},
{
"epoch": 0.05544774050457444,
"grad_norm": 0.824866533279419,
"learning_rate": 9.999248953493363e-06,
"loss": 3.0817,
"step": 200
},
{
"epoch": 0.08317161075686166,
"grad_norm": 0.4907461702823639,
"learning_rate": 9.998304532844263e-06,
"loss": 2.3969,
"step": 300
},
{
"epoch": 0.11089548100914888,
"grad_norm": 0.4534800946712494,
"learning_rate": 9.996980926880713e-06,
"loss": 2.0935,
"step": 400
},
{
"epoch": 0.1386193512614361,
"grad_norm": 0.47491493821144104,
"learning_rate": 9.995278236015153e-06,
"loss": 1.9245,
"step": 500
},
{
"epoch": 0.1386193512614361,
"eval_valid_loss": 1.7945984601974487,
"eval_valid_runtime": 6.4498,
"eval_valid_samples_per_second": 214.426,
"eval_valid_steps_per_second": 6.822,
"step": 500
},
{
"epoch": 0.1386193512614361,
"eval_valid_target_loss": 1.875697374343872,
"eval_valid_target_runtime": 6.5527,
"eval_valid_target_samples_per_second": 218.841,
"eval_valid_target_steps_per_second": 6.867,
"step": 500
},
{
"epoch": 0.16634322151372333,
"grad_norm": 0.5983259677886963,
"learning_rate": 9.99319658941846e-06,
"loss": 1.8294,
"step": 600
},
{
"epoch": 0.19406709176601053,
"grad_norm": 0.6906803846359253,
"learning_rate": 9.990736145010146e-06,
"loss": 1.7625,
"step": 700
},
{
"epoch": 0.22179096201829776,
"grad_norm": 1.4024661779403687,
"learning_rate": 9.987897089446381e-06,
"loss": 1.709,
"step": 800
},
{
"epoch": 0.24951483227058496,
"grad_norm": 1.073205590248108,
"learning_rate": 9.984679638105837e-06,
"loss": 1.6595,
"step": 900
},
{
"epoch": 0.2772387025228722,
"grad_norm": 1.280462384223938,
"learning_rate": 9.981084035073337e-06,
"loss": 1.6153,
"step": 1000
},
{
"epoch": 0.2772387025228722,
"eval_valid_loss": 1.5186923742294312,
"eval_valid_runtime": 6.4198,
"eval_valid_samples_per_second": 215.427,
"eval_valid_steps_per_second": 6.854,
"step": 1000
},
{
"epoch": 0.2772387025228722,
"eval_valid_target_loss": 1.5994268655776978,
"eval_valid_target_runtime": 6.5778,
"eval_valid_target_samples_per_second": 218.006,
"eval_valid_target_steps_per_second": 6.841,
"step": 1000
},
{
"epoch": 0.3049625727751594,
"grad_norm": 0.9407665133476257,
"learning_rate": 9.977110553121353e-06,
"loss": 1.567,
"step": 1100
},
{
"epoch": 0.33268644302744665,
"grad_norm": 1.5439337491989136,
"learning_rate": 9.972759493689301e-06,
"loss": 1.5275,
"step": 1200
},
{
"epoch": 0.36041031327973383,
"grad_norm": 2.2176036834716797,
"learning_rate": 9.968031186860677e-06,
"loss": 1.4833,
"step": 1300
},
{
"epoch": 0.38813418353202106,
"grad_norm": 1.6237233877182007,
"learning_rate": 9.962925991338018e-06,
"loss": 1.4457,
"step": 1400
},
{
"epoch": 0.4158580537843083,
"grad_norm": 1.3075989484786987,
"learning_rate": 9.957444294415685e-06,
"loss": 1.407,
"step": 1500
},
{
"epoch": 0.4158580537843083,
"eval_valid_loss": 1.326136589050293,
"eval_valid_runtime": 6.413,
"eval_valid_samples_per_second": 215.655,
"eval_valid_steps_per_second": 6.861,
"step": 1500
},
{
"epoch": 0.4158580537843083,
"eval_valid_target_loss": 1.3982958793640137,
"eval_valid_target_runtime": 6.5728,
"eval_valid_target_samples_per_second": 218.172,
"eval_valid_target_steps_per_second": 6.846,
"step": 1500
},
{
"epoch": 0.4435819240365955,
"grad_norm": 1.379807472229004,
"learning_rate": 9.951586511950491e-06,
"loss": 1.3768,
"step": 1600
},
{
"epoch": 0.47130579428888275,
"grad_norm": 0.737086832523346,
"learning_rate": 9.945353088330137e-06,
"loss": 1.347,
"step": 1700
},
{
"epoch": 0.4990296645411699,
"grad_norm": 0.6332296133041382,
"learning_rate": 9.93874449643952e-06,
"loss": 1.3188,
"step": 1800
},
{
"epoch": 0.5267535347934572,
"grad_norm": 0.6948099732398987,
"learning_rate": 9.931761237624833e-06,
"loss": 1.2903,
"step": 1900
},
{
"epoch": 0.5544774050457444,
"grad_norm": 0.9397527575492859,
"learning_rate": 9.924403841655565e-06,
"loss": 1.2671,
"step": 2000
},
{
"epoch": 0.5544774050457444,
"eval_valid_loss": 1.2014020681381226,
"eval_valid_runtime": 6.4367,
"eval_valid_samples_per_second": 214.861,
"eval_valid_steps_per_second": 6.836,
"step": 2000
},
{
"epoch": 0.5544774050457444,
"eval_valid_target_loss": 1.2820453643798828,
"eval_valid_target_runtime": 6.5614,
"eval_valid_target_samples_per_second": 218.55,
"eval_valid_target_steps_per_second": 6.858,
"step": 2000
},
{
"epoch": 0.5822012752980316,
"grad_norm": 0.5302172303199768,
"learning_rate": 9.916672866684275e-06,
"loss": 1.2439,
"step": 2100
},
{
"epoch": 0.6099251455503188,
"grad_norm": 0.5439279675483704,
"learning_rate": 9.908568899204281e-06,
"loss": 1.2231,
"step": 2200
},
{
"epoch": 0.637649015802606,
"grad_norm": 0.7026234865188599,
"learning_rate": 9.90009255400514e-06,
"loss": 1.2027,
"step": 2300
},
{
"epoch": 0.6653728860548933,
"grad_norm": 0.642803430557251,
"learning_rate": 9.89124447412603e-06,
"loss": 1.1864,
"step": 2400
},
{
"epoch": 0.6930967563071805,
"grad_norm": 1.3601601123809814,
"learning_rate": 9.882025330806952e-06,
"loss": 1.1654,
"step": 2500
},
{
"epoch": 0.6930967563071805,
"eval_valid_loss": 1.1063387393951416,
"eval_valid_runtime": 6.4314,
"eval_valid_samples_per_second": 215.037,
"eval_valid_steps_per_second": 6.841,
"step": 2500
},
{
"epoch": 0.6930967563071805,
"eval_valid_target_loss": 1.208246111869812,
"eval_valid_target_runtime": 6.5564,
"eval_valid_target_samples_per_second": 218.719,
"eval_valid_target_steps_per_second": 6.864,
"step": 2500
},
{
"epoch": 0.7208206265594677,
"grad_norm": 0.7053922414779663,
"learning_rate": 9.872435823437816e-06,
"loss": 1.1433,
"step": 2600
},
{
"epoch": 0.748544496811755,
"grad_norm": 0.6601741909980774,
"learning_rate": 9.862476679505384e-06,
"loss": 1.1193,
"step": 2700
},
{
"epoch": 0.7762683670640421,
"grad_norm": 0.7706498503684998,
"learning_rate": 9.852148654538072e-06,
"loss": 1.0954,
"step": 2800
},
{
"epoch": 0.8039922373163294,
"grad_norm": 0.8355486392974854,
"learning_rate": 9.841452532048648e-06,
"loss": 1.069,
"step": 2900
},
{
"epoch": 0.8317161075686166,
"grad_norm": 0.8369494676589966,
"learning_rate": 9.830389123474773e-06,
"loss": 1.0384,
"step": 3000
},
{
"epoch": 0.8317161075686166,
"eval_valid_loss": 0.9615023732185364,
"eval_valid_runtime": 6.4156,
"eval_valid_samples_per_second": 215.57,
"eval_valid_steps_per_second": 6.858,
"step": 3000
},
{
"epoch": 0.8317161075686166,
"eval_valid_target_loss": 1.0947415828704834,
"eval_valid_target_runtime": 6.5753,
"eval_valid_target_samples_per_second": 218.088,
"eval_valid_target_steps_per_second": 6.844,
"step": 3000
},
{
"epoch": 0.8594399778209038,
"grad_norm": 1.4864110946655273,
"learning_rate": 9.818959268117464e-06,
"loss": 1.0103,
"step": 3100
},
{
"epoch": 0.887163848073191,
"grad_norm": 0.7728907465934753,
"learning_rate": 9.807163833077407e-06,
"loss": 0.982,
"step": 3200
},
{
"epoch": 0.9148877183254782,
"grad_norm": 0.6881595253944397,
"learning_rate": 9.795003713189187e-06,
"loss": 0.9492,
"step": 3300
},
{
"epoch": 0.9426115885777655,
"grad_norm": 1.0222816467285156,
"learning_rate": 9.782479830953388e-06,
"loss": 0.9142,
"step": 3400
},
{
"epoch": 0.9703354588300527,
"grad_norm": 0.6671555042266846,
"learning_rate": 9.769593136466633e-06,
"loss": 0.8838,
"step": 3500
},
{
"epoch": 0.9703354588300527,
"eval_valid_loss": 0.8037808537483215,
"eval_valid_runtime": 6.4314,
"eval_valid_samples_per_second": 215.038,
"eval_valid_steps_per_second": 6.841,
"step": 3500
},
{
"epoch": 0.9703354588300527,
"eval_valid_target_loss": 0.9639121294021606,
"eval_valid_target_runtime": 6.6053,
"eval_valid_target_samples_per_second": 217.1,
"eval_valid_target_steps_per_second": 6.813,
"step": 3500
},
{
"epoch": 0.9980593290823399,
"grad_norm": 0.7793981432914734,
"learning_rate": 9.756344607349483e-06,
"loss": 0.8496,
"step": 3600
},
{
"epoch": 1.0257831993346271,
"grad_norm": 0.7545821070671082,
"learning_rate": 9.74273524867229e-06,
"loss": 0.8117,
"step": 3700
},
{
"epoch": 1.0535070695869144,
"grad_norm": 0.631118893623352,
"learning_rate": 9.728766092878934e-06,
"loss": 0.7749,
"step": 3800
},
{
"epoch": 1.0812309398392015,
"grad_norm": 0.7934292554855347,
"learning_rate": 9.714438199708516e-06,
"loss": 0.7321,
"step": 3900
},
{
"epoch": 1.1089548100914888,
"grad_norm": 0.6160613298416138,
"learning_rate": 9.699752656114947e-06,
"loss": 0.6891,
"step": 4000
},
{
"epoch": 1.1089548100914888,
"eval_valid_loss": 0.5853330492973328,
"eval_valid_runtime": 6.4069,
"eval_valid_samples_per_second": 215.861,
"eval_valid_steps_per_second": 6.868,
"step": 4000
},
{
"epoch": 1.1089548100914888,
"eval_valid_target_loss": 0.7543638944625854,
"eval_valid_target_runtime": 6.5591,
"eval_valid_target_samples_per_second": 218.627,
"eval_valid_target_steps_per_second": 6.861,
"step": 4000
},
{
"epoch": 1.136678680343776,
"grad_norm": 0.4765689969062805,
"learning_rate": 9.684710576184504e-06,
"loss": 0.6383,
"step": 4100
},
{
"epoch": 1.1644025505960631,
"grad_norm": 0.7610909938812256,
"learning_rate": 9.669313101051295e-06,
"loss": 0.5894,
"step": 4200
},
{
"epoch": 1.1921264208483504,
"grad_norm": 0.5010733008384705,
"learning_rate": 9.653561398810706e-06,
"loss": 0.5446,
"step": 4300
},
{
"epoch": 1.2198502911006377,
"grad_norm": 0.6305666565895081,
"learning_rate": 9.637456664430776e-06,
"loss": 0.5097,
"step": 4400
},
{
"epoch": 1.247574161352925,
"grad_norm": 0.8064519762992859,
"learning_rate": 9.621000119661545e-06,
"loss": 0.4678,
"step": 4500
},
{
"epoch": 1.247574161352925,
"eval_valid_loss": 0.38276800513267517,
"eval_valid_runtime": 6.4349,
"eval_valid_samples_per_second": 214.922,
"eval_valid_steps_per_second": 6.838,
"step": 4500
},
{
"epoch": 1.247574161352925,
"eval_valid_target_loss": 0.4976137578487396,
"eval_valid_target_runtime": 6.5738,
"eval_valid_target_samples_per_second": 218.139,
"eval_valid_target_steps_per_second": 6.845,
"step": 4500
},
{
"epoch": 1.275298031605212,
"grad_norm": 0.49154090881347656,
"learning_rate": 9.604193012942375e-06,
"loss": 0.4326,
"step": 4600
},
{
"epoch": 1.3030219018574993,
"grad_norm": 0.5592367053031921,
"learning_rate": 9.587036619307226e-06,
"loss": 0.4054,
"step": 4700
},
{
"epoch": 1.3307457721097866,
"grad_norm": 0.48195400834083557,
"learning_rate": 9.569532240287946e-06,
"loss": 0.3828,
"step": 4800
},
{
"epoch": 1.3584696423620737,
"grad_norm": 0.5364578366279602,
"learning_rate": 9.551681203815517e-06,
"loss": 0.3595,
"step": 4900
},
{
"epoch": 1.386193512614361,
"grad_norm": 0.5409713387489319,
"learning_rate": 9.533484864119327e-06,
"loss": 0.3405,
"step": 5000
},
{
"epoch": 1.386193512614361,
"eval_valid_loss": 0.2857649326324463,
"eval_valid_runtime": 6.4118,
"eval_valid_samples_per_second": 215.697,
"eval_valid_steps_per_second": 6.862,
"step": 5000
},
{
"epoch": 1.386193512614361,
"eval_valid_target_loss": 0.33146464824676514,
"eval_valid_target_runtime": 6.5717,
"eval_valid_target_samples_per_second": 218.209,
"eval_valid_target_steps_per_second": 6.848,
"step": 5000
},
{
"epoch": 1.4139173828666483,
"grad_norm": 0.7294422388076782,
"learning_rate": 9.514944601624427e-06,
"loss": 0.328,
"step": 5100
},
{
"epoch": 1.4416412531189353,
"grad_norm": 0.4695785343647003,
"learning_rate": 9.49606182284681e-06,
"loss": 0.3095,
"step": 5200
},
{
"epoch": 1.4693651233712226,
"grad_norm": 0.5484552979469299,
"learning_rate": 9.476837960286707e-06,
"loss": 0.3016,
"step": 5300
},
{
"epoch": 1.49708899362351,
"grad_norm": 0.38614729046821594,
"learning_rate": 9.457274472319919e-06,
"loss": 0.2875,
"step": 5400
},
{
"epoch": 1.524812863875797,
"grad_norm": 0.3303731381893158,
"learning_rate": 9.437372843087175e-06,
"loss": 0.2821,
"step": 5500
},
{
"epoch": 1.524812863875797,
"eval_valid_loss": 0.23669035732746124,
"eval_valid_runtime": 6.4303,
"eval_valid_samples_per_second": 215.074,
"eval_valid_steps_per_second": 6.843,
"step": 5500
},
{
"epoch": 1.524812863875797,
"eval_valid_target_loss": 0.2617432773113251,
"eval_valid_target_runtime": 6.5556,
"eval_valid_target_samples_per_second": 218.744,
"eval_valid_target_steps_per_second": 6.864,
"step": 5500
},
{
"epoch": 1.5525367341280842,
"grad_norm": 0.5144414305686951,
"learning_rate": 9.417134582381548e-06,
"loss": 0.2696,
"step": 5600
},
{
"epoch": 1.5802606043803715,
"grad_norm": 0.5522892475128174,
"learning_rate": 9.396561225533902e-06,
"loss": 0.2617,
"step": 5700
},
{
"epoch": 1.6079844746326586,
"grad_norm": 0.4152807295322418,
"learning_rate": 9.37565433329644e-06,
"loss": 0.2522,
"step": 5800
},
{
"epoch": 1.635708344884946,
"grad_norm": 0.3866608142852783,
"learning_rate": 9.35441549172428e-06,
"loss": 0.2469,
"step": 5900
},
{
"epoch": 1.6634322151372332,
"grad_norm": 0.3131564259529114,
"learning_rate": 9.33284631205515e-06,
"loss": 0.2425,
"step": 6000
},
{
"epoch": 1.6634322151372332,
"eval_valid_loss": 0.20471729338169098,
"eval_valid_runtime": 6.4284,
"eval_valid_samples_per_second": 215.138,
"eval_valid_steps_per_second": 6.845,
"step": 6000
},
{
"epoch": 1.6634322151372332,
"eval_valid_target_loss": 0.2232024222612381,
"eval_valid_target_runtime": 6.5873,
"eval_valid_target_samples_per_second": 217.69,
"eval_valid_target_steps_per_second": 6.831,
"step": 6000
},
{
"epoch": 1.6911560853895202,
"grad_norm": 0.4385012090206146,
"learning_rate": 9.31094843058714e-06,
"loss": 0.2346,
"step": 6100
},
{
"epoch": 1.7188799556418077,
"grad_norm": 0.3904290497303009,
"learning_rate": 9.28872350855458e-06,
"loss": 0.2279,
"step": 6200
},
{
"epoch": 1.7466038258940948,
"grad_norm": 0.4294661581516266,
"learning_rate": 9.266173232002005e-06,
"loss": 0.2218,
"step": 6300
},
{
"epoch": 1.774327696146382,
"grad_norm": 0.40256062150001526,
"learning_rate": 9.243299311656253e-06,
"loss": 0.2189,
"step": 6400
},
{
"epoch": 1.8020515663986694,
"grad_norm": 0.39798569679260254,
"learning_rate": 9.220103482796683e-06,
"loss": 0.2154,
"step": 6500
},
{
"epoch": 1.8020515663986694,
"eval_valid_loss": 0.18116505444049835,
"eval_valid_runtime": 6.4306,
"eval_valid_samples_per_second": 215.065,
"eval_valid_steps_per_second": 6.842,
"step": 6500
},
{
"epoch": 1.8020515663986694,
"eval_valid_target_loss": 0.19611063599586487,
"eval_valid_target_runtime": 6.5521,
"eval_valid_target_samples_per_second": 218.86,
"eval_valid_target_steps_per_second": 6.868,
"step": 6500
},
{
"epoch": 1.8297754366509564,
"grad_norm": 0.2555886507034302,
"learning_rate": 9.196587505123526e-06,
"loss": 0.2082,
"step": 6600
},
{
"epoch": 1.8574993069032437,
"grad_norm": 0.278145968914032,
"learning_rate": 9.172753162624401e-06,
"loss": 0.2025,
"step": 6700
},
{
"epoch": 1.885223177155531,
"grad_norm": 0.43592485785484314,
"learning_rate": 9.148602263438967e-06,
"loss": 0.2006,
"step": 6800
},
{
"epoch": 1.912947047407818,
"grad_norm": 0.3828723430633545,
"learning_rate": 9.124136639721757e-06,
"loss": 0.1963,
"step": 6900
},
{
"epoch": 1.9406709176601054,
"grad_norm": 0.3468044102191925,
"learning_rate": 9.09935814750318e-06,
"loss": 0.1928,
"step": 7000
},
{
"epoch": 1.9406709176601054,
"eval_valid_loss": 0.16255635023117065,
"eval_valid_runtime": 6.4262,
"eval_valid_samples_per_second": 215.213,
"eval_valid_steps_per_second": 6.847,
"step": 7000
},
{
"epoch": 1.9406709176601054,
"eval_valid_target_loss": 0.17588204145431519,
"eval_valid_target_runtime": 6.5759,
"eval_valid_target_samples_per_second": 218.07,
"eval_valid_target_steps_per_second": 6.843,
"step": 7000
},
{
"epoch": 1.9683947879123926,
"grad_norm": 0.28793609142303467,
"learning_rate": 9.074268666548728e-06,
"loss": 0.1868,
"step": 7100
},
{
"epoch": 1.9961186581646797,
"grad_norm": 0.4627343714237213,
"learning_rate": 9.04887010021636e-06,
"loss": 0.1857,
"step": 7200
},
{
"epoch": 2.023842528416967,
"grad_norm": 0.4490989148616791,
"learning_rate": 9.023164375312117e-06,
"loss": 0.1786,
"step": 7300
},
{
"epoch": 2.0515663986692543,
"grad_norm": 0.319859117269516,
"learning_rate": 8.997153441943944e-06,
"loss": 0.1779,
"step": 7400
},
{
"epoch": 2.0792902689215413,
"grad_norm": 0.3379845917224884,
"learning_rate": 8.970839273373748e-06,
"loss": 0.1717,
"step": 7500
},
{
"epoch": 2.0792902689215413,
"eval_valid_loss": 0.1455078125,
"eval_valid_runtime": 6.4396,
"eval_valid_samples_per_second": 214.766,
"eval_valid_steps_per_second": 6.833,
"step": 7500
},
{
"epoch": 2.0792902689215413,
"eval_valid_target_loss": 0.15758885443210602,
"eval_valid_target_runtime": 6.5627,
"eval_valid_target_samples_per_second": 218.508,
"eval_valid_target_steps_per_second": 6.857,
"step": 7500
},
{
"epoch": 2.107014139173829,
"grad_norm": 0.3079555928707123,
"learning_rate": 8.944223865867712e-06,
"loss": 0.1688,
"step": 7600
},
{
"epoch": 2.134738009426116,
"grad_norm": 0.346603125333786,
"learning_rate": 8.917309238544834e-06,
"loss": 0.1661,
"step": 7700
},
{
"epoch": 2.162461879678403,
"grad_norm": 0.3899448812007904,
"learning_rate": 8.890097433223766e-06,
"loss": 0.1653,
"step": 7800
},
{
"epoch": 2.1901857499306905,
"grad_norm": 0.31352731585502625,
"learning_rate": 8.862590514267915e-06,
"loss": 0.1609,
"step": 7900
},
{
"epoch": 2.2179096201829775,
"grad_norm": 0.29558128118515015,
"learning_rate": 8.834790568428827e-06,
"loss": 0.158,
"step": 8000
},
{
"epoch": 2.2179096201829775,
"eval_valid_loss": 0.1319538652896881,
"eval_valid_runtime": 6.417,
"eval_valid_samples_per_second": 215.521,
"eval_valid_steps_per_second": 6.857,
"step": 8000
},
{
"epoch": 2.2179096201829775,
"eval_valid_target_loss": 0.1427442878484726,
"eval_valid_target_runtime": 6.5854,
"eval_valid_target_samples_per_second": 217.754,
"eval_valid_target_steps_per_second": 6.833,
"step": 8000
},
{
"epoch": 2.2456334904352646,
"grad_norm": 0.29061177372932434,
"learning_rate": 8.80669970468788e-06,
"loss": 0.1545,
"step": 8100
},
{
"epoch": 2.273357360687552,
"grad_norm": 0.3253875970840454,
"learning_rate": 8.778320054096306e-06,
"loss": 0.1528,
"step": 8200
},
{
"epoch": 2.301081230939839,
"grad_norm": 0.2402360886335373,
"learning_rate": 8.749653769613502e-06,
"loss": 0.1511,
"step": 8300
},
{
"epoch": 2.3288051011921262,
"grad_norm": 0.31634458899497986,
"learning_rate": 8.720703025943717e-06,
"loss": 0.1461,
"step": 8400
},
{
"epoch": 2.3565289714444138,
"grad_norm": 0.21685920655727386,
"learning_rate": 8.691470019371065e-06,
"loss": 0.143,
"step": 8500
},
{
"epoch": 2.3565289714444138,
"eval_valid_loss": 0.12121625989675522,
"eval_valid_runtime": 6.4171,
"eval_valid_samples_per_second": 215.519,
"eval_valid_steps_per_second": 6.857,
"step": 8500
},
{
"epoch": 2.3565289714444138,
"eval_valid_target_loss": 0.1312141716480255,
"eval_valid_target_runtime": 6.57,
"eval_valid_target_samples_per_second": 218.266,
"eval_valid_target_steps_per_second": 6.849,
"step": 8500
},
{
"epoch": 2.384252841696701,
"grad_norm": 0.24635937809944153,
"learning_rate": 8.661956967592907e-06,
"loss": 0.1424,
"step": 8600
},
{
"epoch": 2.411976711948988,
"grad_norm": 0.21958141028881073,
"learning_rate": 8.632166109551623e-06,
"loss": 0.1388,
"step": 8700
},
{
"epoch": 2.4397005822012754,
"grad_norm": 0.2693657875061035,
"learning_rate": 8.60209970526474e-06,
"loss": 0.1392,
"step": 8800
},
{
"epoch": 2.4674244524535625,
"grad_norm": 0.22512082755565643,
"learning_rate": 8.5717600356535e-06,
"loss": 0.1356,
"step": 8900
},
{
"epoch": 2.49514832270585,
"grad_norm": 0.3446211516857147,
"learning_rate": 8.541149402369806e-06,
"loss": 0.1324,
"step": 9000
},
{
"epoch": 2.49514832270585,
"eval_valid_loss": 0.11042323708534241,
"eval_valid_runtime": 6.4273,
"eval_valid_samples_per_second": 215.176,
"eval_valid_steps_per_second": 6.846,
"step": 9000
},
{
"epoch": 2.49514832270585,
"eval_valid_target_loss": 0.11918216943740845,
"eval_valid_target_runtime": 6.5885,
"eval_valid_target_samples_per_second": 217.651,
"eval_valid_target_steps_per_second": 6.83,
"step": 9000
},
{
"epoch": 2.522872192958137,
"grad_norm": 0.21913643181324005,
"learning_rate": 8.51027012762163e-06,
"loss": 0.1303,
"step": 9100
},
{
"epoch": 2.550596063210424,
"grad_norm": 0.24243904650211334,
"learning_rate": 8.479124553996824e-06,
"loss": 0.1268,
"step": 9200
},
{
"epoch": 2.578319933462711,
"grad_norm": 0.22184187173843384,
"learning_rate": 8.447715044285425e-06,
"loss": 0.1251,
"step": 9300
},
{
"epoch": 2.6060438037149987,
"grad_norm": 0.22888724505901337,
"learning_rate": 8.41604398130039e-06,
"loss": 0.1221,
"step": 9400
},
{
"epoch": 2.6337676739672857,
"grad_norm": 0.24152572453022003,
"learning_rate": 8.384113767696838e-06,
"loss": 0.121,
"step": 9500
},
{
"epoch": 2.6337676739672857,
"eval_valid_loss": 0.10074004530906677,
"eval_valid_runtime": 6.4317,
"eval_valid_samples_per_second": 215.03,
"eval_valid_steps_per_second": 6.841,
"step": 9500
},
{
"epoch": 2.6337676739672857,
"eval_valid_target_loss": 0.10891123861074448,
"eval_valid_target_runtime": 6.5593,
"eval_valid_target_samples_per_second": 218.622,
"eval_valid_target_steps_per_second": 6.861,
"step": 9500
},
{
"epoch": 2.6614915442195732,
"grad_norm": 0.2756216526031494,
"learning_rate": 8.35192682578978e-06,
"loss": 0.1195,
"step": 9600
},
{
"epoch": 2.6892154144718603,
"grad_norm": 0.24438254535198212,
"learning_rate": 8.319485597370348e-06,
"loss": 0.1157,
"step": 9700
},
{
"epoch": 2.7169392847241474,
"grad_norm": 0.35991132259368896,
"learning_rate": 8.286792543520556e-06,
"loss": 0.115,
"step": 9800
},
{
"epoch": 2.744663154976435,
"grad_norm": 0.22763152420520782,
"learning_rate": 8.253850144426606e-06,
"loss": 0.1134,
"step": 9900
},
{
"epoch": 2.772387025228722,
"grad_norm": 0.24357567727565765,
"learning_rate": 8.220660899190712e-06,
"loss": 0.1106,
"step": 10000
},
{
"epoch": 2.772387025228722,
"eval_valid_loss": 0.092686228454113,
"eval_valid_runtime": 6.4287,
"eval_valid_samples_per_second": 215.129,
"eval_valid_steps_per_second": 6.844,
"step": 10000
},
{
"epoch": 2.772387025228722,
"eval_valid_target_loss": 0.1005280539393425,
"eval_valid_target_runtime": 6.5902,
"eval_valid_target_samples_per_second": 217.596,
"eval_valid_target_steps_per_second": 6.828,
"step": 10000
},
{
"epoch": 2.800110895481009,
"grad_norm": 0.20446299016475677,
"learning_rate": 8.187227325641534e-06,
"loss": 0.109,
"step": 10100
},
{
"epoch": 2.8278347657332965,
"grad_norm": 0.24309873580932617,
"learning_rate": 8.153551960143157e-06,
"loss": 0.1087,
"step": 10200
},
{
"epoch": 2.8555586359855836,
"grad_norm": 0.21243679523468018,
"learning_rate": 8.119637357402676e-06,
"loss": 0.1063,
"step": 10300
},
{
"epoch": 2.8832825062378706,
"grad_norm": 0.2227753847837448,
"learning_rate": 8.085486090276391e-06,
"loss": 0.1057,
"step": 10400
},
{
"epoch": 2.911006376490158,
"grad_norm": 0.1933346837759018,
"learning_rate": 8.05110074957462e-06,
"loss": 0.1037,
"step": 10500
},
{
"epoch": 2.911006376490158,
"eval_valid_loss": 0.08755628019571304,
"eval_valid_runtime": 6.4374,
"eval_valid_samples_per_second": 214.84,
"eval_valid_steps_per_second": 6.835,
"step": 10500
},
{
"epoch": 2.911006376490158,
"eval_valid_target_loss": 0.09479602426290512,
"eval_valid_target_runtime": 6.5624,
"eval_valid_target_samples_per_second": 218.517,
"eval_valid_target_steps_per_second": 6.857,
"step": 10500
},
{
"epoch": 2.938730246742445,
"grad_norm": 0.24507193267345428,
"learning_rate": 8.016483943865158e-06,
"loss": 0.1026,
"step": 10600
},
{
"epoch": 2.9664541169947327,
"grad_norm": 0.16903254389762878,
"learning_rate": 7.98163829927538e-06,
"loss": 0.1019,
"step": 10700
},
{
"epoch": 2.99417798724702,
"grad_norm": 0.21406187117099762,
"learning_rate": 7.946566459293014e-06,
"loss": 0.1016,
"step": 10800
},
{
"epoch": 3.021901857499307,
"grad_norm": 0.17749078571796417,
"learning_rate": 7.911271084565603e-06,
"loss": 0.0988,
"step": 10900
},
{
"epoch": 3.049625727751594,
"grad_norm": 0.2052767425775528,
"learning_rate": 7.875754852698658e-06,
"loss": 0.099,
"step": 11000
},
{
"epoch": 3.049625727751594,
"eval_valid_loss": 0.08359777182340622,
"eval_valid_runtime": 6.4134,
"eval_valid_samples_per_second": 215.643,
"eval_valid_steps_per_second": 6.861,
"step": 11000
},
{
"epoch": 3.049625727751594,
"eval_valid_target_loss": 0.09044167399406433,
"eval_valid_target_runtime": 6.5678,
"eval_valid_target_samples_per_second": 218.336,
"eval_valid_target_steps_per_second": 6.852,
"step": 11000
},
{
"epoch": 3.0773495980038814,
"grad_norm": 0.20621031522750854,
"learning_rate": 7.840020458052529e-06,
"loss": 0.0961,
"step": 11100
},
{
"epoch": 3.1050734682561685,
"grad_norm": 0.18608888983726501,
"learning_rate": 7.804070611538001e-06,
"loss": 0.0964,
"step": 11200
},
{
"epoch": 3.132797338508456,
"grad_norm": 0.14550629258155823,
"learning_rate": 7.767908040410642e-06,
"loss": 0.0957,
"step": 11300
},
{
"epoch": 3.160521208760743,
"grad_norm": 0.21664443612098694,
"learning_rate": 7.731535488063895e-06,
"loss": 0.0948,
"step": 11400
},
{
"epoch": 3.18824507901303,
"grad_norm": 0.17702756822109222,
"learning_rate": 7.694955713820974e-06,
"loss": 0.0935,
"step": 11500
},
{
"epoch": 3.18824507901303,
"eval_valid_loss": 0.07985392957925797,
"eval_valid_runtime": 6.4194,
"eval_valid_samples_per_second": 215.442,
"eval_valid_steps_per_second": 6.854,
"step": 11500
},
{
"epoch": 3.18824507901303,
"eval_valid_target_loss": 0.08640262484550476,
"eval_valid_target_runtime": 6.5608,
"eval_valid_target_samples_per_second": 218.572,
"eval_valid_target_steps_per_second": 6.859,
"step": 11500
},
{
"epoch": 3.2159689492653176,
"grad_norm": 0.19913919270038605,
"learning_rate": 7.658171492725513e-06,
"loss": 0.0936,
"step": 11600
},
{
"epoch": 3.2436928195176047,
"grad_norm": 0.18789726495742798,
"learning_rate": 7.621185615331061e-06,
"loss": 0.0924,
"step": 11700
},
{
"epoch": 3.2714166897698918,
"grad_norm": 0.18376338481903076,
"learning_rate": 7.584000887489373e-06,
"loss": 0.0911,
"step": 11800
},
{
"epoch": 3.2991405600221793,
"grad_norm": 0.19736219942569733,
"learning_rate": 7.546620130137557e-06,
"loss": 0.0912,
"step": 11900
},
{
"epoch": 3.3268644302744663,
"grad_norm": 0.19527922570705414,
"learning_rate": 7.509046179084061e-06,
"loss": 0.0912,
"step": 12000
},
{
"epoch": 3.3268644302744663,
"eval_valid_loss": 0.07622889429330826,
"eval_valid_runtime": 6.4437,
"eval_valid_samples_per_second": 214.627,
"eval_valid_steps_per_second": 6.828,
"step": 12000
},
{
"epoch": 3.3268644302744663,
"eval_valid_target_loss": 0.0823676660656929,
"eval_valid_target_runtime": 6.5589,
"eval_valid_target_samples_per_second": 218.635,
"eval_valid_target_steps_per_second": 6.861,
"step": 12000
},
{
"epoch": 3.3545883005267534,
"grad_norm": 0.18916228413581848,
"learning_rate": 7.471281884793544e-06,
"loss": 0.0896,
"step": 12100
},
{
"epoch": 3.382312170779041,
"grad_norm": 0.1649465262889862,
"learning_rate": 7.4333301121706445e-06,
"loss": 0.0881,
"step": 12200
},
{
"epoch": 3.410036041031328,
"grad_norm": 0.18362993001937866,
"learning_rate": 7.3951937403426186e-06,
"loss": 0.0892,
"step": 12300
},
{
"epoch": 3.437759911283615,
"grad_norm": 0.19268861413002014,
"learning_rate": 7.356875662440939e-06,
"loss": 0.0879,
"step": 12400
},
{
"epoch": 3.4654837815359025,
"grad_norm": 0.17124581336975098,
"learning_rate": 7.318378785381802e-06,
"loss": 0.086,
"step": 12500
},
{
"epoch": 3.4654837815359025,
"eval_valid_loss": 0.07317828387022018,
"eval_valid_runtime": 6.4273,
"eval_valid_samples_per_second": 215.177,
"eval_valid_steps_per_second": 6.846,
"step": 12500
},
{
"epoch": 3.4654837815359025,
"eval_valid_target_loss": 0.07900213450193405,
"eval_valid_target_runtime": 6.5852,
"eval_valid_target_samples_per_second": 217.76,
"eval_valid_target_steps_per_second": 6.833,
"step": 12500
},
{
"epoch": 3.4932076517881896,
"grad_norm": 0.23004941642284393,
"learning_rate": 7.279706029645615e-06,
"loss": 0.0855,
"step": 12600
},
{
"epoch": 3.5209315220404767,
"grad_norm": 0.16131635010242462,
"learning_rate": 7.240860329055422e-06,
"loss": 0.0848,
"step": 12700
},
{
"epoch": 3.548655392292764,
"grad_norm": 0.19867731630802155,
"learning_rate": 7.201844630554353e-06,
"loss": 0.0851,
"step": 12800
},
{
"epoch": 3.5763792625450512,
"grad_norm": 0.17405714094638824,
"learning_rate": 7.162661893982052e-06,
"loss": 0.0839,
"step": 12900
},
{
"epoch": 3.6041031327973387,
"grad_norm": 0.19404906034469604,
"learning_rate": 7.123315091850136e-06,
"loss": 0.0839,
"step": 13000
},
{
"epoch": 3.6041031327973387,
"eval_valid_loss": 0.07132507115602493,
"eval_valid_runtime": 6.4118,
"eval_valid_samples_per_second": 215.695,
"eval_valid_steps_per_second": 6.862,
"step": 13000
},
{
"epoch": 3.6041031327973387,
"eval_valid_target_loss": 0.0771123468875885,
"eval_valid_target_runtime": 6.5745,
"eval_valid_target_samples_per_second": 218.117,
"eval_valid_target_steps_per_second": 6.845,
"step": 13000
},
{
"epoch": 3.631827003049626,
"grad_norm": 0.15152141451835632,
"learning_rate": 7.083807209116689e-06,
"loss": 0.0836,
"step": 13100
},
{
"epoch": 3.659550873301913,
"grad_norm": 0.18368007242679596,
"learning_rate": 7.044141242959826e-06,
"loss": 0.0827,
"step": 13200
},
{
"epoch": 3.6872747435542,
"grad_norm": 0.18081355094909668,
"learning_rate": 7.004320202550303e-06,
"loss": 0.0823,
"step": 13300
},
{
"epoch": 3.7149986138064874,
"grad_norm": 0.15222586691379547,
"learning_rate": 6.9643471088232506e-06,
"loss": 0.0801,
"step": 13400
},
{
"epoch": 3.7427224840587745,
"grad_norm": 0.1571241021156311,
"learning_rate": 6.9242249942489755e-06,
"loss": 0.0807,
"step": 13500
},
{
"epoch": 3.7427224840587745,
"eval_valid_loss": 0.06911951303482056,
"eval_valid_runtime": 6.4701,
"eval_valid_samples_per_second": 213.752,
"eval_valid_steps_per_second": 6.8,
"step": 13500
},
{
"epoch": 3.7427224840587745,
"eval_valid_target_loss": 0.07482416182756424,
"eval_valid_target_runtime": 6.5611,
"eval_valid_target_samples_per_second": 218.56,
"eval_valid_target_steps_per_second": 6.859,
"step": 13500
},
{
"epoch": 3.770446354311062,
"grad_norm": 0.1546078324317932,
"learning_rate": 6.883956902602933e-06,
"loss": 0.0811,
"step": 13600
},
{
"epoch": 3.798170224563349,
"grad_norm": 0.1428447812795639,
"learning_rate": 6.843545888734801e-06,
"loss": 0.0795,
"step": 13700
},
{
"epoch": 3.825894094815636,
"grad_norm": 0.1369272619485855,
"learning_rate": 6.802995018336736e-06,
"loss": 0.0794,
"step": 13800
},
{
"epoch": 3.8536179650679236,
"grad_norm": 0.1972970962524414,
"learning_rate": 6.762307367710797e-06,
"loss": 0.0785,
"step": 13900
},
{
"epoch": 3.8813418353202107,
"grad_norm": 0.15961000323295593,
"learning_rate": 6.721486023535577e-06,
"loss": 0.0787,
"step": 14000
},
{
"epoch": 3.8813418353202107,
"eval_valid_loss": 0.06712613999843597,
"eval_valid_runtime": 6.4106,
"eval_valid_samples_per_second": 215.737,
"eval_valid_steps_per_second": 6.864,
"step": 14000
},
{
"epoch": 3.8813418353202107,
"eval_valid_target_loss": 0.07271508872509003,
"eval_valid_target_runtime": 6.5891,
"eval_valid_target_samples_per_second": 217.633,
"eval_valid_target_steps_per_second": 6.829,
"step": 14000
},
{
"epoch": 3.9090657055724978,
"grad_norm": 0.15836742520332336,
"learning_rate": 6.680534082632036e-06,
"loss": 0.0779,
"step": 14100
},
{
"epoch": 3.9367895758247853,
"grad_norm": 0.1906501203775406,
"learning_rate": 6.639454651728561e-06,
"loss": 0.0772,
"step": 14200
},
{
"epoch": 3.9645134460770723,
"grad_norm": 0.1872212439775467,
"learning_rate": 6.598250847225286e-06,
"loss": 0.0772,
"step": 14300
},
{
"epoch": 3.9922373163293594,
"grad_norm": 0.1689438670873642,
"learning_rate": 6.556925794957678e-06,
"loss": 0.0769,
"step": 14400
},
{
"epoch": 4.0199611865816465,
"grad_norm": 0.1830626279115677,
"learning_rate": 6.515482629959392e-06,
"loss": 0.0764,
"step": 14500
},
{
"epoch": 4.0199611865816465,
"eval_valid_loss": 0.0653899684548378,
"eval_valid_runtime": 6.4271,
"eval_valid_samples_per_second": 215.181,
"eval_valid_steps_per_second": 6.846,
"step": 14500
},
{
"epoch": 4.0199611865816465,
"eval_valid_target_loss": 0.0708317682147026,
"eval_valid_target_runtime": 6.5574,
"eval_valid_target_samples_per_second": 218.684,
"eval_valid_target_steps_per_second": 6.862,
"step": 14500
},
{
"epoch": 4.047685056833934,
"grad_norm": 0.1517285257577896,
"learning_rate": 6.473924496224447e-06,
"loss": 0.0757,
"step": 14600
},
{
"epoch": 4.0754089270862215,
"grad_norm": 0.15981799364089966,
"learning_rate": 6.432254546468708e-06,
"loss": 0.0751,
"step": 14700
},
{
"epoch": 4.1031327973385086,
"grad_norm": 0.14974670112133026,
"learning_rate": 6.3904759418907194e-06,
"loss": 0.0755,
"step": 14800
},
{
"epoch": 4.130856667590796,
"grad_norm": 0.15918827056884766,
"learning_rate": 6.348591851931879e-06,
"loss": 0.0743,
"step": 14900
},
{
"epoch": 4.158580537843083,
"grad_norm": 0.17248332500457764,
"learning_rate": 6.306605454036001e-06,
"loss": 0.0747,
"step": 15000
},
{
"epoch": 4.158580537843083,
"eval_valid_loss": 0.06470626592636108,
"eval_valid_runtime": 6.4429,
"eval_valid_samples_per_second": 214.654,
"eval_valid_steps_per_second": 6.829,
"step": 15000
},
{
"epoch": 4.158580537843083,
"eval_valid_target_loss": 0.07004554569721222,
"eval_valid_target_runtime": 6.5941,
"eval_valid_target_samples_per_second": 217.468,
"eval_valid_target_steps_per_second": 6.824,
"step": 15000
},
{
"epoch": 4.18630440809537,
"grad_norm": 0.18200209736824036,
"learning_rate": 6.2645199334082674e-06,
"loss": 0.0735,
"step": 15100
},
{
"epoch": 4.214028278347658,
"grad_norm": 0.12851852178573608,
"learning_rate": 6.222338482773584e-06,
"loss": 0.0736,
"step": 15200
},
{
"epoch": 4.241752148599945,
"grad_norm": 0.15132804214954376,
"learning_rate": 6.180064302134374e-06,
"loss": 0.0738,
"step": 15300
},
{
"epoch": 4.269476018852232,
"grad_norm": 0.15047667920589447,
"learning_rate": 6.1377005985278205e-06,
"loss": 0.073,
"step": 15400
},
{
"epoch": 4.297199889104519,
"grad_norm": 0.19985252618789673,
"learning_rate": 6.095250585782562e-06,
"loss": 0.0732,
"step": 15500
},
{
"epoch": 4.297199889104519,
"eval_valid_loss": 0.062382254749536514,
"eval_valid_runtime": 6.4347,
"eval_valid_samples_per_second": 214.927,
"eval_valid_steps_per_second": 6.838,
"step": 15500
},
{
"epoch": 4.297199889104519,
"eval_valid_target_loss": 0.06759324669837952,
"eval_valid_target_runtime": 6.5646,
"eval_valid_target_samples_per_second": 218.446,
"eval_valid_target_steps_per_second": 6.855,
"step": 15500
},
{
"epoch": 4.324923759356806,
"grad_norm": 0.16384641826152802,
"learning_rate": 6.0527174842748994e-06,
"loss": 0.0716,
"step": 15600
},
{
"epoch": 4.352647629609093,
"grad_norm": 0.14244656264781952,
"learning_rate": 6.0101045206844676e-06,
"loss": 0.0716,
"step": 15700
},
{
"epoch": 4.380371499861381,
"grad_norm": 0.16209416091442108,
"learning_rate": 5.9674149277494694e-06,
"loss": 0.0714,
"step": 15800
},
{
"epoch": 4.408095370113668,
"grad_norm": 0.17041273415088654,
"learning_rate": 5.92465194402142e-06,
"loss": 0.0715,
"step": 15900
},
{
"epoch": 4.435819240365955,
"grad_norm": 0.16730940341949463,
"learning_rate": 5.881818813619463e-06,
"loss": 0.0714,
"step": 16000
},
{
"epoch": 4.435819240365955,
"eval_valid_loss": 0.061134014278650284,
"eval_valid_runtime": 6.4104,
"eval_valid_samples_per_second": 215.742,
"eval_valid_steps_per_second": 6.864,
"step": 16000
},
{
"epoch": 4.435819240365955,
"eval_valid_target_loss": 0.06638547778129578,
"eval_valid_target_runtime": 6.5651,
"eval_valid_target_samples_per_second": 218.427,
"eval_valid_target_steps_per_second": 6.854,
"step": 16000
},
{
"epoch": 4.463543110618242,
"grad_norm": 0.13161396980285645,
"learning_rate": 5.8389187859842675e-06,
"loss": 0.0703,
"step": 16100
},
{
"epoch": 4.491266980870529,
"grad_norm": 0.13423210382461548,
"learning_rate": 5.7959551156315156e-06,
"loss": 0.0707,
"step": 16200
},
{
"epoch": 4.518990851122817,
"grad_norm": 0.20051045715808868,
"learning_rate": 5.752931061904994e-06,
"loss": 0.0699,
"step": 16300
},
{
"epoch": 4.546714721375104,
"grad_norm": 0.15945318341255188,
"learning_rate": 5.709849888729351e-06,
"loss": 0.0697,
"step": 16400
},
{
"epoch": 4.574438591627391,
"grad_norm": 0.13749030232429504,
"learning_rate": 5.666714864362468e-06,
"loss": 0.0704,
"step": 16500
},
{
"epoch": 4.574438591627391,
"eval_valid_loss": 0.06001834571361542,
"eval_valid_runtime": 6.4467,
"eval_valid_samples_per_second": 214.529,
"eval_valid_steps_per_second": 6.825,
"step": 16500
},
{
"epoch": 4.574438591627391,
"eval_valid_target_loss": 0.06535307317972183,
"eval_valid_target_runtime": 6.5686,
"eval_valid_target_samples_per_second": 218.311,
"eval_valid_target_steps_per_second": 6.851,
"step": 16500
},
{
"epoch": 4.602162461879678,
"grad_norm": 0.133077010512352,
"learning_rate": 5.6235292611475326e-06,
"loss": 0.0693,
"step": 16600
},
{
"epoch": 4.629886332131965,
"grad_norm": 0.1508035957813263,
"learning_rate": 5.580296355264783e-06,
"loss": 0.069,
"step": 16700
},
{
"epoch": 4.6576102023842525,
"grad_norm": 0.14195485413074493,
"learning_rate": 5.537019426482966e-06,
"loss": 0.0695,
"step": 16800
},
{
"epoch": 4.6853340726365404,
"grad_norm": 0.16586261987686157,
"learning_rate": 5.493701757910536e-06,
"loss": 0.0684,
"step": 16900
},
{
"epoch": 4.7130579428888275,
"grad_norm": 0.13865657150745392,
"learning_rate": 5.4503466357465765e-06,
"loss": 0.0682,
"step": 17000
},
{
"epoch": 4.7130579428888275,
"eval_valid_loss": 0.0584811232984066,
"eval_valid_runtime": 6.422,
"eval_valid_samples_per_second": 215.352,
"eval_valid_steps_per_second": 6.851,
"step": 17000
},
{
"epoch": 4.7130579428888275,
"eval_valid_target_loss": 0.06370435655117035,
"eval_valid_target_runtime": 6.5705,
"eval_valid_target_samples_per_second": 218.247,
"eval_valid_target_steps_per_second": 6.849,
"step": 17000
},
{
"epoch": 4.740781813141115,
"grad_norm": 0.1934811919927597,
"learning_rate": 5.406957349031504e-06,
"loss": 0.0686,
"step": 17100
},
{
"epoch": 4.768505683393402,
"grad_norm": 0.16662567853927612,
"learning_rate": 5.363537189397556e-06,
"loss": 0.0682,
"step": 17200
},
{
"epoch": 4.796229553645689,
"grad_norm": 0.15507076680660248,
"learning_rate": 5.320089450819075e-06,
"loss": 0.0673,
"step": 17300
},
{
"epoch": 4.823953423897976,
"grad_norm": 0.12763585150241852,
"learning_rate": 5.276617429362616e-06,
"loss": 0.0671,
"step": 17400
},
{
"epoch": 4.851677294150264,
"grad_norm": 0.15640078485012054,
"learning_rate": 5.233124422936906e-06,
"loss": 0.0669,
"step": 17500
},
{
"epoch": 4.851677294150264,
"eval_valid_loss": 0.05754322186112404,
"eval_valid_runtime": 6.4388,
"eval_valid_samples_per_second": 214.792,
"eval_valid_steps_per_second": 6.834,
"step": 17500
},
{
"epoch": 4.851677294150264,
"eval_valid_target_loss": 0.06262939423322678,
"eval_valid_target_runtime": 6.5536,
"eval_valid_target_samples_per_second": 218.81,
"eval_valid_target_steps_per_second": 6.866,
"step": 17500
},
{
"epoch": 4.879401164402551,
"grad_norm": 0.16545389592647552,
"learning_rate": 5.189613731042645e-06,
"loss": 0.0663,
"step": 17600
},
{
"epoch": 4.907125034654838,
"grad_norm": 0.17085812985897064,
"learning_rate": 5.146088654522208e-06,
"loss": 0.0657,
"step": 17700
},
{
"epoch": 4.934848904907125,
"grad_norm": 0.14638109505176544,
"learning_rate": 5.102552495309222e-06,
"loss": 0.0677,
"step": 17800
},
{
"epoch": 4.962572775159412,
"grad_norm": 0.15568013489246368,
"learning_rate": 5.059008556178079e-06,
"loss": 0.0657,
"step": 17900
},
{
"epoch": 4.9902966454117,
"grad_norm": 0.16898399591445923,
"learning_rate": 5.015460140493381e-06,
"loss": 0.0661,
"step": 18000
},
{
"epoch": 4.9902966454117,
"eval_valid_loss": 0.05648580938577652,
"eval_valid_runtime": 6.4207,
"eval_valid_samples_per_second": 215.397,
"eval_valid_steps_per_second": 6.853,
"step": 18000
},
{
"epoch": 4.9902966454117,
"eval_valid_target_loss": 0.06151015684008598,
"eval_valid_target_runtime": 6.5952,
"eval_valid_target_samples_per_second": 217.432,
"eval_valid_target_steps_per_second": 6.823,
"step": 18000
},
{
"epoch": 5.018020515663987,
"grad_norm": 0.13535688817501068,
"learning_rate": 4.971910551959332e-06,
"loss": 0.0654,
"step": 18100
},
{
"epoch": 5.045744385916274,
"grad_norm": 0.16001687943935394,
"learning_rate": 4.928363094369108e-06,
"loss": 0.0656,
"step": 18200
},
{
"epoch": 5.073468256168561,
"grad_norm": 0.1575719267129898,
"learning_rate": 4.88482107135423e-06,
"loss": 0.0641,
"step": 18300
},
{
"epoch": 5.101192126420848,
"grad_norm": 0.1607745736837387,
"learning_rate": 4.841287786133937e-06,
"loss": 0.0642,
"step": 18400
},
{
"epoch": 5.128915996673135,
"grad_norm": 0.13689269125461578,
"learning_rate": 4.797766541264592e-06,
"loss": 0.0646,
"step": 18500
},
{
"epoch": 5.128915996673135,
"eval_valid_loss": 0.05563423037528992,
"eval_valid_runtime": 6.4248,
"eval_valid_samples_per_second": 215.261,
"eval_valid_steps_per_second": 6.849,
"step": 18500
},
{
"epoch": 5.128915996673135,
"eval_valid_target_loss": 0.06068035215139389,
"eval_valid_target_runtime": 6.561,
"eval_valid_target_samples_per_second": 218.566,
"eval_valid_target_steps_per_second": 6.859,
"step": 18500
},
{
"epoch": 5.156639866925423,
"grad_norm": 0.13576319813728333,
"learning_rate": 4.754260638389145e-06,
"loss": 0.0641,
"step": 18600
},
{
"epoch": 5.18436373717771,
"grad_norm": 0.13574448227882385,
"learning_rate": 4.710773377986659e-06,
"loss": 0.0643,
"step": 18700
},
{
"epoch": 5.212087607429997,
"grad_norm": 0.11536768078804016,
"learning_rate": 4.667308059121928e-06,
"loss": 0.064,
"step": 18800
},
{
"epoch": 5.239811477682284,
"grad_norm": 0.1470881700515747,
"learning_rate": 4.623867979195196e-06,
"loss": 0.0637,
"step": 18900
},
{
"epoch": 5.2675353479345715,
"grad_norm": 0.13156047463417053,
"learning_rate": 4.580456433692017e-06,
"loss": 0.0635,
"step": 19000
},
{
"epoch": 5.2675353479345715,
"eval_valid_loss": 0.05473410338163376,
"eval_valid_runtime": 6.4623,
"eval_valid_samples_per_second": 214.012,
"eval_valid_steps_per_second": 6.809,
"step": 19000
},
{
"epoch": 5.2675353479345715,
"eval_valid_target_loss": 0.05973204970359802,
"eval_valid_target_runtime": 6.5636,
"eval_valid_target_samples_per_second": 218.477,
"eval_valid_target_steps_per_second": 6.856,
"step": 19000
},
{
"epoch": 5.2952592181868585,
"grad_norm": 0.132376030087471,
"learning_rate": 4.537076715933242e-06,
"loss": 0.0638,
"step": 19100
},
{
"epoch": 5.3229830884391465,
"grad_norm": 0.14191821217536926,
"learning_rate": 4.493732116825174e-06,
"loss": 0.064,
"step": 19200
},
{
"epoch": 5.3507069586914335,
"grad_norm": 0.1247839480638504,
"learning_rate": 4.45042592460993e-06,
"loss": 0.0627,
"step": 19300
},
{
"epoch": 5.378430828943721,
"grad_norm": 0.12980355322360992,
"learning_rate": 4.4071614246159596e-06,
"loss": 0.0632,
"step": 19400
},
{
"epoch": 5.406154699196008,
"grad_norm": 0.1391134262084961,
"learning_rate": 4.363941899008833e-06,
"loss": 0.0625,
"step": 19500
},
{
"epoch": 5.406154699196008,
"eval_valid_loss": 0.05415208637714386,
"eval_valid_runtime": 6.4065,
"eval_valid_samples_per_second": 215.873,
"eval_valid_steps_per_second": 6.868,
"step": 19500
},
{
"epoch": 5.406154699196008,
"eval_valid_target_loss": 0.05894719064235687,
"eval_valid_target_runtime": 6.569,
"eval_valid_target_samples_per_second": 218.299,
"eval_valid_target_steps_per_second": 6.85,
"step": 19500
},
{
"epoch": 5.433878569448295,
"grad_norm": 0.2045671045780182,
"learning_rate": 4.320770626542238e-06,
"loss": 0.0629,
"step": 19600
},
{
"epoch": 5.461602439700582,
"grad_norm": 0.1417771577835083,
"learning_rate": 4.277650882309238e-06,
"loss": 0.0625,
"step": 19700
},
{
"epoch": 5.48932630995287,
"grad_norm": 0.14284995198249817,
"learning_rate": 4.234585937493829e-06,
"loss": 0.0623,
"step": 19800
},
{
"epoch": 5.517050180205157,
"grad_norm": 0.1546027809381485,
"learning_rate": 4.1915790591227615e-06,
"loss": 0.0625,
"step": 19900
},
{
"epoch": 5.544774050457444,
"grad_norm": 0.1454819142818451,
"learning_rate": 4.148633509817715e-06,
"loss": 0.0613,
"step": 20000
},
{
"epoch": 5.544774050457444,
"eval_valid_loss": 0.05364985764026642,
"eval_valid_runtime": 6.436,
"eval_valid_samples_per_second": 214.885,
"eval_valid_steps_per_second": 6.837,
"step": 20000
},
{
"epoch": 5.544774050457444,
"eval_valid_target_loss": 0.05850011110305786,
"eval_valid_target_runtime": 6.5534,
"eval_valid_target_samples_per_second": 218.819,
"eval_valid_target_steps_per_second": 6.867,
"step": 20000
},
{
"epoch": 5.572497920709731,
"grad_norm": 0.12440012395381927,
"learning_rate": 4.105752547547764e-06,
"loss": 0.0613,
"step": 20100
},
{
"epoch": 5.600221790962018,
"grad_norm": 0.14089658856391907,
"learning_rate": 4.062939425382236e-06,
"loss": 0.0616,
"step": 20200
},
{
"epoch": 5.627945661214305,
"grad_norm": 0.24770374596118927,
"learning_rate": 4.020197391243922e-06,
"loss": 0.0621,
"step": 20300
},
{
"epoch": 5.655669531466593,
"grad_norm": 0.11835476011037827,
"learning_rate": 3.977529687662671e-06,
"loss": 0.0619,
"step": 20400
},
{
"epoch": 5.68339340171888,
"grad_norm": 0.12585273385047913,
"learning_rate": 3.93493955152941e-06,
"loss": 0.0612,
"step": 20500
},
{
"epoch": 5.68339340171888,
"eval_valid_loss": 0.05319705978035927,
"eval_valid_runtime": 6.4196,
"eval_valid_samples_per_second": 215.435,
"eval_valid_steps_per_second": 6.854,
"step": 20500
},
{
"epoch": 5.68339340171888,
"eval_valid_target_loss": 0.058061882853507996,
"eval_valid_target_runtime": 6.5894,
"eval_valid_target_samples_per_second": 217.622,
"eval_valid_target_steps_per_second": 6.829,
"step": 20500
},
{
"epoch": 5.711117271971167,
"grad_norm": 0.15103484690189362,
"learning_rate": 3.892430213850587e-06,
"loss": 0.0615,
"step": 20600
},
{
"epoch": 5.738841142223454,
"grad_norm": 0.1266421228647232,
"learning_rate": 3.850004899503051e-06,
"loss": 0.0613,
"step": 20700
},
{
"epoch": 5.766565012475741,
"grad_norm": 0.1100655049085617,
"learning_rate": 3.8076668269894045e-06,
"loss": 0.0606,
"step": 20800
},
{
"epoch": 5.794288882728029,
"grad_norm": 0.1395365446805954,
"learning_rate": 3.765419208193848e-06,
"loss": 0.0614,
"step": 20900
},
{
"epoch": 5.822012752980316,
"grad_norm": 0.12668344378471375,
"learning_rate": 3.723265248138506e-06,
"loss": 0.0614,
"step": 21000
},
{
"epoch": 5.822012752980316,
"eval_valid_loss": 0.052489351481199265,
"eval_valid_runtime": 6.4455,
"eval_valid_samples_per_second": 214.567,
"eval_valid_steps_per_second": 6.826,
"step": 21000
},
{
"epoch": 5.822012752980316,
"eval_valid_target_loss": 0.057213690131902695,
"eval_valid_target_runtime": 6.5546,
"eval_valid_target_samples_per_second": 218.777,
"eval_valid_target_steps_per_second": 6.865,
"step": 21000
},
{
"epoch": 5.849736623232603,
"grad_norm": 0.12728376686573029,
"learning_rate": 3.681208144740291e-06,
"loss": 0.0612,
"step": 21100
},
{
"epoch": 5.87746049348489,
"grad_norm": 0.14501620829105377,
"learning_rate": 3.6392510885682965e-06,
"loss": 0.0601,
"step": 21200
},
{
"epoch": 5.9051843637371775,
"grad_norm": 0.1082565188407898,
"learning_rate": 3.5973972626017594e-06,
"loss": 0.0608,
"step": 21300
},
{
"epoch": 5.9329082339894645,
"grad_norm": 0.14926603436470032,
"learning_rate": 3.5556498419885867e-06,
"loss": 0.0603,
"step": 21400
},
{
"epoch": 5.9606321042417525,
"grad_norm": 0.1263745278120041,
"learning_rate": 3.514011993804469e-06,
"loss": 0.0602,
"step": 21500
},
{
"epoch": 5.9606321042417525,
"eval_valid_loss": 0.05212084576487541,
"eval_valid_runtime": 6.439,
"eval_valid_samples_per_second": 214.785,
"eval_valid_steps_per_second": 6.833,
"step": 21500
},
{
"epoch": 5.9606321042417525,
"eval_valid_target_loss": 0.05688408389687538,
"eval_valid_target_runtime": 6.5822,
"eval_valid_target_samples_per_second": 217.862,
"eval_valid_target_steps_per_second": 6.837,
"step": 21500
},
{
"epoch": 5.98835597449404,
"grad_norm": 0.1368781179189682,
"learning_rate": 3.4724868768126384e-06,
"loss": 0.0604,
"step": 21600
},
{
"epoch": 6.016079844746327,
"grad_norm": 0.15087148547172546,
"learning_rate": 3.4310776412242195e-06,
"loss": 0.06,
"step": 21700
},
{
"epoch": 6.043803714998614,
"grad_norm": 0.11400382220745087,
"learning_rate": 3.3897874284592467e-06,
"loss": 0.0594,
"step": 21800
},
{
"epoch": 6.071527585250901,
"grad_norm": 0.1169167011976242,
"learning_rate": 3.348619370908361e-06,
"loss": 0.0598,
"step": 21900
},
{
"epoch": 6.099251455503188,
"grad_norm": 0.12172160297632217,
"learning_rate": 3.3075765916951576e-06,
"loss": 0.0599,
"step": 22000
},
{
"epoch": 6.099251455503188,
"eval_valid_loss": 0.05157113075256348,
"eval_valid_runtime": 6.4258,
"eval_valid_samples_per_second": 215.224,
"eval_valid_steps_per_second": 6.847,
"step": 22000
},
{
"epoch": 6.099251455503188,
"eval_valid_target_loss": 0.056347791105508804,
"eval_valid_target_runtime": 6.5915,
"eval_valid_target_samples_per_second": 217.554,
"eval_valid_target_steps_per_second": 6.827,
"step": 22000
},
{
"epoch": 6.126975325755476,
"grad_norm": 0.1324358880519867,
"learning_rate": 3.2666622044392765e-06,
"loss": 0.0591,
"step": 22100
},
{
"epoch": 6.154699196007763,
"grad_norm": 0.12708991765975952,
"learning_rate": 3.225879313020178e-06,
"loss": 0.0591,
"step": 22200
},
{
"epoch": 6.18242306626005,
"grad_norm": 0.11844506114721298,
"learning_rate": 3.18523101134169e-06,
"loss": 0.0592,
"step": 22300
},
{
"epoch": 6.210146936512337,
"grad_norm": 0.12888644635677338,
"learning_rate": 3.1447203830972827e-06,
"loss": 0.0597,
"step": 22400
},
{
"epoch": 6.237870806764624,
"grad_norm": 0.1485096514225006,
"learning_rate": 3.104350501536134e-06,
"loss": 0.0598,
"step": 22500
},
{
"epoch": 6.237870806764624,
"eval_valid_loss": 0.051265206187963486,
"eval_valid_runtime": 6.437,
"eval_valid_samples_per_second": 214.85,
"eval_valid_steps_per_second": 6.835,
"step": 22500
},
{
"epoch": 6.237870806764624,
"eval_valid_target_loss": 0.056084584444761276,
"eval_valid_target_runtime": 6.6,
"eval_valid_target_samples_per_second": 217.273,
"eval_valid_target_steps_per_second": 6.818,
"step": 22500
},
{
"epoch": 6.265594677016912,
"grad_norm": 0.11319620907306671,
"learning_rate": 3.064124429229992e-06,
"loss": 0.0581,
"step": 22600
},
{
"epoch": 6.293318547269199,
"grad_norm": 0.125896617770195,
"learning_rate": 3.0240452178408286e-06,
"loss": 0.0594,
"step": 22700
},
{
"epoch": 6.321042417521486,
"grad_norm": 0.13202796876430511,
"learning_rate": 2.9841159078893377e-06,
"loss": 0.0587,
"step": 22800
},
{
"epoch": 6.348766287773773,
"grad_norm": 0.12477891147136688,
"learning_rate": 2.944339528524278e-06,
"loss": 0.0582,
"step": 22900
},
{
"epoch": 6.37649015802606,
"grad_norm": 0.13174673914909363,
"learning_rate": 2.9047190972926597e-06,
"loss": 0.0585,
"step": 23000
},
{
"epoch": 6.37649015802606,
"eval_valid_loss": 0.05099370330572128,
"eval_valid_runtime": 6.4377,
"eval_valid_samples_per_second": 214.828,
"eval_valid_steps_per_second": 6.835,
"step": 23000
},
{
"epoch": 6.37649015802606,
"eval_valid_target_loss": 0.055660318583250046,
"eval_valid_target_runtime": 6.5668,
"eval_valid_target_samples_per_second": 218.37,
"eval_valid_target_steps_per_second": 6.853,
"step": 23000
},
{
"epoch": 6.404214028278347,
"grad_norm": 0.12851925194263458,
"learning_rate": 2.8652576199108395e-06,
"loss": 0.0586,
"step": 23100
},
{
"epoch": 6.431937898530635,
"grad_norm": 0.10676029324531555,
"learning_rate": 2.8259580900364825e-06,
"loss": 0.0584,
"step": 23200
},
{
"epoch": 6.459661768782922,
"grad_norm": 0.1461838185787201,
"learning_rate": 2.786823489041478e-06,
"loss": 0.0583,
"step": 23300
},
{
"epoch": 6.487385639035209,
"grad_norm": 0.12321025878190994,
"learning_rate": 2.747856785785743e-06,
"loss": 0.0579,
"step": 23400
},
{
"epoch": 6.515109509287496,
"grad_norm": 0.1209678128361702,
"learning_rate": 2.7090609363919986e-06,
"loss": 0.0581,
"step": 23500
},
{
"epoch": 6.515109509287496,
"eval_valid_loss": 0.050510190427303314,
"eval_valid_runtime": 6.447,
"eval_valid_samples_per_second": 214.517,
"eval_valid_steps_per_second": 6.825,
"step": 23500
},
{
"epoch": 6.515109509287496,
"eval_valid_target_loss": 0.0551883801817894,
"eval_valid_target_runtime": 6.5701,
"eval_valid_target_samples_per_second": 218.262,
"eval_valid_target_steps_per_second": 6.849,
"step": 23500
},
{
"epoch": 6.5428333795397835,
"grad_norm": 0.15566356480121613,
"learning_rate": 2.6704388840215277e-06,
"loss": 0.0578,
"step": 23600
},
{
"epoch": 6.570557249792071,
"grad_norm": 0.10754121840000153,
"learning_rate": 2.6319935586508814e-06,
"loss": 0.058,
"step": 23700
},
{
"epoch": 6.5982811200443585,
"grad_norm": 0.12134023010730743,
"learning_rate": 2.593727876849601e-06,
"loss": 0.0577,
"step": 23800
},
{
"epoch": 6.626004990296646,
"grad_norm": 0.12984460592269897,
"learning_rate": 2.555644741558979e-06,
"loss": 0.0575,
"step": 23900
},
{
"epoch": 6.653728860548933,
"grad_norm": 0.13557353615760803,
"learning_rate": 2.51774704187181e-06,
"loss": 0.0571,
"step": 24000
},
{
"epoch": 6.653728860548933,
"eval_valid_loss": 0.0503346286714077,
"eval_valid_runtime": 6.419,
"eval_valid_samples_per_second": 215.455,
"eval_valid_steps_per_second": 6.855,
"step": 24000
},
{
"epoch": 6.653728860548933,
"eval_valid_target_loss": 0.0548863522708416,
"eval_valid_target_runtime": 6.5823,
"eval_valid_target_samples_per_second": 217.857,
"eval_valid_target_steps_per_second": 6.837,
"step": 24000
},
{
"epoch": 6.68145273080122,
"grad_norm": 0.10979162156581879,
"learning_rate": 2.4800376528132297e-06,
"loss": 0.0576,
"step": 24100
},
{
"epoch": 6.709176601053507,
"grad_norm": 0.16127757728099823,
"learning_rate": 2.4425194351226082e-06,
"loss": 0.0579,
"step": 24200
},
{
"epoch": 6.736900471305795,
"grad_norm": 0.13306181132793427,
"learning_rate": 2.4051952350365194e-06,
"loss": 0.0572,
"step": 24300
},
{
"epoch": 6.764624341558082,
"grad_norm": 0.11353787779808044,
"learning_rate": 2.368067884072821e-06,
"loss": 0.0573,
"step": 24400
},
{
"epoch": 6.792348211810369,
"grad_norm": 0.10115820914506912,
"learning_rate": 2.331140198815849e-06,
"loss": 0.0574,
"step": 24500
},
{
"epoch": 6.792348211810369,
"eval_valid_loss": 0.049953412264585495,
"eval_valid_runtime": 6.4338,
"eval_valid_samples_per_second": 214.958,
"eval_valid_steps_per_second": 6.839,
"step": 24500
},
{
"epoch": 6.792348211810369,
"eval_valid_target_loss": 0.054579559713602066,
"eval_valid_target_runtime": 6.5694,
"eval_valid_target_samples_per_second": 218.283,
"eval_valid_target_steps_per_second": 6.85,
"step": 24500
},
{
"epoch": 6.820072082062656,
"grad_norm": 0.10899285972118378,
"learning_rate": 2.294414980702741e-06,
"loss": 0.0573,
"step": 24600
},
{
"epoch": 6.847795952314943,
"grad_norm": 0.1248159185051918,
"learning_rate": 2.257895015810913e-06,
"loss": 0.0568,
"step": 24700
},
{
"epoch": 6.87551982256723,
"grad_norm": 0.10761197656393051,
"learning_rate": 2.221583074646701e-06,
"loss": 0.0574,
"step": 24800
},
{
"epoch": 6.903243692819517,
"grad_norm": 0.13541601598262787,
"learning_rate": 2.1854819119351784e-06,
"loss": 0.0562,
"step": 24900
},
{
"epoch": 6.930967563071805,
"grad_norm": 0.10959000140428543,
"learning_rate": 2.1495942664111814e-06,
"loss": 0.0576,
"step": 25000
},
{
"epoch": 6.930967563071805,
"eval_valid_loss": 0.049802832305431366,
"eval_valid_runtime": 6.4091,
"eval_valid_samples_per_second": 215.786,
"eval_valid_steps_per_second": 6.865,
"step": 25000
},
{
"epoch": 6.930967563071805,
"eval_valid_target_loss": 0.05434631556272507,
"eval_valid_target_runtime": 6.5766,
"eval_valid_target_samples_per_second": 218.047,
"eval_valid_target_steps_per_second": 6.842,
"step": 25000
},
{
"epoch": 6.958691433324092,
"grad_norm": 0.11864270269870758,
"learning_rate": 2.113922860611532e-06,
"loss": 0.0571,
"step": 25100
},
{
"epoch": 6.986415303576379,
"grad_norm": 0.10493431985378265,
"learning_rate": 2.078470400668506e-06,
"loss": 0.0572,
"step": 25200
},
{
"epoch": 7.014139173828666,
"grad_norm": 0.10294145345687866,
"learning_rate": 2.0432395761045427e-06,
"loss": 0.0562,
"step": 25300
},
{
"epoch": 7.041863044080953,
"grad_norm": 0.11174608767032623,
"learning_rate": 2.008233059628193e-06,
"loss": 0.0562,
"step": 25400
},
{
"epoch": 7.069586914333241,
"grad_norm": 0.10171514004468918,
"learning_rate": 1.9734535069313753e-06,
"loss": 0.056,
"step": 25500
},
{
"epoch": 7.069586914333241,
"eval_valid_loss": 0.04948737472295761,
"eval_valid_runtime": 6.442,
"eval_valid_samples_per_second": 214.685,
"eval_valid_steps_per_second": 6.83,
"step": 25500
},
{
"epoch": 7.069586914333241,
"eval_valid_target_loss": 0.05410830304026604,
"eval_valid_target_runtime": 6.5896,
"eval_valid_target_samples_per_second": 217.617,
"eval_valid_target_steps_per_second": 6.829,
"step": 25500
},
{
"epoch": 7.097310784585528,
"grad_norm": 0.10731488466262817,
"learning_rate": 1.9389035564879104e-06,
"loss": 0.0569,
"step": 25600
},
{
"epoch": 7.125034654837815,
"grad_norm": 0.0954216718673706,
"learning_rate": 1.9045858293533399e-06,
"loss": 0.0566,
"step": 25700
},
{
"epoch": 7.1527585250901025,
"grad_norm": 0.11443454772233963,
"learning_rate": 1.8705029289661054e-06,
"loss": 0.057,
"step": 25800
},
{
"epoch": 7.1804823953423895,
"grad_norm": 0.10671606659889221,
"learning_rate": 1.8366574409500344e-06,
"loss": 0.0561,
"step": 25900
},
{
"epoch": 7.208206265594677,
"grad_norm": 0.1028604656457901,
"learning_rate": 1.8030519329181916e-06,
"loss": 0.0561,
"step": 26000
},
{
"epoch": 7.208206265594677,
"eval_valid_loss": 0.04931313917040825,
"eval_valid_runtime": 6.431,
"eval_valid_samples_per_second": 215.053,
"eval_valid_steps_per_second": 6.842,
"step": 26000
},
{
"epoch": 7.208206265594677,
"eval_valid_target_loss": 0.053888678550720215,
"eval_valid_target_runtime": 6.5712,
"eval_valid_target_samples_per_second": 218.225,
"eval_valid_target_steps_per_second": 6.848,
"step": 26000
},
{
"epoch": 7.2359301358469645,
"grad_norm": 0.11538730561733246,
"learning_rate": 1.7696889542780904e-06,
"loss": 0.0564,
"step": 26100
},
{
"epoch": 7.263654006099252,
"grad_norm": 0.10585539788007736,
"learning_rate": 1.7365710360382882e-06,
"loss": 0.0562,
"step": 26200
},
{
"epoch": 7.291377876351539,
"grad_norm": 0.09750411659479141,
"learning_rate": 1.7037006906163773e-06,
"loss": 0.0563,
"step": 26300
},
{
"epoch": 7.319101746603826,
"grad_norm": 0.10777630656957626,
"learning_rate": 1.6710804116483886e-06,
"loss": 0.0556,
"step": 26400
},
{
"epoch": 7.346825616856113,
"grad_norm": 0.13231071829795837,
"learning_rate": 1.6387126737996067e-06,
"loss": 0.0559,
"step": 26500
},
{
"epoch": 7.346825616856113,
"eval_valid_loss": 0.04909936338663101,
"eval_valid_runtime": 6.4292,
"eval_valid_samples_per_second": 215.112,
"eval_valid_steps_per_second": 6.844,
"step": 26500
},
{
"epoch": 7.346825616856113,
"eval_valid_target_loss": 0.05357712134718895,
"eval_valid_target_runtime": 6.5542,
"eval_valid_target_samples_per_second": 218.792,
"eval_valid_target_steps_per_second": 6.866,
"step": 26500
},
{
"epoch": 7.374549487108401,
"grad_norm": 0.10591776669025421,
"learning_rate": 1.6065999325768544e-06,
"loss": 0.0559,
"step": 26600
},
{
"epoch": 7.402273357360688,
"grad_norm": 0.11603645980358124,
"learning_rate": 1.5747446241421931e-06,
"loss": 0.0557,
"step": 26700
},
{
"epoch": 7.429997227612975,
"grad_norm": 0.09715123474597931,
"learning_rate": 1.5431491651281123e-06,
"loss": 0.0563,
"step": 26800
},
{
"epoch": 7.457721097865262,
"grad_norm": 0.10046205669641495,
"learning_rate": 1.511815952454208e-06,
"loss": 0.0556,
"step": 26900
},
{
"epoch": 7.485444968117549,
"grad_norm": 0.11805932968854904,
"learning_rate": 1.480747363145334e-06,
"loss": 0.0556,
"step": 27000
},
{
"epoch": 7.485444968117549,
"eval_valid_loss": 0.04887402430176735,
"eval_valid_runtime": 6.4098,
"eval_valid_samples_per_second": 215.763,
"eval_valid_steps_per_second": 6.864,
"step": 27000
},
{
"epoch": 7.485444968117549,
"eval_valid_target_loss": 0.05348382145166397,
"eval_valid_target_runtime": 6.5773,
"eval_valid_target_samples_per_second": 218.023,
"eval_valid_target_steps_per_second": 6.842,
"step": 27000
},
{
"epoch": 7.513168838369836,
"grad_norm": 0.1107444316148758,
"learning_rate": 1.4499457541512746e-06,
"loss": 0.0554,
"step": 27100
},
{
"epoch": 7.540892708622124,
"grad_norm": 0.10029349476099014,
"learning_rate": 1.4194134621679478e-06,
"loss": 0.0559,
"step": 27200
},
{
"epoch": 7.568616578874411,
"grad_norm": 0.09976372122764587,
"learning_rate": 1.3891528034601316e-06,
"loss": 0.0565,
"step": 27300
},
{
"epoch": 7.596340449126698,
"grad_norm": 0.10560230165719986,
"learning_rate": 1.3591660736857453e-06,
"loss": 0.0553,
"step": 27400
},
{
"epoch": 7.624064319378985,
"grad_norm": 0.09814602881669998,
"learning_rate": 1.329455547721697e-06,
"loss": 0.0552,
"step": 27500
},
{
"epoch": 7.624064319378985,
"eval_valid_loss": 0.04867083579301834,
"eval_valid_runtime": 6.4389,
"eval_valid_samples_per_second": 214.79,
"eval_valid_steps_per_second": 6.834,
"step": 27500
},
{
"epoch": 7.624064319378985,
"eval_valid_target_loss": 0.053231850266456604,
"eval_valid_target_runtime": 6.5692,
"eval_valid_target_samples_per_second": 218.292,
"eval_valid_target_steps_per_second": 6.85,
"step": 27500
},
{
"epoch": 7.651788189631272,
"grad_norm": 0.10253589600324631,
"learning_rate": 1.300023479491303e-06,
"loss": 0.0555,
"step": 27600
},
{
"epoch": 7.67951205988356,
"grad_norm": 0.10933282226324081,
"learning_rate": 1.2708721017933007e-06,
"loss": 0.0551,
"step": 27700
},
{
"epoch": 7.707235930135847,
"grad_norm": 0.11853484809398651,
"learning_rate": 1.2420036261324598e-06,
"loss": 0.056,
"step": 27800
},
{
"epoch": 7.734959800388134,
"grad_norm": 0.0992041826248169,
"learning_rate": 1.2134202425518139e-06,
"loss": 0.0547,
"step": 27900
},
{
"epoch": 7.762683670640421,
"grad_norm": 0.10824355483055115,
"learning_rate": 1.185124119466517e-06,
"loss": 0.0554,
"step": 28000
},
{
"epoch": 7.762683670640421,
"eval_valid_loss": 0.048471271991729736,
"eval_valid_runtime": 6.414,
"eval_valid_samples_per_second": 215.623,
"eval_valid_steps_per_second": 6.86,
"step": 28000
},
{
"epoch": 7.762683670640421,
"eval_valid_target_loss": 0.05302482470870018,
"eval_valid_target_runtime": 6.5682,
"eval_valid_target_samples_per_second": 218.326,
"eval_valid_target_steps_per_second": 6.851,
"step": 28000
},
{
"epoch": 7.7904075408927085,
"grad_norm": 0.09927680343389511,
"learning_rate": 1.1571174034993416e-06,
"loss": 0.0555,
"step": 28100
},
{
"epoch": 7.8181314111449955,
"grad_norm": 0.09600567072629929,
"learning_rate": 1.129402219317825e-06,
"loss": 0.0553,
"step": 28200
},
{
"epoch": 7.845855281397283,
"grad_norm": 0.11057105660438538,
"learning_rate": 1.1019806694730989e-06,
"loss": 0.0557,
"step": 28300
},
{
"epoch": 7.873579151649571,
"grad_norm": 0.10991726815700531,
"learning_rate": 1.074854834240368e-06,
"loss": 0.0553,
"step": 28400
},
{
"epoch": 7.901303021901858,
"grad_norm": 0.09168905019760132,
"learning_rate": 1.0480267714611048e-06,
"loss": 0.0551,
"step": 28500
},
{
"epoch": 7.901303021901858,
"eval_valid_loss": 0.04835043475031853,
"eval_valid_runtime": 6.4532,
"eval_valid_samples_per_second": 214.313,
"eval_valid_steps_per_second": 6.818,
"step": 28500
},
{
"epoch": 7.901303021901858,
"eval_valid_target_loss": 0.05293356999754906,
"eval_valid_target_runtime": 6.5812,
"eval_valid_target_samples_per_second": 217.894,
"eval_valid_target_steps_per_second": 6.838,
"step": 28500
},
{
"epoch": 7.929026892154145,
"grad_norm": 0.09465237706899643,
"learning_rate": 1.0214985163869378e-06,
"loss": 0.0556,
"step": 28600
},
{
"epoch": 7.956750762406432,
"grad_norm": 0.10842736065387726,
"learning_rate": 9.952720815252397e-07,
"loss": 0.0543,
"step": 28700
},
{
"epoch": 7.984474632658719,
"grad_norm": 0.09609558433294296,
"learning_rate": 9.693494564864648e-07,
"loss": 0.0554,
"step": 28800
},
{
"epoch": 8.012198502911007,
"grad_norm": 0.10819283127784729,
"learning_rate": 9.437326078332099e-07,
"loss": 0.0545,
"step": 28900
},
{
"epoch": 8.039922373163293,
"grad_norm": 0.09054001420736313,
"learning_rate": 9.18423478931016e-07,
"loss": 0.0554,
"step": 29000
},
{
"epoch": 8.039922373163293,
"eval_valid_loss": 0.04819526523351669,
"eval_valid_runtime": 6.4165,
"eval_valid_samples_per_second": 215.536,
"eval_valid_steps_per_second": 6.857,
"step": 29000
},
{
"epoch": 8.039922373163293,
"eval_valid_target_loss": 0.05275378376245499,
"eval_valid_target_runtime": 6.5635,
"eval_valid_target_samples_per_second": 218.482,
"eval_valid_target_steps_per_second": 6.856,
"step": 29000
},
{
"epoch": 8.067646243415581,
"grad_norm": 0.10373499244451523,
"learning_rate": 8.934239898009517e-07,
"loss": 0.0552,
"step": 29100
},
{
"epoch": 8.095370113667869,
"grad_norm": 0.09614498168230057,
"learning_rate": 8.687360369739473e-07,
"loss": 0.0545,
"step": 29200
},
{
"epoch": 8.123093983920155,
"grad_norm": 0.1014479324221611,
"learning_rate": 8.443614933469208e-07,
"loss": 0.0549,
"step": 29300
},
{
"epoch": 8.150817854172443,
"grad_norm": 0.08971751481294632,
"learning_rate": 8.203022080406952e-07,
"loss": 0.0546,
"step": 29400
},
{
"epoch": 8.17854172442473,
"grad_norm": 0.09659924358129501,
"learning_rate": 7.965600062597184e-07,
"loss": 0.0542,
"step": 29500
},
{
"epoch": 8.17854172442473,
"eval_valid_loss": 0.04812739044427872,
"eval_valid_runtime": 6.4674,
"eval_valid_samples_per_second": 213.843,
"eval_valid_steps_per_second": 6.803,
"step": 29500
},
{
"epoch": 8.17854172442473,
"eval_valid_target_loss": 0.05264822766184807,
"eval_valid_target_runtime": 6.5912,
"eval_valid_target_samples_per_second": 217.563,
"eval_valid_target_steps_per_second": 6.827,
"step": 29500
},
{
"epoch": 8.206265594677017,
"grad_norm": 0.1034499853849411,
"learning_rate": 7.731366891535969e-07,
"loss": 0.0548,
"step": 29600
},
{
"epoch": 8.233989464929303,
"grad_norm": 0.0934043675661087,
"learning_rate": 7.500340336804607e-07,
"loss": 0.0542,
"step": 29700
},
{
"epoch": 8.261713335181591,
"grad_norm": 0.09693789482116699,
"learning_rate": 7.272537924721467e-07,
"loss": 0.0553,
"step": 29800
},
{
"epoch": 8.28943720543388,
"grad_norm": 0.09552415460348129,
"learning_rate": 7.047976937012568e-07,
"loss": 0.0543,
"step": 29900
},
{
"epoch": 8.317161075686165,
"grad_norm": 0.0978178158402443,
"learning_rate": 6.826674409500389e-07,
"loss": 0.0548,
"step": 30000
},
{
"epoch": 8.317161075686165,
"eval_valid_loss": 0.04797354340553284,
"eval_valid_runtime": 6.442,
"eval_valid_samples_per_second": 214.683,
"eval_valid_steps_per_second": 6.83,
"step": 30000
},
{
"epoch": 8.317161075686165,
"eval_valid_target_loss": 0.052511684596538544,
"eval_valid_target_runtime": 6.5615,
"eval_valid_target_samples_per_second": 218.549,
"eval_valid_target_steps_per_second": 6.858,
"step": 30000
},
{
"epoch": 8.344884945938453,
"grad_norm": 0.09591928869485855,
"learning_rate": 6.608647130811502e-07,
"loss": 0.0543,
"step": 30100
},
{
"epoch": 8.37260881619074,
"grad_norm": 0.09678730368614197,
"learning_rate": 6.393911641103051e-07,
"loss": 0.0542,
"step": 30200
},
{
"epoch": 8.400332686443027,
"grad_norm": 0.10894029587507248,
"learning_rate": 6.182484230807845e-07,
"loss": 0.0542,
"step": 30300
},
{
"epoch": 8.428056556695315,
"grad_norm": 0.10065341740846634,
"learning_rate": 5.974380939398555e-07,
"loss": 0.0549,
"step": 30400
},
{
"epoch": 8.455780426947602,
"grad_norm": 0.11015477776527405,
"learning_rate": 5.769617554170959e-07,
"loss": 0.0544,
"step": 30500
},
{
"epoch": 8.455780426947602,
"eval_valid_loss": 0.04785359278321266,
"eval_valid_runtime": 6.4159,
"eval_valid_samples_per_second": 215.558,
"eval_valid_steps_per_second": 6.858,
"step": 30500
},
{
"epoch": 8.455780426947602,
"eval_valid_target_loss": 0.05238433927297592,
"eval_valid_target_runtime": 6.575,
"eval_valid_target_samples_per_second": 218.1,
"eval_valid_target_steps_per_second": 6.844,
"step": 30500
},
{
"epoch": 8.48350429719989,
"grad_norm": 0.10229642689228058,
"learning_rate": 5.568209609046238e-07,
"loss": 0.0542,
"step": 30600
},
{
"epoch": 8.511228167452176,
"grad_norm": 0.1019807681441307,
"learning_rate": 5.370172383392514e-07,
"loss": 0.0548,
"step": 30700
},
{
"epoch": 8.538952037704464,
"grad_norm": 0.1037830114364624,
"learning_rate": 5.175520900865754e-07,
"loss": 0.0538,
"step": 30800
},
{
"epoch": 8.56667590795675,
"grad_norm": 0.0952112227678299,
"learning_rate": 4.984269928270002e-07,
"loss": 0.0537,
"step": 30900
},
{
"epoch": 8.594399778209038,
"grad_norm": 0.09642232209444046,
"learning_rate": 4.796433974437148e-07,
"loss": 0.0533,
"step": 31000
},
{
"epoch": 8.594399778209038,
"eval_valid_loss": 0.04777803644537926,
"eval_valid_runtime": 6.4399,
"eval_valid_samples_per_second": 214.756,
"eval_valid_steps_per_second": 6.832,
"step": 31000
},
{
"epoch": 8.594399778209038,
"eval_valid_target_loss": 0.052354373037815094,
"eval_valid_target_runtime": 6.5668,
"eval_valid_target_samples_per_second": 218.371,
"eval_valid_target_steps_per_second": 6.853,
"step": 31000
},
{
"epoch": 8.622123648461326,
"grad_norm": 0.10211507230997086,
"learning_rate": 4.6120272891262365e-07,
"loss": 0.0544,
"step": 31100
},
{
"epoch": 8.649847518713612,
"grad_norm": 0.0912129357457161,
"learning_rate": 4.4310638619424363e-07,
"loss": 0.0536,
"step": 31200
},
{
"epoch": 8.6775713889659,
"grad_norm": 0.10558176785707474,
"learning_rate": 4.2535574212757667e-07,
"loss": 0.0542,
"step": 31300
},
{
"epoch": 8.705295259218186,
"grad_norm": 0.10381397604942322,
"learning_rate": 4.0795214332596145e-07,
"loss": 0.0547,
"step": 31400
},
{
"epoch": 8.733019129470474,
"grad_norm": 0.09383094310760498,
"learning_rate": 3.908969100749121e-07,
"loss": 0.055,
"step": 31500
},
{
"epoch": 8.733019129470474,
"eval_valid_loss": 0.047727905213832855,
"eval_valid_runtime": 6.4171,
"eval_valid_samples_per_second": 215.518,
"eval_valid_steps_per_second": 6.857,
"step": 31500
},
{
"epoch": 8.733019129470474,
"eval_valid_target_loss": 0.05224745720624924,
"eval_valid_target_runtime": 6.5727,
"eval_valid_target_samples_per_second": 218.174,
"eval_valid_target_steps_per_second": 6.846,
"step": 31500
},
{
"epoch": 8.760742999722762,
"grad_norm": 0.10438426584005356,
"learning_rate": 3.7419133623196825e-07,
"loss": 0.0541,
"step": 31600
},
{
"epoch": 8.788466869975048,
"grad_norm": 0.09324101358652115,
"learning_rate": 3.5783668912852453e-07,
"loss": 0.0537,
"step": 31700
},
{
"epoch": 8.816190740227336,
"grad_norm": 0.09235464036464691,
"learning_rate": 3.4183420947369873e-07,
"loss": 0.0544,
"step": 31800
},
{
"epoch": 8.843914610479622,
"grad_norm": 0.09870747476816177,
"learning_rate": 3.261851112602055e-07,
"loss": 0.0543,
"step": 31900
},
{
"epoch": 8.87163848073191,
"grad_norm": 0.10918495059013367,
"learning_rate": 3.108905816722546e-07,
"loss": 0.054,
"step": 32000
},
{
"epoch": 8.87163848073191,
"eval_valid_loss": 0.047707512974739075,
"eval_valid_runtime": 6.4362,
"eval_valid_samples_per_second": 214.879,
"eval_valid_steps_per_second": 6.836,
"step": 32000
},
{
"epoch": 8.87163848073191,
"eval_valid_target_loss": 0.05221306532621384,
"eval_valid_target_runtime": 6.5779,
"eval_valid_target_samples_per_second": 218.002,
"eval_valid_target_steps_per_second": 6.841,
"step": 32000
},
{
"epoch": 8.899362350984198,
"grad_norm": 0.09537260234355927,
"learning_rate": 2.9595178099549315e-07,
"loss": 0.054,
"step": 32100
},
{
"epoch": 8.927086221236484,
"grad_norm": 0.09188380092382431,
"learning_rate": 2.8136984252898515e-07,
"loss": 0.0542,
"step": 32200
},
{
"epoch": 8.954810091488772,
"grad_norm": 0.09919969737529755,
"learning_rate": 2.671458724992254e-07,
"loss": 0.0542,
"step": 32300
},
{
"epoch": 8.982533961741058,
"grad_norm": 0.09692647308111191,
"learning_rate": 2.532809499762312e-07,
"loss": 0.0544,
"step": 32400
},
{
"epoch": 9.010257831993346,
"grad_norm": 0.09277132153511047,
"learning_rate": 2.397761267916726e-07,
"loss": 0.0539,
"step": 32500
},
{
"epoch": 9.010257831993346,
"eval_valid_loss": 0.047637518495321274,
"eval_valid_runtime": 6.4471,
"eval_valid_samples_per_second": 214.516,
"eval_valid_steps_per_second": 6.825,
"step": 32500
},
{
"epoch": 9.010257831993346,
"eval_valid_target_loss": 0.052208978682756424,
"eval_valid_target_runtime": 6.5636,
"eval_valid_target_samples_per_second": 218.477,
"eval_valid_target_steps_per_second": 6.856,
"step": 32500
},
{
"epoch": 9.037981702245634,
"grad_norm": 0.09585940837860107,
"learning_rate": 2.2663242745908087e-07,
"loss": 0.0542,
"step": 32600
},
{
"epoch": 9.06570557249792,
"grad_norm": 0.09488432109355927,
"learning_rate": 2.138508490961244e-07,
"loss": 0.0533,
"step": 32700
},
{
"epoch": 9.093429442750208,
"grad_norm": 0.09499957412481308,
"learning_rate": 2.014323613489666e-07,
"loss": 0.0543,
"step": 32800
},
{
"epoch": 9.121153313002495,
"grad_norm": 0.09435317665338516,
"learning_rate": 1.8937790631870345e-07,
"loss": 0.0536,
"step": 32900
},
{
"epoch": 9.148877183254783,
"grad_norm": 0.10342779755592346,
"learning_rate": 1.7768839848989584e-07,
"loss": 0.0539,
"step": 33000
},
{
"epoch": 9.148877183254783,
"eval_valid_loss": 0.047598063945770264,
"eval_valid_runtime": 6.4315,
"eval_valid_samples_per_second": 215.037,
"eval_valid_steps_per_second": 6.841,
"step": 33000
},
{
"epoch": 9.148877183254783,
"eval_valid_target_loss": 0.05212317034602165,
"eval_valid_target_runtime": 6.5736,
"eval_valid_target_samples_per_second": 218.146,
"eval_valid_target_steps_per_second": 6.846,
"step": 33000
},
{
"epoch": 9.176601053507069,
"grad_norm": 0.09814909845590591,
"learning_rate": 1.6636472466118992e-07,
"loss": 0.0542,
"step": 33100
},
{
"epoch": 9.204324923759357,
"grad_norm": 0.09484022855758667,
"learning_rate": 1.5540774387804825e-07,
"loss": 0.0544,
"step": 33200
},
{
"epoch": 9.232048794011645,
"grad_norm": 0.07888332009315491,
"learning_rate": 1.448182873675752e-07,
"loss": 0.0539,
"step": 33300
},
{
"epoch": 9.25977266426393,
"grad_norm": 0.0964021384716034,
"learning_rate": 1.345971584754585e-07,
"loss": 0.0539,
"step": 33400
},
{
"epoch": 9.287496534516219,
"grad_norm": 0.10322096943855286,
"learning_rate": 1.2474513260502695e-07,
"loss": 0.0536,
"step": 33500
},
{
"epoch": 9.287496534516219,
"eval_valid_loss": 0.047564879059791565,
"eval_valid_runtime": 6.4358,
"eval_valid_samples_per_second": 214.89,
"eval_valid_steps_per_second": 6.837,
"step": 33500
},
{
"epoch": 9.287496534516219,
"eval_valid_target_loss": 0.05209695175290108,
"eval_valid_target_runtime": 6.5809,
"eval_valid_target_samples_per_second": 217.904,
"eval_valid_target_steps_per_second": 6.838,
"step": 33500
},
{
"epoch": 9.315220404768505,
"grad_norm": 0.10957927256822586,
"learning_rate": 1.1526295715842628e-07,
"loss": 0.0541,
"step": 33600
},
{
"epoch": 9.342944275020793,
"grad_norm": 0.09433583915233612,
"learning_rate": 1.0615135147991562e-07,
"loss": 0.0542,
"step": 33700
},
{
"epoch": 9.370668145273081,
"grad_norm": 0.09703412652015686,
"learning_rate": 9.741100680130122e-08,
"loss": 0.0535,
"step": 33800
},
{
"epoch": 9.398392015525367,
"grad_norm": 0.10180799663066864,
"learning_rate": 8.904258618949335e-08,
"loss": 0.054,
"step": 33900
},
{
"epoch": 9.426115885777655,
"grad_norm": 0.09336613118648529,
"learning_rate": 8.104672449620598e-08,
"loss": 0.0532,
"step": 34000
},
{
"epoch": 9.426115885777655,
"eval_valid_loss": 0.047556404024362564,
"eval_valid_runtime": 6.42,
"eval_valid_samples_per_second": 215.421,
"eval_valid_steps_per_second": 6.854,
"step": 34000
},
{
"epoch": 9.426115885777655,
"eval_valid_target_loss": 0.05208129063248634,
"eval_valid_target_runtime": 6.595,
"eval_valid_target_samples_per_second": 217.437,
"eval_valid_target_steps_per_second": 6.823,
"step": 34000
},
{
"epoch": 9.453839756029941,
"grad_norm": 0.0890408605337143,
"learning_rate": 7.342402830979589e-08,
"loss": 0.054,
"step": 34100
},
{
"epoch": 9.48156362628223,
"grad_norm": 0.09568461775779724,
"learning_rate": 6.617507590924332e-08,
"loss": 0.0535,
"step": 34200
},
{
"epoch": 9.509287496534515,
"grad_norm": 0.09256019443273544,
"learning_rate": 5.930041722028379e-08,
"loss": 0.054,
"step": 34300
},
{
"epoch": 9.537011366786803,
"grad_norm": 0.09314898401498795,
"learning_rate": 5.280057377368863e-08,
"loss": 0.0535,
"step": 34400
},
{
"epoch": 9.564735237039091,
"grad_norm": 0.10256827622652054,
"learning_rate": 4.667603866569892e-08,
"loss": 0.0537,
"step": 34500
},
{
"epoch": 9.564735237039091,
"eval_valid_loss": 0.047560639679431915,
"eval_valid_runtime": 6.4632,
"eval_valid_samples_per_second": 213.979,
"eval_valid_steps_per_second": 6.808,
"step": 34500
},
{
"epoch": 9.564735237039091,
"eval_valid_target_loss": 0.05206665024161339,
"eval_valid_target_runtime": 6.5886,
"eval_valid_target_samples_per_second": 217.649,
"eval_valid_target_steps_per_second": 6.83,
"step": 34500
},
{
"epoch": 9.592459107291377,
"grad_norm": 0.0861942321062088,
"learning_rate": 4.092727652062034e-08,
"loss": 0.0537,
"step": 34600
},
{
"epoch": 9.620182977543665,
"grad_norm": 0.09521106630563736,
"learning_rate": 3.555472345557365e-08,
"loss": 0.0535,
"step": 34700
},
{
"epoch": 9.647906847795952,
"grad_norm": 0.10885845869779587,
"learning_rate": 3.055878704741e-08,
"loss": 0.0542,
"step": 34800
},
{
"epoch": 9.67563071804824,
"grad_norm": 0.09145703911781311,
"learning_rate": 2.5939846301791804e-08,
"loss": 0.0541,
"step": 34900
},
{
"epoch": 9.703354588300527,
"grad_norm": 0.09051796793937683,
"learning_rate": 2.1698251624438503e-08,
"loss": 0.0544,
"step": 35000
},
{
"epoch": 9.703354588300527,
"eval_valid_loss": 0.04752533510327339,
"eval_valid_runtime": 6.4168,
"eval_valid_samples_per_second": 215.528,
"eval_valid_steps_per_second": 6.857,
"step": 35000
},
{
"epoch": 9.703354588300527,
"eval_valid_target_loss": 0.05207618325948715,
"eval_valid_target_runtime": 6.57,
"eval_valid_target_samples_per_second": 218.265,
"eval_valid_target_steps_per_second": 6.849,
"step": 35000
},
{
"epoch": 9.731078458552814,
"grad_norm": 0.0903056338429451,
"learning_rate": 1.7834324794546164e-08,
"loss": 0.0539,
"step": 35100
},
{
"epoch": 9.758802328805102,
"grad_norm": 0.0897304117679596,
"learning_rate": 1.434835894037423e-08,
"loss": 0.0539,
"step": 35200
},
{
"epoch": 9.786526199057388,
"grad_norm": 0.10058806836605072,
"learning_rate": 1.1240618517009416e-08,
"loss": 0.0542,
"step": 35300
},
{
"epoch": 9.814250069309676,
"grad_norm": 0.1056876927614212,
"learning_rate": 8.511339286303432e-09,
"loss": 0.0537,
"step": 35400
},
{
"epoch": 9.841973939561964,
"grad_norm": 0.08990786969661713,
"learning_rate": 6.1607282989856184e-09,
"loss": 0.0547,
"step": 35500
},
{
"epoch": 9.841973939561964,
"eval_valid_loss": 0.047528158873319626,
"eval_valid_runtime": 6.4412,
"eval_valid_samples_per_second": 214.712,
"eval_valid_steps_per_second": 6.831,
"step": 35500
},
{
"epoch": 9.841973939561964,
"eval_valid_target_loss": 0.05206017941236496,
"eval_valid_target_runtime": 6.5864,
"eval_valid_target_samples_per_second": 217.72,
"eval_valid_target_steps_per_second": 6.832,
"step": 35500
},
{
"epoch": 9.86969780981425,
"grad_norm": 0.08090436458587646,
"learning_rate": 4.188963878958841e-09,
"loss": 0.0536,
"step": 35600
},
{
"epoch": 9.897421680066538,
"grad_norm": 0.08319131284952164,
"learning_rate": 2.5961956097669827e-09,
"loss": 0.0541,
"step": 35700
},
{
"epoch": 9.925145550318824,
"grad_norm": 0.10666873306035995,
"learning_rate": 1.3825443232517999e-09,
"loss": 0.0541,
"step": 35800
},
{
"epoch": 9.952869420571112,
"grad_norm": 0.10748881101608276,
"learning_rate": 5.48102090381919e-10,
"loss": 0.0543,
"step": 35900
},
{
"epoch": 9.9805932908234,
"grad_norm": 0.10198221355676651,
"learning_rate": 9.293221427231214e-11,
"loss": 0.0533,
"step": 36000
},
{
"epoch": 9.9805932908234,
"eval_valid_loss": 0.04753027856349945,
"eval_valid_runtime": 6.4518,
"eval_valid_samples_per_second": 214.359,
"eval_valid_steps_per_second": 6.82,
"step": 36000
},
{
"epoch": 9.9805932908234,
"eval_valid_target_loss": 0.05205439031124115,
"eval_valid_target_runtime": 6.5698,
"eval_valid_target_samples_per_second": 218.272,
"eval_valid_target_steps_per_second": 6.85,
"step": 36000
}
],
"logging_steps": 100,
"max_steps": 36070,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.429394066302619e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}