qwen2-7b-agent-instruct / trainer_state.json
ai-modelscope
first commit
641dd6c
raw
history blame
40.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.99889339727038,
"eval_steps": 50,
"global_step": 677,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"acc": 0.85936797,
"epoch": 0.0014754703061600886,
"grad_norm": 7.874454151515785,
"learning_rate": 0.0,
"loss": 0.68658942,
"memory(GiB)": 24.89,
"step": 1,
"train_speed(iter/s)": 0.03037
},
{
"acc": 0.84321463,
"epoch": 0.0073773515308004425,
"grad_norm": 8.79654818500605,
"learning_rate": 7.628557760232497e-07,
"loss": 0.79017758,
"memory(GiB)": 31.87,
"step": 5,
"train_speed(iter/s)": 0.092709
},
{
"acc": 0.85256624,
"epoch": 0.014754703061600885,
"grad_norm": 8.005772072681205,
"learning_rate": 1.0913998759473501e-06,
"loss": 0.70760584,
"memory(GiB)": 33.75,
"step": 10,
"train_speed(iter/s)": 0.120868
},
{
"acc": 0.85825052,
"epoch": 0.022132054592401328,
"grad_norm": 4.861872738410458,
"learning_rate": 1.2835858542361333e-06,
"loss": 0.64002485,
"memory(GiB)": 33.01,
"step": 15,
"train_speed(iter/s)": 0.137764
},
{
"acc": 0.8677763,
"epoch": 0.02950940612320177,
"grad_norm": 2.624090927434735,
"learning_rate": 1.4199439758714505e-06,
"loss": 0.5428031,
"memory(GiB)": 34.84,
"step": 20,
"train_speed(iter/s)": 0.148523
},
{
"acc": 0.88262272,
"epoch": 0.03688675765400221,
"grad_norm": 2.2979293864903276,
"learning_rate": 1.5257115520464994e-06,
"loss": 0.45293074,
"memory(GiB)": 31.42,
"step": 25,
"train_speed(iter/s)": 0.152816
},
{
"acc": 0.88684368,
"epoch": 0.044264109184802655,
"grad_norm": 2.321279166108657,
"learning_rate": 1.6121299541602339e-06,
"loss": 0.44487882,
"memory(GiB)": 34.17,
"step": 30,
"train_speed(iter/s)": 0.158226
},
{
"acc": 0.88785019,
"epoch": 0.0516414607156031,
"grad_norm": 1.6462078924259171,
"learning_rate": 1.6851956720581583e-06,
"loss": 0.42431307,
"memory(GiB)": 33.89,
"step": 35,
"train_speed(iter/s)": 0.160915
},
{
"acc": 0.88771706,
"epoch": 0.05901881224640354,
"grad_norm": 2.0535907435541323,
"learning_rate": 1.7484880757955508e-06,
"loss": 0.41692309,
"memory(GiB)": 33.45,
"step": 40,
"train_speed(iter/s)": 0.162212
},
{
"acc": 0.89934006,
"epoch": 0.06639616377720399,
"grad_norm": 1.880024272875225,
"learning_rate": 1.8043159324490168e-06,
"loss": 0.37824535,
"memory(GiB)": 32.49,
"step": 45,
"train_speed(iter/s)": 0.164895
},
{
"acc": 0.89317064,
"epoch": 0.07377351530800443,
"grad_norm": 2.4862794709135483,
"learning_rate": 1.8542556519706e-06,
"loss": 0.39434323,
"memory(GiB)": 31.37,
"step": 50,
"train_speed(iter/s)": 0.166039
},
{
"epoch": 0.07377351530800443,
"eval_acc": 0.8897788969852836,
"eval_loss": 0.3586576581001282,
"eval_runtime": 9.1458,
"eval_samples_per_second": 23.836,
"eval_steps_per_second": 3.062,
"step": 50
},
{
"acc": 0.90738754,
"epoch": 0.08115086683880487,
"grad_norm": 1.818011862869067,
"learning_rate": 1.8994316234174147e-06,
"loss": 0.34018734,
"memory(GiB)": 43.99,
"step": 55,
"train_speed(iter/s)": 0.163069
},
{
"acc": 0.89877386,
"epoch": 0.08852821836960531,
"grad_norm": 2.769061395622785,
"learning_rate": 1.940674054084334e-06,
"loss": 0.3834722,
"memory(GiB)": 33.18,
"step": 60,
"train_speed(iter/s)": 0.163587
},
{
"acc": 0.89560518,
"epoch": 0.09590556990040576,
"grad_norm": 3.0254291124967776,
"learning_rate": 1.9786134125433064e-06,
"loss": 0.40774279,
"memory(GiB)": 36.96,
"step": 65,
"train_speed(iter/s)": 0.163438
},
{
"acc": 0.90745316,
"epoch": 0.1032829214312062,
"grad_norm": 1.9702664127406297,
"learning_rate": 1.998444790046656e-06,
"loss": 0.34646974,
"memory(GiB)": 33.91,
"step": 70,
"train_speed(iter/s)": 0.165839
},
{
"acc": 0.90453644,
"epoch": 0.11066027296200664,
"grad_norm": 1.956498769069037,
"learning_rate": 1.990668740279938e-06,
"loss": 0.34771657,
"memory(GiB)": 32.4,
"step": 75,
"train_speed(iter/s)": 0.166283
},
{
"acc": 0.90620461,
"epoch": 0.11803762449280708,
"grad_norm": 1.7929520466502804,
"learning_rate": 1.9828926905132194e-06,
"loss": 0.34979777,
"memory(GiB)": 32.69,
"step": 80,
"train_speed(iter/s)": 0.166045
},
{
"acc": 0.90826426,
"epoch": 0.12541497602360752,
"grad_norm": 2.255532399806791,
"learning_rate": 1.975116640746501e-06,
"loss": 0.34021211,
"memory(GiB)": 32.39,
"step": 85,
"train_speed(iter/s)": 0.16736
},
{
"acc": 0.90400352,
"epoch": 0.13279232755440798,
"grad_norm": 1.606426887028717,
"learning_rate": 1.9673405909797823e-06,
"loss": 0.3593976,
"memory(GiB)": 33.28,
"step": 90,
"train_speed(iter/s)": 0.166086
},
{
"acc": 0.90273075,
"epoch": 0.14016967908520842,
"grad_norm": 1.7550090784719037,
"learning_rate": 1.959564541213064e-06,
"loss": 0.34527693,
"memory(GiB)": 32.74,
"step": 95,
"train_speed(iter/s)": 0.167937
},
{
"acc": 0.90631161,
"epoch": 0.14754703061600885,
"grad_norm": 2.151177976553762,
"learning_rate": 1.9517884914463452e-06,
"loss": 0.34601164,
"memory(GiB)": 34.44,
"step": 100,
"train_speed(iter/s)": 0.167745
},
{
"epoch": 0.14754703061600885,
"eval_acc": 0.8985658665523646,
"eval_loss": 0.3217943012714386,
"eval_runtime": 9.0118,
"eval_samples_per_second": 24.19,
"eval_steps_per_second": 3.107,
"step": 100
},
{
"acc": 0.90445766,
"epoch": 0.1549243821468093,
"grad_norm": 2.0562867995030527,
"learning_rate": 1.9440124416796267e-06,
"loss": 0.34789481,
"memory(GiB)": 42.9,
"step": 105,
"train_speed(iter/s)": 0.164588
},
{
"acc": 0.90358963,
"epoch": 0.16230173367760975,
"grad_norm": 1.8705476431194374,
"learning_rate": 1.936236391912908e-06,
"loss": 0.34220786,
"memory(GiB)": 31.78,
"step": 110,
"train_speed(iter/s)": 0.165873
},
{
"acc": 0.9085845,
"epoch": 0.16967908520841019,
"grad_norm": 1.8278699994168497,
"learning_rate": 1.9284603421461896e-06,
"loss": 0.3233917,
"memory(GiB)": 31.86,
"step": 115,
"train_speed(iter/s)": 0.16598
},
{
"acc": 0.90997429,
"epoch": 0.17705643673921062,
"grad_norm": 1.945716912044592,
"learning_rate": 1.920684292379471e-06,
"loss": 0.34307232,
"memory(GiB)": 35.12,
"step": 120,
"train_speed(iter/s)": 0.166556
},
{
"acc": 0.91014824,
"epoch": 0.18443378827001106,
"grad_norm": 1.7135397704667659,
"learning_rate": 1.912908242612753e-06,
"loss": 0.32152495,
"memory(GiB)": 35.65,
"step": 125,
"train_speed(iter/s)": 0.167431
},
{
"acc": 0.9074892,
"epoch": 0.19181113980081152,
"grad_norm": 1.7116721779311537,
"learning_rate": 1.9051321928460342e-06,
"loss": 0.32937753,
"memory(GiB)": 33.19,
"step": 130,
"train_speed(iter/s)": 0.167152
},
{
"acc": 0.90999937,
"epoch": 0.19918849133161196,
"grad_norm": 1.6389355962957932,
"learning_rate": 1.8973561430793156e-06,
"loss": 0.33004179,
"memory(GiB)": 33.36,
"step": 135,
"train_speed(iter/s)": 0.168049
},
{
"acc": 0.9056819,
"epoch": 0.2065658428624124,
"grad_norm": 1.618401896535921,
"learning_rate": 1.889580093312597e-06,
"loss": 0.32887373,
"memory(GiB)": 31.72,
"step": 140,
"train_speed(iter/s)": 0.167987
},
{
"acc": 0.90799198,
"epoch": 0.21394319439321283,
"grad_norm": 2.0697336354422076,
"learning_rate": 1.8818040435458787e-06,
"loss": 0.33212447,
"memory(GiB)": 32.61,
"step": 145,
"train_speed(iter/s)": 0.168358
},
{
"acc": 0.89975605,
"epoch": 0.2213205459240133,
"grad_norm": 1.645561918074026,
"learning_rate": 1.8740279937791602e-06,
"loss": 0.35846872,
"memory(GiB)": 32.3,
"step": 150,
"train_speed(iter/s)": 0.169041
},
{
"epoch": 0.2213205459240133,
"eval_acc": 0.9009412058865552,
"eval_loss": 0.31137242913246155,
"eval_runtime": 8.9003,
"eval_samples_per_second": 24.494,
"eval_steps_per_second": 3.146,
"step": 150
},
{
"acc": 0.90751858,
"epoch": 0.22869789745481373,
"grad_norm": 1.717914687308357,
"learning_rate": 1.8662519440124416e-06,
"loss": 0.33635845,
"memory(GiB)": 43.6,
"step": 155,
"train_speed(iter/s)": 0.167082
},
{
"acc": 0.90450516,
"epoch": 0.23607524898561416,
"grad_norm": 1.6863266349964434,
"learning_rate": 1.858475894245723e-06,
"loss": 0.35405197,
"memory(GiB)": 33.81,
"step": 160,
"train_speed(iter/s)": 0.167855
},
{
"acc": 0.90395164,
"epoch": 0.2434526005164146,
"grad_norm": 2.1013428529714906,
"learning_rate": 1.8506998444790045e-06,
"loss": 0.34658258,
"memory(GiB)": 32.9,
"step": 165,
"train_speed(iter/s)": 0.167867
},
{
"acc": 0.91127558,
"epoch": 0.25082995204721503,
"grad_norm": 1.6631238092162342,
"learning_rate": 1.842923794712286e-06,
"loss": 0.32777104,
"memory(GiB)": 33.53,
"step": 170,
"train_speed(iter/s)": 0.168028
},
{
"acc": 0.90831413,
"epoch": 0.25820730357801547,
"grad_norm": 2.0857884493375756,
"learning_rate": 1.8351477449455676e-06,
"loss": 0.32164063,
"memory(GiB)": 32.03,
"step": 175,
"train_speed(iter/s)": 0.169138
},
{
"acc": 0.91539364,
"epoch": 0.26558465510881596,
"grad_norm": 2.0145344122511095,
"learning_rate": 1.827371695178849e-06,
"loss": 0.30975475,
"memory(GiB)": 34.31,
"step": 180,
"train_speed(iter/s)": 0.168973
},
{
"acc": 0.9064558,
"epoch": 0.2729620066396164,
"grad_norm": 1.6651879684580124,
"learning_rate": 1.8195956454121305e-06,
"loss": 0.3413609,
"memory(GiB)": 32.63,
"step": 185,
"train_speed(iter/s)": 0.169312
},
{
"acc": 0.90828686,
"epoch": 0.28033935817041683,
"grad_norm": 2.3469960245148056,
"learning_rate": 1.811819595645412e-06,
"loss": 0.32660947,
"memory(GiB)": 33.41,
"step": 190,
"train_speed(iter/s)": 0.169856
},
{
"acc": 0.91549397,
"epoch": 0.28771670970121727,
"grad_norm": 2.1806025367886117,
"learning_rate": 1.8040435458786937e-06,
"loss": 0.30616875,
"memory(GiB)": 36.24,
"step": 195,
"train_speed(iter/s)": 0.169761
},
{
"acc": 0.90924969,
"epoch": 0.2950940612320177,
"grad_norm": 1.5587292681869693,
"learning_rate": 1.7962674961119751e-06,
"loss": 0.32027857,
"memory(GiB)": 32.62,
"step": 200,
"train_speed(iter/s)": 0.170581
},
{
"epoch": 0.2950940612320177,
"eval_acc": 0.901896699528504,
"eval_loss": 0.3015853464603424,
"eval_runtime": 9.0231,
"eval_samples_per_second": 24.16,
"eval_steps_per_second": 3.103,
"step": 200
},
{
"acc": 0.91348085,
"epoch": 0.30247141276281814,
"grad_norm": 1.7818986098446097,
"learning_rate": 1.7884914463452566e-06,
"loss": 0.30208986,
"memory(GiB)": 44.06,
"step": 205,
"train_speed(iter/s)": 0.169194
},
{
"acc": 0.90921364,
"epoch": 0.3098487642936186,
"grad_norm": 4.02077354284952,
"learning_rate": 1.780715396578538e-06,
"loss": 0.31497798,
"memory(GiB)": 34.58,
"step": 210,
"train_speed(iter/s)": 0.169003
},
{
"acc": 0.91234264,
"epoch": 0.317226115824419,
"grad_norm": 1.856976113207096,
"learning_rate": 1.7729393468118195e-06,
"loss": 0.30694566,
"memory(GiB)": 33.8,
"step": 215,
"train_speed(iter/s)": 0.16984
},
{
"acc": 0.91051998,
"epoch": 0.3246034673552195,
"grad_norm": 1.7185168230569432,
"learning_rate": 1.765163297045101e-06,
"loss": 0.30961909,
"memory(GiB)": 32.79,
"step": 220,
"train_speed(iter/s)": 0.169666
},
{
"acc": 0.90716095,
"epoch": 0.33198081888601993,
"grad_norm": 1.340608010048739,
"learning_rate": 1.7573872472783826e-06,
"loss": 0.32777991,
"memory(GiB)": 32.43,
"step": 225,
"train_speed(iter/s)": 0.169965
},
{
"acc": 0.91547451,
"epoch": 0.33935817041682037,
"grad_norm": 1.6059763623857688,
"learning_rate": 1.749611197511664e-06,
"loss": 0.30423913,
"memory(GiB)": 34.95,
"step": 230,
"train_speed(iter/s)": 0.169935
},
{
"acc": 0.917132,
"epoch": 0.3467355219476208,
"grad_norm": 2.0390121908637644,
"learning_rate": 1.7418351477449455e-06,
"loss": 0.30788417,
"memory(GiB)": 34.18,
"step": 235,
"train_speed(iter/s)": 0.169583
},
{
"acc": 0.92253389,
"epoch": 0.35411287347842124,
"grad_norm": 1.7323441045370742,
"learning_rate": 1.734059097978227e-06,
"loss": 0.27823753,
"memory(GiB)": 31.85,
"step": 240,
"train_speed(iter/s)": 0.17024
},
{
"acc": 0.91325512,
"epoch": 0.3614902250092217,
"grad_norm": 1.6955182367729624,
"learning_rate": 1.7262830482115086e-06,
"loss": 0.31402481,
"memory(GiB)": 32.14,
"step": 245,
"train_speed(iter/s)": 0.169973
},
{
"acc": 0.91568565,
"epoch": 0.3688675765400221,
"grad_norm": 1.5212817841417117,
"learning_rate": 1.71850699844479e-06,
"loss": 0.29354782,
"memory(GiB)": 33.28,
"step": 250,
"train_speed(iter/s)": 0.169891
},
{
"epoch": 0.3688675765400221,
"eval_acc": 0.903888055436491,
"eval_loss": 0.2949393689632416,
"eval_runtime": 8.8569,
"eval_samples_per_second": 24.614,
"eval_steps_per_second": 3.161,
"step": 250
},
{
"acc": 0.91542091,
"epoch": 0.37624492807082255,
"grad_norm": 1.872512089057089,
"learning_rate": 1.7107309486780715e-06,
"loss": 0.29765024,
"memory(GiB)": 43.8,
"step": 255,
"train_speed(iter/s)": 0.169287
},
{
"acc": 0.90894642,
"epoch": 0.38362227960162304,
"grad_norm": 2.118992381164901,
"learning_rate": 1.702954898911353e-06,
"loss": 0.32009149,
"memory(GiB)": 33.0,
"step": 260,
"train_speed(iter/s)": 0.169108
},
{
"acc": 0.91895199,
"epoch": 0.3909996311324235,
"grad_norm": 1.8087446200238866,
"learning_rate": 1.6951788491446344e-06,
"loss": 0.28518291,
"memory(GiB)": 33.64,
"step": 265,
"train_speed(iter/s)": 0.169659
},
{
"acc": 0.91831837,
"epoch": 0.3983769826632239,
"grad_norm": 2.295227865477349,
"learning_rate": 1.6874027993779158e-06,
"loss": 0.29493954,
"memory(GiB)": 32.16,
"step": 270,
"train_speed(iter/s)": 0.16921
},
{
"acc": 0.91772842,
"epoch": 0.40575433419402435,
"grad_norm": 1.8335936104899577,
"learning_rate": 1.6796267496111975e-06,
"loss": 0.29295368,
"memory(GiB)": 32.48,
"step": 275,
"train_speed(iter/s)": 0.169211
},
{
"acc": 0.9184288,
"epoch": 0.4131316857248248,
"grad_norm": 1.9183997806679902,
"learning_rate": 1.671850699844479e-06,
"loss": 0.29449196,
"memory(GiB)": 32.65,
"step": 280,
"train_speed(iter/s)": 0.169821
},
{
"acc": 0.91275759,
"epoch": 0.4205090372556252,
"grad_norm": 1.5737005817463792,
"learning_rate": 1.6640746500777604e-06,
"loss": 0.30824404,
"memory(GiB)": 32.27,
"step": 285,
"train_speed(iter/s)": 0.169618
},
{
"acc": 0.91761837,
"epoch": 0.42788638878642565,
"grad_norm": 1.6411868652328097,
"learning_rate": 1.6562986003110419e-06,
"loss": 0.28589807,
"memory(GiB)": 33.9,
"step": 290,
"train_speed(iter/s)": 0.16978
},
{
"acc": 0.91096239,
"epoch": 0.4352637403172261,
"grad_norm": 1.4763719992796571,
"learning_rate": 1.6485225505443235e-06,
"loss": 0.31501875,
"memory(GiB)": 33.9,
"step": 295,
"train_speed(iter/s)": 0.170116
},
{
"acc": 0.92102461,
"epoch": 0.4426410918480266,
"grad_norm": 1.7038633862826587,
"learning_rate": 1.640746500777605e-06,
"loss": 0.28700156,
"memory(GiB)": 33.12,
"step": 300,
"train_speed(iter/s)": 0.16999
},
{
"epoch": 0.4426410918480266,
"eval_acc": 0.904986426632376,
"eval_loss": 0.28871360421180725,
"eval_runtime": 8.8172,
"eval_samples_per_second": 24.724,
"eval_steps_per_second": 3.176,
"step": 300
},
{
"acc": 0.9137413,
"epoch": 0.450018443378827,
"grad_norm": 1.5572757830459178,
"learning_rate": 1.6329704510108864e-06,
"loss": 0.3066596,
"memory(GiB)": 44.77,
"step": 305,
"train_speed(iter/s)": 0.169643
},
{
"acc": 0.92225361,
"epoch": 0.45739579490962745,
"grad_norm": 1.7973596806557957,
"learning_rate": 1.6251944012441679e-06,
"loss": 0.28060098,
"memory(GiB)": 34.38,
"step": 310,
"train_speed(iter/s)": 0.169469
},
{
"acc": 0.91542816,
"epoch": 0.4647731464404279,
"grad_norm": 1.7774091029439925,
"learning_rate": 1.6174183514774493e-06,
"loss": 0.29976537,
"memory(GiB)": 33.81,
"step": 315,
"train_speed(iter/s)": 0.169523
},
{
"acc": 0.91291943,
"epoch": 0.4721504979712283,
"grad_norm": 1.3755306649838441,
"learning_rate": 1.6096423017107308e-06,
"loss": 0.30613976,
"memory(GiB)": 33.81,
"step": 320,
"train_speed(iter/s)": 0.169769
},
{
"acc": 0.90916691,
"epoch": 0.47952784950202876,
"grad_norm": 1.9213831375809023,
"learning_rate": 1.6018662519440122e-06,
"loss": 0.32510529,
"memory(GiB)": 34.44,
"step": 325,
"train_speed(iter/s)": 0.169545
},
{
"acc": 0.91636696,
"epoch": 0.4869052010328292,
"grad_norm": 1.8837685149781478,
"learning_rate": 1.5940902021772939e-06,
"loss": 0.30537646,
"memory(GiB)": 31.2,
"step": 330,
"train_speed(iter/s)": 0.170038
},
{
"acc": 0.91307325,
"epoch": 0.4942825525636297,
"grad_norm": 1.8595782698159422,
"learning_rate": 1.5863141524105753e-06,
"loss": 0.30300996,
"memory(GiB)": 30.74,
"step": 335,
"train_speed(iter/s)": 0.169983
},
{
"acc": 0.91927223,
"epoch": 0.5016599040944301,
"grad_norm": 1.8693944311229003,
"learning_rate": 1.5785381026438568e-06,
"loss": 0.28294766,
"memory(GiB)": 31.5,
"step": 340,
"train_speed(iter/s)": 0.170169
},
{
"acc": 0.92018118,
"epoch": 0.5090372556252305,
"grad_norm": 1.6240951695142463,
"learning_rate": 1.5707620528771385e-06,
"loss": 0.27536349,
"memory(GiB)": 32.84,
"step": 345,
"train_speed(iter/s)": 0.170494
},
{
"acc": 0.91428967,
"epoch": 0.5164146071560309,
"grad_norm": 2.0654305075288653,
"learning_rate": 1.56298600311042e-06,
"loss": 0.30193062,
"memory(GiB)": 33.88,
"step": 350,
"train_speed(iter/s)": 0.170499
},
{
"epoch": 0.5164146071560309,
"eval_acc": 0.906031218745535,
"eval_loss": 0.2829771637916565,
"eval_runtime": 8.9252,
"eval_samples_per_second": 24.425,
"eval_steps_per_second": 3.137,
"step": 350
},
{
"acc": 0.92116051,
"epoch": 0.5237919586868315,
"grad_norm": 2.2709862324112136,
"learning_rate": 1.5552099533437014e-06,
"loss": 0.277144,
"memory(GiB)": 44.05,
"step": 355,
"train_speed(iter/s)": 0.169773
},
{
"acc": 0.90278854,
"epoch": 0.5311693102176319,
"grad_norm": 1.9738153042801483,
"learning_rate": 1.5474339035769828e-06,
"loss": 0.33822517,
"memory(GiB)": 31.78,
"step": 360,
"train_speed(iter/s)": 0.170163
},
{
"acc": 0.92497654,
"epoch": 0.5385466617484324,
"grad_norm": 1.2430005126419985,
"learning_rate": 1.5396578538102643e-06,
"loss": 0.26646669,
"memory(GiB)": 33.8,
"step": 365,
"train_speed(iter/s)": 0.16992
},
{
"acc": 0.91328669,
"epoch": 0.5459240132792328,
"grad_norm": 1.732568460701246,
"learning_rate": 1.5318818040435457e-06,
"loss": 0.30124869,
"memory(GiB)": 34.07,
"step": 370,
"train_speed(iter/s)": 0.170382
},
{
"acc": 0.91603355,
"epoch": 0.5533013648100332,
"grad_norm": 1.6627563648419381,
"learning_rate": 1.5241057542768272e-06,
"loss": 0.29759171,
"memory(GiB)": 32.61,
"step": 375,
"train_speed(iter/s)": 0.170197
},
{
"acc": 0.90871716,
"epoch": 0.5606787163408337,
"grad_norm": 2.1331488669107492,
"learning_rate": 1.5163297045101088e-06,
"loss": 0.33630853,
"memory(GiB)": 32.33,
"step": 380,
"train_speed(iter/s)": 0.17029
},
{
"acc": 0.90700073,
"epoch": 0.5680560678716341,
"grad_norm": 2.080763753555995,
"learning_rate": 1.5085536547433903e-06,
"loss": 0.325877,
"memory(GiB)": 32.95,
"step": 385,
"train_speed(iter/s)": 0.170474
},
{
"acc": 0.91835623,
"epoch": 0.5754334194024345,
"grad_norm": 1.5911495384236254,
"learning_rate": 1.500777604976672e-06,
"loss": 0.28332872,
"memory(GiB)": 31.78,
"step": 390,
"train_speed(iter/s)": 0.170283
},
{
"acc": 0.91712914,
"epoch": 0.582810770933235,
"grad_norm": 1.6237776507352246,
"learning_rate": 1.4930015552099534e-06,
"loss": 0.28782868,
"memory(GiB)": 33.13,
"step": 395,
"train_speed(iter/s)": 0.170424
},
{
"acc": 0.92452984,
"epoch": 0.5901881224640354,
"grad_norm": 1.9617693211652296,
"learning_rate": 1.4852255054432348e-06,
"loss": 0.25721183,
"memory(GiB)": 34.52,
"step": 400,
"train_speed(iter/s)": 0.170549
},
{
"epoch": 0.5901881224640354,
"eval_acc": 0.9067634662094585,
"eval_loss": 0.27780693769454956,
"eval_runtime": 8.9713,
"eval_samples_per_second": 24.3,
"eval_steps_per_second": 3.121,
"step": 400
},
{
"acc": 0.91402645,
"epoch": 0.5975654739948358,
"grad_norm": 1.6283342820719429,
"learning_rate": 1.4774494556765163e-06,
"loss": 0.29935551,
"memory(GiB)": 43.79,
"step": 405,
"train_speed(iter/s)": 0.169655
},
{
"acc": 0.91232147,
"epoch": 0.6049428255256363,
"grad_norm": 1.7979698219270268,
"learning_rate": 1.4696734059097977e-06,
"loss": 0.29618566,
"memory(GiB)": 34.75,
"step": 410,
"train_speed(iter/s)": 0.169867
},
{
"acc": 0.91495514,
"epoch": 0.6123201770564367,
"grad_norm": 1.400313093548897,
"learning_rate": 1.4618973561430792e-06,
"loss": 0.30076814,
"memory(GiB)": 33.36,
"step": 415,
"train_speed(iter/s)": 0.169686
},
{
"acc": 0.91793385,
"epoch": 0.6196975285872371,
"grad_norm": 1.5440217170439645,
"learning_rate": 1.4541213063763606e-06,
"loss": 0.27723732,
"memory(GiB)": 32.03,
"step": 420,
"train_speed(iter/s)": 0.169706
},
{
"acc": 0.92025652,
"epoch": 0.6270748801180376,
"grad_norm": 1.7171089334482643,
"learning_rate": 1.446345256609642e-06,
"loss": 0.28218346,
"memory(GiB)": 31.84,
"step": 425,
"train_speed(iter/s)": 0.169824
},
{
"acc": 0.91456184,
"epoch": 0.634452231648838,
"grad_norm": 1.7617810648771757,
"learning_rate": 1.4385692068429238e-06,
"loss": 0.30232787,
"memory(GiB)": 33.01,
"step": 430,
"train_speed(iter/s)": 0.169549
},
{
"acc": 0.91554451,
"epoch": 0.6418295831796386,
"grad_norm": 2.1102714988825966,
"learning_rate": 1.4307931570762052e-06,
"loss": 0.29879627,
"memory(GiB)": 33.18,
"step": 435,
"train_speed(iter/s)": 0.169677
},
{
"acc": 0.92126179,
"epoch": 0.649206934710439,
"grad_norm": 2.046949703950944,
"learning_rate": 1.4230171073094869e-06,
"loss": 0.27905126,
"memory(GiB)": 35.07,
"step": 440,
"train_speed(iter/s)": 0.169605
},
{
"acc": 0.90152893,
"epoch": 0.6565842862412394,
"grad_norm": 2.001971595085909,
"learning_rate": 1.4152410575427683e-06,
"loss": 0.34060516,
"memory(GiB)": 33.51,
"step": 445,
"train_speed(iter/s)": 0.169689
},
{
"acc": 0.91629639,
"epoch": 0.6639616377720399,
"grad_norm": 2.0397672790155528,
"learning_rate": 1.4074650077760498e-06,
"loss": 0.28595252,
"memory(GiB)": 34.12,
"step": 450,
"train_speed(iter/s)": 0.170047
},
{
"epoch": 0.6639616377720399,
"eval_acc": 0.9078082583226175,
"eval_loss": 0.2715848386287689,
"eval_runtime": 8.8964,
"eval_samples_per_second": 24.504,
"eval_steps_per_second": 3.147,
"step": 450
},
{
"acc": 0.92627125,
"epoch": 0.6713389893028403,
"grad_norm": 1.6378143906534044,
"learning_rate": 1.3996889580093312e-06,
"loss": 0.25918436,
"memory(GiB)": 43.88,
"step": 455,
"train_speed(iter/s)": 0.169369
},
{
"acc": 0.91979427,
"epoch": 0.6787163408336407,
"grad_norm": 1.7082862687854972,
"learning_rate": 1.3919129082426127e-06,
"loss": 0.27077117,
"memory(GiB)": 32.33,
"step": 460,
"train_speed(iter/s)": 0.169438
},
{
"acc": 0.91361713,
"epoch": 0.6860936923644412,
"grad_norm": 2.293000555161464,
"learning_rate": 1.3841368584758941e-06,
"loss": 0.30449131,
"memory(GiB)": 32.93,
"step": 465,
"train_speed(iter/s)": 0.169581
},
{
"acc": 0.91954422,
"epoch": 0.6934710438952416,
"grad_norm": 1.8478883729217541,
"learning_rate": 1.3763608087091756e-06,
"loss": 0.29147563,
"memory(GiB)": 32.32,
"step": 470,
"train_speed(iter/s)": 0.169425
},
{
"acc": 0.91925821,
"epoch": 0.700848395426042,
"grad_norm": 2.1771276083255833,
"learning_rate": 1.368584758942457e-06,
"loss": 0.27578421,
"memory(GiB)": 31.55,
"step": 475,
"train_speed(iter/s)": 0.169717
},
{
"acc": 0.91978226,
"epoch": 0.7082257469568425,
"grad_norm": 1.5525703471804124,
"learning_rate": 1.3608087091757387e-06,
"loss": 0.28457327,
"memory(GiB)": 34.35,
"step": 480,
"train_speed(iter/s)": 0.169473
},
{
"acc": 0.91358566,
"epoch": 0.7156030984876429,
"grad_norm": 1.6094545899681876,
"learning_rate": 1.3530326594090201e-06,
"loss": 0.29641771,
"memory(GiB)": 34.35,
"step": 485,
"train_speed(iter/s)": 0.169292
},
{
"acc": 0.9157114,
"epoch": 0.7229804500184434,
"grad_norm": 2.001462148706446,
"learning_rate": 1.3452566096423018e-06,
"loss": 0.30091541,
"memory(GiB)": 33.0,
"step": 490,
"train_speed(iter/s)": 0.169539
},
{
"acc": 0.9181448,
"epoch": 0.7303578015492438,
"grad_norm": 1.933852376850104,
"learning_rate": 1.3374805598755833e-06,
"loss": 0.28622799,
"memory(GiB)": 31.96,
"step": 495,
"train_speed(iter/s)": 0.169315
},
{
"acc": 0.91473122,
"epoch": 0.7377351530800442,
"grad_norm": 1.9036456322193762,
"learning_rate": 1.3297045101088647e-06,
"loss": 0.3094301,
"memory(GiB)": 31.84,
"step": 500,
"train_speed(iter/s)": 0.169482
},
{
"epoch": 0.7377351530800442,
"eval_acc": 0.9090048578368338,
"eval_loss": 0.2688305675983429,
"eval_runtime": 8.8274,
"eval_samples_per_second": 24.696,
"eval_steps_per_second": 3.172,
"step": 500
},
{
"acc": 0.91458435,
"epoch": 0.7451125046108447,
"grad_norm": 1.9335752594206985,
"learning_rate": 1.3219284603421462e-06,
"loss": 0.29494238,
"memory(GiB)": 43.4,
"step": 505,
"train_speed(iter/s)": 0.168821
},
{
"acc": 0.9221386,
"epoch": 0.7524898561416451,
"grad_norm": 1.8197097143608403,
"learning_rate": 1.3141524105754276e-06,
"loss": 0.2647439,
"memory(GiB)": 33.36,
"step": 510,
"train_speed(iter/s)": 0.168682
},
{
"acc": 0.92193203,
"epoch": 0.7598672076724456,
"grad_norm": 1.901554742963865,
"learning_rate": 1.306376360808709e-06,
"loss": 0.27191839,
"memory(GiB)": 30.47,
"step": 515,
"train_speed(iter/s)": 0.168924
},
{
"acc": 0.91413088,
"epoch": 0.7672445592032461,
"grad_norm": 2.0670792917636236,
"learning_rate": 1.2986003110419905e-06,
"loss": 0.296503,
"memory(GiB)": 32.43,
"step": 520,
"train_speed(iter/s)": 0.168732
},
{
"acc": 0.92014456,
"epoch": 0.7746219107340465,
"grad_norm": 1.3940992355499904,
"learning_rate": 1.290824261275272e-06,
"loss": 0.27345006,
"memory(GiB)": 31.88,
"step": 525,
"train_speed(iter/s)": 0.168564
},
{
"acc": 0.91787033,
"epoch": 0.781999262264847,
"grad_norm": 1.7528498159038246,
"learning_rate": 1.2830482115085536e-06,
"loss": 0.27718287,
"memory(GiB)": 32.83,
"step": 530,
"train_speed(iter/s)": 0.168633
},
{
"acc": 0.91950254,
"epoch": 0.7893766137956474,
"grad_norm": 1.6045395248629215,
"learning_rate": 1.275272161741835e-06,
"loss": 0.27553134,
"memory(GiB)": 30.99,
"step": 535,
"train_speed(iter/s)": 0.168504
},
{
"acc": 0.91442375,
"epoch": 0.7967539653264478,
"grad_norm": 2.0480557410695686,
"learning_rate": 1.2674961119751167e-06,
"loss": 0.29672928,
"memory(GiB)": 32.9,
"step": 540,
"train_speed(iter/s)": 0.168746
},
{
"acc": 0.91783228,
"epoch": 0.8041313168572483,
"grad_norm": 1.7063380836356228,
"learning_rate": 1.2597200622083982e-06,
"loss": 0.28551073,
"memory(GiB)": 32.64,
"step": 545,
"train_speed(iter/s)": 0.168632
},
{
"acc": 0.91965294,
"epoch": 0.8115086683880487,
"grad_norm": 1.8091430299196016,
"learning_rate": 1.2519440124416796e-06,
"loss": 0.28367462,
"memory(GiB)": 33.12,
"step": 550,
"train_speed(iter/s)": 0.168537
},
{
"epoch": 0.8115086683880487,
"eval_acc": 0.9094959994284898,
"eval_loss": 0.265609472990036,
"eval_runtime": 8.9354,
"eval_samples_per_second": 24.397,
"eval_steps_per_second": 3.134,
"step": 550
},
{
"acc": 0.91708422,
"epoch": 0.8188860199188491,
"grad_norm": 1.9338041082162762,
"learning_rate": 1.244167962674961e-06,
"loss": 0.30288501,
"memory(GiB)": 44.46,
"step": 555,
"train_speed(iter/s)": 0.168246
},
{
"acc": 0.91793032,
"epoch": 0.8262633714496496,
"grad_norm": 1.960186880981984,
"learning_rate": 1.2363919129082425e-06,
"loss": 0.29391913,
"memory(GiB)": 33.02,
"step": 560,
"train_speed(iter/s)": 0.168119
},
{
"acc": 0.92976294,
"epoch": 0.83364072298045,
"grad_norm": 1.7220525036525174,
"learning_rate": 1.228615863141524e-06,
"loss": 0.24753182,
"memory(GiB)": 32.77,
"step": 565,
"train_speed(iter/s)": 0.16819
},
{
"acc": 0.9202878,
"epoch": 0.8410180745112504,
"grad_norm": 1.9681280144249207,
"learning_rate": 1.2208398133748054e-06,
"loss": 0.27648234,
"memory(GiB)": 32.36,
"step": 570,
"train_speed(iter/s)": 0.168331
},
{
"acc": 0.91870079,
"epoch": 0.8483954260420509,
"grad_norm": 1.6402903494642216,
"learning_rate": 1.2130637636080869e-06,
"loss": 0.29140263,
"memory(GiB)": 35.18,
"step": 575,
"train_speed(iter/s)": 0.168255
},
{
"acc": 0.91364193,
"epoch": 0.8557727775728513,
"grad_norm": 2.146651599757078,
"learning_rate": 1.2052877138413686e-06,
"loss": 0.31224487,
"memory(GiB)": 37.43,
"step": 580,
"train_speed(iter/s)": 0.168463
},
{
"acc": 0.92091951,
"epoch": 0.8631501291036517,
"grad_norm": 2.110687395796676,
"learning_rate": 1.19751166407465e-06,
"loss": 0.27074888,
"memory(GiB)": 30.34,
"step": 585,
"train_speed(iter/s)": 0.16837
},
{
"acc": 0.92361298,
"epoch": 0.8705274806344522,
"grad_norm": 1.341809177582426,
"learning_rate": 1.1897356143079317e-06,
"loss": 0.26371779,
"memory(GiB)": 32.35,
"step": 590,
"train_speed(iter/s)": 0.168375
},
{
"acc": 0.92123985,
"epoch": 0.8779048321652527,
"grad_norm": 1.8270563745834436,
"learning_rate": 1.1819595645412131e-06,
"loss": 0.26702247,
"memory(GiB)": 34.77,
"step": 595,
"train_speed(iter/s)": 0.168532
},
{
"acc": 0.91653709,
"epoch": 0.8852821836960532,
"grad_norm": 1.6527432011832037,
"learning_rate": 1.1741835147744946e-06,
"loss": 0.29842911,
"memory(GiB)": 33.87,
"step": 600,
"train_speed(iter/s)": 0.168424
},
{
"epoch": 0.8852821836960532,
"eval_acc": 0.9105765109301329,
"eval_loss": 0.2623133361339569,
"eval_runtime": 8.7796,
"eval_samples_per_second": 24.83,
"eval_steps_per_second": 3.189,
"step": 600
},
{
"acc": 0.91810665,
"epoch": 0.8926595352268536,
"grad_norm": 1.3239706750197222,
"learning_rate": 1.166407465007776e-06,
"loss": 0.29543982,
"memory(GiB)": 43.63,
"step": 605,
"train_speed(iter/s)": 0.16811
},
{
"acc": 0.92373562,
"epoch": 0.900036886757654,
"grad_norm": 1.589090709862595,
"learning_rate": 1.1586314152410575e-06,
"loss": 0.27000737,
"memory(GiB)": 32.08,
"step": 610,
"train_speed(iter/s)": 0.168111
},
{
"acc": 0.92571859,
"epoch": 0.9074142382884545,
"grad_norm": 1.786690071917202,
"learning_rate": 1.150855365474339e-06,
"loss": 0.26558821,
"memory(GiB)": 34.26,
"step": 615,
"train_speed(iter/s)": 0.167944
},
{
"acc": 0.92350941,
"epoch": 0.9147915898192549,
"grad_norm": 1.4482760998007842,
"learning_rate": 1.1430793157076204e-06,
"loss": 0.27038224,
"memory(GiB)": 32.87,
"step": 620,
"train_speed(iter/s)": 0.168075
},
{
"acc": 0.92567997,
"epoch": 0.9221689413500553,
"grad_norm": 1.5651995631831526,
"learning_rate": 1.1353032659409018e-06,
"loss": 0.25891747,
"memory(GiB)": 32.63,
"step": 625,
"train_speed(iter/s)": 0.168015
},
{
"acc": 0.91823616,
"epoch": 0.9295462928808558,
"grad_norm": 1.4462434724962336,
"learning_rate": 1.1275272161741835e-06,
"loss": 0.2788033,
"memory(GiB)": 38.22,
"step": 630,
"train_speed(iter/s)": 0.167998
},
{
"acc": 0.92322083,
"epoch": 0.9369236444116562,
"grad_norm": 1.4194043988299254,
"learning_rate": 1.119751166407465e-06,
"loss": 0.26030297,
"memory(GiB)": 32.29,
"step": 635,
"train_speed(iter/s)": 0.168162
},
{
"acc": 0.92457771,
"epoch": 0.9443009959424566,
"grad_norm": 1.8304569462755849,
"learning_rate": 1.1119751166407466e-06,
"loss": 0.27183619,
"memory(GiB)": 35.33,
"step": 640,
"train_speed(iter/s)": 0.168086
},
{
"acc": 0.9201807,
"epoch": 0.9516783474732571,
"grad_norm": 1.6355541683467607,
"learning_rate": 1.104199066874028e-06,
"loss": 0.27730408,
"memory(GiB)": 31.4,
"step": 645,
"train_speed(iter/s)": 0.168284
},
{
"acc": 0.92337418,
"epoch": 0.9590556990040575,
"grad_norm": 1.6309155055635356,
"learning_rate": 1.0964230171073095e-06,
"loss": 0.25860276,
"memory(GiB)": 32.67,
"step": 650,
"train_speed(iter/s)": 0.168267
},
{
"epoch": 0.9590556990040575,
"eval_acc": 0.9113176882411773,
"eval_loss": 0.2569684386253357,
"eval_runtime": 8.8598,
"eval_samples_per_second": 24.605,
"eval_steps_per_second": 3.16,
"step": 650
},
{
"acc": 0.91919975,
"epoch": 0.966433050534858,
"grad_norm": 1.482378816274918,
"learning_rate": 1.088646967340591e-06,
"loss": 0.28527048,
"memory(GiB)": 45.59,
"step": 655,
"train_speed(iter/s)": 0.167772
},
{
"acc": 0.92037735,
"epoch": 0.9738104020656584,
"grad_norm": 2.2165369625767712,
"learning_rate": 1.0808709175738724e-06,
"loss": 0.28198528,
"memory(GiB)": 32.93,
"step": 660,
"train_speed(iter/s)": 0.16789
},
{
"acc": 0.92200727,
"epoch": 0.9811877535964588,
"grad_norm": 1.7151646172394919,
"learning_rate": 1.0730948678071539e-06,
"loss": 0.27098572,
"memory(GiB)": 33.1,
"step": 665,
"train_speed(iter/s)": 0.167862
},
{
"acc": 0.92197828,
"epoch": 0.9885651051272594,
"grad_norm": 2.076606131505725,
"learning_rate": 1.0653188180404353e-06,
"loss": 0.26747627,
"memory(GiB)": 34.45,
"step": 670,
"train_speed(iter/s)": 0.167945
},
{
"acc": 0.92063084,
"epoch": 0.9959424566580598,
"grad_norm": 1.7465662806523121,
"learning_rate": 1.0575427682737168e-06,
"loss": 0.27087922,
"memory(GiB)": 39.51,
"step": 675,
"train_speed(iter/s)": 0.167951
}
],
"logging_steps": 5,
"max_steps": 1354,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 66000591650816.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}