diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7270 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 4017, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007472445357743321, + "grad_norm": 7.656607564918146, + "learning_rate": 5.999999770634896e-06, + "loss": 0.880859375, + "memory(GiB)": 4.97, + "step": 1, + "train_speed(iter/s)": 0.036844 + }, + { + "epoch": 0.0037362226788716607, + "grad_norm": 1.3240812663129038, + "learning_rate": 5.999994265874156e-06, + "loss": 0.6390380859375, + "memory(GiB)": 5.26, + "step": 5, + "train_speed(iter/s)": 0.138065 + }, + { + "epoch": 0.007472445357743321, + "grad_norm": 0.7284755993919946, + "learning_rate": 5.999977063518543e-06, + "loss": 0.5041015625, + "memory(GiB)": 5.26, + "step": 10, + "train_speed(iter/s)": 0.213363 + }, + { + "epoch": 0.011208668036614982, + "grad_norm": 0.6580943903426335, + "learning_rate": 5.999948392998923e-06, + "loss": 0.40029296875, + "memory(GiB)": 5.26, + "step": 15, + "train_speed(iter/s)": 0.261004 + }, + { + "epoch": 0.014944890715486643, + "grad_norm": 0.7857583426949194, + "learning_rate": 5.999908254424895e-06, + "loss": 0.33583984375, + "memory(GiB)": 6.59, + "step": 20, + "train_speed(iter/s)": 0.293459 + }, + { + "epoch": 0.018681113394358302, + "grad_norm": 0.6568799760311678, + "learning_rate": 5.999856647949899e-06, + "loss": 0.29228515625, + "memory(GiB)": 6.59, + "step": 25, + "train_speed(iter/s)": 0.318359 + }, + { + "epoch": 0.022417336073229963, + "grad_norm": 0.6029155858817358, + "learning_rate": 5.999793573771213e-06, + "loss": 0.316943359375, + "memory(GiB)": 6.59, + "step": 30, + "train_speed(iter/s)": 0.337313 + }, + { + "epoch": 0.026153558752101624, + "grad_norm": 0.5753306542956114, + "learning_rate": 5.999719032129956e-06, + "loss": 0.2943359375, + "memory(GiB)": 6.59, + "step": 35, + "train_speed(iter/s)": 0.348738 + }, + { + "epoch": 0.029889781430973286, + "grad_norm": 0.4408689781290305, + "learning_rate": 5.999633023311079e-06, + "loss": 0.2607421875, + "memory(GiB)": 6.59, + "step": 40, + "train_speed(iter/s)": 0.361231 + }, + { + "epoch": 0.03362600410984495, + "grad_norm": 0.48969938792644213, + "learning_rate": 5.999535547643375e-06, + "loss": 0.274072265625, + "memory(GiB)": 6.59, + "step": 45, + "train_speed(iter/s)": 0.371707 + }, + { + "epoch": 0.037362226788716604, + "grad_norm": 0.44261950794528854, + "learning_rate": 5.999426605499469e-06, + "loss": 0.26669921875, + "memory(GiB)": 6.59, + "step": 50, + "train_speed(iter/s)": 0.379995 + }, + { + "epoch": 0.04109844946758827, + "grad_norm": 0.598637540279772, + "learning_rate": 5.999306197295818e-06, + "loss": 0.231982421875, + "memory(GiB)": 6.59, + "step": 55, + "train_speed(iter/s)": 0.387849 + }, + { + "epoch": 0.04483467214645993, + "grad_norm": 0.590172951597349, + "learning_rate": 5.999174323492712e-06, + "loss": 0.2291015625, + "memory(GiB)": 6.59, + "step": 60, + "train_speed(iter/s)": 0.394021 + }, + { + "epoch": 0.04857089482533159, + "grad_norm": 0.3946331437032985, + "learning_rate": 5.999030984594274e-06, + "loss": 0.2292236328125, + "memory(GiB)": 7.12, + "step": 65, + "train_speed(iter/s)": 0.398312 + }, + { + "epoch": 0.05230711750420325, + "grad_norm": 0.47975474179770955, + "learning_rate": 5.998876181148451e-06, + "loss": 0.244677734375, + "memory(GiB)": 7.12, + "step": 70, + "train_speed(iter/s)": 0.403159 + }, + { + "epoch": 0.05604334018307491, + "grad_norm": 0.42828328321417347, + "learning_rate": 5.99870991374702e-06, + "loss": 0.241357421875, + "memory(GiB)": 7.12, + "step": 75, + "train_speed(iter/s)": 0.40778 + }, + { + "epoch": 0.05977956286194657, + "grad_norm": 0.42785803136464096, + "learning_rate": 5.9985321830255785e-06, + "loss": 0.19462890625, + "memory(GiB)": 7.12, + "step": 80, + "train_speed(iter/s)": 0.411773 + }, + { + "epoch": 0.06351578554081823, + "grad_norm": 0.508984996055907, + "learning_rate": 5.998342989663546e-06, + "loss": 0.2152587890625, + "memory(GiB)": 7.12, + "step": 85, + "train_speed(iter/s)": 0.415122 + }, + { + "epoch": 0.0672520082196899, + "grad_norm": 0.40314104128835676, + "learning_rate": 5.998142334384162e-06, + "loss": 0.2130859375, + "memory(GiB)": 7.12, + "step": 90, + "train_speed(iter/s)": 0.417064 + }, + { + "epoch": 0.07098823089856156, + "grad_norm": 0.4958145558390914, + "learning_rate": 5.997930217954482e-06, + "loss": 0.20390625, + "memory(GiB)": 7.12, + "step": 95, + "train_speed(iter/s)": 0.419957 + }, + { + "epoch": 0.07472445357743321, + "grad_norm": 0.41222740097614996, + "learning_rate": 5.997706641185376e-06, + "loss": 0.2318359375, + "memory(GiB)": 7.68, + "step": 100, + "train_speed(iter/s)": 0.422692 + }, + { + "epoch": 0.07846067625630487, + "grad_norm": 0.3568824010450547, + "learning_rate": 5.997471604931518e-06, + "loss": 0.21181640625, + "memory(GiB)": 7.68, + "step": 105, + "train_speed(iter/s)": 0.425586 + }, + { + "epoch": 0.08219689893517654, + "grad_norm": 0.5279562949874639, + "learning_rate": 5.997225110091396e-06, + "loss": 0.2095947265625, + "memory(GiB)": 7.68, + "step": 110, + "train_speed(iter/s)": 0.428419 + }, + { + "epoch": 0.0859331216140482, + "grad_norm": 0.4919839298671231, + "learning_rate": 5.996967157607298e-06, + "loss": 0.187939453125, + "memory(GiB)": 7.68, + "step": 115, + "train_speed(iter/s)": 0.430818 + }, + { + "epoch": 0.08966934429291985, + "grad_norm": 0.3706866470661083, + "learning_rate": 5.99669774846531e-06, + "loss": 0.2244140625, + "memory(GiB)": 7.68, + "step": 120, + "train_speed(iter/s)": 0.432015 + }, + { + "epoch": 0.09340556697179152, + "grad_norm": 0.39636987044245997, + "learning_rate": 5.9964168836953194e-06, + "loss": 0.206689453125, + "memory(GiB)": 7.68, + "step": 125, + "train_speed(iter/s)": 0.434132 + }, + { + "epoch": 0.09714178965066318, + "grad_norm": 0.4441200958244795, + "learning_rate": 5.996124564371e-06, + "loss": 0.17958984375, + "memory(GiB)": 7.68, + "step": 130, + "train_speed(iter/s)": 0.435878 + }, + { + "epoch": 0.10087801232953485, + "grad_norm": 0.5703220339704642, + "learning_rate": 5.995820791609815e-06, + "loss": 0.1775390625, + "memory(GiB)": 7.68, + "step": 135, + "train_speed(iter/s)": 0.437848 + }, + { + "epoch": 0.1046142350084065, + "grad_norm": 0.4384590937574754, + "learning_rate": 5.995505566573013e-06, + "loss": 0.166064453125, + "memory(GiB)": 7.68, + "step": 140, + "train_speed(iter/s)": 0.438804 + }, + { + "epoch": 0.10835045768727816, + "grad_norm": 0.39708135180108495, + "learning_rate": 5.995178890465622e-06, + "loss": 0.1685302734375, + "memory(GiB)": 7.68, + "step": 145, + "train_speed(iter/s)": 0.440584 + }, + { + "epoch": 0.11208668036614983, + "grad_norm": 0.4525405723559605, + "learning_rate": 5.99484076453644e-06, + "loss": 0.19501953125, + "memory(GiB)": 7.68, + "step": 150, + "train_speed(iter/s)": 0.441918 + }, + { + "epoch": 0.11582290304502148, + "grad_norm": 0.285652037586189, + "learning_rate": 5.99449119007804e-06, + "loss": 0.1964111328125, + "memory(GiB)": 7.68, + "step": 155, + "train_speed(iter/s)": 0.442742 + }, + { + "epoch": 0.11955912572389314, + "grad_norm": 0.37436551218621555, + "learning_rate": 5.994130168426758e-06, + "loss": 0.17265625, + "memory(GiB)": 7.68, + "step": 160, + "train_speed(iter/s)": 0.444294 + }, + { + "epoch": 0.1232953484027648, + "grad_norm": 0.4319611112269015, + "learning_rate": 5.993757700962691e-06, + "loss": 0.1605712890625, + "memory(GiB)": 7.68, + "step": 165, + "train_speed(iter/s)": 0.445095 + }, + { + "epoch": 0.12703157108163646, + "grad_norm": 0.4679153709762584, + "learning_rate": 5.993373789109686e-06, + "loss": 0.165673828125, + "memory(GiB)": 7.68, + "step": 170, + "train_speed(iter/s)": 0.446127 + }, + { + "epoch": 0.13076779376050812, + "grad_norm": 0.371562107209469, + "learning_rate": 5.992978434335345e-06, + "loss": 0.2007080078125, + "memory(GiB)": 7.68, + "step": 175, + "train_speed(iter/s)": 0.447213 + }, + { + "epoch": 0.1345040164393798, + "grad_norm": 0.41362103389091964, + "learning_rate": 5.992571638151009e-06, + "loss": 0.189794921875, + "memory(GiB)": 7.68, + "step": 180, + "train_speed(iter/s)": 0.447752 + }, + { + "epoch": 0.13824023911825145, + "grad_norm": 0.44521680263908975, + "learning_rate": 5.992153402111759e-06, + "loss": 0.1886474609375, + "memory(GiB)": 7.68, + "step": 185, + "train_speed(iter/s)": 0.448523 + }, + { + "epoch": 0.14197646179712312, + "grad_norm": 0.3574382830191666, + "learning_rate": 5.991723727816408e-06, + "loss": 0.2037109375, + "memory(GiB)": 7.68, + "step": 190, + "train_speed(iter/s)": 0.449759 + }, + { + "epoch": 0.14571268447599478, + "grad_norm": 0.384417458292917, + "learning_rate": 5.991282616907493e-06, + "loss": 0.182666015625, + "memory(GiB)": 7.68, + "step": 195, + "train_speed(iter/s)": 0.450424 + }, + { + "epoch": 0.14944890715486642, + "grad_norm": 0.30564363786555343, + "learning_rate": 5.990830071071269e-06, + "loss": 0.1610107421875, + "memory(GiB)": 7.68, + "step": 200, + "train_speed(iter/s)": 0.45118 + }, + { + "epoch": 0.15318512983373808, + "grad_norm": 0.34594889167069637, + "learning_rate": 5.990366092037709e-06, + "loss": 0.1712890625, + "memory(GiB)": 7.68, + "step": 205, + "train_speed(iter/s)": 0.451796 + }, + { + "epoch": 0.15692135251260975, + "grad_norm": 0.26609760325798565, + "learning_rate": 5.9898906815804865e-06, + "loss": 0.1716552734375, + "memory(GiB)": 8.72, + "step": 210, + "train_speed(iter/s)": 0.451737 + }, + { + "epoch": 0.1606575751914814, + "grad_norm": 0.4326902726320289, + "learning_rate": 5.989403841516979e-06, + "loss": 0.1868408203125, + "memory(GiB)": 9.45, + "step": 215, + "train_speed(iter/s)": 0.452203 + }, + { + "epoch": 0.16439379787035308, + "grad_norm": 0.31305519468747833, + "learning_rate": 5.9889055737082535e-06, + "loss": 0.1808837890625, + "memory(GiB)": 9.45, + "step": 220, + "train_speed(iter/s)": 0.452465 + }, + { + "epoch": 0.16813002054922474, + "grad_norm": 0.337929954898332, + "learning_rate": 5.988395880059065e-06, + "loss": 0.1795166015625, + "memory(GiB)": 9.45, + "step": 225, + "train_speed(iter/s)": 0.452759 + }, + { + "epoch": 0.1718662432280964, + "grad_norm": 0.39047122531072104, + "learning_rate": 5.987874762517843e-06, + "loss": 0.169384765625, + "memory(GiB)": 9.45, + "step": 230, + "train_speed(iter/s)": 0.453624 + }, + { + "epoch": 0.17560246590696804, + "grad_norm": 0.29442955032080625, + "learning_rate": 5.987342223076692e-06, + "loss": 0.15751953125, + "memory(GiB)": 9.45, + "step": 235, + "train_speed(iter/s)": 0.453993 + }, + { + "epoch": 0.1793386885858397, + "grad_norm": 0.4050248335175831, + "learning_rate": 5.986798263771375e-06, + "loss": 0.1673095703125, + "memory(GiB)": 9.45, + "step": 240, + "train_speed(iter/s)": 0.454589 + }, + { + "epoch": 0.18307491126471137, + "grad_norm": 0.39189087307596043, + "learning_rate": 5.9862428866813155e-06, + "loss": 0.17457275390625, + "memory(GiB)": 9.45, + "step": 245, + "train_speed(iter/s)": 0.455097 + }, + { + "epoch": 0.18681113394358304, + "grad_norm": 0.2995268823777092, + "learning_rate": 5.985676093929579e-06, + "loss": 0.1733154296875, + "memory(GiB)": 9.45, + "step": 250, + "train_speed(iter/s)": 0.455559 + }, + { + "epoch": 0.1905473566224547, + "grad_norm": 0.35042188317088824, + "learning_rate": 5.985097887682876e-06, + "loss": 0.18154296875, + "memory(GiB)": 9.45, + "step": 255, + "train_speed(iter/s)": 0.456072 + }, + { + "epoch": 0.19428357930132636, + "grad_norm": 0.3402952343617486, + "learning_rate": 5.984508270151542e-06, + "loss": 0.1767578125, + "memory(GiB)": 9.45, + "step": 260, + "train_speed(iter/s)": 0.456723 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.2789400887911893, + "learning_rate": 5.983907243589537e-06, + "loss": 0.16141357421875, + "memory(GiB)": 9.45, + "step": 265, + "train_speed(iter/s)": 0.456772 + }, + { + "epoch": 0.2017560246590697, + "grad_norm": 0.33400251489865246, + "learning_rate": 5.983294810294439e-06, + "loss": 0.158544921875, + "memory(GiB)": 9.45, + "step": 270, + "train_speed(iter/s)": 0.457152 + }, + { + "epoch": 0.20549224733794133, + "grad_norm": 0.4225006545766808, + "learning_rate": 5.982670972607426e-06, + "loss": 0.1498046875, + "memory(GiB)": 9.45, + "step": 275, + "train_speed(iter/s)": 0.457743 + }, + { + "epoch": 0.209228470016813, + "grad_norm": 0.43474965051646863, + "learning_rate": 5.982035732913273e-06, + "loss": 0.1770263671875, + "memory(GiB)": 9.45, + "step": 280, + "train_speed(iter/s)": 0.457807 + }, + { + "epoch": 0.21296469269568466, + "grad_norm": 0.36173927443406817, + "learning_rate": 5.981389093640344e-06, + "loss": 0.1758056640625, + "memory(GiB)": 9.45, + "step": 285, + "train_speed(iter/s)": 0.458088 + }, + { + "epoch": 0.21670091537455632, + "grad_norm": 0.25308312315237813, + "learning_rate": 5.980731057260579e-06, + "loss": 0.173388671875, + "memory(GiB)": 9.45, + "step": 290, + "train_speed(iter/s)": 0.457498 + }, + { + "epoch": 0.220437138053428, + "grad_norm": 0.29470555914634394, + "learning_rate": 5.980061626289489e-06, + "loss": 0.15411376953125, + "memory(GiB)": 9.45, + "step": 295, + "train_speed(iter/s)": 0.457387 + }, + { + "epoch": 0.22417336073229965, + "grad_norm": 0.35624287307171026, + "learning_rate": 5.9793808032861385e-06, + "loss": 0.1614501953125, + "memory(GiB)": 9.45, + "step": 300, + "train_speed(iter/s)": 0.457895 + }, + { + "epoch": 0.22790958341117132, + "grad_norm": 0.2504855752959934, + "learning_rate": 5.9786885908531455e-06, + "loss": 0.15517578125, + "memory(GiB)": 9.45, + "step": 305, + "train_speed(iter/s)": 0.458265 + }, + { + "epoch": 0.23164580609004295, + "grad_norm": 0.33904923734016645, + "learning_rate": 5.977984991636665e-06, + "loss": 0.1745361328125, + "memory(GiB)": 9.45, + "step": 310, + "train_speed(iter/s)": 0.458658 + }, + { + "epoch": 0.23538202876891462, + "grad_norm": 0.3551555191841338, + "learning_rate": 5.977270008326383e-06, + "loss": 0.157275390625, + "memory(GiB)": 9.45, + "step": 315, + "train_speed(iter/s)": 0.459103 + }, + { + "epoch": 0.23911825144778628, + "grad_norm": 0.4587798002581139, + "learning_rate": 5.9765436436555e-06, + "loss": 0.1659423828125, + "memory(GiB)": 9.45, + "step": 320, + "train_speed(iter/s)": 0.459434 + }, + { + "epoch": 0.24285447412665795, + "grad_norm": 0.3505254508815674, + "learning_rate": 5.975805900400728e-06, + "loss": 0.1699951171875, + "memory(GiB)": 9.45, + "step": 325, + "train_speed(iter/s)": 0.459396 + }, + { + "epoch": 0.2465906968055296, + "grad_norm": 0.3234531871867349, + "learning_rate": 5.9750567813822766e-06, + "loss": 0.15689697265625, + "memory(GiB)": 9.45, + "step": 330, + "train_speed(iter/s)": 0.459815 + }, + { + "epoch": 0.2503269194844013, + "grad_norm": 0.2847235822528394, + "learning_rate": 5.974296289463838e-06, + "loss": 0.1782470703125, + "memory(GiB)": 9.45, + "step": 335, + "train_speed(iter/s)": 0.460005 + }, + { + "epoch": 0.2540631421632729, + "grad_norm": 0.19887321720781595, + "learning_rate": 5.973524427552586e-06, + "loss": 0.1454345703125, + "memory(GiB)": 9.45, + "step": 340, + "train_speed(iter/s)": 0.46045 + }, + { + "epoch": 0.2577993648421446, + "grad_norm": 0.35609582881164253, + "learning_rate": 5.972741198599155e-06, + "loss": 0.15576171875, + "memory(GiB)": 9.45, + "step": 345, + "train_speed(iter/s)": 0.460808 + }, + { + "epoch": 0.26153558752101624, + "grad_norm": 0.3260335257305967, + "learning_rate": 5.971946605597634e-06, + "loss": 0.1542236328125, + "memory(GiB)": 9.45, + "step": 350, + "train_speed(iter/s)": 0.461081 + }, + { + "epoch": 0.26527181019988794, + "grad_norm": 0.3000956082136632, + "learning_rate": 5.9711406515855535e-06, + "loss": 0.1672119140625, + "memory(GiB)": 9.45, + "step": 355, + "train_speed(iter/s)": 0.461632 + }, + { + "epoch": 0.2690080328787596, + "grad_norm": 0.5003356531721083, + "learning_rate": 5.970323339643875e-06, + "loss": 0.141943359375, + "memory(GiB)": 9.45, + "step": 360, + "train_speed(iter/s)": 0.46182 + }, + { + "epoch": 0.2727442555576312, + "grad_norm": 0.3898278569959764, + "learning_rate": 5.969494672896979e-06, + "loss": 0.1525146484375, + "memory(GiB)": 9.45, + "step": 365, + "train_speed(iter/s)": 0.461906 + }, + { + "epoch": 0.2764804782365029, + "grad_norm": 0.3453310818742678, + "learning_rate": 5.96865465451265e-06, + "loss": 0.178564453125, + "memory(GiB)": 9.45, + "step": 370, + "train_speed(iter/s)": 0.46223 + }, + { + "epoch": 0.28021670091537454, + "grad_norm": 0.38009861005791173, + "learning_rate": 5.9678032877020705e-06, + "loss": 0.1583251953125, + "memory(GiB)": 9.45, + "step": 375, + "train_speed(iter/s)": 0.46236 + }, + { + "epoch": 0.28395292359424623, + "grad_norm": 0.3337227144486021, + "learning_rate": 5.966940575719802e-06, + "loss": 0.164697265625, + "memory(GiB)": 9.45, + "step": 380, + "train_speed(iter/s)": 0.462583 + }, + { + "epoch": 0.28768914627311787, + "grad_norm": 0.34344615999699735, + "learning_rate": 5.966066521863778e-06, + "loss": 0.155126953125, + "memory(GiB)": 9.45, + "step": 385, + "train_speed(iter/s)": 0.462936 + }, + { + "epoch": 0.29142536895198956, + "grad_norm": 0.3782402092083932, + "learning_rate": 5.9651811294752885e-06, + "loss": 0.161767578125, + "memory(GiB)": 9.45, + "step": 390, + "train_speed(iter/s)": 0.463287 + }, + { + "epoch": 0.2951615916308612, + "grad_norm": 0.3820929493431576, + "learning_rate": 5.964284401938968e-06, + "loss": 0.1547119140625, + "memory(GiB)": 9.45, + "step": 395, + "train_speed(iter/s)": 0.463312 + }, + { + "epoch": 0.29889781430973283, + "grad_norm": 0.37254277787709306, + "learning_rate": 5.96337634268278e-06, + "loss": 0.1453125, + "memory(GiB)": 9.45, + "step": 400, + "train_speed(iter/s)": 0.463552 + }, + { + "epoch": 0.3026340369886045, + "grad_norm": 0.3771270351369902, + "learning_rate": 5.9624569551780115e-06, + "loss": 0.1693603515625, + "memory(GiB)": 9.45, + "step": 405, + "train_speed(iter/s)": 0.463665 + }, + { + "epoch": 0.30637025966747616, + "grad_norm": 0.3169810724128572, + "learning_rate": 5.961526242939251e-06, + "loss": 0.143310546875, + "memory(GiB)": 9.45, + "step": 410, + "train_speed(iter/s)": 0.463774 + }, + { + "epoch": 0.31010648234634786, + "grad_norm": 0.39276892682897285, + "learning_rate": 5.960584209524377e-06, + "loss": 0.12626953125, + "memory(GiB)": 9.45, + "step": 415, + "train_speed(iter/s)": 0.463772 + }, + { + "epoch": 0.3138427050252195, + "grad_norm": 0.30248041554648486, + "learning_rate": 5.95963085853455e-06, + "loss": 0.1291259765625, + "memory(GiB)": 9.45, + "step": 420, + "train_speed(iter/s)": 0.464062 + }, + { + "epoch": 0.3175789277040912, + "grad_norm": 0.31139734130517427, + "learning_rate": 5.958666193614194e-06, + "loss": 0.1403564453125, + "memory(GiB)": 9.45, + "step": 425, + "train_speed(iter/s)": 0.46431 + }, + { + "epoch": 0.3213151503829628, + "grad_norm": 0.29672071282145907, + "learning_rate": 5.95769021845098e-06, + "loss": 0.1619140625, + "memory(GiB)": 9.45, + "step": 430, + "train_speed(iter/s)": 0.464574 + }, + { + "epoch": 0.32505137306183446, + "grad_norm": 0.3245553447126267, + "learning_rate": 5.956702936775819e-06, + "loss": 0.149169921875, + "memory(GiB)": 9.45, + "step": 435, + "train_speed(iter/s)": 0.464656 + }, + { + "epoch": 0.32878759574070615, + "grad_norm": 0.37942479273965346, + "learning_rate": 5.955704352362843e-06, + "loss": 0.1540283203125, + "memory(GiB)": 9.45, + "step": 440, + "train_speed(iter/s)": 0.464866 + }, + { + "epoch": 0.3325238184195778, + "grad_norm": 0.4722961848658832, + "learning_rate": 5.954694469029391e-06, + "loss": 0.146875, + "memory(GiB)": 9.45, + "step": 445, + "train_speed(iter/s)": 0.46511 + }, + { + "epoch": 0.3362600410984495, + "grad_norm": 0.32208483256209325, + "learning_rate": 5.9536732906359936e-06, + "loss": 0.1362060546875, + "memory(GiB)": 9.45, + "step": 450, + "train_speed(iter/s)": 0.465444 + }, + { + "epoch": 0.3399962637773211, + "grad_norm": 0.39468565724302457, + "learning_rate": 5.952640821086362e-06, + "loss": 0.14046630859375, + "memory(GiB)": 9.45, + "step": 455, + "train_speed(iter/s)": 0.465502 + }, + { + "epoch": 0.3437324864561928, + "grad_norm": 0.2923449968980904, + "learning_rate": 5.951597064327371e-06, + "loss": 0.14259033203125, + "memory(GiB)": 9.45, + "step": 460, + "train_speed(iter/s)": 0.465768 + }, + { + "epoch": 0.34746870913506445, + "grad_norm": 0.2526312937320368, + "learning_rate": 5.95054202434904e-06, + "loss": 0.154150390625, + "memory(GiB)": 9.45, + "step": 465, + "train_speed(iter/s)": 0.465477 + }, + { + "epoch": 0.3512049318139361, + "grad_norm": 0.25397429668673016, + "learning_rate": 5.949475705184526e-06, + "loss": 0.145068359375, + "memory(GiB)": 9.45, + "step": 470, + "train_speed(iter/s)": 0.465793 + }, + { + "epoch": 0.3549411544928078, + "grad_norm": 0.2889099964297901, + "learning_rate": 5.948398110910099e-06, + "loss": 0.14326171875, + "memory(GiB)": 9.45, + "step": 475, + "train_speed(iter/s)": 0.465718 + }, + { + "epoch": 0.3586773771716794, + "grad_norm": 0.30650880945183995, + "learning_rate": 5.947309245645134e-06, + "loss": 0.17294921875, + "memory(GiB)": 9.45, + "step": 480, + "train_speed(iter/s)": 0.465738 + }, + { + "epoch": 0.3624135998505511, + "grad_norm": 0.23874814446464385, + "learning_rate": 5.946209113552092e-06, + "loss": 0.1577880859375, + "memory(GiB)": 9.45, + "step": 485, + "train_speed(iter/s)": 0.465905 + }, + { + "epoch": 0.36614982252942274, + "grad_norm": 0.26737529230375395, + "learning_rate": 5.945097718836503e-06, + "loss": 0.13236083984375, + "memory(GiB)": 9.45, + "step": 490, + "train_speed(iter/s)": 0.466159 + }, + { + "epoch": 0.36988604520829443, + "grad_norm": 0.34648783089300494, + "learning_rate": 5.9439750657469524e-06, + "loss": 0.166064453125, + "memory(GiB)": 9.45, + "step": 495, + "train_speed(iter/s)": 0.466248 + }, + { + "epoch": 0.37362226788716607, + "grad_norm": 0.3711374814276351, + "learning_rate": 5.942841158575061e-06, + "loss": 0.15181884765625, + "memory(GiB)": 9.45, + "step": 500, + "train_speed(iter/s)": 0.46631 + }, + { + "epoch": 0.37735849056603776, + "grad_norm": 0.26122017355859195, + "learning_rate": 5.941696001655475e-06, + "loss": 0.1420654296875, + "memory(GiB)": 9.45, + "step": 505, + "train_speed(iter/s)": 0.466356 + }, + { + "epoch": 0.3810947132449094, + "grad_norm": 0.30129945797313573, + "learning_rate": 5.940539599365843e-06, + "loss": 0.15704345703125, + "memory(GiB)": 9.45, + "step": 510, + "train_speed(iter/s)": 0.466088 + }, + { + "epoch": 0.38483093592378104, + "grad_norm": 0.27115019497623694, + "learning_rate": 5.939371956126803e-06, + "loss": 0.1350341796875, + "memory(GiB)": 9.45, + "step": 515, + "train_speed(iter/s)": 0.466144 + }, + { + "epoch": 0.38856715860265273, + "grad_norm": 0.3323988811121097, + "learning_rate": 5.938193076401964e-06, + "loss": 0.149072265625, + "memory(GiB)": 9.45, + "step": 520, + "train_speed(iter/s)": 0.466125 + }, + { + "epoch": 0.39230338128152437, + "grad_norm": 0.36151939711979136, + "learning_rate": 5.937002964697888e-06, + "loss": 0.13743896484375, + "memory(GiB)": 9.45, + "step": 525, + "train_speed(iter/s)": 0.466282 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.28979409508186516, + "learning_rate": 5.935801625564074e-06, + "loss": 0.15244140625, + "memory(GiB)": 9.45, + "step": 530, + "train_speed(iter/s)": 0.466375 + }, + { + "epoch": 0.3997758266392677, + "grad_norm": 0.3296511480431298, + "learning_rate": 5.934589063592946e-06, + "loss": 0.15579833984375, + "memory(GiB)": 9.45, + "step": 535, + "train_speed(iter/s)": 0.466122 + }, + { + "epoch": 0.4035120493181394, + "grad_norm": 0.20344435673525696, + "learning_rate": 5.933365283419823e-06, + "loss": 0.151953125, + "memory(GiB)": 9.45, + "step": 540, + "train_speed(iter/s)": 0.466272 + }, + { + "epoch": 0.407248271997011, + "grad_norm": 0.2633323321431179, + "learning_rate": 5.932130289722912e-06, + "loss": 0.15283203125, + "memory(GiB)": 9.45, + "step": 545, + "train_speed(iter/s)": 0.466106 + }, + { + "epoch": 0.41098449467588266, + "grad_norm": 0.3221514229815824, + "learning_rate": 5.9308840872232845e-06, + "loss": 0.16361083984375, + "memory(GiB)": 9.45, + "step": 550, + "train_speed(iter/s)": 0.466076 + }, + { + "epoch": 0.41472071735475435, + "grad_norm": 0.2957653300069589, + "learning_rate": 5.929626680684864e-06, + "loss": 0.1420654296875, + "memory(GiB)": 9.45, + "step": 555, + "train_speed(iter/s)": 0.466246 + }, + { + "epoch": 0.418456940033626, + "grad_norm": 0.27433097372254944, + "learning_rate": 5.928358074914402e-06, + "loss": 0.133544921875, + "memory(GiB)": 9.45, + "step": 560, + "train_speed(iter/s)": 0.466388 + }, + { + "epoch": 0.4221931627124977, + "grad_norm": 0.30811252206856754, + "learning_rate": 5.927078274761459e-06, + "loss": 0.13226318359375, + "memory(GiB)": 9.45, + "step": 565, + "train_speed(iter/s)": 0.46622 + }, + { + "epoch": 0.4259293853913693, + "grad_norm": 0.4343588364932629, + "learning_rate": 5.925787285118395e-06, + "loss": 0.132061767578125, + "memory(GiB)": 9.45, + "step": 570, + "train_speed(iter/s)": 0.466434 + }, + { + "epoch": 0.429665608070241, + "grad_norm": 0.28429865872011917, + "learning_rate": 5.9244851109203404e-06, + "loss": 0.1482177734375, + "memory(GiB)": 9.45, + "step": 575, + "train_speed(iter/s)": 0.466569 + }, + { + "epoch": 0.43340183074911265, + "grad_norm": 0.27884340279867387, + "learning_rate": 5.923171757145182e-06, + "loss": 0.14344482421875, + "memory(GiB)": 9.45, + "step": 580, + "train_speed(iter/s)": 0.46672 + }, + { + "epoch": 0.4371380534279843, + "grad_norm": 0.32622048115497765, + "learning_rate": 5.921847228813543e-06, + "loss": 0.146728515625, + "memory(GiB)": 9.45, + "step": 585, + "train_speed(iter/s)": 0.466879 + }, + { + "epoch": 0.440874276106856, + "grad_norm": 0.43240131198466947, + "learning_rate": 5.9205115309887666e-06, + "loss": 0.1595458984375, + "memory(GiB)": 9.45, + "step": 590, + "train_speed(iter/s)": 0.466944 + }, + { + "epoch": 0.4446104987857276, + "grad_norm": 0.31277968889979835, + "learning_rate": 5.919164668776891e-06, + "loss": 0.1449462890625, + "memory(GiB)": 9.45, + "step": 595, + "train_speed(iter/s)": 0.467155 + }, + { + "epoch": 0.4483467214645993, + "grad_norm": 0.2719906915702348, + "learning_rate": 5.917806647326636e-06, + "loss": 0.1359130859375, + "memory(GiB)": 9.45, + "step": 600, + "train_speed(iter/s)": 0.467399 + }, + { + "epoch": 0.45208294414347094, + "grad_norm": 0.2958357180656749, + "learning_rate": 5.9164374718293764e-06, + "loss": 0.1510498046875, + "memory(GiB)": 9.45, + "step": 605, + "train_speed(iter/s)": 0.467309 + }, + { + "epoch": 0.45581916682234264, + "grad_norm": 0.323840801916129, + "learning_rate": 5.91505714751913e-06, + "loss": 0.1556884765625, + "memory(GiB)": 9.45, + "step": 610, + "train_speed(iter/s)": 0.467461 + }, + { + "epoch": 0.4595553895012143, + "grad_norm": 0.25540153277044697, + "learning_rate": 5.913665679672533e-06, + "loss": 0.1478271484375, + "memory(GiB)": 9.45, + "step": 615, + "train_speed(iter/s)": 0.467614 + }, + { + "epoch": 0.4632916121800859, + "grad_norm": 0.3216539204738006, + "learning_rate": 5.912263073608819e-06, + "loss": 0.14404296875, + "memory(GiB)": 9.45, + "step": 620, + "train_speed(iter/s)": 0.46759 + }, + { + "epoch": 0.4670278348589576, + "grad_norm": 0.3564645954321089, + "learning_rate": 5.9108493346898014e-06, + "loss": 0.1556640625, + "memory(GiB)": 9.45, + "step": 625, + "train_speed(iter/s)": 0.467777 + }, + { + "epoch": 0.47076405753782924, + "grad_norm": 0.3234498866245867, + "learning_rate": 5.9094244683198514e-06, + "loss": 0.130474853515625, + "memory(GiB)": 9.45, + "step": 630, + "train_speed(iter/s)": 0.467657 + }, + { + "epoch": 0.47450028021670093, + "grad_norm": 0.27930245162799133, + "learning_rate": 5.907988479945878e-06, + "loss": 0.1467529296875, + "memory(GiB)": 9.45, + "step": 635, + "train_speed(iter/s)": 0.467501 + }, + { + "epoch": 0.47823650289557257, + "grad_norm": 0.2831117651967566, + "learning_rate": 5.906541375057305e-06, + "loss": 0.135107421875, + "memory(GiB)": 9.45, + "step": 640, + "train_speed(iter/s)": 0.467611 + }, + { + "epoch": 0.48197272557444426, + "grad_norm": 0.27115323391313917, + "learning_rate": 5.905083159186056e-06, + "loss": 0.128759765625, + "memory(GiB)": 9.45, + "step": 645, + "train_speed(iter/s)": 0.46749 + }, + { + "epoch": 0.4857089482533159, + "grad_norm": 0.32564068860731793, + "learning_rate": 5.903613837906525e-06, + "loss": 0.1319580078125, + "memory(GiB)": 9.45, + "step": 650, + "train_speed(iter/s)": 0.467686 + }, + { + "epoch": 0.48944517093218753, + "grad_norm": 0.2387399044673888, + "learning_rate": 5.902133416835561e-06, + "loss": 0.1338134765625, + "memory(GiB)": 9.45, + "step": 655, + "train_speed(iter/s)": 0.467441 + }, + { + "epoch": 0.4931813936110592, + "grad_norm": 0.24117814539801136, + "learning_rate": 5.900641901632444e-06, + "loss": 0.1324462890625, + "memory(GiB)": 9.45, + "step": 660, + "train_speed(iter/s)": 0.46725 + }, + { + "epoch": 0.49691761628993086, + "grad_norm": 0.2806056999325975, + "learning_rate": 5.899139297998865e-06, + "loss": 0.14583740234375, + "memory(GiB)": 9.45, + "step": 665, + "train_speed(iter/s)": 0.467303 + }, + { + "epoch": 0.5006538389688026, + "grad_norm": 0.3602595784462823, + "learning_rate": 5.897625611678904e-06, + "loss": 0.16168212890625, + "memory(GiB)": 9.45, + "step": 670, + "train_speed(iter/s)": 0.467465 + }, + { + "epoch": 0.5043900616476742, + "grad_norm": 0.2892704455949438, + "learning_rate": 5.896100848459004e-06, + "loss": 0.14654541015625, + "memory(GiB)": 9.45, + "step": 675, + "train_speed(iter/s)": 0.467417 + }, + { + "epoch": 0.5081262843265458, + "grad_norm": 0.30864116070274367, + "learning_rate": 5.894565014167955e-06, + "loss": 0.1387451171875, + "memory(GiB)": 9.45, + "step": 680, + "train_speed(iter/s)": 0.467388 + }, + { + "epoch": 0.5118625070054176, + "grad_norm": 0.23741861823114724, + "learning_rate": 5.89301811467687e-06, + "loss": 0.14443359375, + "memory(GiB)": 9.45, + "step": 685, + "train_speed(iter/s)": 0.467619 + }, + { + "epoch": 0.5155987296842892, + "grad_norm": 0.3704119854549676, + "learning_rate": 5.891460155899159e-06, + "loss": 0.1429931640625, + "memory(GiB)": 9.45, + "step": 690, + "train_speed(iter/s)": 0.467553 + }, + { + "epoch": 0.5193349523631609, + "grad_norm": 0.3371956586173727, + "learning_rate": 5.88989114379051e-06, + "loss": 0.122119140625, + "memory(GiB)": 9.45, + "step": 695, + "train_speed(iter/s)": 0.467568 + }, + { + "epoch": 0.5230711750420325, + "grad_norm": 0.23061580193263015, + "learning_rate": 5.888311084348865e-06, + "loss": 0.1429931640625, + "memory(GiB)": 9.45, + "step": 700, + "train_speed(iter/s)": 0.467617 + }, + { + "epoch": 0.5268073977209041, + "grad_norm": 0.2357495758457104, + "learning_rate": 5.886719983614396e-06, + "loss": 0.1326904296875, + "memory(GiB)": 9.45, + "step": 705, + "train_speed(iter/s)": 0.467672 + }, + { + "epoch": 0.5305436203997759, + "grad_norm": 0.20506003694806352, + "learning_rate": 5.885117847669485e-06, + "loss": 0.1441650390625, + "memory(GiB)": 9.45, + "step": 710, + "train_speed(iter/s)": 0.467709 + }, + { + "epoch": 0.5342798430786475, + "grad_norm": 0.3366909550119504, + "learning_rate": 5.883504682638699e-06, + "loss": 0.1407958984375, + "memory(GiB)": 9.45, + "step": 715, + "train_speed(iter/s)": 0.467852 + }, + { + "epoch": 0.5380160657575191, + "grad_norm": 0.2909847266005631, + "learning_rate": 5.881880494688763e-06, + "loss": 0.1455322265625, + "memory(GiB)": 9.45, + "step": 720, + "train_speed(iter/s)": 0.467893 + }, + { + "epoch": 0.5417522884363908, + "grad_norm": 0.2386052769018931, + "learning_rate": 5.880245290028545e-06, + "loss": 0.140478515625, + "memory(GiB)": 9.45, + "step": 725, + "train_speed(iter/s)": 0.467751 + }, + { + "epoch": 0.5454885111152624, + "grad_norm": 0.2645707847366404, + "learning_rate": 5.878599074909023e-06, + "loss": 0.1463134765625, + "memory(GiB)": 9.45, + "step": 730, + "train_speed(iter/s)": 0.467878 + }, + { + "epoch": 0.5492247337941342, + "grad_norm": 0.31563029908522805, + "learning_rate": 5.876941855623268e-06, + "loss": 0.1530029296875, + "memory(GiB)": 9.45, + "step": 735, + "train_speed(iter/s)": 0.467974 + }, + { + "epoch": 0.5529609564730058, + "grad_norm": 0.26319413448836815, + "learning_rate": 5.8752736385064145e-06, + "loss": 0.12587890625, + "memory(GiB)": 9.45, + "step": 740, + "train_speed(iter/s)": 0.467961 + }, + { + "epoch": 0.5566971791518774, + "grad_norm": 0.3858440978882179, + "learning_rate": 5.873594429935642e-06, + "loss": 0.1377197265625, + "memory(GiB)": 9.45, + "step": 745, + "train_speed(iter/s)": 0.468054 + }, + { + "epoch": 0.5604334018307491, + "grad_norm": 0.20276433188895907, + "learning_rate": 5.871904236330144e-06, + "loss": 0.12718505859375, + "memory(GiB)": 9.45, + "step": 750, + "train_speed(iter/s)": 0.468081 + }, + { + "epoch": 0.5641696245096208, + "grad_norm": 0.22243564217533868, + "learning_rate": 5.870203064151111e-06, + "loss": 0.1421630859375, + "memory(GiB)": 9.45, + "step": 755, + "train_speed(iter/s)": 0.468228 + }, + { + "epoch": 0.5679058471884925, + "grad_norm": 0.2924186985340597, + "learning_rate": 5.8684909199017e-06, + "loss": 0.145458984375, + "memory(GiB)": 9.45, + "step": 760, + "train_speed(iter/s)": 0.468279 + }, + { + "epoch": 0.5716420698673641, + "grad_norm": 0.22056169438669584, + "learning_rate": 5.866767810127009e-06, + "loss": 0.128564453125, + "memory(GiB)": 9.45, + "step": 765, + "train_speed(iter/s)": 0.468225 + }, + { + "epoch": 0.5753782925462357, + "grad_norm": 0.2740803532217515, + "learning_rate": 5.86503374141406e-06, + "loss": 0.1392822265625, + "memory(GiB)": 9.45, + "step": 770, + "train_speed(iter/s)": 0.468416 + }, + { + "epoch": 0.5791145152251074, + "grad_norm": 0.3606780255005757, + "learning_rate": 5.863288720391763e-06, + "loss": 0.155615234375, + "memory(GiB)": 9.45, + "step": 775, + "train_speed(iter/s)": 0.468411 + }, + { + "epoch": 0.5828507379039791, + "grad_norm": 0.21221894282841508, + "learning_rate": 5.861532753730898e-06, + "loss": 0.1374755859375, + "memory(GiB)": 9.45, + "step": 780, + "train_speed(iter/s)": 0.468088 + }, + { + "epoch": 0.5865869605828508, + "grad_norm": 0.2660755841560947, + "learning_rate": 5.859765848144089e-06, + "loss": 0.13995361328125, + "memory(GiB)": 9.45, + "step": 785, + "train_speed(iter/s)": 0.467999 + }, + { + "epoch": 0.5903231832617224, + "grad_norm": 0.22600558934152162, + "learning_rate": 5.857988010385774e-06, + "loss": 0.128515625, + "memory(GiB)": 9.45, + "step": 790, + "train_speed(iter/s)": 0.468097 + }, + { + "epoch": 0.594059405940594, + "grad_norm": 0.24882069836354315, + "learning_rate": 5.856199247252184e-06, + "loss": 0.1505126953125, + "memory(GiB)": 9.45, + "step": 795, + "train_speed(iter/s)": 0.468257 + }, + { + "epoch": 0.5977956286194657, + "grad_norm": 0.2541011112429318, + "learning_rate": 5.854399565581314e-06, + "loss": 0.13427734375, + "memory(GiB)": 10.57, + "step": 800, + "train_speed(iter/s)": 0.468265 + }, + { + "epoch": 0.6015318512983374, + "grad_norm": 0.2872004300607469, + "learning_rate": 5.8525889722528985e-06, + "loss": 0.1360595703125, + "memory(GiB)": 10.57, + "step": 805, + "train_speed(iter/s)": 0.468185 + }, + { + "epoch": 0.605268073977209, + "grad_norm": 0.323332001257219, + "learning_rate": 5.850767474188383e-06, + "loss": 0.1507080078125, + "memory(GiB)": 10.57, + "step": 810, + "train_speed(iter/s)": 0.468123 + }, + { + "epoch": 0.6090042966560807, + "grad_norm": 0.20912945195997415, + "learning_rate": 5.8489350783509025e-06, + "loss": 0.13023681640625, + "memory(GiB)": 10.57, + "step": 815, + "train_speed(iter/s)": 0.46818 + }, + { + "epoch": 0.6127405193349523, + "grad_norm": 0.2939854808777276, + "learning_rate": 5.847091791745247e-06, + "loss": 0.14840087890625, + "memory(GiB)": 10.57, + "step": 820, + "train_speed(iter/s)": 0.468357 + }, + { + "epoch": 0.6164767420138241, + "grad_norm": 0.24988955332399215, + "learning_rate": 5.8452376214178426e-06, + "loss": 0.12974853515625, + "memory(GiB)": 10.57, + "step": 825, + "train_speed(iter/s)": 0.468499 + }, + { + "epoch": 0.6202129646926957, + "grad_norm": 0.21781664947497836, + "learning_rate": 5.84337257445672e-06, + "loss": 0.1396484375, + "memory(GiB)": 10.57, + "step": 830, + "train_speed(iter/s)": 0.468441 + }, + { + "epoch": 0.6239491873715673, + "grad_norm": 0.25082480295038034, + "learning_rate": 5.841496657991487e-06, + "loss": 0.135546875, + "memory(GiB)": 10.57, + "step": 835, + "train_speed(iter/s)": 0.468446 + }, + { + "epoch": 0.627685410050439, + "grad_norm": 0.2686863733874229, + "learning_rate": 5.8396098791933055e-06, + "loss": 0.11251220703125, + "memory(GiB)": 10.57, + "step": 840, + "train_speed(iter/s)": 0.46852 + }, + { + "epoch": 0.6314216327293106, + "grad_norm": 0.2710369755059897, + "learning_rate": 5.837712245274861e-06, + "loss": 0.11365966796875, + "memory(GiB)": 10.57, + "step": 845, + "train_speed(iter/s)": 0.468573 + }, + { + "epoch": 0.6351578554081824, + "grad_norm": 0.34487595194525544, + "learning_rate": 5.835803763490333e-06, + "loss": 0.1312255859375, + "memory(GiB)": 10.57, + "step": 850, + "train_speed(iter/s)": 0.468679 + }, + { + "epoch": 0.638894078087054, + "grad_norm": 0.2557913641225529, + "learning_rate": 5.833884441135373e-06, + "loss": 0.150701904296875, + "memory(GiB)": 10.57, + "step": 855, + "train_speed(iter/s)": 0.468713 + }, + { + "epoch": 0.6426303007659256, + "grad_norm": 0.2492246452188681, + "learning_rate": 5.831954285547071e-06, + "loss": 0.1027587890625, + "memory(GiB)": 10.57, + "step": 860, + "train_speed(iter/s)": 0.468697 + }, + { + "epoch": 0.6463665234447973, + "grad_norm": 0.20962556058124304, + "learning_rate": 5.830013304103929e-06, + "loss": 0.13544921875, + "memory(GiB)": 10.57, + "step": 865, + "train_speed(iter/s)": 0.468784 + }, + { + "epoch": 0.6501027461236689, + "grad_norm": 0.26313981696050626, + "learning_rate": 5.828061504225837e-06, + "loss": 0.13037109375, + "memory(GiB)": 10.57, + "step": 870, + "train_speed(iter/s)": 0.468837 + }, + { + "epoch": 0.6538389688025407, + "grad_norm": 0.3459843916573515, + "learning_rate": 5.826098893374037e-06, + "loss": 0.1420654296875, + "memory(GiB)": 10.57, + "step": 875, + "train_speed(iter/s)": 0.468901 + }, + { + "epoch": 0.6575751914814123, + "grad_norm": 0.27792291827470583, + "learning_rate": 5.824125479051103e-06, + "loss": 0.13037109375, + "memory(GiB)": 10.57, + "step": 880, + "train_speed(iter/s)": 0.468895 + }, + { + "epoch": 0.6613114141602839, + "grad_norm": 0.2607823555214958, + "learning_rate": 5.8221412688009034e-06, + "loss": 0.135107421875, + "memory(GiB)": 10.57, + "step": 885, + "train_speed(iter/s)": 0.468908 + }, + { + "epoch": 0.6650476368391556, + "grad_norm": 0.2501794810871831, + "learning_rate": 5.820146270208581e-06, + "loss": 0.12391357421875, + "memory(GiB)": 10.57, + "step": 890, + "train_speed(iter/s)": 0.468941 + }, + { + "epoch": 0.6687838595180273, + "grad_norm": 0.2564710519025842, + "learning_rate": 5.8181404909005175e-06, + "loss": 0.14501953125, + "memory(GiB)": 10.57, + "step": 895, + "train_speed(iter/s)": 0.46911 + }, + { + "epoch": 0.672520082196899, + "grad_norm": 0.2715014489679807, + "learning_rate": 5.816123938544305e-06, + "loss": 0.132275390625, + "memory(GiB)": 10.57, + "step": 900, + "train_speed(iter/s)": 0.469243 + }, + { + "epoch": 0.6762563048757706, + "grad_norm": 0.321526690011715, + "learning_rate": 5.814096620848723e-06, + "loss": 0.14796142578125, + "memory(GiB)": 10.57, + "step": 905, + "train_speed(iter/s)": 0.469369 + }, + { + "epoch": 0.6799925275546422, + "grad_norm": 0.26737616140516984, + "learning_rate": 5.8120585455636975e-06, + "loss": 0.1335205078125, + "memory(GiB)": 10.57, + "step": 910, + "train_speed(iter/s)": 0.469536 + }, + { + "epoch": 0.6837287502335139, + "grad_norm": 0.23441075691954993, + "learning_rate": 5.8100097204802854e-06, + "loss": 0.13460693359375, + "memory(GiB)": 10.57, + "step": 915, + "train_speed(iter/s)": 0.469558 + }, + { + "epoch": 0.6874649729123856, + "grad_norm": 0.3004174037886124, + "learning_rate": 5.807950153430634e-06, + "loss": 0.13314208984375, + "memory(GiB)": 10.57, + "step": 920, + "train_speed(iter/s)": 0.469494 + }, + { + "epoch": 0.6912011955912573, + "grad_norm": 0.3511989299300596, + "learning_rate": 5.805879852287953e-06, + "loss": 0.11871337890625, + "memory(GiB)": 10.57, + "step": 925, + "train_speed(iter/s)": 0.469497 + }, + { + "epoch": 0.6949374182701289, + "grad_norm": 0.22941841038115351, + "learning_rate": 5.803798824966487e-06, + "loss": 0.12340087890625, + "memory(GiB)": 10.57, + "step": 930, + "train_speed(iter/s)": 0.469442 + }, + { + "epoch": 0.6986736409490005, + "grad_norm": 0.2296737881416939, + "learning_rate": 5.801707079421485e-06, + "loss": 0.115618896484375, + "memory(GiB)": 10.57, + "step": 935, + "train_speed(iter/s)": 0.469455 + }, + { + "epoch": 0.7024098636278722, + "grad_norm": 0.2832572168234479, + "learning_rate": 5.799604623649168e-06, + "loss": 0.1305908203125, + "memory(GiB)": 10.57, + "step": 940, + "train_speed(iter/s)": 0.46963 + }, + { + "epoch": 0.7061460863067439, + "grad_norm": 0.316216648821189, + "learning_rate": 5.7974914656867004e-06, + "loss": 0.123193359375, + "memory(GiB)": 10.57, + "step": 945, + "train_speed(iter/s)": 0.46966 + }, + { + "epoch": 0.7098823089856156, + "grad_norm": 0.26883224736976363, + "learning_rate": 5.795367613612158e-06, + "loss": 0.12900390625, + "memory(GiB)": 10.57, + "step": 950, + "train_speed(iter/s)": 0.469682 + }, + { + "epoch": 0.7136185316644872, + "grad_norm": 0.2965748829854584, + "learning_rate": 5.793233075544498e-06, + "loss": 0.11947021484375, + "memory(GiB)": 10.57, + "step": 955, + "train_speed(iter/s)": 0.469772 + }, + { + "epoch": 0.7173547543433588, + "grad_norm": 0.23063703167824398, + "learning_rate": 5.791087859643525e-06, + "loss": 0.15511474609375, + "memory(GiB)": 10.57, + "step": 960, + "train_speed(iter/s)": 0.469872 + }, + { + "epoch": 0.7210909770222306, + "grad_norm": 0.3034417815586922, + "learning_rate": 5.788931974109867e-06, + "loss": 0.1328369140625, + "memory(GiB)": 10.57, + "step": 965, + "train_speed(iter/s)": 0.469955 + }, + { + "epoch": 0.7248271997011022, + "grad_norm": 0.2606666501840904, + "learning_rate": 5.7867654271849355e-06, + "loss": 0.1348388671875, + "memory(GiB)": 10.57, + "step": 970, + "train_speed(iter/s)": 0.470006 + }, + { + "epoch": 0.7285634223799738, + "grad_norm": 0.26581107097992346, + "learning_rate": 5.7845882271508975e-06, + "loss": 0.133349609375, + "memory(GiB)": 10.57, + "step": 975, + "train_speed(iter/s)": 0.470064 + }, + { + "epoch": 0.7322996450588455, + "grad_norm": 0.3223256775646686, + "learning_rate": 5.7824003823306484e-06, + "loss": 0.13079833984375, + "memory(GiB)": 10.57, + "step": 980, + "train_speed(iter/s)": 0.469962 + }, + { + "epoch": 0.7360358677377171, + "grad_norm": 0.3011414890652826, + "learning_rate": 5.780201901087771e-06, + "loss": 0.1345947265625, + "memory(GiB)": 10.57, + "step": 985, + "train_speed(iter/s)": 0.470102 + }, + { + "epoch": 0.7397720904165889, + "grad_norm": 0.30876194543952196, + "learning_rate": 5.777992791826512e-06, + "loss": 0.14576416015625, + "memory(GiB)": 10.57, + "step": 990, + "train_speed(iter/s)": 0.470252 + }, + { + "epoch": 0.7435083130954605, + "grad_norm": 0.2584420531503668, + "learning_rate": 5.775773062991744e-06, + "loss": 0.1373291015625, + "memory(GiB)": 10.57, + "step": 995, + "train_speed(iter/s)": 0.470366 + }, + { + "epoch": 0.7472445357743321, + "grad_norm": 0.30020605961844676, + "learning_rate": 5.773542723068937e-06, + "loss": 0.1499267578125, + "memory(GiB)": 10.57, + "step": 1000, + "train_speed(iter/s)": 0.470476 + }, + { + "epoch": 0.7509807584532038, + "grad_norm": 0.3190463063150355, + "learning_rate": 5.771301780584126e-06, + "loss": 0.13701171875, + "memory(GiB)": 10.57, + "step": 1005, + "train_speed(iter/s)": 0.470413 + }, + { + "epoch": 0.7547169811320755, + "grad_norm": 0.27457688934495655, + "learning_rate": 5.769050244103872e-06, + "loss": 0.14490966796875, + "memory(GiB)": 10.57, + "step": 1010, + "train_speed(iter/s)": 0.470476 + }, + { + "epoch": 0.7584532038109472, + "grad_norm": 0.2676875855097265, + "learning_rate": 5.76678812223524e-06, + "loss": 0.1295654296875, + "memory(GiB)": 10.57, + "step": 1015, + "train_speed(iter/s)": 0.470519 + }, + { + "epoch": 0.7621894264898188, + "grad_norm": 0.35808053452995126, + "learning_rate": 5.764515423625757e-06, + "loss": 0.148779296875, + "memory(GiB)": 10.57, + "step": 1020, + "train_speed(iter/s)": 0.470638 + }, + { + "epoch": 0.7659256491686904, + "grad_norm": 0.2842904674611216, + "learning_rate": 5.762232156963381e-06, + "loss": 0.142138671875, + "memory(GiB)": 10.57, + "step": 1025, + "train_speed(iter/s)": 0.47066 + }, + { + "epoch": 0.7696618718475621, + "grad_norm": 0.24187187753830167, + "learning_rate": 5.759938330976473e-06, + "loss": 0.12486572265625, + "memory(GiB)": 10.57, + "step": 1030, + "train_speed(iter/s)": 0.470767 + }, + { + "epoch": 0.7733980945264338, + "grad_norm": 0.17249284757124964, + "learning_rate": 5.757633954433757e-06, + "loss": 0.13060302734375, + "memory(GiB)": 10.57, + "step": 1035, + "train_speed(iter/s)": 0.470832 + }, + { + "epoch": 0.7771343172053055, + "grad_norm": 0.2299915320848999, + "learning_rate": 5.755319036144289e-06, + "loss": 0.1218017578125, + "memory(GiB)": 10.57, + "step": 1040, + "train_speed(iter/s)": 0.470946 + }, + { + "epoch": 0.7808705398841771, + "grad_norm": 0.19120763093823928, + "learning_rate": 5.752993584957426e-06, + "loss": 0.11143798828125, + "memory(GiB)": 10.57, + "step": 1045, + "train_speed(iter/s)": 0.470952 + }, + { + "epoch": 0.7846067625630487, + "grad_norm": 0.23296209254061714, + "learning_rate": 5.750657609762787e-06, + "loss": 0.12412109375, + "memory(GiB)": 10.57, + "step": 1050, + "train_speed(iter/s)": 0.471065 + }, + { + "epoch": 0.7883429852419204, + "grad_norm": 0.23478728691916106, + "learning_rate": 5.74831111949022e-06, + "loss": 0.119873046875, + "memory(GiB)": 10.57, + "step": 1055, + "train_speed(iter/s)": 0.471177 + }, + { + "epoch": 0.7920792079207921, + "grad_norm": 0.3162727585321945, + "learning_rate": 5.745954123109776e-06, + "loss": 0.1431884765625, + "memory(GiB)": 10.57, + "step": 1060, + "train_speed(iter/s)": 0.471178 + }, + { + "epoch": 0.7958154305996638, + "grad_norm": 0.2554843268036968, + "learning_rate": 5.743586629631663e-06, + "loss": 0.13331298828125, + "memory(GiB)": 10.57, + "step": 1065, + "train_speed(iter/s)": 0.471177 + }, + { + "epoch": 0.7995516532785354, + "grad_norm": 0.26771118158424334, + "learning_rate": 5.741208648106216e-06, + "loss": 0.12591552734375, + "memory(GiB)": 10.57, + "step": 1070, + "train_speed(iter/s)": 0.471319 + }, + { + "epoch": 0.803287875957407, + "grad_norm": 0.22210847866208316, + "learning_rate": 5.7388201876238665e-06, + "loss": 0.13048095703125, + "memory(GiB)": 10.57, + "step": 1075, + "train_speed(iter/s)": 0.471098 + }, + { + "epoch": 0.8070240986362788, + "grad_norm": 0.2090698541733704, + "learning_rate": 5.736421257315105e-06, + "loss": 0.128466796875, + "memory(GiB)": 10.57, + "step": 1080, + "train_speed(iter/s)": 0.471054 + }, + { + "epoch": 0.8107603213151504, + "grad_norm": 0.25643857813943166, + "learning_rate": 5.734011866350441e-06, + "loss": 0.1285400390625, + "memory(GiB)": 10.57, + "step": 1085, + "train_speed(iter/s)": 0.471162 + }, + { + "epoch": 0.814496543994022, + "grad_norm": 0.21115359067598077, + "learning_rate": 5.731592023940377e-06, + "loss": 0.1183837890625, + "memory(GiB)": 10.57, + "step": 1090, + "train_speed(iter/s)": 0.471227 + }, + { + "epoch": 0.8182327666728937, + "grad_norm": 0.2712675993739738, + "learning_rate": 5.7291617393353644e-06, + "loss": 0.13204345703125, + "memory(GiB)": 10.57, + "step": 1095, + "train_speed(iter/s)": 0.471252 + }, + { + "epoch": 0.8219689893517653, + "grad_norm": 0.2084250099258058, + "learning_rate": 5.726721021825778e-06, + "loss": 0.11478271484375, + "memory(GiB)": 10.57, + "step": 1100, + "train_speed(iter/s)": 0.471379 + }, + { + "epoch": 0.8257052120306371, + "grad_norm": 0.2830458697450999, + "learning_rate": 5.724269880741871e-06, + "loss": 0.126416015625, + "memory(GiB)": 10.57, + "step": 1105, + "train_speed(iter/s)": 0.471405 + }, + { + "epoch": 0.8294414347095087, + "grad_norm": 0.3346887940418336, + "learning_rate": 5.721808325453744e-06, + "loss": 0.120458984375, + "memory(GiB)": 10.57, + "step": 1110, + "train_speed(iter/s)": 0.471343 + }, + { + "epoch": 0.8331776573883803, + "grad_norm": 0.36391607206986826, + "learning_rate": 5.719336365371309e-06, + "loss": 0.12493896484375, + "memory(GiB)": 10.57, + "step": 1115, + "train_speed(iter/s)": 0.471475 + }, + { + "epoch": 0.836913880067252, + "grad_norm": 0.24337137317648888, + "learning_rate": 5.716854009944253e-06, + "loss": 0.116259765625, + "memory(GiB)": 10.57, + "step": 1120, + "train_speed(iter/s)": 0.471472 + }, + { + "epoch": 0.8406501027461236, + "grad_norm": 0.26926385581419715, + "learning_rate": 5.714361268662001e-06, + "loss": 0.12049560546875, + "memory(GiB)": 10.57, + "step": 1125, + "train_speed(iter/s)": 0.471483 + }, + { + "epoch": 0.8443863254249954, + "grad_norm": 0.21579011337181153, + "learning_rate": 5.711858151053681e-06, + "loss": 0.13843994140625, + "memory(GiB)": 10.57, + "step": 1130, + "train_speed(iter/s)": 0.471564 + }, + { + "epoch": 0.848122548103867, + "grad_norm": 0.20161584440361865, + "learning_rate": 5.7093446666880895e-06, + "loss": 0.109716796875, + "memory(GiB)": 10.57, + "step": 1135, + "train_speed(iter/s)": 0.471594 + }, + { + "epoch": 0.8518587707827386, + "grad_norm": 0.188476182825962, + "learning_rate": 5.7068208251736475e-06, + "loss": 0.126171875, + "memory(GiB)": 10.57, + "step": 1140, + "train_speed(iter/s)": 0.471689 + }, + { + "epoch": 0.8555949934616103, + "grad_norm": 0.27427824113320737, + "learning_rate": 5.704286636158373e-06, + "loss": 0.12137451171875, + "memory(GiB)": 10.57, + "step": 1145, + "train_speed(iter/s)": 0.471734 + }, + { + "epoch": 0.859331216140482, + "grad_norm": 0.22763746167838253, + "learning_rate": 5.701742109329838e-06, + "loss": 0.13856201171875, + "memory(GiB)": 10.57, + "step": 1150, + "train_speed(iter/s)": 0.471726 + }, + { + "epoch": 0.8630674388193537, + "grad_norm": 0.17131970459498547, + "learning_rate": 5.6991872544151335e-06, + "loss": 0.14425048828125, + "memory(GiB)": 10.57, + "step": 1155, + "train_speed(iter/s)": 0.471788 + }, + { + "epoch": 0.8668036614982253, + "grad_norm": 0.23048712465348178, + "learning_rate": 5.696622081180834e-06, + "loss": 0.153955078125, + "memory(GiB)": 10.57, + "step": 1160, + "train_speed(iter/s)": 0.471828 + }, + { + "epoch": 0.8705398841770969, + "grad_norm": 0.2737430014057503, + "learning_rate": 5.694046599432956e-06, + "loss": 0.116259765625, + "memory(GiB)": 10.57, + "step": 1165, + "train_speed(iter/s)": 0.471811 + }, + { + "epoch": 0.8742761068559686, + "grad_norm": 0.23626021988375195, + "learning_rate": 5.691460819016923e-06, + "loss": 0.1245849609375, + "memory(GiB)": 10.57, + "step": 1170, + "train_speed(iter/s)": 0.471906 + }, + { + "epoch": 0.8780123295348403, + "grad_norm": 0.27390563050373423, + "learning_rate": 5.68886474981753e-06, + "loss": 0.12216796875, + "memory(GiB)": 10.57, + "step": 1175, + "train_speed(iter/s)": 0.471894 + }, + { + "epoch": 0.881748552213712, + "grad_norm": 0.3598824701234181, + "learning_rate": 5.686258401758901e-06, + "loss": 0.1288818359375, + "memory(GiB)": 10.57, + "step": 1180, + "train_speed(iter/s)": 0.471866 + }, + { + "epoch": 0.8854847748925836, + "grad_norm": 0.2803403042160743, + "learning_rate": 5.683641784804454e-06, + "loss": 0.119970703125, + "memory(GiB)": 10.57, + "step": 1185, + "train_speed(iter/s)": 0.471838 + }, + { + "epoch": 0.8892209975714552, + "grad_norm": 0.24011469363238191, + "learning_rate": 5.681014908956866e-06, + "loss": 0.12734375, + "memory(GiB)": 10.57, + "step": 1190, + "train_speed(iter/s)": 0.471876 + }, + { + "epoch": 0.8929572202503269, + "grad_norm": 0.23680884380834868, + "learning_rate": 5.6783777842580245e-06, + "loss": 0.131884765625, + "memory(GiB)": 10.57, + "step": 1195, + "train_speed(iter/s)": 0.471946 + }, + { + "epoch": 0.8966934429291986, + "grad_norm": 0.25067555757294774, + "learning_rate": 5.6757304207890006e-06, + "loss": 0.11749267578125, + "memory(GiB)": 10.57, + "step": 1200, + "train_speed(iter/s)": 0.471919 + }, + { + "epoch": 0.9004296656080703, + "grad_norm": 0.25663340180554484, + "learning_rate": 5.673072828670005e-06, + "loss": 0.1390380859375, + "memory(GiB)": 10.57, + "step": 1205, + "train_speed(iter/s)": 0.471961 + }, + { + "epoch": 0.9041658882869419, + "grad_norm": 0.34196712108358773, + "learning_rate": 5.670405018060349e-06, + "loss": 0.1314453125, + "memory(GiB)": 10.57, + "step": 1210, + "train_speed(iter/s)": 0.472017 + }, + { + "epoch": 0.9079021109658135, + "grad_norm": 0.25320961648503115, + "learning_rate": 5.667726999158408e-06, + "loss": 0.11199951171875, + "memory(GiB)": 10.57, + "step": 1215, + "train_speed(iter/s)": 0.472063 + }, + { + "epoch": 0.9116383336446853, + "grad_norm": 0.2895202800969726, + "learning_rate": 5.665038782201579e-06, + "loss": 0.11494140625, + "memory(GiB)": 10.57, + "step": 1220, + "train_speed(iter/s)": 0.472149 + }, + { + "epoch": 0.9153745563235569, + "grad_norm": 0.24353976208363304, + "learning_rate": 5.662340377466246e-06, + "loss": 0.13350830078125, + "memory(GiB)": 10.57, + "step": 1225, + "train_speed(iter/s)": 0.472205 + }, + { + "epoch": 0.9191107790024285, + "grad_norm": 0.21343931443362257, + "learning_rate": 5.659631795267736e-06, + "loss": 0.1358642578125, + "memory(GiB)": 10.57, + "step": 1230, + "train_speed(iter/s)": 0.472292 + }, + { + "epoch": 0.9228470016813002, + "grad_norm": 0.18836231763075187, + "learning_rate": 5.656913045960284e-06, + "loss": 0.1275634765625, + "memory(GiB)": 10.57, + "step": 1235, + "train_speed(iter/s)": 0.472331 + }, + { + "epoch": 0.9265832243601718, + "grad_norm": 0.4626722838861778, + "learning_rate": 5.65418413993699e-06, + "loss": 0.12288818359375, + "memory(GiB)": 10.57, + "step": 1240, + "train_speed(iter/s)": 0.472384 + }, + { + "epoch": 0.9303194470390436, + "grad_norm": 0.2791944611984056, + "learning_rate": 5.651445087629781e-06, + "loss": 0.12313232421875, + "memory(GiB)": 10.57, + "step": 1245, + "train_speed(iter/s)": 0.47236 + }, + { + "epoch": 0.9340556697179152, + "grad_norm": 0.2351927769190445, + "learning_rate": 5.648695899509373e-06, + "loss": 0.12640380859375, + "memory(GiB)": 10.57, + "step": 1250, + "train_speed(iter/s)": 0.472318 + }, + { + "epoch": 0.9377918923967868, + "grad_norm": 0.29167608891344404, + "learning_rate": 5.6459365860852225e-06, + "loss": 0.1332763671875, + "memory(GiB)": 10.57, + "step": 1255, + "train_speed(iter/s)": 0.472324 + }, + { + "epoch": 0.9415281150756585, + "grad_norm": 0.3389174699822604, + "learning_rate": 5.643167157905499e-06, + "loss": 0.1290771484375, + "memory(GiB)": 10.57, + "step": 1260, + "train_speed(iter/s)": 0.472422 + }, + { + "epoch": 0.9452643377545301, + "grad_norm": 0.19240685493137236, + "learning_rate": 5.640387625557036e-06, + "loss": 0.11680908203125, + "memory(GiB)": 10.57, + "step": 1265, + "train_speed(iter/s)": 0.472422 + }, + { + "epoch": 0.9490005604334019, + "grad_norm": 0.26444195491643885, + "learning_rate": 5.63759799966529e-06, + "loss": 0.139111328125, + "memory(GiB)": 10.57, + "step": 1270, + "train_speed(iter/s)": 0.4725 + }, + { + "epoch": 0.9527367831122735, + "grad_norm": 0.2630005422058253, + "learning_rate": 5.634798290894306e-06, + "loss": 0.1197265625, + "memory(GiB)": 10.57, + "step": 1275, + "train_speed(iter/s)": 0.472495 + }, + { + "epoch": 0.9564730057911451, + "grad_norm": 0.23145820253558871, + "learning_rate": 5.631988509946674e-06, + "loss": 0.1113037109375, + "memory(GiB)": 10.57, + "step": 1280, + "train_speed(iter/s)": 0.472494 + }, + { + "epoch": 0.9602092284700168, + "grad_norm": 0.2899148756938717, + "learning_rate": 5.629168667563484e-06, + "loss": 0.121484375, + "memory(GiB)": 10.57, + "step": 1285, + "train_speed(iter/s)": 0.472497 + }, + { + "epoch": 0.9639454511488885, + "grad_norm": 0.36548878879119173, + "learning_rate": 5.62633877452429e-06, + "loss": 0.12415771484375, + "memory(GiB)": 10.57, + "step": 1290, + "train_speed(iter/s)": 0.472442 + }, + { + "epoch": 0.9676816738277602, + "grad_norm": 0.25528341367700647, + "learning_rate": 5.623498841647067e-06, + "loss": 0.1307861328125, + "memory(GiB)": 10.57, + "step": 1295, + "train_speed(iter/s)": 0.472498 + }, + { + "epoch": 0.9714178965066318, + "grad_norm": 0.1948509154997499, + "learning_rate": 5.620648879788172e-06, + "loss": 0.122802734375, + "memory(GiB)": 10.57, + "step": 1300, + "train_speed(iter/s)": 0.47247 + }, + { + "epoch": 0.9751541191855034, + "grad_norm": 0.23395818708390523, + "learning_rate": 5.617788899842296e-06, + "loss": 0.1336181640625, + "memory(GiB)": 10.57, + "step": 1305, + "train_speed(iter/s)": 0.472478 + }, + { + "epoch": 0.9788903418643751, + "grad_norm": 0.22129751131979117, + "learning_rate": 5.61491891274243e-06, + "loss": 0.11290283203125, + "memory(GiB)": 10.57, + "step": 1310, + "train_speed(iter/s)": 0.47253 + }, + { + "epoch": 0.9826265645432468, + "grad_norm": 0.38768444008771463, + "learning_rate": 5.6120389294598185e-06, + "loss": 0.1374267578125, + "memory(GiB)": 10.57, + "step": 1315, + "train_speed(iter/s)": 0.472627 + }, + { + "epoch": 0.9863627872221185, + "grad_norm": 0.2634727672178905, + "learning_rate": 5.609148961003919e-06, + "loss": 0.10865478515625, + "memory(GiB)": 10.57, + "step": 1320, + "train_speed(iter/s)": 0.472642 + }, + { + "epoch": 0.9900990099009901, + "grad_norm": 0.2693250349909997, + "learning_rate": 5.606249018422361e-06, + "loss": 0.121435546875, + "memory(GiB)": 10.57, + "step": 1325, + "train_speed(iter/s)": 0.472693 + }, + { + "epoch": 0.9938352325798617, + "grad_norm": 0.3142833629244817, + "learning_rate": 5.603339112800902e-06, + "loss": 0.127587890625, + "memory(GiB)": 10.57, + "step": 1330, + "train_speed(iter/s)": 0.472723 + }, + { + "epoch": 0.9975714552587335, + "grad_norm": 0.260475289320075, + "learning_rate": 5.600419255263382e-06, + "loss": 0.12655029296875, + "memory(GiB)": 10.57, + "step": 1335, + "train_speed(iter/s)": 0.472744 + }, + { + "epoch": 1.0007472445357743, + "grad_norm": 0.1995569301298896, + "learning_rate": 5.5974894569716925e-06, + "loss": 0.11612548828125, + "memory(GiB)": 10.57, + "step": 1340, + "train_speed(iter/s)": 0.472443 + }, + { + "epoch": 1.004483467214646, + "grad_norm": 0.19232697366661053, + "learning_rate": 5.594549729125718e-06, + "loss": 0.09854736328125, + "memory(GiB)": 10.57, + "step": 1345, + "train_speed(iter/s)": 0.472505 + }, + { + "epoch": 1.0082196898935176, + "grad_norm": 0.33732827706175905, + "learning_rate": 5.591600082963308e-06, + "loss": 0.115234375, + "memory(GiB)": 10.57, + "step": 1350, + "train_speed(iter/s)": 0.472424 + }, + { + "epoch": 1.0119559125723894, + "grad_norm": 0.18176928497159583, + "learning_rate": 5.58864052976022e-06, + "loss": 0.10379638671875, + "memory(GiB)": 10.57, + "step": 1355, + "train_speed(iter/s)": 0.472477 + }, + { + "epoch": 1.015692135251261, + "grad_norm": 0.357592464764428, + "learning_rate": 5.585671080830091e-06, + "loss": 0.1013671875, + "memory(GiB)": 10.57, + "step": 1360, + "train_speed(iter/s)": 0.472527 + }, + { + "epoch": 1.0194283579301326, + "grad_norm": 0.2808114053275493, + "learning_rate": 5.5826917475243834e-06, + "loss": 0.097698974609375, + "memory(GiB)": 10.57, + "step": 1365, + "train_speed(iter/s)": 0.47262 + }, + { + "epoch": 1.0231645806090044, + "grad_norm": 0.17182129292938358, + "learning_rate": 5.579702541232344e-06, + "loss": 0.10177001953125, + "memory(GiB)": 10.57, + "step": 1370, + "train_speed(iter/s)": 0.472515 + }, + { + "epoch": 1.026900803287876, + "grad_norm": 0.36516118234394773, + "learning_rate": 5.576703473380963e-06, + "loss": 0.10394287109375, + "memory(GiB)": 10.57, + "step": 1375, + "train_speed(iter/s)": 0.472485 + }, + { + "epoch": 1.0306370259667477, + "grad_norm": 0.12994331570229598, + "learning_rate": 5.573694555434929e-06, + "loss": 0.09647216796875, + "memory(GiB)": 10.57, + "step": 1380, + "train_speed(iter/s)": 0.472423 + }, + { + "epoch": 1.0343732486456192, + "grad_norm": 0.2439500781039156, + "learning_rate": 5.570675798896584e-06, + "loss": 0.09913330078125, + "memory(GiB)": 10.57, + "step": 1385, + "train_speed(iter/s)": 0.472472 + }, + { + "epoch": 1.038109471324491, + "grad_norm": 0.24510427147093836, + "learning_rate": 5.567647215305884e-06, + "loss": 0.10660400390625, + "memory(GiB)": 10.57, + "step": 1390, + "train_speed(iter/s)": 0.472502 + }, + { + "epoch": 1.0418456940033627, + "grad_norm": 0.3224514432487436, + "learning_rate": 5.564608816240345e-06, + "loss": 0.1132080078125, + "memory(GiB)": 10.57, + "step": 1395, + "train_speed(iter/s)": 0.472569 + }, + { + "epoch": 1.0455819166822342, + "grad_norm": 0.23587230778852436, + "learning_rate": 5.56156061331501e-06, + "loss": 0.0916259765625, + "memory(GiB)": 10.57, + "step": 1400, + "train_speed(iter/s)": 0.472605 + }, + { + "epoch": 1.049318139361106, + "grad_norm": 0.22597459572374368, + "learning_rate": 5.5585026181823994e-06, + "loss": 0.10594482421875, + "memory(GiB)": 10.57, + "step": 1405, + "train_speed(iter/s)": 0.472556 + }, + { + "epoch": 1.0530543620399775, + "grad_norm": 0.19096269961906193, + "learning_rate": 5.555434842532465e-06, + "loss": 0.089910888671875, + "memory(GiB)": 10.57, + "step": 1410, + "train_speed(iter/s)": 0.472594 + }, + { + "epoch": 1.0567905847188492, + "grad_norm": 0.21611547990188876, + "learning_rate": 5.552357298092549e-06, + "loss": 0.10777587890625, + "memory(GiB)": 10.57, + "step": 1415, + "train_speed(iter/s)": 0.472614 + }, + { + "epoch": 1.060526807397721, + "grad_norm": 0.2651855509481471, + "learning_rate": 5.549269996627335e-06, + "loss": 0.104296875, + "memory(GiB)": 10.57, + "step": 1420, + "train_speed(iter/s)": 0.472711 + }, + { + "epoch": 1.0642630300765925, + "grad_norm": 0.2884021435709037, + "learning_rate": 5.546172949938806e-06, + "loss": 0.09815673828125, + "memory(GiB)": 10.57, + "step": 1425, + "train_speed(iter/s)": 0.472743 + }, + { + "epoch": 1.0679992527554643, + "grad_norm": 0.3272777127266579, + "learning_rate": 5.5430661698661995e-06, + "loss": 0.09771728515625, + "memory(GiB)": 10.57, + "step": 1430, + "train_speed(iter/s)": 0.472793 + }, + { + "epoch": 1.0717354754343358, + "grad_norm": 0.22908749879031715, + "learning_rate": 5.539949668285962e-06, + "loss": 0.11275634765625, + "memory(GiB)": 10.57, + "step": 1435, + "train_speed(iter/s)": 0.472759 + }, + { + "epoch": 1.0754716981132075, + "grad_norm": 0.20839037146203993, + "learning_rate": 5.5368234571117e-06, + "loss": 0.1127685546875, + "memory(GiB)": 10.57, + "step": 1440, + "train_speed(iter/s)": 0.472726 + }, + { + "epoch": 1.0792079207920793, + "grad_norm": 0.21433788637796058, + "learning_rate": 5.533687548294139e-06, + "loss": 0.102685546875, + "memory(GiB)": 10.57, + "step": 1445, + "train_speed(iter/s)": 0.472795 + }, + { + "epoch": 1.0829441434709508, + "grad_norm": 0.2194852609411041, + "learning_rate": 5.530541953821078e-06, + "loss": 0.1194580078125, + "memory(GiB)": 10.57, + "step": 1450, + "train_speed(iter/s)": 0.472644 + }, + { + "epoch": 1.0866803661498226, + "grad_norm": 0.2119142735733801, + "learning_rate": 5.5273866857173375e-06, + "loss": 0.09979248046875, + "memory(GiB)": 10.57, + "step": 1455, + "train_speed(iter/s)": 0.472692 + }, + { + "epoch": 1.090416588828694, + "grad_norm": 0.18271859704191354, + "learning_rate": 5.524221756044723e-06, + "loss": 0.10120849609375, + "memory(GiB)": 10.57, + "step": 1460, + "train_speed(iter/s)": 0.472786 + }, + { + "epoch": 1.0941528115075658, + "grad_norm": 0.1965749879154183, + "learning_rate": 5.521047176901968e-06, + "loss": 0.09178466796875, + "memory(GiB)": 10.57, + "step": 1465, + "train_speed(iter/s)": 0.472753 + }, + { + "epoch": 1.0978890341864376, + "grad_norm": 0.3529079661879815, + "learning_rate": 5.5178629604247e-06, + "loss": 0.099200439453125, + "memory(GiB)": 10.57, + "step": 1470, + "train_speed(iter/s)": 0.4728 + }, + { + "epoch": 1.101625256865309, + "grad_norm": 0.23509583771318013, + "learning_rate": 5.514669118785383e-06, + "loss": 0.10716552734375, + "memory(GiB)": 10.57, + "step": 1475, + "train_speed(iter/s)": 0.47282 + }, + { + "epoch": 1.1053614795441808, + "grad_norm": 0.22191044730282325, + "learning_rate": 5.511465664193278e-06, + "loss": 0.1013671875, + "memory(GiB)": 10.57, + "step": 1480, + "train_speed(iter/s)": 0.472823 + }, + { + "epoch": 1.1090977022230526, + "grad_norm": 0.2697952102289562, + "learning_rate": 5.50825260889439e-06, + "loss": 0.110107421875, + "memory(GiB)": 10.57, + "step": 1485, + "train_speed(iter/s)": 0.47279 + }, + { + "epoch": 1.1128339249019241, + "grad_norm": 0.34041111425327863, + "learning_rate": 5.505029965171431e-06, + "loss": 0.10975341796875, + "memory(GiB)": 10.57, + "step": 1490, + "train_speed(iter/s)": 0.472756 + }, + { + "epoch": 1.1165701475807959, + "grad_norm": 0.26883268648527414, + "learning_rate": 5.501797745343762e-06, + "loss": 0.09005126953125, + "memory(GiB)": 10.57, + "step": 1495, + "train_speed(iter/s)": 0.472749 + }, + { + "epoch": 1.1203063702596674, + "grad_norm": 0.2591995651189346, + "learning_rate": 5.498555961767353e-06, + "loss": 0.1026611328125, + "memory(GiB)": 10.57, + "step": 1500, + "train_speed(iter/s)": 0.472741 + }, + { + "epoch": 1.1240425929385391, + "grad_norm": 0.20112516368334774, + "learning_rate": 5.495304626834737e-06, + "loss": 0.10999755859375, + "memory(GiB)": 10.57, + "step": 1505, + "train_speed(iter/s)": 0.472822 + }, + { + "epoch": 1.127778815617411, + "grad_norm": 0.31243387047085314, + "learning_rate": 5.492043752974954e-06, + "loss": 0.1138671875, + "memory(GiB)": 10.57, + "step": 1510, + "train_speed(iter/s)": 0.472881 + }, + { + "epoch": 1.1315150382962824, + "grad_norm": 0.2885339464617385, + "learning_rate": 5.488773352653511e-06, + "loss": 0.103564453125, + "memory(GiB)": 10.57, + "step": 1515, + "train_speed(iter/s)": 0.47291 + }, + { + "epoch": 1.1352512609751542, + "grad_norm": 0.2769596342571021, + "learning_rate": 5.485493438372334e-06, + "loss": 0.11546630859375, + "memory(GiB)": 10.57, + "step": 1520, + "train_speed(iter/s)": 0.472812 + }, + { + "epoch": 1.1389874836540257, + "grad_norm": 0.3103063506129397, + "learning_rate": 5.482204022669716e-06, + "loss": 0.108837890625, + "memory(GiB)": 10.57, + "step": 1525, + "train_speed(iter/s)": 0.472794 + }, + { + "epoch": 1.1427237063328974, + "grad_norm": 0.2913972666378632, + "learning_rate": 5.478905118120274e-06, + "loss": 0.1053466796875, + "memory(GiB)": 10.57, + "step": 1530, + "train_speed(iter/s)": 0.472759 + }, + { + "epoch": 1.1464599290117692, + "grad_norm": 0.1805358057399443, + "learning_rate": 5.475596737334896e-06, + "loss": 0.10556640625, + "memory(GiB)": 10.57, + "step": 1535, + "train_speed(iter/s)": 0.472764 + }, + { + "epoch": 1.1501961516906407, + "grad_norm": 0.18027571971615952, + "learning_rate": 5.472278892960697e-06, + "loss": 0.10286865234375, + "memory(GiB)": 10.57, + "step": 1540, + "train_speed(iter/s)": 0.472798 + }, + { + "epoch": 1.1539323743695125, + "grad_norm": 0.26360451822838044, + "learning_rate": 5.468951597680969e-06, + "loss": 0.10518798828125, + "memory(GiB)": 10.57, + "step": 1545, + "train_speed(iter/s)": 0.472811 + }, + { + "epoch": 1.1576685970483842, + "grad_norm": 0.27825790440429315, + "learning_rate": 5.4656148642151315e-06, + "loss": 0.1068115234375, + "memory(GiB)": 10.57, + "step": 1550, + "train_speed(iter/s)": 0.472822 + }, + { + "epoch": 1.1614048197272557, + "grad_norm": 0.37841928985976586, + "learning_rate": 5.462268705318685e-06, + "loss": 0.105902099609375, + "memory(GiB)": 10.57, + "step": 1555, + "train_speed(iter/s)": 0.472796 + }, + { + "epoch": 1.1651410424061275, + "grad_norm": 0.20072772545318748, + "learning_rate": 5.458913133783158e-06, + "loss": 0.096240234375, + "memory(GiB)": 10.57, + "step": 1560, + "train_speed(iter/s)": 0.472849 + }, + { + "epoch": 1.168877265084999, + "grad_norm": 0.2782224873229787, + "learning_rate": 5.455548162436066e-06, + "loss": 0.10538330078125, + "memory(GiB)": 10.57, + "step": 1565, + "train_speed(iter/s)": 0.472866 + }, + { + "epoch": 1.1726134877638708, + "grad_norm": 0.2611062382021719, + "learning_rate": 5.4521738041408535e-06, + "loss": 0.102545166015625, + "memory(GiB)": 10.57, + "step": 1570, + "train_speed(iter/s)": 0.472925 + }, + { + "epoch": 1.1763497104427425, + "grad_norm": 0.36681796736657335, + "learning_rate": 5.448790071796851e-06, + "loss": 0.10877685546875, + "memory(GiB)": 10.57, + "step": 1575, + "train_speed(iter/s)": 0.472947 + }, + { + "epoch": 1.180085933121614, + "grad_norm": 0.31412178351944464, + "learning_rate": 5.445396978339223e-06, + "loss": 0.1108642578125, + "memory(GiB)": 10.57, + "step": 1580, + "train_speed(iter/s)": 0.472961 + }, + { + "epoch": 1.1838221558004858, + "grad_norm": 0.2945745909309181, + "learning_rate": 5.4419945367389204e-06, + "loss": 0.104638671875, + "memory(GiB)": 10.57, + "step": 1585, + "train_speed(iter/s)": 0.472997 + }, + { + "epoch": 1.1875583784793573, + "grad_norm": 0.2005694453013891, + "learning_rate": 5.438582760002628e-06, + "loss": 0.11466064453125, + "memory(GiB)": 10.57, + "step": 1590, + "train_speed(iter/s)": 0.472995 + }, + { + "epoch": 1.191294601158229, + "grad_norm": 0.21815942040257993, + "learning_rate": 5.4351616611727174e-06, + "loss": 0.09090576171875, + "memory(GiB)": 10.57, + "step": 1595, + "train_speed(iter/s)": 0.472983 + }, + { + "epoch": 1.1950308238371008, + "grad_norm": 0.3254149926280658, + "learning_rate": 5.431731253327197e-06, + "loss": 0.09832763671875, + "memory(GiB)": 10.57, + "step": 1600, + "train_speed(iter/s)": 0.472989 + }, + { + "epoch": 1.1987670465159723, + "grad_norm": 0.21539039093948628, + "learning_rate": 5.428291549579658e-06, + "loss": 0.0917236328125, + "memory(GiB)": 10.57, + "step": 1605, + "train_speed(iter/s)": 0.472999 + }, + { + "epoch": 1.202503269194844, + "grad_norm": 0.3980763574441828, + "learning_rate": 5.424842563079231e-06, + "loss": 0.1013427734375, + "memory(GiB)": 10.57, + "step": 1610, + "train_speed(iter/s)": 0.473002 + }, + { + "epoch": 1.2062394918737156, + "grad_norm": 0.2562644399270751, + "learning_rate": 5.421384307010532e-06, + "loss": 0.12611083984375, + "memory(GiB)": 10.57, + "step": 1615, + "train_speed(iter/s)": 0.473001 + }, + { + "epoch": 1.2099757145525873, + "grad_norm": 0.21063963603050906, + "learning_rate": 5.41791679459361e-06, + "loss": 0.09677734375, + "memory(GiB)": 10.57, + "step": 1620, + "train_speed(iter/s)": 0.473087 + }, + { + "epoch": 1.213711937231459, + "grad_norm": 0.26589295201735347, + "learning_rate": 5.4144400390839014e-06, + "loss": 0.10716552734375, + "memory(GiB)": 10.57, + "step": 1625, + "train_speed(iter/s)": 0.473137 + }, + { + "epoch": 1.2174481599103306, + "grad_norm": 0.3159674300444183, + "learning_rate": 5.410954053772174e-06, + "loss": 0.117822265625, + "memory(GiB)": 10.57, + "step": 1630, + "train_speed(iter/s)": 0.473161 + }, + { + "epoch": 1.2211843825892024, + "grad_norm": 0.3257909348870682, + "learning_rate": 5.407458851984481e-06, + "loss": 0.105908203125, + "memory(GiB)": 10.57, + "step": 1635, + "train_speed(iter/s)": 0.473064 + }, + { + "epoch": 1.224920605268074, + "grad_norm": 0.25594963311057084, + "learning_rate": 5.403954447082107e-06, + "loss": 0.1008544921875, + "memory(GiB)": 10.57, + "step": 1640, + "train_speed(iter/s)": 0.473138 + }, + { + "epoch": 1.2286568279469456, + "grad_norm": 0.27760936809640124, + "learning_rate": 5.400440852461517e-06, + "loss": 0.08446044921875, + "memory(GiB)": 10.57, + "step": 1645, + "train_speed(iter/s)": 0.473198 + }, + { + "epoch": 1.2323930506258174, + "grad_norm": 0.30926667434610317, + "learning_rate": 5.3969180815543075e-06, + "loss": 0.0973876953125, + "memory(GiB)": 10.57, + "step": 1650, + "train_speed(iter/s)": 0.473211 + }, + { + "epoch": 1.236129273304689, + "grad_norm": 0.22376369134309534, + "learning_rate": 5.393386147827153e-06, + "loss": 0.08917236328125, + "memory(GiB)": 10.57, + "step": 1655, + "train_speed(iter/s)": 0.473219 + }, + { + "epoch": 1.2398654959835607, + "grad_norm": 0.3060981242994768, + "learning_rate": 5.3898450647817534e-06, + "loss": 0.095660400390625, + "memory(GiB)": 10.57, + "step": 1660, + "train_speed(iter/s)": 0.47326 + }, + { + "epoch": 1.2436017186624322, + "grad_norm": 0.2824418483688286, + "learning_rate": 5.386294845954789e-06, + "loss": 0.093310546875, + "memory(GiB)": 10.57, + "step": 1665, + "train_speed(iter/s)": 0.473272 + }, + { + "epoch": 1.247337941341304, + "grad_norm": 0.36318507390627536, + "learning_rate": 5.382735504917859e-06, + "loss": 0.09969482421875, + "memory(GiB)": 10.57, + "step": 1670, + "train_speed(iter/s)": 0.473338 + }, + { + "epoch": 1.2510741640201757, + "grad_norm": 0.25998406554963555, + "learning_rate": 5.379167055277436e-06, + "loss": 0.0906982421875, + "memory(GiB)": 10.57, + "step": 1675, + "train_speed(iter/s)": 0.473289 + }, + { + "epoch": 1.2548103866990472, + "grad_norm": 0.3053060614623874, + "learning_rate": 5.3755895106748135e-06, + "loss": 0.1009033203125, + "memory(GiB)": 10.57, + "step": 1680, + "train_speed(iter/s)": 0.473278 + }, + { + "epoch": 1.258546609377919, + "grad_norm": 0.3304211891993834, + "learning_rate": 5.372002884786053e-06, + "loss": 0.080206298828125, + "memory(GiB)": 10.57, + "step": 1685, + "train_speed(iter/s)": 0.473247 + }, + { + "epoch": 1.2622828320567905, + "grad_norm": 0.3786132572419238, + "learning_rate": 5.368407191321929e-06, + "loss": 0.11483154296875, + "memory(GiB)": 10.57, + "step": 1690, + "train_speed(iter/s)": 0.473224 + }, + { + "epoch": 1.2660190547356622, + "grad_norm": 0.4098142898034233, + "learning_rate": 5.364802444027881e-06, + "loss": 0.11900634765625, + "memory(GiB)": 10.57, + "step": 1695, + "train_speed(iter/s)": 0.473255 + }, + { + "epoch": 1.269755277414534, + "grad_norm": 0.31832382239724993, + "learning_rate": 5.36118865668396e-06, + "loss": 0.100079345703125, + "memory(GiB)": 10.57, + "step": 1700, + "train_speed(iter/s)": 0.473256 + }, + { + "epoch": 1.2734915000934055, + "grad_norm": 0.21787448497633385, + "learning_rate": 5.357565843104772e-06, + "loss": 0.1089111328125, + "memory(GiB)": 10.57, + "step": 1705, + "train_speed(iter/s)": 0.473319 + }, + { + "epoch": 1.2772277227722773, + "grad_norm": 0.24048814888237727, + "learning_rate": 5.3539340171394315e-06, + "loss": 0.103173828125, + "memory(GiB)": 10.57, + "step": 1710, + "train_speed(iter/s)": 0.473382 + }, + { + "epoch": 1.2809639454511488, + "grad_norm": 0.2628088064912976, + "learning_rate": 5.350293192671502e-06, + "loss": 0.1017578125, + "memory(GiB)": 10.57, + "step": 1715, + "train_speed(iter/s)": 0.473392 + }, + { + "epoch": 1.2847001681300205, + "grad_norm": 0.19682320473371387, + "learning_rate": 5.3466433836189466e-06, + "loss": 0.10618896484375, + "memory(GiB)": 10.57, + "step": 1720, + "train_speed(iter/s)": 0.473367 + }, + { + "epoch": 1.2884363908088923, + "grad_norm": 0.31166282334428463, + "learning_rate": 5.342984603934075e-06, + "loss": 0.0931884765625, + "memory(GiB)": 10.57, + "step": 1725, + "train_speed(iter/s)": 0.473394 + }, + { + "epoch": 1.2921726134877638, + "grad_norm": 0.4426055463824898, + "learning_rate": 5.3393168676034925e-06, + "loss": 0.10029296875, + "memory(GiB)": 10.57, + "step": 1730, + "train_speed(iter/s)": 0.473469 + }, + { + "epoch": 1.2959088361666355, + "grad_norm": 0.19012257878940111, + "learning_rate": 5.335640188648036e-06, + "loss": 0.0994873046875, + "memory(GiB)": 10.57, + "step": 1735, + "train_speed(iter/s)": 0.473506 + }, + { + "epoch": 1.299645058845507, + "grad_norm": 0.2509436471905221, + "learning_rate": 5.3319545811227345e-06, + "loss": 0.10556640625, + "memory(GiB)": 10.57, + "step": 1740, + "train_speed(iter/s)": 0.47352 + }, + { + "epoch": 1.3033812815243788, + "grad_norm": 0.30945571438082825, + "learning_rate": 5.328260059116746e-06, + "loss": 0.10347900390625, + "memory(GiB)": 10.57, + "step": 1745, + "train_speed(iter/s)": 0.473584 + }, + { + "epoch": 1.3071175042032506, + "grad_norm": 0.2842323038315994, + "learning_rate": 5.324556636753305e-06, + "loss": 0.0927490234375, + "memory(GiB)": 10.57, + "step": 1750, + "train_speed(iter/s)": 0.473591 + }, + { + "epoch": 1.310853726882122, + "grad_norm": 0.22529076559497616, + "learning_rate": 5.320844328189674e-06, + "loss": 0.10736083984375, + "memory(GiB)": 10.57, + "step": 1755, + "train_speed(iter/s)": 0.473593 + }, + { + "epoch": 1.3145899495609938, + "grad_norm": 0.25966109665415044, + "learning_rate": 5.31712314761708e-06, + "loss": 0.09718017578125, + "memory(GiB)": 10.57, + "step": 1760, + "train_speed(iter/s)": 0.473634 + }, + { + "epoch": 1.3183261722398654, + "grad_norm": 0.2824285315852678, + "learning_rate": 5.31339310926067e-06, + "loss": 0.1147216796875, + "memory(GiB)": 10.57, + "step": 1765, + "train_speed(iter/s)": 0.473682 + }, + { + "epoch": 1.3220623949187371, + "grad_norm": 0.29212776422688475, + "learning_rate": 5.30965422737945e-06, + "loss": 0.106103515625, + "memory(GiB)": 10.57, + "step": 1770, + "train_speed(iter/s)": 0.473711 + }, + { + "epoch": 1.3257986175976089, + "grad_norm": 0.21770600045083738, + "learning_rate": 5.305906516266232e-06, + "loss": 0.09356689453125, + "memory(GiB)": 10.57, + "step": 1775, + "train_speed(iter/s)": 0.473749 + }, + { + "epoch": 1.3295348402764806, + "grad_norm": 0.22535805175359133, + "learning_rate": 5.302149990247581e-06, + "loss": 0.09854736328125, + "memory(GiB)": 10.57, + "step": 1780, + "train_speed(iter/s)": 0.47377 + }, + { + "epoch": 1.3332710629553521, + "grad_norm": 0.3731424208017629, + "learning_rate": 5.298384663683759e-06, + "loss": 0.10096435546875, + "memory(GiB)": 10.57, + "step": 1785, + "train_speed(iter/s)": 0.473814 + }, + { + "epoch": 1.3370072856342237, + "grad_norm": 0.19409382195361594, + "learning_rate": 5.29461055096867e-06, + "loss": 0.0933837890625, + "memory(GiB)": 10.57, + "step": 1790, + "train_speed(iter/s)": 0.473846 + }, + { + "epoch": 1.3407435083130954, + "grad_norm": 0.20858019331443553, + "learning_rate": 5.290827666529807e-06, + "loss": 0.09691162109375, + "memory(GiB)": 10.57, + "step": 1795, + "train_speed(iter/s)": 0.473812 + }, + { + "epoch": 1.3444797309919672, + "grad_norm": 0.21508957217260072, + "learning_rate": 5.287036024828191e-06, + "loss": 0.112396240234375, + "memory(GiB)": 10.57, + "step": 1800, + "train_speed(iter/s)": 0.473874 + }, + { + "epoch": 1.348215953670839, + "grad_norm": 0.21088809922179003, + "learning_rate": 5.283235640358326e-06, + "loss": 0.10013427734375, + "memory(GiB)": 10.57, + "step": 1805, + "train_speed(iter/s)": 0.473898 + }, + { + "epoch": 1.3519521763497104, + "grad_norm": 0.2980687891825392, + "learning_rate": 5.27942652764813e-06, + "loss": 0.12469482421875, + "memory(GiB)": 10.57, + "step": 1810, + "train_speed(iter/s)": 0.473908 + }, + { + "epoch": 1.3556883990285822, + "grad_norm": 0.26579488787728855, + "learning_rate": 5.275608701258893e-06, + "loss": 0.09619140625, + "memory(GiB)": 10.57, + "step": 1815, + "train_speed(iter/s)": 0.473922 + }, + { + "epoch": 1.3594246217074537, + "grad_norm": 0.18737292024034827, + "learning_rate": 5.271782175785213e-06, + "loss": 0.08944091796875, + "memory(GiB)": 10.57, + "step": 1820, + "train_speed(iter/s)": 0.473933 + }, + { + "epoch": 1.3631608443863255, + "grad_norm": 0.24782345412701354, + "learning_rate": 5.2679469658549425e-06, + "loss": 0.09827880859375, + "memory(GiB)": 10.57, + "step": 1825, + "train_speed(iter/s)": 0.473873 + }, + { + "epoch": 1.3668970670651972, + "grad_norm": 0.32532596436786243, + "learning_rate": 5.26410308612913e-06, + "loss": 0.09747314453125, + "memory(GiB)": 10.57, + "step": 1830, + "train_speed(iter/s)": 0.473915 + }, + { + "epoch": 1.3706332897440687, + "grad_norm": 0.31097616250716587, + "learning_rate": 5.2602505513019725e-06, + "loss": 0.1041748046875, + "memory(GiB)": 10.57, + "step": 1835, + "train_speed(iter/s)": 0.473886 + }, + { + "epoch": 1.3743695124229405, + "grad_norm": 0.3233980057122036, + "learning_rate": 5.256389376100747e-06, + "loss": 0.10128173828125, + "memory(GiB)": 10.57, + "step": 1840, + "train_speed(iter/s)": 0.473889 + }, + { + "epoch": 1.378105735101812, + "grad_norm": 0.2838217794938913, + "learning_rate": 5.252519575285765e-06, + "loss": 0.10989990234375, + "memory(GiB)": 10.57, + "step": 1845, + "train_speed(iter/s)": 0.473918 + }, + { + "epoch": 1.3818419577806837, + "grad_norm": 0.2857844265885774, + "learning_rate": 5.248641163650309e-06, + "loss": 0.101458740234375, + "memory(GiB)": 10.57, + "step": 1850, + "train_speed(iter/s)": 0.473944 + }, + { + "epoch": 1.3855781804595555, + "grad_norm": 0.3117055756844236, + "learning_rate": 5.244754156020577e-06, + "loss": 0.10926513671875, + "memory(GiB)": 10.57, + "step": 1855, + "train_speed(iter/s)": 0.473967 + }, + { + "epoch": 1.389314403138427, + "grad_norm": 0.1920114429204594, + "learning_rate": 5.240858567255634e-06, + "loss": 0.110009765625, + "memory(GiB)": 10.57, + "step": 1860, + "train_speed(iter/s)": 0.473962 + }, + { + "epoch": 1.3930506258172988, + "grad_norm": 0.3502090927498937, + "learning_rate": 5.236954412247341e-06, + "loss": 0.11763916015625, + "memory(GiB)": 10.57, + "step": 1865, + "train_speed(iter/s)": 0.473991 + }, + { + "epoch": 1.3967868484961703, + "grad_norm": 0.23316922643496588, + "learning_rate": 5.2330417059203095e-06, + "loss": 0.1151123046875, + "memory(GiB)": 10.57, + "step": 1870, + "train_speed(iter/s)": 0.474034 + }, + { + "epoch": 1.400523071175042, + "grad_norm": 0.2549951722054464, + "learning_rate": 5.22912046323184e-06, + "loss": 0.110504150390625, + "memory(GiB)": 10.57, + "step": 1875, + "train_speed(iter/s)": 0.474059 + }, + { + "epoch": 1.4042592938539138, + "grad_norm": 0.1708829919522614, + "learning_rate": 5.225190699171865e-06, + "loss": 0.08787841796875, + "memory(GiB)": 10.57, + "step": 1880, + "train_speed(iter/s)": 0.474029 + }, + { + "epoch": 1.4079955165327853, + "grad_norm": 0.27196811779503416, + "learning_rate": 5.221252428762893e-06, + "loss": 0.11351318359375, + "memory(GiB)": 10.57, + "step": 1885, + "train_speed(iter/s)": 0.474046 + }, + { + "epoch": 1.411731739211657, + "grad_norm": 0.23328619371671638, + "learning_rate": 5.217305667059948e-06, + "loss": 0.101446533203125, + "memory(GiB)": 10.57, + "step": 1890, + "train_speed(iter/s)": 0.474076 + }, + { + "epoch": 1.4154679618905286, + "grad_norm": 0.18762276770097455, + "learning_rate": 5.213350429150517e-06, + "loss": 0.10950927734375, + "memory(GiB)": 10.57, + "step": 1895, + "train_speed(iter/s)": 0.474023 + }, + { + "epoch": 1.4192041845694003, + "grad_norm": 0.26686273546353123, + "learning_rate": 5.209386730154487e-06, + "loss": 0.10045166015625, + "memory(GiB)": 10.57, + "step": 1900, + "train_speed(iter/s)": 0.474076 + }, + { + "epoch": 1.422940407248272, + "grad_norm": 0.3085786825020616, + "learning_rate": 5.205414585224091e-06, + "loss": 0.10711669921875, + "memory(GiB)": 10.57, + "step": 1905, + "train_speed(iter/s)": 0.474098 + }, + { + "epoch": 1.4266766299271436, + "grad_norm": 0.3905887360768796, + "learning_rate": 5.2014340095438476e-06, + "loss": 0.118505859375, + "memory(GiB)": 10.57, + "step": 1910, + "train_speed(iter/s)": 0.474116 + }, + { + "epoch": 1.4304128526060154, + "grad_norm": 0.2752084354347657, + "learning_rate": 5.197445018330506e-06, + "loss": 0.09713134765625, + "memory(GiB)": 10.57, + "step": 1915, + "train_speed(iter/s)": 0.47414 + }, + { + "epoch": 1.4341490752848869, + "grad_norm": 0.25638122340507086, + "learning_rate": 5.193447626832984e-06, + "loss": 0.1004638671875, + "memory(GiB)": 10.57, + "step": 1920, + "train_speed(iter/s)": 0.474127 + }, + { + "epoch": 1.4378852979637586, + "grad_norm": 0.3365573737926719, + "learning_rate": 5.189441850332312e-06, + "loss": 0.096502685546875, + "memory(GiB)": 10.57, + "step": 1925, + "train_speed(iter/s)": 0.474083 + }, + { + "epoch": 1.4416215206426304, + "grad_norm": 0.1924187499510245, + "learning_rate": 5.185427704141573e-06, + "loss": 0.124609375, + "memory(GiB)": 10.57, + "step": 1930, + "train_speed(iter/s)": 0.474111 + }, + { + "epoch": 1.445357743321502, + "grad_norm": 0.28660368393049557, + "learning_rate": 5.181405203605849e-06, + "loss": 0.10279541015625, + "memory(GiB)": 10.57, + "step": 1935, + "train_speed(iter/s)": 0.474107 + }, + { + "epoch": 1.4490939660003737, + "grad_norm": 0.26275748472823024, + "learning_rate": 5.177374364102156e-06, + "loss": 0.1211669921875, + "memory(GiB)": 10.57, + "step": 1940, + "train_speed(iter/s)": 0.474117 + }, + { + "epoch": 1.4528301886792452, + "grad_norm": 0.30473179680325724, + "learning_rate": 5.1733352010393855e-06, + "loss": 0.1116455078125, + "memory(GiB)": 10.57, + "step": 1945, + "train_speed(iter/s)": 0.474149 + }, + { + "epoch": 1.456566411358117, + "grad_norm": 0.2980857699329149, + "learning_rate": 5.169287729858254e-06, + "loss": 0.09521484375, + "memory(GiB)": 10.57, + "step": 1950, + "train_speed(iter/s)": 0.474117 + }, + { + "epoch": 1.4603026340369887, + "grad_norm": 0.3892418519621433, + "learning_rate": 5.165231966031231e-06, + "loss": 0.10706787109375, + "memory(GiB)": 10.57, + "step": 1955, + "train_speed(iter/s)": 0.474167 + }, + { + "epoch": 1.4640388567158602, + "grad_norm": 0.26876863290437225, + "learning_rate": 5.161167925062492e-06, + "loss": 0.0955810546875, + "memory(GiB)": 10.57, + "step": 1960, + "train_speed(iter/s)": 0.474231 + }, + { + "epoch": 1.467775079394732, + "grad_norm": 0.23766298983672868, + "learning_rate": 5.15709562248785e-06, + "loss": 0.1157470703125, + "memory(GiB)": 10.57, + "step": 1965, + "train_speed(iter/s)": 0.474264 + }, + { + "epoch": 1.4715113020736035, + "grad_norm": 0.2475077256620063, + "learning_rate": 5.153015073874704e-06, + "loss": 0.103997802734375, + "memory(GiB)": 10.57, + "step": 1970, + "train_speed(iter/s)": 0.474248 + }, + { + "epoch": 1.4752475247524752, + "grad_norm": 0.2529463798672503, + "learning_rate": 5.148926294821973e-06, + "loss": 0.09212646484375, + "memory(GiB)": 10.57, + "step": 1975, + "train_speed(iter/s)": 0.474282 + }, + { + "epoch": 1.478983747431347, + "grad_norm": 0.34121952234096015, + "learning_rate": 5.144829300960038e-06, + "loss": 0.09998779296875, + "memory(GiB)": 10.57, + "step": 1980, + "train_speed(iter/s)": 0.474279 + }, + { + "epoch": 1.4827199701102185, + "grad_norm": 0.26555171567768715, + "learning_rate": 5.140724107950687e-06, + "loss": 0.10701904296875, + "memory(GiB)": 10.57, + "step": 1985, + "train_speed(iter/s)": 0.474325 + }, + { + "epoch": 1.4864561927890902, + "grad_norm": 0.3012526382519, + "learning_rate": 5.136610731487047e-06, + "loss": 0.10223388671875, + "memory(GiB)": 10.57, + "step": 1990, + "train_speed(iter/s)": 0.474388 + }, + { + "epoch": 1.4901924154679618, + "grad_norm": 0.2585567492074306, + "learning_rate": 5.13248918729353e-06, + "loss": 0.110015869140625, + "memory(GiB)": 10.57, + "step": 1995, + "train_speed(iter/s)": 0.474458 + }, + { + "epoch": 1.4939286381468335, + "grad_norm": 0.21553275657329446, + "learning_rate": 5.128359491125772e-06, + "loss": 0.10537109375, + "memory(GiB)": 10.57, + "step": 2000, + "train_speed(iter/s)": 0.474436 + }, + { + "epoch": 1.4976648608257053, + "grad_norm": 0.23393892148099255, + "learning_rate": 5.1242216587705726e-06, + "loss": 0.09471435546875, + "memory(GiB)": 10.57, + "step": 2005, + "train_speed(iter/s)": 0.474455 + }, + { + "epoch": 1.501401083504577, + "grad_norm": 0.1982523301744199, + "learning_rate": 5.1200757060458305e-06, + "loss": 0.094744873046875, + "memory(GiB)": 10.57, + "step": 2010, + "train_speed(iter/s)": 0.47441 + }, + { + "epoch": 1.5051373061834485, + "grad_norm": 0.24897221547603635, + "learning_rate": 5.11592164880049e-06, + "loss": 0.094281005859375, + "memory(GiB)": 10.57, + "step": 2015, + "train_speed(iter/s)": 0.474368 + }, + { + "epoch": 1.50887352886232, + "grad_norm": 0.2524388493286587, + "learning_rate": 5.111759502914477e-06, + "loss": 0.10567626953125, + "memory(GiB)": 10.57, + "step": 2020, + "train_speed(iter/s)": 0.474413 + }, + { + "epoch": 1.5126097515411918, + "grad_norm": 0.2821918241104093, + "learning_rate": 5.107589284298635e-06, + "loss": 0.10643310546875, + "memory(GiB)": 10.57, + "step": 2025, + "train_speed(iter/s)": 0.474445 + }, + { + "epoch": 1.5163459742200636, + "grad_norm": 0.1949063316633063, + "learning_rate": 5.10341100889467e-06, + "loss": 0.10220947265625, + "memory(GiB)": 10.57, + "step": 2030, + "train_speed(iter/s)": 0.474452 + }, + { + "epoch": 1.5200821968989353, + "grad_norm": 0.2664640714650226, + "learning_rate": 5.0992246926750866e-06, + "loss": 0.1039306640625, + "memory(GiB)": 10.57, + "step": 2035, + "train_speed(iter/s)": 0.474496 + }, + { + "epoch": 1.5238184195778068, + "grad_norm": 0.15706118788240764, + "learning_rate": 5.095030351643129e-06, + "loss": 0.0922210693359375, + "memory(GiB)": 10.57, + "step": 2040, + "train_speed(iter/s)": 0.474486 + }, + { + "epoch": 1.5275546422566784, + "grad_norm": 0.2929327425758124, + "learning_rate": 5.090828001832715e-06, + "loss": 0.1028076171875, + "memory(GiB)": 10.57, + "step": 2045, + "train_speed(iter/s)": 0.474519 + }, + { + "epoch": 1.5312908649355501, + "grad_norm": 0.21976773396934837, + "learning_rate": 5.0866176593083805e-06, + "loss": 0.1067626953125, + "memory(GiB)": 10.57, + "step": 2050, + "train_speed(iter/s)": 0.474561 + }, + { + "epoch": 1.5350270876144219, + "grad_norm": 0.22682626802364397, + "learning_rate": 5.082399340165214e-06, + "loss": 0.10389404296875, + "memory(GiB)": 10.57, + "step": 2055, + "train_speed(iter/s)": 0.474594 + }, + { + "epoch": 1.5387633102932936, + "grad_norm": 0.2279293975450204, + "learning_rate": 5.0781730605287985e-06, + "loss": 0.102423095703125, + "memory(GiB)": 10.57, + "step": 2060, + "train_speed(iter/s)": 0.474651 + }, + { + "epoch": 1.5424995329721651, + "grad_norm": 0.21127637298228888, + "learning_rate": 5.073938836555145e-06, + "loss": 0.11668701171875, + "memory(GiB)": 10.57, + "step": 2065, + "train_speed(iter/s)": 0.474653 + }, + { + "epoch": 1.5462357556510367, + "grad_norm": 0.23883103143189194, + "learning_rate": 5.069696684430639e-06, + "loss": 0.10777587890625, + "memory(GiB)": 10.57, + "step": 2070, + "train_speed(iter/s)": 0.474569 + }, + { + "epoch": 1.5499719783299084, + "grad_norm": 0.19708822331757736, + "learning_rate": 5.065446620371966e-06, + "loss": 0.10965576171875, + "memory(GiB)": 10.57, + "step": 2075, + "train_speed(iter/s)": 0.474611 + }, + { + "epoch": 1.5537082010087802, + "grad_norm": 0.22428614901572544, + "learning_rate": 5.061188660626064e-06, + "loss": 0.08321533203125, + "memory(GiB)": 10.57, + "step": 2080, + "train_speed(iter/s)": 0.474592 + }, + { + "epoch": 1.557444423687652, + "grad_norm": 0.23095421524064055, + "learning_rate": 5.056922821470048e-06, + "loss": 0.1009521484375, + "memory(GiB)": 10.57, + "step": 2085, + "train_speed(iter/s)": 0.474622 + }, + { + "epoch": 1.5611806463665234, + "grad_norm": 0.2871546333696532, + "learning_rate": 5.052649119211159e-06, + "loss": 0.1187744140625, + "memory(GiB)": 10.57, + "step": 2090, + "train_speed(iter/s)": 0.474601 + }, + { + "epoch": 1.564916869045395, + "grad_norm": 0.21613086763978323, + "learning_rate": 5.048367570186694e-06, + "loss": 0.1031494140625, + "memory(GiB)": 10.57, + "step": 2095, + "train_speed(iter/s)": 0.474627 + }, + { + "epoch": 1.5686530917242667, + "grad_norm": 0.2485805730125251, + "learning_rate": 5.044078190763949e-06, + "loss": 0.09178466796875, + "memory(GiB)": 10.57, + "step": 2100, + "train_speed(iter/s)": 0.474608 + }, + { + "epoch": 1.5723893144031384, + "grad_norm": 0.2501433468360814, + "learning_rate": 5.039780997340148e-06, + "loss": 0.096502685546875, + "memory(GiB)": 10.57, + "step": 2105, + "train_speed(iter/s)": 0.474639 + }, + { + "epoch": 1.5761255370820102, + "grad_norm": 0.2625314288905634, + "learning_rate": 5.035476006342392e-06, + "loss": 0.12071533203125, + "memory(GiB)": 10.57, + "step": 2110, + "train_speed(iter/s)": 0.474666 + }, + { + "epoch": 1.5798617597608817, + "grad_norm": 0.25070127552544946, + "learning_rate": 5.031163234227587e-06, + "loss": 0.102880859375, + "memory(GiB)": 10.57, + "step": 2115, + "train_speed(iter/s)": 0.474731 + }, + { + "epoch": 1.5835979824397532, + "grad_norm": 0.2730775843332172, + "learning_rate": 5.026842697482386e-06, + "loss": 0.107745361328125, + "memory(GiB)": 10.57, + "step": 2120, + "train_speed(iter/s)": 0.47469 + }, + { + "epoch": 1.587334205118625, + "grad_norm": 0.3168533915295129, + "learning_rate": 5.022514412623122e-06, + "loss": 0.10606689453125, + "memory(GiB)": 10.57, + "step": 2125, + "train_speed(iter/s)": 0.474712 + }, + { + "epoch": 1.5910704277974967, + "grad_norm": 0.26414617810461144, + "learning_rate": 5.018178396195749e-06, + "loss": 0.114739990234375, + "memory(GiB)": 10.57, + "step": 2130, + "train_speed(iter/s)": 0.474667 + }, + { + "epoch": 1.5948066504763685, + "grad_norm": 0.2884403060168701, + "learning_rate": 5.013834664775775e-06, + "loss": 0.09578857421875, + "memory(GiB)": 10.57, + "step": 2135, + "train_speed(iter/s)": 0.474686 + }, + { + "epoch": 1.59854287315524, + "grad_norm": 0.17316814005290654, + "learning_rate": 5.009483234968204e-06, + "loss": 0.09461669921875, + "memory(GiB)": 10.57, + "step": 2140, + "train_speed(iter/s)": 0.474718 + }, + { + "epoch": 1.6022790958341118, + "grad_norm": 0.20180870823591296, + "learning_rate": 5.005124123407466e-06, + "loss": 0.1016357421875, + "memory(GiB)": 10.57, + "step": 2145, + "train_speed(iter/s)": 0.474763 + }, + { + "epoch": 1.6060153185129833, + "grad_norm": 0.28225684517263877, + "learning_rate": 5.0007573467573556e-06, + "loss": 0.0999755859375, + "memory(GiB)": 10.57, + "step": 2150, + "train_speed(iter/s)": 0.474781 + }, + { + "epoch": 1.609751541191855, + "grad_norm": 0.11744325613491245, + "learning_rate": 4.996382921710973e-06, + "loss": 0.088720703125, + "memory(GiB)": 10.57, + "step": 2155, + "train_speed(iter/s)": 0.474755 + }, + { + "epoch": 1.6134877638707268, + "grad_norm": 0.34760100976149216, + "learning_rate": 4.992000864990652e-06, + "loss": 0.112939453125, + "memory(GiB)": 10.57, + "step": 2160, + "train_speed(iter/s)": 0.474772 + }, + { + "epoch": 1.6172239865495983, + "grad_norm": 0.22604747445071158, + "learning_rate": 4.987611193347903e-06, + "loss": 0.089892578125, + "memory(GiB)": 10.57, + "step": 2165, + "train_speed(iter/s)": 0.474717 + }, + { + "epoch": 1.62096020922847, + "grad_norm": 0.28280682170193416, + "learning_rate": 4.983213923563347e-06, + "loss": 0.0989990234375, + "memory(GiB)": 10.57, + "step": 2170, + "train_speed(iter/s)": 0.474738 + }, + { + "epoch": 1.6246964319073416, + "grad_norm": 0.22814666006274306, + "learning_rate": 4.978809072446648e-06, + "loss": 0.0938232421875, + "memory(GiB)": 10.57, + "step": 2175, + "train_speed(iter/s)": 0.474723 + }, + { + "epoch": 1.6284326545862133, + "grad_norm": 0.26304826342931886, + "learning_rate": 4.974396656836454e-06, + "loss": 0.09578857421875, + "memory(GiB)": 10.57, + "step": 2180, + "train_speed(iter/s)": 0.474661 + }, + { + "epoch": 1.632168877265085, + "grad_norm": 0.3174530542273234, + "learning_rate": 4.969976693600328e-06, + "loss": 0.08758544921875, + "memory(GiB)": 10.57, + "step": 2185, + "train_speed(iter/s)": 0.474686 + }, + { + "epoch": 1.6359050999439566, + "grad_norm": 0.2533342016854265, + "learning_rate": 4.965549199634688e-06, + "loss": 0.095849609375, + "memory(GiB)": 10.57, + "step": 2190, + "train_speed(iter/s)": 0.474707 + }, + { + "epoch": 1.6396413226228284, + "grad_norm": 0.2795419703573222, + "learning_rate": 4.96111419186474e-06, + "loss": 0.09959716796875, + "memory(GiB)": 10.57, + "step": 2195, + "train_speed(iter/s)": 0.474746 + }, + { + "epoch": 1.6433775453016999, + "grad_norm": 0.2244253656669392, + "learning_rate": 4.95667168724441e-06, + "loss": 0.103564453125, + "memory(GiB)": 10.57, + "step": 2200, + "train_speed(iter/s)": 0.474702 + }, + { + "epoch": 1.6471137679805716, + "grad_norm": 0.2568324687784542, + "learning_rate": 4.952221702756288e-06, + "loss": 0.1037445068359375, + "memory(GiB)": 10.57, + "step": 2205, + "train_speed(iter/s)": 0.474722 + }, + { + "epoch": 1.6508499906594434, + "grad_norm": 0.3956651516840788, + "learning_rate": 4.947764255411551e-06, + "loss": 0.11588134765625, + "memory(GiB)": 10.57, + "step": 2210, + "train_speed(iter/s)": 0.474738 + }, + { + "epoch": 1.6545862133383151, + "grad_norm": 0.20985100077876295, + "learning_rate": 4.943299362249912e-06, + "loss": 0.099951171875, + "memory(GiB)": 10.57, + "step": 2215, + "train_speed(iter/s)": 0.474773 + }, + { + "epoch": 1.6583224360171867, + "grad_norm": 0.1962140667346041, + "learning_rate": 4.9388270403395415e-06, + "loss": 0.10343017578125, + "memory(GiB)": 10.57, + "step": 2220, + "train_speed(iter/s)": 0.474776 + }, + { + "epoch": 1.6620586586960582, + "grad_norm": 0.22503137462618433, + "learning_rate": 4.934347306777012e-06, + "loss": 0.1007568359375, + "memory(GiB)": 10.57, + "step": 2225, + "train_speed(iter/s)": 0.474752 + }, + { + "epoch": 1.66579488137493, + "grad_norm": 0.22195673002837232, + "learning_rate": 4.929860178687226e-06, + "loss": 0.091131591796875, + "memory(GiB)": 10.57, + "step": 2230, + "train_speed(iter/s)": 0.474771 + }, + { + "epoch": 1.6695311040538017, + "grad_norm": 0.3168855098173885, + "learning_rate": 4.9253656732233564e-06, + "loss": 0.11160888671875, + "memory(GiB)": 10.57, + "step": 2235, + "train_speed(iter/s)": 0.474768 + }, + { + "epoch": 1.6732673267326734, + "grad_norm": 0.1738888875381385, + "learning_rate": 4.920863807566776e-06, + "loss": 0.0958465576171875, + "memory(GiB)": 10.57, + "step": 2240, + "train_speed(iter/s)": 0.474725 + }, + { + "epoch": 1.677003549411545, + "grad_norm": 0.2552273932950652, + "learning_rate": 4.9163545989269944e-06, + "loss": 0.09219970703125, + "memory(GiB)": 10.57, + "step": 2245, + "train_speed(iter/s)": 0.474729 + }, + { + "epoch": 1.6807397720904165, + "grad_norm": 0.3060989271500881, + "learning_rate": 4.9118380645415905e-06, + "loss": 0.100439453125, + "memory(GiB)": 10.57, + "step": 2250, + "train_speed(iter/s)": 0.474737 + }, + { + "epoch": 1.6844759947692882, + "grad_norm": 0.2949704093412238, + "learning_rate": 4.907314221676149e-06, + "loss": 0.102716064453125, + "memory(GiB)": 10.57, + "step": 2255, + "train_speed(iter/s)": 0.474753 + }, + { + "epoch": 1.68821221744816, + "grad_norm": 0.28246484565713104, + "learning_rate": 4.902783087624195e-06, + "loss": 0.104339599609375, + "memory(GiB)": 10.57, + "step": 2260, + "train_speed(iter/s)": 0.474772 + }, + { + "epoch": 1.6919484401270317, + "grad_norm": 0.2912739109964812, + "learning_rate": 4.89824467970712e-06, + "loss": 0.09698486328125, + "memory(GiB)": 10.57, + "step": 2265, + "train_speed(iter/s)": 0.474766 + }, + { + "epoch": 1.6956846628059032, + "grad_norm": 0.20297905907906486, + "learning_rate": 4.8936990152741276e-06, + "loss": 0.10142822265625, + "memory(GiB)": 10.57, + "step": 2270, + "train_speed(iter/s)": 0.474788 + }, + { + "epoch": 1.6994208854847748, + "grad_norm": 0.27675872548007086, + "learning_rate": 4.88914611170216e-06, + "loss": 0.11038818359375, + "memory(GiB)": 10.57, + "step": 2275, + "train_speed(iter/s)": 0.474807 + }, + { + "epoch": 1.7031571081636465, + "grad_norm": 0.26312724669069576, + "learning_rate": 4.88458598639583e-06, + "loss": 0.10172119140625, + "memory(GiB)": 10.57, + "step": 2280, + "train_speed(iter/s)": 0.474842 + }, + { + "epoch": 1.7068933308425183, + "grad_norm": 0.2905331610134025, + "learning_rate": 4.880018656787359e-06, + "loss": 0.09381103515625, + "memory(GiB)": 10.57, + "step": 2285, + "train_speed(iter/s)": 0.474842 + }, + { + "epoch": 1.71062955352139, + "grad_norm": 0.34444149002078045, + "learning_rate": 4.8754441403365105e-06, + "loss": 0.1239501953125, + "memory(GiB)": 10.57, + "step": 2290, + "train_speed(iter/s)": 0.47486 + }, + { + "epoch": 1.7143657762002615, + "grad_norm": 0.2738462078711773, + "learning_rate": 4.8708624545305185e-06, + "loss": 0.0885498046875, + "memory(GiB)": 10.57, + "step": 2295, + "train_speed(iter/s)": 0.474827 + }, + { + "epoch": 1.718101998879133, + "grad_norm": 0.28959854575833754, + "learning_rate": 4.866273616884027e-06, + "loss": 0.11025390625, + "memory(GiB)": 10.57, + "step": 2300, + "train_speed(iter/s)": 0.474849 + }, + { + "epoch": 1.7218382215580048, + "grad_norm": 0.20588142938995796, + "learning_rate": 4.861677644939015e-06, + "loss": 0.08424072265625, + "memory(GiB)": 10.57, + "step": 2305, + "train_speed(iter/s)": 0.474856 + }, + { + "epoch": 1.7255744442368766, + "grad_norm": 0.3354441601677246, + "learning_rate": 4.857074556264738e-06, + "loss": 0.1094970703125, + "memory(GiB)": 10.57, + "step": 2310, + "train_speed(iter/s)": 0.474867 + }, + { + "epoch": 1.7293106669157483, + "grad_norm": 0.20426806575301326, + "learning_rate": 4.852464368457656e-06, + "loss": 0.10550537109375, + "memory(GiB)": 10.57, + "step": 2315, + "train_speed(iter/s)": 0.474874 + }, + { + "epoch": 1.7330468895946198, + "grad_norm": 0.23904264143395532, + "learning_rate": 4.8478470991413675e-06, + "loss": 0.086602783203125, + "memory(GiB)": 10.57, + "step": 2320, + "train_speed(iter/s)": 0.474876 + }, + { + "epoch": 1.7367831122734914, + "grad_norm": 0.22442760094437317, + "learning_rate": 4.84322276596654e-06, + "loss": 0.10830078125, + "memory(GiB)": 10.57, + "step": 2325, + "train_speed(iter/s)": 0.4749 + }, + { + "epoch": 1.740519334952363, + "grad_norm": 0.22627089113762092, + "learning_rate": 4.838591386610846e-06, + "loss": 0.0934814453125, + "memory(GiB)": 10.57, + "step": 2330, + "train_speed(iter/s)": 0.474923 + }, + { + "epoch": 1.7442555576312349, + "grad_norm": 0.212873273345035, + "learning_rate": 4.833952978778896e-06, + "loss": 0.10042724609375, + "memory(GiB)": 10.57, + "step": 2335, + "train_speed(iter/s)": 0.474953 + }, + { + "epoch": 1.7479917803101066, + "grad_norm": 0.310168401865503, + "learning_rate": 4.829307560202164e-06, + "loss": 0.090283203125, + "memory(GiB)": 10.57, + "step": 2340, + "train_speed(iter/s)": 0.47497 + }, + { + "epoch": 1.7517280029889781, + "grad_norm": 0.25363080821630596, + "learning_rate": 4.824655148638925e-06, + "loss": 0.09075927734375, + "memory(GiB)": 10.57, + "step": 2345, + "train_speed(iter/s)": 0.474997 + }, + { + "epoch": 1.7554642256678497, + "grad_norm": 0.2287201903267125, + "learning_rate": 4.81999576187419e-06, + "loss": 0.122119140625, + "memory(GiB)": 10.57, + "step": 2350, + "train_speed(iter/s)": 0.474997 + }, + { + "epoch": 1.7592004483467214, + "grad_norm": 0.360999021305386, + "learning_rate": 4.815329417719632e-06, + "loss": 0.11300048828125, + "memory(GiB)": 10.57, + "step": 2355, + "train_speed(iter/s)": 0.474979 + }, + { + "epoch": 1.7629366710255931, + "grad_norm": 0.2535783044832626, + "learning_rate": 4.810656134013522e-06, + "loss": 0.108135986328125, + "memory(GiB)": 10.57, + "step": 2360, + "train_speed(iter/s)": 0.474978 + }, + { + "epoch": 1.766672893704465, + "grad_norm": 0.32574474831453987, + "learning_rate": 4.805975928620656e-06, + "loss": 0.10255126953125, + "memory(GiB)": 10.57, + "step": 2365, + "train_speed(iter/s)": 0.47493 + }, + { + "epoch": 1.7704091163833364, + "grad_norm": 0.19234656846328618, + "learning_rate": 4.801288819432292e-06, + "loss": 0.10970458984375, + "memory(GiB)": 10.57, + "step": 2370, + "train_speed(iter/s)": 0.474954 + }, + { + "epoch": 1.774145339062208, + "grad_norm": 0.2139672272846014, + "learning_rate": 4.79659482436608e-06, + "loss": 0.09434814453125, + "memory(GiB)": 10.57, + "step": 2375, + "train_speed(iter/s)": 0.474927 + }, + { + "epoch": 1.7778815617410797, + "grad_norm": 0.2978805049656468, + "learning_rate": 4.791893961365992e-06, + "loss": 0.11248779296875, + "memory(GiB)": 10.57, + "step": 2380, + "train_speed(iter/s)": 0.474937 + }, + { + "epoch": 1.7816177844199514, + "grad_norm": 0.20130959752649452, + "learning_rate": 4.787186248402255e-06, + "loss": 0.0978759765625, + "memory(GiB)": 10.57, + "step": 2385, + "train_speed(iter/s)": 0.474949 + }, + { + "epoch": 1.7853540070988232, + "grad_norm": 0.29180997165297434, + "learning_rate": 4.782471703471281e-06, + "loss": 0.112115478515625, + "memory(GiB)": 10.57, + "step": 2390, + "train_speed(iter/s)": 0.475004 + }, + { + "epoch": 1.7890902297776947, + "grad_norm": 0.35716522757327235, + "learning_rate": 4.777750344595599e-06, + "loss": 0.111859130859375, + "memory(GiB)": 10.57, + "step": 2395, + "train_speed(iter/s)": 0.475038 + }, + { + "epoch": 1.7928264524565665, + "grad_norm": 0.20213639606383335, + "learning_rate": 4.773022189823787e-06, + "loss": 0.09229736328125, + "memory(GiB)": 10.57, + "step": 2400, + "train_speed(iter/s)": 0.475057 + }, + { + "epoch": 1.796562675135438, + "grad_norm": 0.2865105053142085, + "learning_rate": 4.768287257230401e-06, + "loss": 0.097021484375, + "memory(GiB)": 10.57, + "step": 2405, + "train_speed(iter/s)": 0.475109 + }, + { + "epoch": 1.8002988978143097, + "grad_norm": 0.21308993463861362, + "learning_rate": 4.763545564915908e-06, + "loss": 0.0991943359375, + "memory(GiB)": 10.57, + "step": 2410, + "train_speed(iter/s)": 0.475081 + }, + { + "epoch": 1.8040351204931815, + "grad_norm": 0.23525035418815923, + "learning_rate": 4.758797131006613e-06, + "loss": 0.0963623046875, + "memory(GiB)": 10.57, + "step": 2415, + "train_speed(iter/s)": 0.475099 + }, + { + "epoch": 1.807771343172053, + "grad_norm": 0.21883109136220677, + "learning_rate": 4.754041973654596e-06, + "loss": 0.092449951171875, + "memory(GiB)": 10.57, + "step": 2420, + "train_speed(iter/s)": 0.475037 + }, + { + "epoch": 1.8115075658509248, + "grad_norm": 0.3077520982362397, + "learning_rate": 4.749280111037637e-06, + "loss": 0.113623046875, + "memory(GiB)": 10.57, + "step": 2425, + "train_speed(iter/s)": 0.475075 + }, + { + "epoch": 1.8152437885297963, + "grad_norm": 0.32425955991836447, + "learning_rate": 4.7445115613591496e-06, + "loss": 0.09962158203125, + "memory(GiB)": 10.57, + "step": 2430, + "train_speed(iter/s)": 0.475116 + }, + { + "epoch": 1.818980011208668, + "grad_norm": 0.32297534935048733, + "learning_rate": 4.739736342848108e-06, + "loss": 0.09112548828125, + "memory(GiB)": 10.57, + "step": 2435, + "train_speed(iter/s)": 0.475123 + }, + { + "epoch": 1.8227162338875398, + "grad_norm": 0.21046232051363747, + "learning_rate": 4.734954473758984e-06, + "loss": 0.08634033203125, + "memory(GiB)": 10.57, + "step": 2440, + "train_speed(iter/s)": 0.47511 + }, + { + "epoch": 1.8264524565664113, + "grad_norm": 0.1757652117500697, + "learning_rate": 4.730165972371668e-06, + "loss": 0.1082275390625, + "memory(GiB)": 10.57, + "step": 2445, + "train_speed(iter/s)": 0.475149 + }, + { + "epoch": 1.830188679245283, + "grad_norm": 0.25911116090794284, + "learning_rate": 4.725370856991408e-06, + "loss": 0.1029541015625, + "memory(GiB)": 10.57, + "step": 2450, + "train_speed(iter/s)": 0.475184 + }, + { + "epoch": 1.8339249019241546, + "grad_norm": 0.34390479485101666, + "learning_rate": 4.720569145948732e-06, + "loss": 0.11917724609375, + "memory(GiB)": 10.57, + "step": 2455, + "train_speed(iter/s)": 0.475229 + }, + { + "epoch": 1.8376611246030263, + "grad_norm": 0.2682881042332428, + "learning_rate": 4.715760857599386e-06, + "loss": 0.09146728515625, + "memory(GiB)": 10.57, + "step": 2460, + "train_speed(iter/s)": 0.475248 + }, + { + "epoch": 1.841397347281898, + "grad_norm": 0.19430110744207282, + "learning_rate": 4.710946010324257e-06, + "loss": 0.10311279296875, + "memory(GiB)": 10.57, + "step": 2465, + "train_speed(iter/s)": 0.475206 + }, + { + "epoch": 1.8451335699607698, + "grad_norm": 0.27883436818284973, + "learning_rate": 4.706124622529303e-06, + "loss": 0.10494384765625, + "memory(GiB)": 10.57, + "step": 2470, + "train_speed(iter/s)": 0.475183 + }, + { + "epoch": 1.8488697926396414, + "grad_norm": 0.31596787268028487, + "learning_rate": 4.7012967126454875e-06, + "loss": 0.08948974609375, + "memory(GiB)": 10.57, + "step": 2475, + "train_speed(iter/s)": 0.47521 + }, + { + "epoch": 1.8526060153185129, + "grad_norm": 0.31069646386041977, + "learning_rate": 4.696462299128708e-06, + "loss": 0.08408203125, + "memory(GiB)": 10.57, + "step": 2480, + "train_speed(iter/s)": 0.475194 + }, + { + "epoch": 1.8563422379973846, + "grad_norm": 0.2061030284127865, + "learning_rate": 4.691621400459718e-06, + "loss": 0.09312744140625, + "memory(GiB)": 10.57, + "step": 2485, + "train_speed(iter/s)": 0.475183 + }, + { + "epoch": 1.8600784606762564, + "grad_norm": 0.2927277785286754, + "learning_rate": 4.686774035144067e-06, + "loss": 0.104736328125, + "memory(GiB)": 10.57, + "step": 2490, + "train_speed(iter/s)": 0.475219 + }, + { + "epoch": 1.8638146833551281, + "grad_norm": 0.27419348046623093, + "learning_rate": 4.681920221712026e-06, + "loss": 0.10330810546875, + "memory(GiB)": 10.57, + "step": 2495, + "train_speed(iter/s)": 0.475193 + }, + { + "epoch": 1.8675509060339996, + "grad_norm": 0.2618512568544601, + "learning_rate": 4.67705997871851e-06, + "loss": 0.09486083984375, + "memory(GiB)": 10.57, + "step": 2500, + "train_speed(iter/s)": 0.475193 + }, + { + "epoch": 1.8712871287128712, + "grad_norm": 0.2616692317535369, + "learning_rate": 4.6721933247430155e-06, + "loss": 0.10108642578125, + "memory(GiB)": 10.57, + "step": 2505, + "train_speed(iter/s)": 0.475234 + }, + { + "epoch": 1.875023351391743, + "grad_norm": 0.37832147071618105, + "learning_rate": 4.667320278389548e-06, + "loss": 0.094085693359375, + "memory(GiB)": 10.57, + "step": 2510, + "train_speed(iter/s)": 0.475221 + }, + { + "epoch": 1.8787595740706147, + "grad_norm": 0.24687088782500174, + "learning_rate": 4.662440858286548e-06, + "loss": 0.09676513671875, + "memory(GiB)": 10.57, + "step": 2515, + "train_speed(iter/s)": 0.475216 + }, + { + "epoch": 1.8824957967494864, + "grad_norm": 0.234016616688346, + "learning_rate": 4.657555083086823e-06, + "loss": 0.10130615234375, + "memory(GiB)": 10.57, + "step": 2520, + "train_speed(iter/s)": 0.475251 + }, + { + "epoch": 1.886232019428358, + "grad_norm": 0.238817474808307, + "learning_rate": 4.65266297146747e-06, + "loss": 0.097900390625, + "memory(GiB)": 10.57, + "step": 2525, + "train_speed(iter/s)": 0.475255 + }, + { + "epoch": 1.8899682421072295, + "grad_norm": 0.207645191573174, + "learning_rate": 4.647764542129812e-06, + "loss": 0.091064453125, + "memory(GiB)": 10.57, + "step": 2530, + "train_speed(iter/s)": 0.475271 + }, + { + "epoch": 1.8937044647861012, + "grad_norm": 0.38113365892750667, + "learning_rate": 4.642859813799324e-06, + "loss": 0.118853759765625, + "memory(GiB)": 10.57, + "step": 2535, + "train_speed(iter/s)": 0.475293 + }, + { + "epoch": 1.897440687464973, + "grad_norm": 0.19816679538437149, + "learning_rate": 4.637948805225559e-06, + "loss": 0.08568115234375, + "memory(GiB)": 10.57, + "step": 2540, + "train_speed(iter/s)": 0.475228 + }, + { + "epoch": 1.9011769101438447, + "grad_norm": 0.23604249041392467, + "learning_rate": 4.633031535182075e-06, + "loss": 0.11710205078125, + "memory(GiB)": 10.57, + "step": 2545, + "train_speed(iter/s)": 0.47526 + }, + { + "epoch": 1.9049131328227162, + "grad_norm": 0.24670385102759632, + "learning_rate": 4.6281080224663716e-06, + "loss": 0.087890625, + "memory(GiB)": 10.57, + "step": 2550, + "train_speed(iter/s)": 0.475273 + }, + { + "epoch": 1.9086493555015878, + "grad_norm": 0.2847144171201072, + "learning_rate": 4.62317828589981e-06, + "loss": 0.104248046875, + "memory(GiB)": 10.57, + "step": 2555, + "train_speed(iter/s)": 0.475257 + }, + { + "epoch": 1.9123855781804595, + "grad_norm": 0.3178684000074, + "learning_rate": 4.618242344327542e-06, + "loss": 0.0997802734375, + "memory(GiB)": 10.57, + "step": 2560, + "train_speed(iter/s)": 0.475268 + }, + { + "epoch": 1.9161218008593313, + "grad_norm": 0.2554865843964831, + "learning_rate": 4.613300216618441e-06, + "loss": 0.097015380859375, + "memory(GiB)": 10.57, + "step": 2565, + "train_speed(iter/s)": 0.475299 + }, + { + "epoch": 1.919858023538203, + "grad_norm": 0.2965767135219661, + "learning_rate": 4.608351921665029e-06, + "loss": 0.10614013671875, + "memory(GiB)": 10.57, + "step": 2570, + "train_speed(iter/s)": 0.475332 + }, + { + "epoch": 1.9235942462170745, + "grad_norm": 0.4039822442089598, + "learning_rate": 4.603397478383403e-06, + "loss": 0.10904541015625, + "memory(GiB)": 10.57, + "step": 2575, + "train_speed(iter/s)": 0.475287 + }, + { + "epoch": 1.927330468895946, + "grad_norm": 0.25628472854278145, + "learning_rate": 4.5984369057131656e-06, + "loss": 0.0983642578125, + "memory(GiB)": 10.57, + "step": 2580, + "train_speed(iter/s)": 0.475305 + }, + { + "epoch": 1.9310666915748178, + "grad_norm": 0.2779068338896975, + "learning_rate": 4.5934702226173455e-06, + "loss": 0.098095703125, + "memory(GiB)": 10.57, + "step": 2585, + "train_speed(iter/s)": 0.475271 + }, + { + "epoch": 1.9348029142536896, + "grad_norm": 0.281249239607163, + "learning_rate": 4.588497448082336e-06, + "loss": 0.129345703125, + "memory(GiB)": 10.57, + "step": 2590, + "train_speed(iter/s)": 0.475263 + }, + { + "epoch": 1.9385391369325613, + "grad_norm": 0.18136865279150907, + "learning_rate": 4.583518601117812e-06, + "loss": 0.089013671875, + "memory(GiB)": 10.57, + "step": 2595, + "train_speed(iter/s)": 0.47529 + }, + { + "epoch": 1.9422753596114328, + "grad_norm": 0.3240659543460739, + "learning_rate": 4.578533700756666e-06, + "loss": 0.11053466796875, + "memory(GiB)": 10.57, + "step": 2600, + "train_speed(iter/s)": 0.475327 + }, + { + "epoch": 1.9460115822903044, + "grad_norm": 0.19903277137682823, + "learning_rate": 4.573542766054926e-06, + "loss": 0.1120361328125, + "memory(GiB)": 10.57, + "step": 2605, + "train_speed(iter/s)": 0.475344 + }, + { + "epoch": 1.949747804969176, + "grad_norm": 0.24138123028972722, + "learning_rate": 4.568545816091691e-06, + "loss": 0.08602294921875, + "memory(GiB)": 10.57, + "step": 2610, + "train_speed(iter/s)": 0.475337 + }, + { + "epoch": 1.9534840276480478, + "grad_norm": 0.28322280343269146, + "learning_rate": 4.563542869969055e-06, + "loss": 0.08720703125, + "memory(GiB)": 10.57, + "step": 2615, + "train_speed(iter/s)": 0.475316 + }, + { + "epoch": 1.9572202503269196, + "grad_norm": 0.249240836739657, + "learning_rate": 4.558533946812034e-06, + "loss": 0.093548583984375, + "memory(GiB)": 10.57, + "step": 2620, + "train_speed(iter/s)": 0.475334 + }, + { + "epoch": 1.9609564730057911, + "grad_norm": 0.26762802652785167, + "learning_rate": 4.55351906576849e-06, + "loss": 0.08345947265625, + "memory(GiB)": 10.57, + "step": 2625, + "train_speed(iter/s)": 0.475361 + }, + { + "epoch": 1.9646926956846626, + "grad_norm": 0.22273584638151617, + "learning_rate": 4.548498246009062e-06, + "loss": 0.10457763671875, + "memory(GiB)": 10.57, + "step": 2630, + "train_speed(iter/s)": 0.475364 + }, + { + "epoch": 1.9684289183635344, + "grad_norm": 0.3304879364377937, + "learning_rate": 4.543471506727094e-06, + "loss": 0.1021240234375, + "memory(GiB)": 10.57, + "step": 2635, + "train_speed(iter/s)": 0.475354 + }, + { + "epoch": 1.9721651410424061, + "grad_norm": 0.29863906262334294, + "learning_rate": 4.538438867138554e-06, + "loss": 0.10843505859375, + "memory(GiB)": 10.57, + "step": 2640, + "train_speed(iter/s)": 0.475332 + }, + { + "epoch": 1.975901363721278, + "grad_norm": 0.2714963446386557, + "learning_rate": 4.533400346481969e-06, + "loss": 0.097955322265625, + "memory(GiB)": 10.57, + "step": 2645, + "train_speed(iter/s)": 0.475329 + }, + { + "epoch": 1.9796375864001494, + "grad_norm": 0.3336618360843215, + "learning_rate": 4.528355964018347e-06, + "loss": 0.09144287109375, + "memory(GiB)": 10.57, + "step": 2650, + "train_speed(iter/s)": 0.475305 + }, + { + "epoch": 1.983373809079021, + "grad_norm": 0.2980584550792422, + "learning_rate": 4.523305739031104e-06, + "loss": 0.0895965576171875, + "memory(GiB)": 10.57, + "step": 2655, + "train_speed(iter/s)": 0.475329 + }, + { + "epoch": 1.9871100317578927, + "grad_norm": 0.2720629310615164, + "learning_rate": 4.518249690825988e-06, + "loss": 0.1112548828125, + "memory(GiB)": 10.57, + "step": 2660, + "train_speed(iter/s)": 0.475346 + }, + { + "epoch": 1.9908462544367644, + "grad_norm": 0.3546253789825318, + "learning_rate": 4.5131878387310135e-06, + "loss": 0.12337646484375, + "memory(GiB)": 10.57, + "step": 2665, + "train_speed(iter/s)": 0.475357 + }, + { + "epoch": 1.9945824771156362, + "grad_norm": 0.28424801518849385, + "learning_rate": 4.508120202096376e-06, + "loss": 0.109814453125, + "memory(GiB)": 10.57, + "step": 2670, + "train_speed(iter/s)": 0.475325 + }, + { + "epoch": 1.9983186997945077, + "grad_norm": 0.21729292843099146, + "learning_rate": 4.5030468002943874e-06, + "loss": 0.0903076171875, + "memory(GiB)": 10.57, + "step": 2675, + "train_speed(iter/s)": 0.475307 + }, + { + "epoch": 2.0014944890715487, + "grad_norm": 0.2345088903539215, + "learning_rate": 4.497967652719397e-06, + "loss": 0.08399658203125, + "memory(GiB)": 10.57, + "step": 2680, + "train_speed(iter/s)": 0.475181 + }, + { + "epoch": 2.0052307117504204, + "grad_norm": 0.15184847590072537, + "learning_rate": 4.492882778787718e-06, + "loss": 0.07313232421875, + "memory(GiB)": 10.57, + "step": 2685, + "train_speed(iter/s)": 0.475183 + }, + { + "epoch": 2.008966934429292, + "grad_norm": 0.19979306629529392, + "learning_rate": 4.487792197937558e-06, + "loss": 0.0822509765625, + "memory(GiB)": 10.57, + "step": 2690, + "train_speed(iter/s)": 0.475199 + }, + { + "epoch": 2.0127031571081635, + "grad_norm": 0.2757404700372733, + "learning_rate": 4.482695929628936e-06, + "loss": 0.083453369140625, + "memory(GiB)": 10.57, + "step": 2695, + "train_speed(iter/s)": 0.475215 + }, + { + "epoch": 2.0164393797870352, + "grad_norm": 0.2560396040817178, + "learning_rate": 4.477593993343614e-06, + "loss": 0.0873291015625, + "memory(GiB)": 10.57, + "step": 2700, + "train_speed(iter/s)": 0.475205 + }, + { + "epoch": 2.020175602465907, + "grad_norm": 0.26086772363802274, + "learning_rate": 4.472486408585022e-06, + "loss": 0.084521484375, + "memory(GiB)": 10.57, + "step": 2705, + "train_speed(iter/s)": 0.475236 + }, + { + "epoch": 2.0239118251447787, + "grad_norm": 0.2694766103158065, + "learning_rate": 4.467373194878183e-06, + "loss": 0.0845458984375, + "memory(GiB)": 10.57, + "step": 2710, + "train_speed(iter/s)": 0.475254 + }, + { + "epoch": 2.0276480478236505, + "grad_norm": 0.36339657819849375, + "learning_rate": 4.462254371769637e-06, + "loss": 0.08817138671875, + "memory(GiB)": 10.57, + "step": 2715, + "train_speed(iter/s)": 0.475273 + }, + { + "epoch": 2.031384270502522, + "grad_norm": 0.1574529728668933, + "learning_rate": 4.457129958827369e-06, + "loss": 0.07781982421875, + "memory(GiB)": 10.57, + "step": 2720, + "train_speed(iter/s)": 0.475304 + }, + { + "epoch": 2.0351204931813935, + "grad_norm": 0.2776966602079697, + "learning_rate": 4.451999975640731e-06, + "loss": 0.079388427734375, + "memory(GiB)": 10.57, + "step": 2725, + "train_speed(iter/s)": 0.47531 + }, + { + "epoch": 2.0388567158602653, + "grad_norm": 0.1584963725015156, + "learning_rate": 4.446864441820368e-06, + "loss": 0.09000244140625, + "memory(GiB)": 10.57, + "step": 2730, + "train_speed(iter/s)": 0.475323 + }, + { + "epoch": 2.042592938539137, + "grad_norm": 0.22327421262837086, + "learning_rate": 4.441723376998147e-06, + "loss": 0.0762939453125, + "memory(GiB)": 10.57, + "step": 2735, + "train_speed(iter/s)": 0.475273 + }, + { + "epoch": 2.046329161218009, + "grad_norm": 0.33057687684074827, + "learning_rate": 4.436576800827074e-06, + "loss": 0.06875762939453126, + "memory(GiB)": 10.57, + "step": 2740, + "train_speed(iter/s)": 0.475284 + }, + { + "epoch": 2.05006538389688, + "grad_norm": 0.29923227392853685, + "learning_rate": 4.431424732981228e-06, + "loss": 0.06706466674804687, + "memory(GiB)": 10.57, + "step": 2745, + "train_speed(iter/s)": 0.475292 + }, + { + "epoch": 2.053801606575752, + "grad_norm": 0.4030927309740962, + "learning_rate": 4.426267193155678e-06, + "loss": 0.075927734375, + "memory(GiB)": 10.57, + "step": 2750, + "train_speed(iter/s)": 0.475316 + }, + { + "epoch": 2.0575378292546236, + "grad_norm": 0.37117244198948085, + "learning_rate": 4.4211042010664135e-06, + "loss": 0.07960205078125, + "memory(GiB)": 10.57, + "step": 2755, + "train_speed(iter/s)": 0.475314 + }, + { + "epoch": 2.0612740519334953, + "grad_norm": 0.31391095462008983, + "learning_rate": 4.415935776450264e-06, + "loss": 0.09554443359375, + "memory(GiB)": 10.57, + "step": 2760, + "train_speed(iter/s)": 0.475317 + }, + { + "epoch": 2.065010274612367, + "grad_norm": 0.17975702587106152, + "learning_rate": 4.410761939064827e-06, + "loss": 0.07388916015625, + "memory(GiB)": 10.57, + "step": 2765, + "train_speed(iter/s)": 0.475337 + }, + { + "epoch": 2.0687464972912384, + "grad_norm": 0.3396889402601098, + "learning_rate": 4.405582708688395e-06, + "loss": 0.084979248046875, + "memory(GiB)": 10.57, + "step": 2770, + "train_speed(iter/s)": 0.475352 + }, + { + "epoch": 2.07248271997011, + "grad_norm": 0.24563175886180283, + "learning_rate": 4.400398105119872e-06, + "loss": 0.08388671875, + "memory(GiB)": 10.57, + "step": 2775, + "train_speed(iter/s)": 0.475388 + }, + { + "epoch": 2.076218942648982, + "grad_norm": 0.2558763668832394, + "learning_rate": 4.395208148178704e-06, + "loss": 0.0897216796875, + "memory(GiB)": 10.57, + "step": 2780, + "train_speed(iter/s)": 0.475404 + }, + { + "epoch": 2.0799551653278536, + "grad_norm": 0.3548268406619161, + "learning_rate": 4.390012857704802e-06, + "loss": 0.08565673828125, + "memory(GiB)": 10.57, + "step": 2785, + "train_speed(iter/s)": 0.4754 + }, + { + "epoch": 2.0836913880067254, + "grad_norm": 0.326064743718348, + "learning_rate": 4.384812253558467e-06, + "loss": 0.08856201171875, + "memory(GiB)": 10.57, + "step": 2790, + "train_speed(iter/s)": 0.47541 + }, + { + "epoch": 2.0874276106855967, + "grad_norm": 0.3250783826701612, + "learning_rate": 4.37960635562031e-06, + "loss": 0.083563232421875, + "memory(GiB)": 10.57, + "step": 2795, + "train_speed(iter/s)": 0.475407 + }, + { + "epoch": 2.0911638333644684, + "grad_norm": 0.1928343549830928, + "learning_rate": 4.3743951837911804e-06, + "loss": 0.0770751953125, + "memory(GiB)": 10.57, + "step": 2800, + "train_speed(iter/s)": 0.475418 + }, + { + "epoch": 2.09490005604334, + "grad_norm": 0.3314940438350291, + "learning_rate": 4.3691787579920886e-06, + "loss": 0.0668182373046875, + "memory(GiB)": 10.57, + "step": 2805, + "train_speed(iter/s)": 0.475443 + }, + { + "epoch": 2.098636278722212, + "grad_norm": 0.25557946764887945, + "learning_rate": 4.363957098164129e-06, + "loss": 0.09249267578125, + "memory(GiB)": 10.57, + "step": 2810, + "train_speed(iter/s)": 0.475472 + }, + { + "epoch": 2.1023725014010837, + "grad_norm": 0.2834236723948582, + "learning_rate": 4.358730224268404e-06, + "loss": 0.076348876953125, + "memory(GiB)": 10.57, + "step": 2815, + "train_speed(iter/s)": 0.475457 + }, + { + "epoch": 2.106108724079955, + "grad_norm": 0.17913726646319922, + "learning_rate": 4.353498156285951e-06, + "loss": 0.0684478759765625, + "memory(GiB)": 10.57, + "step": 2820, + "train_speed(iter/s)": 0.475474 + }, + { + "epoch": 2.1098449467588267, + "grad_norm": 0.39181628904806004, + "learning_rate": 4.3482609142176585e-06, + "loss": 0.08323974609375, + "memory(GiB)": 10.57, + "step": 2825, + "train_speed(iter/s)": 0.475472 + }, + { + "epoch": 2.1135811694376985, + "grad_norm": 0.3689042584118628, + "learning_rate": 4.343018518084197e-06, + "loss": 0.08089599609375, + "memory(GiB)": 10.57, + "step": 2830, + "train_speed(iter/s)": 0.475507 + }, + { + "epoch": 2.11731739211657, + "grad_norm": 0.30944635697905426, + "learning_rate": 4.337770987925941e-06, + "loss": 0.074566650390625, + "memory(GiB)": 10.57, + "step": 2835, + "train_speed(iter/s)": 0.475485 + }, + { + "epoch": 2.121053614795442, + "grad_norm": 0.20965343005966453, + "learning_rate": 4.332518343802886e-06, + "loss": 0.0746063232421875, + "memory(GiB)": 10.57, + "step": 2840, + "train_speed(iter/s)": 0.475453 + }, + { + "epoch": 2.1247898374743133, + "grad_norm": 0.24055286896299563, + "learning_rate": 4.327260605794583e-06, + "loss": 0.0832275390625, + "memory(GiB)": 10.57, + "step": 2845, + "train_speed(iter/s)": 0.475488 + }, + { + "epoch": 2.128526060153185, + "grad_norm": 0.30278392143378924, + "learning_rate": 4.321997794000053e-06, + "loss": 0.09150390625, + "memory(GiB)": 10.57, + "step": 2850, + "train_speed(iter/s)": 0.475506 + }, + { + "epoch": 2.1322622828320568, + "grad_norm": 0.3357493665071166, + "learning_rate": 4.316729928537712e-06, + "loss": 0.077264404296875, + "memory(GiB)": 10.57, + "step": 2855, + "train_speed(iter/s)": 0.475505 + }, + { + "epoch": 2.1359985055109285, + "grad_norm": 0.28839246476160085, + "learning_rate": 4.311457029545295e-06, + "loss": 0.07557373046875, + "memory(GiB)": 10.57, + "step": 2860, + "train_speed(iter/s)": 0.475494 + }, + { + "epoch": 2.1397347281898003, + "grad_norm": 0.3587645451871882, + "learning_rate": 4.30617911717978e-06, + "loss": 0.08240966796875, + "memory(GiB)": 10.57, + "step": 2865, + "train_speed(iter/s)": 0.475522 + }, + { + "epoch": 2.1434709508686716, + "grad_norm": 0.21348435074986552, + "learning_rate": 4.3008962116173105e-06, + "loss": 0.06397705078125, + "memory(GiB)": 10.57, + "step": 2870, + "train_speed(iter/s)": 0.47546 + }, + { + "epoch": 2.1472071735475433, + "grad_norm": 0.24044644726569717, + "learning_rate": 4.295608333053115e-06, + "loss": 0.0892333984375, + "memory(GiB)": 10.57, + "step": 2875, + "train_speed(iter/s)": 0.475493 + }, + { + "epoch": 2.150943396226415, + "grad_norm": 0.271844882428932, + "learning_rate": 4.290315501701436e-06, + "loss": 0.07017822265625, + "memory(GiB)": 10.57, + "step": 2880, + "train_speed(iter/s)": 0.475506 + }, + { + "epoch": 2.154679618905287, + "grad_norm": 0.32275562789715756, + "learning_rate": 4.285017737795447e-06, + "loss": 0.094970703125, + "memory(GiB)": 10.57, + "step": 2885, + "train_speed(iter/s)": 0.475543 + }, + { + "epoch": 2.1584158415841586, + "grad_norm": 0.19204227011392838, + "learning_rate": 4.279715061587176e-06, + "loss": 0.082275390625, + "memory(GiB)": 10.57, + "step": 2890, + "train_speed(iter/s)": 0.475515 + }, + { + "epoch": 2.1621520642630303, + "grad_norm": 0.3187374981569435, + "learning_rate": 4.274407493347435e-06, + "loss": 0.073956298828125, + "memory(GiB)": 10.57, + "step": 2895, + "train_speed(iter/s)": 0.475498 + }, + { + "epoch": 2.1658882869419016, + "grad_norm": 0.31518550432451825, + "learning_rate": 4.26909505336573e-06, + "loss": 0.08779296875, + "memory(GiB)": 10.57, + "step": 2900, + "train_speed(iter/s)": 0.475501 + }, + { + "epoch": 2.1696245096207734, + "grad_norm": 0.25742777623976215, + "learning_rate": 4.2637777619501955e-06, + "loss": 0.068133544921875, + "memory(GiB)": 10.57, + "step": 2905, + "train_speed(iter/s)": 0.475526 + }, + { + "epoch": 2.173360732299645, + "grad_norm": 0.327461904975564, + "learning_rate": 4.258455639427512e-06, + "loss": 0.07855224609375, + "memory(GiB)": 10.57, + "step": 2910, + "train_speed(iter/s)": 0.475516 + }, + { + "epoch": 2.177096954978517, + "grad_norm": 0.2947045587842032, + "learning_rate": 4.253128706142823e-06, + "loss": 0.078759765625, + "memory(GiB)": 10.57, + "step": 2915, + "train_speed(iter/s)": 0.475556 + }, + { + "epoch": 2.180833177657388, + "grad_norm": 0.24106474434323896, + "learning_rate": 4.2477969824596675e-06, + "loss": 0.0806396484375, + "memory(GiB)": 10.57, + "step": 2920, + "train_speed(iter/s)": 0.475576 + }, + { + "epoch": 2.18456940033626, + "grad_norm": 0.35498053988232225, + "learning_rate": 4.2424604887598956e-06, + "loss": 0.08232421875, + "memory(GiB)": 10.57, + "step": 2925, + "train_speed(iter/s)": 0.475536 + }, + { + "epoch": 2.1883056230151317, + "grad_norm": 0.30444021185040904, + "learning_rate": 4.237119245443591e-06, + "loss": 0.08363037109375, + "memory(GiB)": 10.57, + "step": 2930, + "train_speed(iter/s)": 0.475537 + }, + { + "epoch": 2.1920418456940034, + "grad_norm": 0.2844894921351017, + "learning_rate": 4.231773272928995e-06, + "loss": 0.0828857421875, + "memory(GiB)": 10.57, + "step": 2935, + "train_speed(iter/s)": 0.475519 + }, + { + "epoch": 2.195778068372875, + "grad_norm": 0.3680515586014792, + "learning_rate": 4.226422591652426e-06, + "loss": 0.0849609375, + "memory(GiB)": 10.57, + "step": 2940, + "train_speed(iter/s)": 0.475527 + }, + { + "epoch": 2.199514291051747, + "grad_norm": 0.3347584264458827, + "learning_rate": 4.221067222068204e-06, + "loss": 0.07615966796875, + "memory(GiB)": 10.57, + "step": 2945, + "train_speed(iter/s)": 0.475506 + }, + { + "epoch": 2.203250513730618, + "grad_norm": 0.24357214909557126, + "learning_rate": 4.215707184648571e-06, + "loss": 0.071929931640625, + "memory(GiB)": 10.57, + "step": 2950, + "train_speed(iter/s)": 0.475535 + }, + { + "epoch": 2.20698673640949, + "grad_norm": 0.2969870033632324, + "learning_rate": 4.2103424998836166e-06, + "loss": 0.0795166015625, + "memory(GiB)": 10.57, + "step": 2955, + "train_speed(iter/s)": 0.475539 + }, + { + "epoch": 2.2107229590883617, + "grad_norm": 0.2597821857641748, + "learning_rate": 4.204973188281187e-06, + "loss": 0.078076171875, + "memory(GiB)": 10.57, + "step": 2960, + "train_speed(iter/s)": 0.475554 + }, + { + "epoch": 2.2144591817672334, + "grad_norm": 0.345560787249567, + "learning_rate": 4.199599270366825e-06, + "loss": 0.085748291015625, + "memory(GiB)": 10.57, + "step": 2965, + "train_speed(iter/s)": 0.47555 + }, + { + "epoch": 2.218195404446105, + "grad_norm": 0.30970032428526245, + "learning_rate": 4.1942207666836765e-06, + "loss": 0.082818603515625, + "memory(GiB)": 10.57, + "step": 2970, + "train_speed(iter/s)": 0.475506 + }, + { + "epoch": 2.2219316271249765, + "grad_norm": 0.3183590391694136, + "learning_rate": 4.188837697792421e-06, + "loss": 0.0791748046875, + "memory(GiB)": 10.57, + "step": 2975, + "train_speed(iter/s)": 0.475502 + }, + { + "epoch": 2.2256678498038482, + "grad_norm": 0.40743149107649224, + "learning_rate": 4.183450084271186e-06, + "loss": 0.085736083984375, + "memory(GiB)": 10.57, + "step": 2980, + "train_speed(iter/s)": 0.475528 + }, + { + "epoch": 2.22940407248272, + "grad_norm": 0.36574069885687205, + "learning_rate": 4.178057946715476e-06, + "loss": 0.08839111328125, + "memory(GiB)": 10.57, + "step": 2985, + "train_speed(iter/s)": 0.475523 + }, + { + "epoch": 2.2331402951615917, + "grad_norm": 0.29949255358893273, + "learning_rate": 4.172661305738086e-06, + "loss": 0.076226806640625, + "memory(GiB)": 10.57, + "step": 2990, + "train_speed(iter/s)": 0.4755 + }, + { + "epoch": 2.2368765178404635, + "grad_norm": 0.2645783347146312, + "learning_rate": 4.167260181969031e-06, + "loss": 0.0787109375, + "memory(GiB)": 10.57, + "step": 2995, + "train_speed(iter/s)": 0.475505 + }, + { + "epoch": 2.240612740519335, + "grad_norm": 0.35590583986728974, + "learning_rate": 4.161854596055458e-06, + "loss": 0.082958984375, + "memory(GiB)": 10.57, + "step": 3000, + "train_speed(iter/s)": 0.475522 + }, + { + "epoch": 2.2443489631982065, + "grad_norm": 0.2855462271704881, + "learning_rate": 4.156444568661574e-06, + "loss": 0.0782135009765625, + "memory(GiB)": 10.57, + "step": 3005, + "train_speed(iter/s)": 0.475556 + }, + { + "epoch": 2.2480851858770783, + "grad_norm": 0.23189643301309532, + "learning_rate": 4.151030120468563e-06, + "loss": 0.08284912109375, + "memory(GiB)": 10.57, + "step": 3010, + "train_speed(iter/s)": 0.475525 + }, + { + "epoch": 2.25182140855595, + "grad_norm": 0.2823549603550444, + "learning_rate": 4.145611272174513e-06, + "loss": 0.1001220703125, + "memory(GiB)": 10.57, + "step": 3015, + "train_speed(iter/s)": 0.47551 + }, + { + "epoch": 2.255557631234822, + "grad_norm": 0.3123102072825862, + "learning_rate": 4.140188044494328e-06, + "loss": 0.0789306640625, + "memory(GiB)": 10.57, + "step": 3020, + "train_speed(iter/s)": 0.475473 + }, + { + "epoch": 2.259293853913693, + "grad_norm": 0.34390166190396304, + "learning_rate": 4.134760458159652e-06, + "loss": 0.088250732421875, + "memory(GiB)": 10.57, + "step": 3025, + "train_speed(iter/s)": 0.475474 + }, + { + "epoch": 2.263030076592565, + "grad_norm": 0.4471657878322189, + "learning_rate": 4.1293285339187975e-06, + "loss": 0.08520050048828125, + "memory(GiB)": 10.57, + "step": 3030, + "train_speed(iter/s)": 0.475502 + }, + { + "epoch": 2.2667662992714366, + "grad_norm": 0.29627009892222517, + "learning_rate": 4.123892292536655e-06, + "loss": 0.0954498291015625, + "memory(GiB)": 10.57, + "step": 3035, + "train_speed(iter/s)": 0.475527 + }, + { + "epoch": 2.2705025219503083, + "grad_norm": 0.2103142983370086, + "learning_rate": 4.118451754794616e-06, + "loss": 0.079296875, + "memory(GiB)": 10.57, + "step": 3040, + "train_speed(iter/s)": 0.47555 + }, + { + "epoch": 2.27423874462918, + "grad_norm": 0.29094874204231086, + "learning_rate": 4.113006941490504e-06, + "loss": 0.07890625, + "memory(GiB)": 10.57, + "step": 3045, + "train_speed(iter/s)": 0.475543 + }, + { + "epoch": 2.2779749673080514, + "grad_norm": 0.2944500502582637, + "learning_rate": 4.1075578734384796e-06, + "loss": 0.07510986328125, + "memory(GiB)": 10.57, + "step": 3050, + "train_speed(iter/s)": 0.475485 + }, + { + "epoch": 2.281711189986923, + "grad_norm": 0.247526345569416, + "learning_rate": 4.1021045714689715e-06, + "loss": 0.062725830078125, + "memory(GiB)": 10.57, + "step": 3055, + "train_speed(iter/s)": 0.4755 + }, + { + "epoch": 2.285447412665795, + "grad_norm": 0.2223509166017715, + "learning_rate": 4.096647056428591e-06, + "loss": 0.08511962890625, + "memory(GiB)": 10.57, + "step": 3060, + "train_speed(iter/s)": 0.475511 + }, + { + "epoch": 2.2891836353446666, + "grad_norm": 0.40394852915768165, + "learning_rate": 4.0911853491800606e-06, + "loss": 0.078338623046875, + "memory(GiB)": 10.57, + "step": 3065, + "train_speed(iter/s)": 0.475523 + }, + { + "epoch": 2.2929198580235384, + "grad_norm": 0.3262435355040092, + "learning_rate": 4.085719470602121e-06, + "loss": 0.085260009765625, + "memory(GiB)": 10.57, + "step": 3070, + "train_speed(iter/s)": 0.475506 + }, + { + "epoch": 2.2966560807024097, + "grad_norm": 0.30731468388186667, + "learning_rate": 4.080249441589465e-06, + "loss": 0.081439208984375, + "memory(GiB)": 10.57, + "step": 3075, + "train_speed(iter/s)": 0.47553 + }, + { + "epoch": 2.3003923033812814, + "grad_norm": 0.2619319232654712, + "learning_rate": 4.074775283052647e-06, + "loss": 0.07823486328125, + "memory(GiB)": 10.57, + "step": 3080, + "train_speed(iter/s)": 0.475536 + }, + { + "epoch": 2.304128526060153, + "grad_norm": 0.28997697963247854, + "learning_rate": 4.069297015918012e-06, + "loss": 0.080047607421875, + "memory(GiB)": 10.57, + "step": 3085, + "train_speed(iter/s)": 0.475543 + }, + { + "epoch": 2.307864748739025, + "grad_norm": 0.3041055152853103, + "learning_rate": 4.063814661127607e-06, + "loss": 0.085015869140625, + "memory(GiB)": 10.57, + "step": 3090, + "train_speed(iter/s)": 0.475538 + }, + { + "epoch": 2.3116009714178967, + "grad_norm": 0.28074738714998865, + "learning_rate": 4.058328239639108e-06, + "loss": 0.0771240234375, + "memory(GiB)": 10.57, + "step": 3095, + "train_speed(iter/s)": 0.475537 + }, + { + "epoch": 2.3153371940967684, + "grad_norm": 0.2742208472612064, + "learning_rate": 4.052837772425735e-06, + "loss": 0.071533203125, + "memory(GiB)": 10.57, + "step": 3100, + "train_speed(iter/s)": 0.475526 + }, + { + "epoch": 2.3190734167756397, + "grad_norm": 0.2738394747920133, + "learning_rate": 4.0473432804761745e-06, + "loss": 0.074151611328125, + "memory(GiB)": 10.57, + "step": 3105, + "train_speed(iter/s)": 0.475533 + }, + { + "epoch": 2.3228096394545115, + "grad_norm": 0.3325363093754662, + "learning_rate": 4.0418447847945e-06, + "loss": 0.07762451171875, + "memory(GiB)": 10.57, + "step": 3110, + "train_speed(iter/s)": 0.475573 + }, + { + "epoch": 2.326545862133383, + "grad_norm": 0.29208910041820724, + "learning_rate": 4.036342306400087e-06, + "loss": 0.08729248046875, + "memory(GiB)": 10.57, + "step": 3115, + "train_speed(iter/s)": 0.475557 + }, + { + "epoch": 2.330282084812255, + "grad_norm": 0.2986291580987787, + "learning_rate": 4.03083586632754e-06, + "loss": 0.070965576171875, + "memory(GiB)": 10.57, + "step": 3120, + "train_speed(iter/s)": 0.475591 + }, + { + "epoch": 2.3340183074911263, + "grad_norm": 0.2715172245264193, + "learning_rate": 4.025325485626604e-06, + "loss": 0.07711181640625, + "memory(GiB)": 10.57, + "step": 3125, + "train_speed(iter/s)": 0.475607 + }, + { + "epoch": 2.337754530169998, + "grad_norm": 0.28383527690267557, + "learning_rate": 4.01981118536209e-06, + "loss": 0.073974609375, + "memory(GiB)": 10.57, + "step": 3130, + "train_speed(iter/s)": 0.475608 + }, + { + "epoch": 2.3414907528488698, + "grad_norm": 0.4294056030563819, + "learning_rate": 4.014292986613795e-06, + "loss": 0.09591064453125, + "memory(GiB)": 10.57, + "step": 3135, + "train_speed(iter/s)": 0.475616 + }, + { + "epoch": 2.3452269755277415, + "grad_norm": 0.324672085272647, + "learning_rate": 4.008770910476415e-06, + "loss": 0.073956298828125, + "memory(GiB)": 10.57, + "step": 3140, + "train_speed(iter/s)": 0.475642 + }, + { + "epoch": 2.3489631982066133, + "grad_norm": 0.33039068217728207, + "learning_rate": 4.003244978059466e-06, + "loss": 0.082257080078125, + "memory(GiB)": 10.57, + "step": 3145, + "train_speed(iter/s)": 0.475644 + }, + { + "epoch": 2.352699420885485, + "grad_norm": 0.25727097167399077, + "learning_rate": 3.997715210487215e-06, + "loss": 0.078131103515625, + "memory(GiB)": 10.57, + "step": 3150, + "train_speed(iter/s)": 0.475682 + }, + { + "epoch": 2.3564356435643563, + "grad_norm": 0.3005461408551253, + "learning_rate": 3.992181628898582e-06, + "loss": 0.0718292236328125, + "memory(GiB)": 10.57, + "step": 3155, + "train_speed(iter/s)": 0.475677 + }, + { + "epoch": 2.360171866243228, + "grad_norm": 0.21717097651290396, + "learning_rate": 3.986644254447067e-06, + "loss": 0.084930419921875, + "memory(GiB)": 10.57, + "step": 3160, + "train_speed(iter/s)": 0.475668 + }, + { + "epoch": 2.3639080889221, + "grad_norm": 0.2740183483391346, + "learning_rate": 3.981103108300674e-06, + "loss": 0.08662109375, + "memory(GiB)": 10.57, + "step": 3165, + "train_speed(iter/s)": 0.475671 + }, + { + "epoch": 2.3676443116009716, + "grad_norm": 0.23952800833281973, + "learning_rate": 3.975558211641822e-06, + "loss": 0.085614013671875, + "memory(GiB)": 10.57, + "step": 3170, + "train_speed(iter/s)": 0.475681 + }, + { + "epoch": 2.371380534279843, + "grad_norm": 0.20740773834062282, + "learning_rate": 3.970009585667267e-06, + "loss": 0.0666015625, + "memory(GiB)": 10.57, + "step": 3175, + "train_speed(iter/s)": 0.475702 + }, + { + "epoch": 2.3751167569587146, + "grad_norm": 0.3093587039146876, + "learning_rate": 3.964457251588023e-06, + "loss": 0.07269287109375, + "memory(GiB)": 10.57, + "step": 3180, + "train_speed(iter/s)": 0.475703 + }, + { + "epoch": 2.3788529796375864, + "grad_norm": 0.3535470284455733, + "learning_rate": 3.958901230629277e-06, + "loss": 0.0844482421875, + "memory(GiB)": 10.57, + "step": 3185, + "train_speed(iter/s)": 0.475708 + }, + { + "epoch": 2.382589202316458, + "grad_norm": 0.3279555931100402, + "learning_rate": 3.953341544030311e-06, + "loss": 0.08740234375, + "memory(GiB)": 10.57, + "step": 3190, + "train_speed(iter/s)": 0.475712 + }, + { + "epoch": 2.38632542499533, + "grad_norm": 0.37799827875806785, + "learning_rate": 3.947778213044423e-06, + "loss": 0.06464996337890624, + "memory(GiB)": 10.57, + "step": 3195, + "train_speed(iter/s)": 0.475685 + }, + { + "epoch": 2.3900616476742016, + "grad_norm": 0.21175755993638834, + "learning_rate": 3.942211258938837e-06, + "loss": 0.079998779296875, + "memory(GiB)": 10.57, + "step": 3200, + "train_speed(iter/s)": 0.475655 + }, + { + "epoch": 2.393797870353073, + "grad_norm": 0.3983514672863944, + "learning_rate": 3.936640702994629e-06, + "loss": 0.07978515625, + "memory(GiB)": 10.57, + "step": 3205, + "train_speed(iter/s)": 0.475627 + }, + { + "epoch": 2.3975340930319446, + "grad_norm": 0.3407681935903124, + "learning_rate": 3.931066566506648e-06, + "loss": 0.08079833984375, + "memory(GiB)": 10.57, + "step": 3210, + "train_speed(iter/s)": 0.475614 + }, + { + "epoch": 2.4012703157108164, + "grad_norm": 0.1829141400287362, + "learning_rate": 3.925488870783426e-06, + "loss": 0.08177490234375, + "memory(GiB)": 10.57, + "step": 3215, + "train_speed(iter/s)": 0.475612 + }, + { + "epoch": 2.405006538389688, + "grad_norm": 0.24647777146358466, + "learning_rate": 3.919907637147102e-06, + "loss": 0.081903076171875, + "memory(GiB)": 10.57, + "step": 3220, + "train_speed(iter/s)": 0.475609 + }, + { + "epoch": 2.4087427610685594, + "grad_norm": 0.38090689812957224, + "learning_rate": 3.914322886933341e-06, + "loss": 0.064569091796875, + "memory(GiB)": 10.57, + "step": 3225, + "train_speed(iter/s)": 0.475619 + }, + { + "epoch": 2.412478983747431, + "grad_norm": 0.2666319657744909, + "learning_rate": 3.908734641491248e-06, + "loss": 0.077764892578125, + "memory(GiB)": 10.57, + "step": 3230, + "train_speed(iter/s)": 0.475645 + }, + { + "epoch": 2.416215206426303, + "grad_norm": 0.22804209432893346, + "learning_rate": 3.903142922183294e-06, + "loss": 0.070025634765625, + "memory(GiB)": 10.57, + "step": 3235, + "train_speed(iter/s)": 0.475584 + }, + { + "epoch": 2.4199514291051747, + "grad_norm": 0.23685896651720773, + "learning_rate": 3.897547750385226e-06, + "loss": 0.0831634521484375, + "memory(GiB)": 10.57, + "step": 3240, + "train_speed(iter/s)": 0.475578 + }, + { + "epoch": 2.4236876517840464, + "grad_norm": 0.2355129405846085, + "learning_rate": 3.891949147485989e-06, + "loss": 0.077679443359375, + "memory(GiB)": 10.57, + "step": 3245, + "train_speed(iter/s)": 0.47556 + }, + { + "epoch": 2.427423874462918, + "grad_norm": 0.38970162877110276, + "learning_rate": 3.886347134887647e-06, + "loss": 0.0797607421875, + "memory(GiB)": 10.57, + "step": 3250, + "train_speed(iter/s)": 0.475557 + }, + { + "epoch": 2.4311600971417895, + "grad_norm": 0.2697647074819102, + "learning_rate": 3.8807417340052964e-06, + "loss": 0.0737060546875, + "memory(GiB)": 10.57, + "step": 3255, + "train_speed(iter/s)": 0.475577 + }, + { + "epoch": 2.4348963198206612, + "grad_norm": 0.19920837434880515, + "learning_rate": 3.875132966266987e-06, + "loss": 0.0791748046875, + "memory(GiB)": 10.57, + "step": 3260, + "train_speed(iter/s)": 0.475596 + }, + { + "epoch": 2.438632542499533, + "grad_norm": 0.22217603367413016, + "learning_rate": 3.869520853113637e-06, + "loss": 0.07099609375, + "memory(GiB)": 10.57, + "step": 3265, + "train_speed(iter/s)": 0.475601 + }, + { + "epoch": 2.4423687651784047, + "grad_norm": 0.310354028282849, + "learning_rate": 3.863905415998958e-06, + "loss": 0.075830078125, + "memory(GiB)": 10.57, + "step": 3270, + "train_speed(iter/s)": 0.475595 + }, + { + "epoch": 2.4461049878572765, + "grad_norm": 0.2904199442330529, + "learning_rate": 3.858286676389363e-06, + "loss": 0.07169189453125, + "memory(GiB)": 10.57, + "step": 3275, + "train_speed(iter/s)": 0.475577 + }, + { + "epoch": 2.449841210536148, + "grad_norm": 0.2671154417988313, + "learning_rate": 3.852664655763891e-06, + "loss": 0.0576446533203125, + "memory(GiB)": 10.57, + "step": 3280, + "train_speed(iter/s)": 0.475573 + }, + { + "epoch": 2.4535774332150195, + "grad_norm": 0.2117803221633462, + "learning_rate": 3.8470393756141285e-06, + "loss": 0.070208740234375, + "memory(GiB)": 10.57, + "step": 3285, + "train_speed(iter/s)": 0.475569 + }, + { + "epoch": 2.4573136558938913, + "grad_norm": 0.28365805075568284, + "learning_rate": 3.8414108574441155e-06, + "loss": 0.07728271484375, + "memory(GiB)": 10.57, + "step": 3290, + "train_speed(iter/s)": 0.475604 + }, + { + "epoch": 2.461049878572763, + "grad_norm": 0.26559512910109384, + "learning_rate": 3.835779122770274e-06, + "loss": 0.07513427734375, + "memory(GiB)": 10.57, + "step": 3295, + "train_speed(iter/s)": 0.475628 + }, + { + "epoch": 2.4647861012516348, + "grad_norm": 0.31583700464598574, + "learning_rate": 3.830144193121321e-06, + "loss": 0.0657806396484375, + "memory(GiB)": 10.57, + "step": 3300, + "train_speed(iter/s)": 0.475643 + }, + { + "epoch": 2.468522323930506, + "grad_norm": 0.2884092438790019, + "learning_rate": 3.824506090038185e-06, + "loss": 0.091070556640625, + "memory(GiB)": 10.57, + "step": 3305, + "train_speed(iter/s)": 0.475667 + }, + { + "epoch": 2.472258546609378, + "grad_norm": 0.3977319977360202, + "learning_rate": 3.818864835073931e-06, + "loss": 0.0851806640625, + "memory(GiB)": 10.57, + "step": 3310, + "train_speed(iter/s)": 0.475693 + }, + { + "epoch": 2.4759947692882496, + "grad_norm": 0.3494999636811868, + "learning_rate": 3.813220449793667e-06, + "loss": 0.064434814453125, + "memory(GiB)": 10.57, + "step": 3315, + "train_speed(iter/s)": 0.475688 + }, + { + "epoch": 2.4797309919671213, + "grad_norm": 0.17667298355698585, + "learning_rate": 3.8075729557744706e-06, + "loss": 0.06602783203125, + "memory(GiB)": 10.57, + "step": 3320, + "train_speed(iter/s)": 0.475718 + }, + { + "epoch": 2.483467214645993, + "grad_norm": 0.2847260138841454, + "learning_rate": 3.8019223746053037e-06, + "loss": 0.0813232421875, + "memory(GiB)": 10.57, + "step": 3325, + "train_speed(iter/s)": 0.47572 + }, + { + "epoch": 2.4872034373248644, + "grad_norm": 0.3276391701017016, + "learning_rate": 3.7962687278869266e-06, + "loss": 0.084173583984375, + "memory(GiB)": 10.57, + "step": 3330, + "train_speed(iter/s)": 0.47573 + }, + { + "epoch": 2.490939660003736, + "grad_norm": 0.20750116064295474, + "learning_rate": 3.7906120372318237e-06, + "loss": 0.055908203125, + "memory(GiB)": 10.57, + "step": 3335, + "train_speed(iter/s)": 0.475771 + }, + { + "epoch": 2.494675882682608, + "grad_norm": 0.21852160072540378, + "learning_rate": 3.784952324264109e-06, + "loss": 0.075030517578125, + "memory(GiB)": 10.57, + "step": 3340, + "train_speed(iter/s)": 0.475804 + }, + { + "epoch": 2.4984121053614796, + "grad_norm": 0.24279228051631654, + "learning_rate": 3.779289610619455e-06, + "loss": 0.07666015625, + "memory(GiB)": 10.57, + "step": 3345, + "train_speed(iter/s)": 0.475805 + }, + { + "epoch": 2.5021483280403514, + "grad_norm": 0.2904472098375547, + "learning_rate": 3.773623917945004e-06, + "loss": 0.092840576171875, + "memory(GiB)": 10.57, + "step": 3350, + "train_speed(iter/s)": 0.475809 + }, + { + "epoch": 2.505884550719223, + "grad_norm": 0.3311881989863495, + "learning_rate": 3.7679552678992854e-06, + "loss": 0.07431640625, + "memory(GiB)": 10.57, + "step": 3355, + "train_speed(iter/s)": 0.475802 + }, + { + "epoch": 2.5096207733980944, + "grad_norm": 0.347020365516737, + "learning_rate": 3.7622836821521346e-06, + "loss": 0.083404541015625, + "memory(GiB)": 10.57, + "step": 3360, + "train_speed(iter/s)": 0.475755 + }, + { + "epoch": 2.513356996076966, + "grad_norm": 0.30218078744076704, + "learning_rate": 3.7566091823846082e-06, + "loss": 0.080633544921875, + "memory(GiB)": 10.57, + "step": 3365, + "train_speed(iter/s)": 0.475751 + }, + { + "epoch": 2.517093218755838, + "grad_norm": 0.19250830743626035, + "learning_rate": 3.750931790288904e-06, + "loss": 0.070989990234375, + "memory(GiB)": 10.57, + "step": 3370, + "train_speed(iter/s)": 0.475766 + }, + { + "epoch": 2.5208294414347097, + "grad_norm": 0.3140116665074889, + "learning_rate": 3.745251527568276e-06, + "loss": 0.08988037109375, + "memory(GiB)": 10.57, + "step": 3375, + "train_speed(iter/s)": 0.475765 + }, + { + "epoch": 2.524565664113581, + "grad_norm": 0.27965921080609724, + "learning_rate": 3.7395684159369515e-06, + "loss": 0.0727783203125, + "memory(GiB)": 10.57, + "step": 3380, + "train_speed(iter/s)": 0.475783 + }, + { + "epoch": 2.5283018867924527, + "grad_norm": 0.2825039712001602, + "learning_rate": 3.733882477120049e-06, + "loss": 0.07235107421875, + "memory(GiB)": 10.57, + "step": 3385, + "train_speed(iter/s)": 0.475777 + }, + { + "epoch": 2.5320381094713245, + "grad_norm": 0.2817704189737431, + "learning_rate": 3.7281937328534927e-06, + "loss": 0.07215576171875, + "memory(GiB)": 10.57, + "step": 3390, + "train_speed(iter/s)": 0.475785 + }, + { + "epoch": 2.535774332150196, + "grad_norm": 0.2984895644961484, + "learning_rate": 3.7225022048839364e-06, + "loss": 0.07979736328125, + "memory(GiB)": 10.57, + "step": 3395, + "train_speed(iter/s)": 0.475804 + }, + { + "epoch": 2.539510554829068, + "grad_norm": 0.4297688864469516, + "learning_rate": 3.716807914968669e-06, + "loss": 0.0768310546875, + "memory(GiB)": 10.57, + "step": 3400, + "train_speed(iter/s)": 0.475802 + }, + { + "epoch": 2.5432467775079397, + "grad_norm": 0.2540092842763994, + "learning_rate": 3.7111108848755407e-06, + "loss": 0.080731201171875, + "memory(GiB)": 10.57, + "step": 3405, + "train_speed(iter/s)": 0.475804 + }, + { + "epoch": 2.546983000186811, + "grad_norm": 0.218855865695132, + "learning_rate": 3.705411136382877e-06, + "loss": 0.07509765625, + "memory(GiB)": 10.57, + "step": 3410, + "train_speed(iter/s)": 0.475824 + }, + { + "epoch": 2.5507192228656828, + "grad_norm": 0.31386617014735185, + "learning_rate": 3.6997086912793953e-06, + "loss": 0.08365478515625, + "memory(GiB)": 10.57, + "step": 3415, + "train_speed(iter/s)": 0.475796 + }, + { + "epoch": 2.5544554455445545, + "grad_norm": 0.2888393651203557, + "learning_rate": 3.69400357136412e-06, + "loss": 0.08245849609375, + "memory(GiB)": 10.57, + "step": 3420, + "train_speed(iter/s)": 0.475804 + }, + { + "epoch": 2.5581916682234263, + "grad_norm": 0.518767980813791, + "learning_rate": 3.6882957984463014e-06, + "loss": 0.084869384765625, + "memory(GiB)": 10.57, + "step": 3425, + "train_speed(iter/s)": 0.475798 + }, + { + "epoch": 2.5619278909022976, + "grad_norm": 0.24055934018386763, + "learning_rate": 3.6825853943453326e-06, + "loss": 0.07509765625, + "memory(GiB)": 10.57, + "step": 3430, + "train_speed(iter/s)": 0.475815 + }, + { + "epoch": 2.5656641135811693, + "grad_norm": 0.11607703015154515, + "learning_rate": 3.6768723808906624e-06, + "loss": 0.0733642578125, + "memory(GiB)": 10.57, + "step": 3435, + "train_speed(iter/s)": 0.475839 + }, + { + "epoch": 2.569400336260041, + "grad_norm": 0.2621128311109813, + "learning_rate": 3.6711567799217177e-06, + "loss": 0.07127685546875, + "memory(GiB)": 10.57, + "step": 3440, + "train_speed(iter/s)": 0.475869 + }, + { + "epoch": 2.573136558938913, + "grad_norm": 0.4650255643831401, + "learning_rate": 3.6654386132878153e-06, + "loss": 0.07940673828125, + "memory(GiB)": 10.57, + "step": 3445, + "train_speed(iter/s)": 0.475873 + }, + { + "epoch": 2.5768727816177845, + "grad_norm": 0.3724024885268326, + "learning_rate": 3.659717902848079e-06, + "loss": 0.07889404296875, + "memory(GiB)": 10.57, + "step": 3450, + "train_speed(iter/s)": 0.475871 + }, + { + "epoch": 2.5806090042966563, + "grad_norm": 0.23714008480261214, + "learning_rate": 3.653994670471358e-06, + "loss": 0.062042236328125, + "memory(GiB)": 10.57, + "step": 3455, + "train_speed(iter/s)": 0.475898 + }, + { + "epoch": 2.5843452269755276, + "grad_norm": 0.38138493209988716, + "learning_rate": 3.6482689380361434e-06, + "loss": 0.078564453125, + "memory(GiB)": 10.57, + "step": 3460, + "train_speed(iter/s)": 0.47589 + }, + { + "epoch": 2.5880814496543993, + "grad_norm": 0.2790205903786827, + "learning_rate": 3.6425407274304794e-06, + "loss": 0.07850341796875, + "memory(GiB)": 10.57, + "step": 3465, + "train_speed(iter/s)": 0.475897 + }, + { + "epoch": 2.591817672333271, + "grad_norm": 0.28268894066227623, + "learning_rate": 3.6368100605518895e-06, + "loss": 0.080084228515625, + "memory(GiB)": 10.57, + "step": 3470, + "train_speed(iter/s)": 0.4759 + }, + { + "epoch": 2.595553895012143, + "grad_norm": 0.40313615278345716, + "learning_rate": 3.631076959307282e-06, + "loss": 0.085107421875, + "memory(GiB)": 10.57, + "step": 3475, + "train_speed(iter/s)": 0.475908 + }, + { + "epoch": 2.599290117691014, + "grad_norm": 0.2734351199877751, + "learning_rate": 3.625341445612872e-06, + "loss": 0.084490966796875, + "memory(GiB)": 10.57, + "step": 3480, + "train_speed(iter/s)": 0.475939 + }, + { + "epoch": 2.603026340369886, + "grad_norm": 0.24165164144941384, + "learning_rate": 3.6196035413941004e-06, + "loss": 0.075732421875, + "memory(GiB)": 10.57, + "step": 3485, + "train_speed(iter/s)": 0.475926 + }, + { + "epoch": 2.6067625630487576, + "grad_norm": 0.22587276792049774, + "learning_rate": 3.6138632685855416e-06, + "loss": 0.06920166015625, + "memory(GiB)": 10.57, + "step": 3490, + "train_speed(iter/s)": 0.47595 + }, + { + "epoch": 2.6104987857276294, + "grad_norm": 0.26274757578605296, + "learning_rate": 3.608120649130827e-06, + "loss": 0.06964111328125, + "memory(GiB)": 10.57, + "step": 3495, + "train_speed(iter/s)": 0.475958 + }, + { + "epoch": 2.614235008406501, + "grad_norm": 0.2791749381588521, + "learning_rate": 3.602375704982559e-06, + "loss": 0.082159423828125, + "memory(GiB)": 10.57, + "step": 3500, + "train_speed(iter/s)": 0.475942 + }, + { + "epoch": 2.617971231085373, + "grad_norm": 0.19097386934636804, + "learning_rate": 3.5966284581022256e-06, + "loss": 0.071124267578125, + "memory(GiB)": 10.57, + "step": 3505, + "train_speed(iter/s)": 0.475946 + }, + { + "epoch": 2.621707453764244, + "grad_norm": 0.30489359623246215, + "learning_rate": 3.5908789304601187e-06, + "loss": 0.0773193359375, + "memory(GiB)": 10.57, + "step": 3510, + "train_speed(iter/s)": 0.475924 + }, + { + "epoch": 2.625443676443116, + "grad_norm": 0.3251670210353117, + "learning_rate": 3.585127144035247e-06, + "loss": 0.0652557373046875, + "memory(GiB)": 10.57, + "step": 3515, + "train_speed(iter/s)": 0.475915 + }, + { + "epoch": 2.6291798991219877, + "grad_norm": 0.47973710424124294, + "learning_rate": 3.579373120815257e-06, + "loss": 0.0652099609375, + "memory(GiB)": 10.57, + "step": 3520, + "train_speed(iter/s)": 0.47582 + }, + { + "epoch": 2.6329161218008594, + "grad_norm": 0.251813320258894, + "learning_rate": 3.5736168827963423e-06, + "loss": 0.0735595703125, + "memory(GiB)": 10.57, + "step": 3525, + "train_speed(iter/s)": 0.475822 + }, + { + "epoch": 2.6366523444797307, + "grad_norm": 0.16642948523661447, + "learning_rate": 3.567858451983167e-06, + "loss": 0.0711456298828125, + "memory(GiB)": 10.57, + "step": 3530, + "train_speed(iter/s)": 0.475776 + }, + { + "epoch": 2.6403885671586025, + "grad_norm": 0.2232206082433094, + "learning_rate": 3.562097850388775e-06, + "loss": 0.08082275390625, + "memory(GiB)": 10.57, + "step": 3535, + "train_speed(iter/s)": 0.475792 + }, + { + "epoch": 2.6441247898374742, + "grad_norm": 0.29955499401855273, + "learning_rate": 3.5563351000345077e-06, + "loss": 0.06729736328125, + "memory(GiB)": 10.57, + "step": 3540, + "train_speed(iter/s)": 0.475806 + }, + { + "epoch": 2.647861012516346, + "grad_norm": 0.3399121760483779, + "learning_rate": 3.5505702229499243e-06, + "loss": 0.0638671875, + "memory(GiB)": 10.57, + "step": 3545, + "train_speed(iter/s)": 0.475786 + }, + { + "epoch": 2.6515972351952177, + "grad_norm": 0.24813478944145864, + "learning_rate": 3.5448032411727123e-06, + "loss": 0.073760986328125, + "memory(GiB)": 10.57, + "step": 3550, + "train_speed(iter/s)": 0.475775 + }, + { + "epoch": 2.6553334578740895, + "grad_norm": 0.20754012538401892, + "learning_rate": 3.539034176748602e-06, + "loss": 0.069378662109375, + "memory(GiB)": 10.57, + "step": 3555, + "train_speed(iter/s)": 0.475759 + }, + { + "epoch": 2.6590696805529612, + "grad_norm": 0.3300071479044449, + "learning_rate": 3.53326305173129e-06, + "loss": 0.0831787109375, + "memory(GiB)": 10.57, + "step": 3560, + "train_speed(iter/s)": 0.475747 + }, + { + "epoch": 2.6628059032318325, + "grad_norm": 0.2418845408277716, + "learning_rate": 3.5274898881823466e-06, + "loss": 0.0650390625, + "memory(GiB)": 10.57, + "step": 3565, + "train_speed(iter/s)": 0.475754 + }, + { + "epoch": 2.6665421259107043, + "grad_norm": 0.191875325205025, + "learning_rate": 3.5217147081711363e-06, + "loss": 0.07650146484375, + "memory(GiB)": 10.57, + "step": 3570, + "train_speed(iter/s)": 0.475774 + }, + { + "epoch": 2.670278348589576, + "grad_norm": 0.2918403056701858, + "learning_rate": 3.515937533774732e-06, + "loss": 0.0787841796875, + "memory(GiB)": 10.57, + "step": 3575, + "train_speed(iter/s)": 0.475801 + }, + { + "epoch": 2.6740145712684473, + "grad_norm": 0.2103497141365804, + "learning_rate": 3.51015838707783e-06, + "loss": 0.083331298828125, + "memory(GiB)": 10.57, + "step": 3580, + "train_speed(iter/s)": 0.475836 + }, + { + "epoch": 2.677750793947319, + "grad_norm": 0.15535646417219773, + "learning_rate": 3.504377290172666e-06, + "loss": 0.0805419921875, + "memory(GiB)": 10.57, + "step": 3585, + "train_speed(iter/s)": 0.475811 + }, + { + "epoch": 2.681487016626191, + "grad_norm": 0.2156487636541889, + "learning_rate": 3.498594265158933e-06, + "loss": 0.0731689453125, + "memory(GiB)": 10.57, + "step": 3590, + "train_speed(iter/s)": 0.47582 + }, + { + "epoch": 2.6852232393050626, + "grad_norm": 0.31756593216849865, + "learning_rate": 3.4928093341436915e-06, + "loss": 0.08016357421875, + "memory(GiB)": 10.57, + "step": 3595, + "train_speed(iter/s)": 0.475826 + }, + { + "epoch": 2.6889594619839343, + "grad_norm": 0.17993011176812954, + "learning_rate": 3.4870225192412908e-06, + "loss": 0.068292236328125, + "memory(GiB)": 10.57, + "step": 3600, + "train_speed(iter/s)": 0.475817 + }, + { + "epoch": 2.692695684662806, + "grad_norm": 0.2563812995989066, + "learning_rate": 3.4812338425732808e-06, + "loss": 0.09036865234375, + "memory(GiB)": 10.57, + "step": 3605, + "train_speed(iter/s)": 0.475841 + }, + { + "epoch": 2.696431907341678, + "grad_norm": 0.21729858304510458, + "learning_rate": 3.4754433262683286e-06, + "loss": 0.070880126953125, + "memory(GiB)": 10.57, + "step": 3610, + "train_speed(iter/s)": 0.475864 + }, + { + "epoch": 2.700168130020549, + "grad_norm": 0.4448881083896266, + "learning_rate": 3.4696509924621324e-06, + "loss": 0.090478515625, + "memory(GiB)": 10.57, + "step": 3615, + "train_speed(iter/s)": 0.475831 + }, + { + "epoch": 2.703904352699421, + "grad_norm": 0.29692075196588846, + "learning_rate": 3.463856863297341e-06, + "loss": 0.078076171875, + "memory(GiB)": 10.57, + "step": 3620, + "train_speed(iter/s)": 0.475848 + }, + { + "epoch": 2.7076405753782926, + "grad_norm": 0.31954279997414836, + "learning_rate": 3.4580609609234648e-06, + "loss": 0.07919921875, + "memory(GiB)": 10.57, + "step": 3625, + "train_speed(iter/s)": 0.475834 + }, + { + "epoch": 2.7113767980571644, + "grad_norm": 0.1723702450513143, + "learning_rate": 3.4522633074967915e-06, + "loss": 0.074517822265625, + "memory(GiB)": 10.57, + "step": 3630, + "train_speed(iter/s)": 0.475811 + }, + { + "epoch": 2.7151130207360357, + "grad_norm": 0.22262320422842827, + "learning_rate": 3.4464639251803052e-06, + "loss": 0.070367431640625, + "memory(GiB)": 10.57, + "step": 3635, + "train_speed(iter/s)": 0.475826 + }, + { + "epoch": 2.7188492434149074, + "grad_norm": 0.28450955603049155, + "learning_rate": 3.4406628361435986e-06, + "loss": 0.08800048828125, + "memory(GiB)": 10.57, + "step": 3640, + "train_speed(iter/s)": 0.475849 + }, + { + "epoch": 2.722585466093779, + "grad_norm": 0.3537764688990701, + "learning_rate": 3.4348600625627853e-06, + "loss": 0.08115081787109375, + "memory(GiB)": 10.57, + "step": 3645, + "train_speed(iter/s)": 0.475856 + }, + { + "epoch": 2.726321688772651, + "grad_norm": 0.2717562915869466, + "learning_rate": 3.4290556266204255e-06, + "loss": 0.06995849609375, + "memory(GiB)": 10.57, + "step": 3650, + "train_speed(iter/s)": 0.475855 + }, + { + "epoch": 2.7300579114515227, + "grad_norm": 0.22750796325018738, + "learning_rate": 3.4232495505054263e-06, + "loss": 0.071771240234375, + "memory(GiB)": 10.57, + "step": 3655, + "train_speed(iter/s)": 0.475875 + }, + { + "epoch": 2.7337941341303944, + "grad_norm": 0.15412260555395027, + "learning_rate": 3.4174418564129683e-06, + "loss": 0.07366943359375, + "memory(GiB)": 10.57, + "step": 3660, + "train_speed(iter/s)": 0.475851 + }, + { + "epoch": 2.7375303568092657, + "grad_norm": 0.22006647714355373, + "learning_rate": 3.4116325665444205e-06, + "loss": 0.07138671875, + "memory(GiB)": 10.57, + "step": 3665, + "train_speed(iter/s)": 0.475871 + }, + { + "epoch": 2.7412665794881375, + "grad_norm": 0.42373302378912014, + "learning_rate": 3.405821703107247e-06, + "loss": 0.081640625, + "memory(GiB)": 10.57, + "step": 3670, + "train_speed(iter/s)": 0.475866 + }, + { + "epoch": 2.745002802167009, + "grad_norm": 0.25034251347665165, + "learning_rate": 3.4000092883149293e-06, + "loss": 0.07459716796875, + "memory(GiB)": 10.57, + "step": 3675, + "train_speed(iter/s)": 0.475862 + }, + { + "epoch": 2.748739024845881, + "grad_norm": 0.26815460719783096, + "learning_rate": 3.3941953443868794e-06, + "loss": 0.0758056640625, + "memory(GiB)": 10.57, + "step": 3680, + "train_speed(iter/s)": 0.475869 + }, + { + "epoch": 2.7524752475247523, + "grad_norm": 0.3488626865913072, + "learning_rate": 3.388379893548356e-06, + "loss": 0.076416015625, + "memory(GiB)": 10.57, + "step": 3685, + "train_speed(iter/s)": 0.475889 + }, + { + "epoch": 2.756211470203624, + "grad_norm": 0.2927879301365204, + "learning_rate": 3.382562958030375e-06, + "loss": 0.072265625, + "memory(GiB)": 10.57, + "step": 3690, + "train_speed(iter/s)": 0.475894 + }, + { + "epoch": 2.7599476928824958, + "grad_norm": 0.39819039701808595, + "learning_rate": 3.376744560069631e-06, + "loss": 0.0801025390625, + "memory(GiB)": 10.57, + "step": 3695, + "train_speed(iter/s)": 0.475889 + }, + { + "epoch": 2.7636839155613675, + "grad_norm": 0.27836721809953646, + "learning_rate": 3.370924721908408e-06, + "loss": 0.081817626953125, + "memory(GiB)": 10.57, + "step": 3700, + "train_speed(iter/s)": 0.475851 + }, + { + "epoch": 2.7674201382402392, + "grad_norm": 0.3159510466408062, + "learning_rate": 3.3651034657944944e-06, + "loss": 0.09007568359375, + "memory(GiB)": 10.57, + "step": 3705, + "train_speed(iter/s)": 0.475839 + }, + { + "epoch": 2.771156360919111, + "grad_norm": 0.2482343530491869, + "learning_rate": 3.3592808139811034e-06, + "loss": 0.08701171875, + "memory(GiB)": 10.57, + "step": 3710, + "train_speed(iter/s)": 0.475854 + }, + { + "epoch": 2.7748925835979823, + "grad_norm": 0.2212717362508163, + "learning_rate": 3.353456788726778e-06, + "loss": 0.089019775390625, + "memory(GiB)": 10.57, + "step": 3715, + "train_speed(iter/s)": 0.475852 + }, + { + "epoch": 2.778628806276854, + "grad_norm": 0.3180240539309867, + "learning_rate": 3.347631412295314e-06, + "loss": 0.078448486328125, + "memory(GiB)": 10.57, + "step": 3720, + "train_speed(iter/s)": 0.475768 + }, + { + "epoch": 2.782365028955726, + "grad_norm": 0.19694686614220888, + "learning_rate": 3.341804706955673e-06, + "loss": 0.071771240234375, + "memory(GiB)": 10.57, + "step": 3725, + "train_speed(iter/s)": 0.475772 + }, + { + "epoch": 2.7861012516345975, + "grad_norm": 0.27207148460273645, + "learning_rate": 3.335976694981898e-06, + "loss": 0.071990966796875, + "memory(GiB)": 10.57, + "step": 3730, + "train_speed(iter/s)": 0.475786 + }, + { + "epoch": 2.789837474313469, + "grad_norm": 0.2784440972147361, + "learning_rate": 3.3301473986530204e-06, + "loss": 0.08033447265625, + "memory(GiB)": 10.57, + "step": 3735, + "train_speed(iter/s)": 0.475803 + }, + { + "epoch": 2.7935736969923406, + "grad_norm": 0.384630172157372, + "learning_rate": 3.3243168402529903e-06, + "loss": 0.07603759765625, + "memory(GiB)": 10.57, + "step": 3740, + "train_speed(iter/s)": 0.475835 + }, + { + "epoch": 2.7973099196712123, + "grad_norm": 0.3015764425828606, + "learning_rate": 3.318485042070576e-06, + "loss": 0.070220947265625, + "memory(GiB)": 10.57, + "step": 3745, + "train_speed(iter/s)": 0.475867 + }, + { + "epoch": 2.801046142350084, + "grad_norm": 0.33638080331152426, + "learning_rate": 3.3126520263992883e-06, + "loss": 0.078277587890625, + "memory(GiB)": 10.57, + "step": 3750, + "train_speed(iter/s)": 0.475859 + }, + { + "epoch": 2.804782365028956, + "grad_norm": 0.2624352148398618, + "learning_rate": 3.306817815537291e-06, + "loss": 0.0696044921875, + "memory(GiB)": 10.57, + "step": 3755, + "train_speed(iter/s)": 0.47588 + }, + { + "epoch": 2.8085185877078276, + "grad_norm": 0.27781369223511193, + "learning_rate": 3.3009824317873164e-06, + "loss": 0.058050537109375, + "memory(GiB)": 10.57, + "step": 3760, + "train_speed(iter/s)": 0.475896 + }, + { + "epoch": 2.812254810386699, + "grad_norm": 0.1340015202269091, + "learning_rate": 3.2951458974565808e-06, + "loss": 0.08018798828125, + "memory(GiB)": 10.57, + "step": 3765, + "train_speed(iter/s)": 0.475889 + }, + { + "epoch": 2.8159910330655706, + "grad_norm": 0.20980509524344693, + "learning_rate": 3.2893082348567e-06, + "loss": 0.069110107421875, + "memory(GiB)": 10.57, + "step": 3770, + "train_speed(iter/s)": 0.475909 + }, + { + "epoch": 2.8197272557444424, + "grad_norm": 0.2501876137757298, + "learning_rate": 3.2834694663036016e-06, + "loss": 0.07905120849609375, + "memory(GiB)": 10.57, + "step": 3775, + "train_speed(iter/s)": 0.475912 + }, + { + "epoch": 2.823463478423314, + "grad_norm": 0.23719398237463618, + "learning_rate": 3.2776296141174405e-06, + "loss": 0.07977294921875, + "memory(GiB)": 10.57, + "step": 3780, + "train_speed(iter/s)": 0.475894 + }, + { + "epoch": 2.8271997011021854, + "grad_norm": 0.2019112294748815, + "learning_rate": 3.271788700622517e-06, + "loss": 0.067169189453125, + "memory(GiB)": 10.57, + "step": 3785, + "train_speed(iter/s)": 0.475868 + }, + { + "epoch": 2.830935923781057, + "grad_norm": 0.2569708345412305, + "learning_rate": 3.265946748147185e-06, + "loss": 0.08135986328125, + "memory(GiB)": 10.57, + "step": 3790, + "train_speed(iter/s)": 0.475837 + }, + { + "epoch": 2.834672146459929, + "grad_norm": 0.3039925539901921, + "learning_rate": 3.2601037790237713e-06, + "loss": 0.0752685546875, + "memory(GiB)": 10.57, + "step": 3795, + "train_speed(iter/s)": 0.475822 + }, + { + "epoch": 2.8384083691388007, + "grad_norm": 0.31263171782477395, + "learning_rate": 3.2542598155884905e-06, + "loss": 0.079345703125, + "memory(GiB)": 10.57, + "step": 3800, + "train_speed(iter/s)": 0.475843 + }, + { + "epoch": 2.8421445918176724, + "grad_norm": 0.3194717938532269, + "learning_rate": 3.2484148801813564e-06, + "loss": 0.0697540283203125, + "memory(GiB)": 10.57, + "step": 3805, + "train_speed(iter/s)": 0.475787 + }, + { + "epoch": 2.845880814496544, + "grad_norm": 0.2414957673117366, + "learning_rate": 3.242568995146099e-06, + "loss": 0.079833984375, + "memory(GiB)": 10.57, + "step": 3810, + "train_speed(iter/s)": 0.475812 + }, + { + "epoch": 2.849617037175416, + "grad_norm": 0.29766797126278466, + "learning_rate": 3.2367221828300797e-06, + "loss": 0.07156982421875, + "memory(GiB)": 10.57, + "step": 3815, + "train_speed(iter/s)": 0.475822 + }, + { + "epoch": 2.8533532598542872, + "grad_norm": 0.33562960678102366, + "learning_rate": 3.2308744655842023e-06, + "loss": 0.07691650390625, + "memory(GiB)": 10.57, + "step": 3820, + "train_speed(iter/s)": 0.475844 + }, + { + "epoch": 2.857089482533159, + "grad_norm": 0.23249083043588517, + "learning_rate": 3.2250258657628317e-06, + "loss": 0.0674591064453125, + "memory(GiB)": 10.57, + "step": 3825, + "train_speed(iter/s)": 0.475864 + }, + { + "epoch": 2.8608257052120307, + "grad_norm": 0.3868842372829782, + "learning_rate": 3.2191764057237057e-06, + "loss": 0.0788818359375, + "memory(GiB)": 10.57, + "step": 3830, + "train_speed(iter/s)": 0.475867 + }, + { + "epoch": 2.864561927890902, + "grad_norm": 0.2721643307415772, + "learning_rate": 3.2133261078278516e-06, + "loss": 0.076806640625, + "memory(GiB)": 10.57, + "step": 3835, + "train_speed(iter/s)": 0.475878 + }, + { + "epoch": 2.8682981505697738, + "grad_norm": 0.2509409981744641, + "learning_rate": 3.207474994439499e-06, + "loss": 0.07947998046875, + "memory(GiB)": 10.57, + "step": 3840, + "train_speed(iter/s)": 0.475893 + }, + { + "epoch": 2.8720343732486455, + "grad_norm": 0.2985391643876752, + "learning_rate": 3.2016230879259938e-06, + "loss": 0.08131103515625, + "memory(GiB)": 10.57, + "step": 3845, + "train_speed(iter/s)": 0.475879 + }, + { + "epoch": 2.8757705959275173, + "grad_norm": 0.34684741561716165, + "learning_rate": 3.195770410657717e-06, + "loss": 0.082269287109375, + "memory(GiB)": 10.57, + "step": 3850, + "train_speed(iter/s)": 0.475847 + }, + { + "epoch": 2.879506818606389, + "grad_norm": 0.23479279469344572, + "learning_rate": 3.189916985007991e-06, + "loss": 0.09420166015625, + "memory(GiB)": 10.57, + "step": 3855, + "train_speed(iter/s)": 0.475813 + }, + { + "epoch": 2.8832430412852608, + "grad_norm": 0.3907742470555341, + "learning_rate": 3.184062833353005e-06, + "loss": 0.07618408203125, + "memory(GiB)": 10.57, + "step": 3860, + "train_speed(iter/s)": 0.475799 + }, + { + "epoch": 2.8869792639641325, + "grad_norm": 0.19372123225177681, + "learning_rate": 3.178207978071719e-06, + "loss": 0.079144287109375, + "memory(GiB)": 10.57, + "step": 3865, + "train_speed(iter/s)": 0.475828 + }, + { + "epoch": 2.890715486643004, + "grad_norm": 0.3425380929045749, + "learning_rate": 3.1723524415457845e-06, + "loss": 0.085382080078125, + "memory(GiB)": 10.57, + "step": 3870, + "train_speed(iter/s)": 0.475816 + }, + { + "epoch": 2.8944517093218756, + "grad_norm": 0.3609396149048238, + "learning_rate": 3.166496246159457e-06, + "loss": 0.070849609375, + "memory(GiB)": 10.57, + "step": 3875, + "train_speed(iter/s)": 0.475828 + }, + { + "epoch": 2.8981879320007473, + "grad_norm": 0.20183491005738083, + "learning_rate": 3.160639414299511e-06, + "loss": 0.074884033203125, + "memory(GiB)": 10.57, + "step": 3880, + "train_speed(iter/s)": 0.475821 + }, + { + "epoch": 2.901924154679619, + "grad_norm": 0.2467148593569697, + "learning_rate": 3.154781968355153e-06, + "loss": 0.063775634765625, + "memory(GiB)": 10.57, + "step": 3885, + "train_speed(iter/s)": 0.475846 + }, + { + "epoch": 2.9056603773584904, + "grad_norm": 0.2637999747733018, + "learning_rate": 3.148923930717939e-06, + "loss": 0.0755615234375, + "memory(GiB)": 10.57, + "step": 3890, + "train_speed(iter/s)": 0.475849 + }, + { + "epoch": 2.909396600037362, + "grad_norm": 0.25527787190645407, + "learning_rate": 3.143065323781685e-06, + "loss": 0.06624755859375, + "memory(GiB)": 10.57, + "step": 3895, + "train_speed(iter/s)": 0.475865 + }, + { + "epoch": 2.913132822716234, + "grad_norm": 0.30417828277097125, + "learning_rate": 3.137206169942384e-06, + "loss": 0.073992919921875, + "memory(GiB)": 10.57, + "step": 3900, + "train_speed(iter/s)": 0.475832 + }, + { + "epoch": 2.9168690453951056, + "grad_norm": 0.2346109435926227, + "learning_rate": 3.131346491598119e-06, + "loss": 0.07637939453125, + "memory(GiB)": 10.57, + "step": 3905, + "train_speed(iter/s)": 0.47584 + }, + { + "epoch": 2.9206052680739774, + "grad_norm": 0.2353613119236764, + "learning_rate": 3.1254863111489804e-06, + "loss": 0.081158447265625, + "memory(GiB)": 10.57, + "step": 3910, + "train_speed(iter/s)": 0.475845 + }, + { + "epoch": 2.924341490752849, + "grad_norm": 0.3558838314274693, + "learning_rate": 3.119625650996974e-06, + "loss": 0.076300048828125, + "memory(GiB)": 10.57, + "step": 3915, + "train_speed(iter/s)": 0.475836 + }, + { + "epoch": 2.9280777134317204, + "grad_norm": 0.27354688251249265, + "learning_rate": 3.1137645335459434e-06, + "loss": 0.073907470703125, + "memory(GiB)": 10.57, + "step": 3920, + "train_speed(iter/s)": 0.475809 + }, + { + "epoch": 2.931813936110592, + "grad_norm": 0.3327608490083812, + "learning_rate": 3.107902981201478e-06, + "loss": 0.07683868408203125, + "memory(GiB)": 10.57, + "step": 3925, + "train_speed(iter/s)": 0.475779 + }, + { + "epoch": 2.935550158789464, + "grad_norm": 0.3747363988689518, + "learning_rate": 3.1020410163708304e-06, + "loss": 0.074114990234375, + "memory(GiB)": 10.57, + "step": 3930, + "train_speed(iter/s)": 0.475764 + }, + { + "epoch": 2.9392863814683357, + "grad_norm": 0.18606776814447884, + "learning_rate": 3.0961786614628308e-06, + "loss": 0.073858642578125, + "memory(GiB)": 10.57, + "step": 3935, + "train_speed(iter/s)": 0.475783 + }, + { + "epoch": 2.943022604147207, + "grad_norm": 0.22753548240298943, + "learning_rate": 3.0903159388877984e-06, + "loss": 0.07952880859375, + "memory(GiB)": 10.57, + "step": 3940, + "train_speed(iter/s)": 0.475798 + }, + { + "epoch": 2.9467588268260787, + "grad_norm": 0.2665097861133451, + "learning_rate": 3.0844528710574603e-06, + "loss": 0.08333740234375, + "memory(GiB)": 10.57, + "step": 3945, + "train_speed(iter/s)": 0.475797 + }, + { + "epoch": 2.9504950495049505, + "grad_norm": 0.17698058114731188, + "learning_rate": 3.0785894803848617e-06, + "loss": 0.069122314453125, + "memory(GiB)": 10.57, + "step": 3950, + "train_speed(iter/s)": 0.475778 + }, + { + "epoch": 2.954231272183822, + "grad_norm": 0.3104099022805613, + "learning_rate": 3.072725789284282e-06, + "loss": 0.062646484375, + "memory(GiB)": 10.57, + "step": 3955, + "train_speed(iter/s)": 0.475745 + }, + { + "epoch": 2.957967494862694, + "grad_norm": 0.20315634133154128, + "learning_rate": 3.0668618201711517e-06, + "loss": 0.08089599609375, + "memory(GiB)": 10.57, + "step": 3960, + "train_speed(iter/s)": 0.475758 + }, + { + "epoch": 2.9617037175415657, + "grad_norm": 0.25055279371623723, + "learning_rate": 3.0609975954619585e-06, + "loss": 0.070599365234375, + "memory(GiB)": 10.57, + "step": 3965, + "train_speed(iter/s)": 0.475775 + }, + { + "epoch": 2.965439940220437, + "grad_norm": 0.27358700735815494, + "learning_rate": 3.0551331375741753e-06, + "loss": 0.079913330078125, + "memory(GiB)": 10.57, + "step": 3970, + "train_speed(iter/s)": 0.475795 + }, + { + "epoch": 2.9691761628993087, + "grad_norm": 0.2701014272775072, + "learning_rate": 3.0492684689261587e-06, + "loss": 0.069427490234375, + "memory(GiB)": 10.57, + "step": 3975, + "train_speed(iter/s)": 0.475767 + }, + { + "epoch": 2.9729123855781805, + "grad_norm": 0.26839228857427083, + "learning_rate": 3.0434036119370734e-06, + "loss": 0.07572021484375, + "memory(GiB)": 10.57, + "step": 3980, + "train_speed(iter/s)": 0.475785 + }, + { + "epoch": 2.9766486082570522, + "grad_norm": 0.22716855276596393, + "learning_rate": 3.037538589026808e-06, + "loss": 0.08402099609375, + "memory(GiB)": 10.57, + "step": 3985, + "train_speed(iter/s)": 0.475805 + }, + { + "epoch": 2.9803848309359235, + "grad_norm": 0.2867732902522501, + "learning_rate": 3.03167342261588e-06, + "loss": 0.06982421875, + "memory(GiB)": 10.57, + "step": 3990, + "train_speed(iter/s)": 0.475802 + }, + { + "epoch": 2.9841210536147953, + "grad_norm": 0.1859176777096869, + "learning_rate": 3.0258081351253565e-06, + "loss": 0.073046875, + "memory(GiB)": 10.57, + "step": 3995, + "train_speed(iter/s)": 0.475829 + }, + { + "epoch": 2.987857276293667, + "grad_norm": 0.28880199434249176, + "learning_rate": 3.019942748976771e-06, + "loss": 0.092022705078125, + "memory(GiB)": 10.57, + "step": 4000, + "train_speed(iter/s)": 0.475846 + }, + { + "epoch": 2.991593498972539, + "grad_norm": 0.33890813145931753, + "learning_rate": 3.0140772865920308e-06, + "loss": 0.076885986328125, + "memory(GiB)": 10.57, + "step": 4005, + "train_speed(iter/s)": 0.475863 + }, + { + "epoch": 2.9953297216514105, + "grad_norm": 0.237266786675584, + "learning_rate": 3.0082117703933345e-06, + "loss": 0.088226318359375, + "memory(GiB)": 10.57, + "step": 4010, + "train_speed(iter/s)": 0.475845 + }, + { + "epoch": 2.9990659443302823, + "grad_norm": 0.2422362583040606, + "learning_rate": 3.002346222803089e-06, + "loss": 0.0780517578125, + "memory(GiB)": 10.57, + "step": 4015, + "train_speed(iter/s)": 0.475845 + } + ], + "logging_steps": 5, + "max_steps": 8034, + "num_input_tokens_seen": 0, + "num_train_epochs": 6, + "save_steps": 1339, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 502166754164736.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}