gemma-2b-ultrachat-sft / trainer_state.json
kykim0's picture
Model save
578c7d4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9984289080911233,
"eval_steps": 500,
"global_step": 2862,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 24.950808312659266,
"learning_rate": 6.968641114982578e-08,
"loss": 2.031,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 15.612463250366455,
"learning_rate": 3.4843205574912896e-07,
"loss": 1.9905,
"step": 5
},
{
"epoch": 0.01,
"grad_norm": 16.67989238928508,
"learning_rate": 6.968641114982579e-07,
"loss": 1.9679,
"step": 10
},
{
"epoch": 0.02,
"grad_norm": 13.97263384789853,
"learning_rate": 1.045296167247387e-06,
"loss": 1.958,
"step": 15
},
{
"epoch": 0.02,
"grad_norm": 12.658444689479861,
"learning_rate": 1.3937282229965158e-06,
"loss": 1.932,
"step": 20
},
{
"epoch": 0.03,
"grad_norm": 9.287049000818321,
"learning_rate": 1.742160278745645e-06,
"loss": 1.8444,
"step": 25
},
{
"epoch": 0.03,
"grad_norm": 10.56045665593996,
"learning_rate": 2.090592334494774e-06,
"loss": 1.7719,
"step": 30
},
{
"epoch": 0.04,
"grad_norm": 5.747482976874127,
"learning_rate": 2.4390243902439027e-06,
"loss": 1.7056,
"step": 35
},
{
"epoch": 0.04,
"grad_norm": 9.413255697398755,
"learning_rate": 2.7874564459930316e-06,
"loss": 1.6451,
"step": 40
},
{
"epoch": 0.05,
"grad_norm": 4.434325086760698,
"learning_rate": 3.13588850174216e-06,
"loss": 1.6296,
"step": 45
},
{
"epoch": 0.05,
"grad_norm": 3.445926768023501,
"learning_rate": 3.48432055749129e-06,
"loss": 1.5796,
"step": 50
},
{
"epoch": 0.06,
"grad_norm": 3.178041261009526,
"learning_rate": 3.832752613240418e-06,
"loss": 1.5351,
"step": 55
},
{
"epoch": 0.06,
"grad_norm": 4.334033916945232,
"learning_rate": 4.181184668989548e-06,
"loss": 1.5303,
"step": 60
},
{
"epoch": 0.07,
"grad_norm": 3.2535280041113004,
"learning_rate": 4.529616724738676e-06,
"loss": 1.5094,
"step": 65
},
{
"epoch": 0.07,
"grad_norm": 2.252573016083858,
"learning_rate": 4.8780487804878055e-06,
"loss": 1.476,
"step": 70
},
{
"epoch": 0.08,
"grad_norm": 1.7368897202196425,
"learning_rate": 5.226480836236935e-06,
"loss": 1.468,
"step": 75
},
{
"epoch": 0.08,
"grad_norm": 4.713037916280227,
"learning_rate": 5.574912891986063e-06,
"loss": 1.4478,
"step": 80
},
{
"epoch": 0.09,
"grad_norm": 2.0371577380954746,
"learning_rate": 5.923344947735193e-06,
"loss": 1.4509,
"step": 85
},
{
"epoch": 0.09,
"grad_norm": 1.9202336443172359,
"learning_rate": 6.27177700348432e-06,
"loss": 1.4359,
"step": 90
},
{
"epoch": 0.1,
"grad_norm": 2.089843032349266,
"learning_rate": 6.62020905923345e-06,
"loss": 1.4352,
"step": 95
},
{
"epoch": 0.1,
"grad_norm": 1.8298050620081034,
"learning_rate": 6.96864111498258e-06,
"loss": 1.4157,
"step": 100
},
{
"epoch": 0.11,
"grad_norm": 4.1178966229203455,
"learning_rate": 7.317073170731707e-06,
"loss": 1.4312,
"step": 105
},
{
"epoch": 0.12,
"grad_norm": 2.365694961937579,
"learning_rate": 7.665505226480837e-06,
"loss": 1.4002,
"step": 110
},
{
"epoch": 0.12,
"grad_norm": 2.064701878483497,
"learning_rate": 8.013937282229966e-06,
"loss": 1.4006,
"step": 115
},
{
"epoch": 0.13,
"grad_norm": 2.0740194756934627,
"learning_rate": 8.362369337979095e-06,
"loss": 1.3766,
"step": 120
},
{
"epoch": 0.13,
"grad_norm": 1.1110382607010576,
"learning_rate": 8.710801393728223e-06,
"loss": 1.3956,
"step": 125
},
{
"epoch": 0.14,
"grad_norm": 1.2726316388320422,
"learning_rate": 9.059233449477352e-06,
"loss": 1.3658,
"step": 130
},
{
"epoch": 0.14,
"grad_norm": 1.1731484868005952,
"learning_rate": 9.407665505226482e-06,
"loss": 1.3903,
"step": 135
},
{
"epoch": 0.15,
"grad_norm": 1.3544859981372896,
"learning_rate": 9.756097560975611e-06,
"loss": 1.3857,
"step": 140
},
{
"epoch": 0.15,
"grad_norm": 1.3881786053179046,
"learning_rate": 1.0104529616724739e-05,
"loss": 1.3647,
"step": 145
},
{
"epoch": 0.16,
"grad_norm": 1.57802449965969,
"learning_rate": 1.045296167247387e-05,
"loss": 1.3763,
"step": 150
},
{
"epoch": 0.16,
"grad_norm": 9.996731917567597,
"learning_rate": 1.0801393728222997e-05,
"loss": 1.3755,
"step": 155
},
{
"epoch": 0.17,
"grad_norm": 2.0373656181612914,
"learning_rate": 1.1149825783972127e-05,
"loss": 1.3241,
"step": 160
},
{
"epoch": 0.17,
"grad_norm": 1.2816937013772902,
"learning_rate": 1.1498257839721256e-05,
"loss": 1.3442,
"step": 165
},
{
"epoch": 0.18,
"grad_norm": 1.6836553557453815,
"learning_rate": 1.1846689895470385e-05,
"loss": 1.3246,
"step": 170
},
{
"epoch": 0.18,
"grad_norm": 1.4941600150894023,
"learning_rate": 1.2195121951219513e-05,
"loss": 1.3388,
"step": 175
},
{
"epoch": 0.19,
"grad_norm": 1.0375455465844554,
"learning_rate": 1.254355400696864e-05,
"loss": 1.3483,
"step": 180
},
{
"epoch": 0.19,
"grad_norm": 1.027207168323913,
"learning_rate": 1.2891986062717772e-05,
"loss": 1.3326,
"step": 185
},
{
"epoch": 0.2,
"grad_norm": 1.3344220787049235,
"learning_rate": 1.32404181184669e-05,
"loss": 1.3291,
"step": 190
},
{
"epoch": 0.2,
"grad_norm": 1.2196412425493108,
"learning_rate": 1.3588850174216028e-05,
"loss": 1.3339,
"step": 195
},
{
"epoch": 0.21,
"grad_norm": 0.9929061272984637,
"learning_rate": 1.393728222996516e-05,
"loss": 1.3294,
"step": 200
},
{
"epoch": 0.21,
"grad_norm": 0.8966398019407287,
"learning_rate": 1.4285714285714287e-05,
"loss": 1.3202,
"step": 205
},
{
"epoch": 0.22,
"grad_norm": 2.2234096160749663,
"learning_rate": 1.4634146341463415e-05,
"loss": 1.3247,
"step": 210
},
{
"epoch": 0.23,
"grad_norm": 1.2508239639898415,
"learning_rate": 1.4982578397212544e-05,
"loss": 1.3003,
"step": 215
},
{
"epoch": 0.23,
"grad_norm": 1.198416205396815,
"learning_rate": 1.5331010452961673e-05,
"loss": 1.2997,
"step": 220
},
{
"epoch": 0.24,
"grad_norm": 2.0863526606845215,
"learning_rate": 1.5679442508710803e-05,
"loss": 1.2975,
"step": 225
},
{
"epoch": 0.24,
"grad_norm": 1.6327610820644576,
"learning_rate": 1.6027874564459932e-05,
"loss": 1.2896,
"step": 230
},
{
"epoch": 0.25,
"grad_norm": 1.8969443348798178,
"learning_rate": 1.637630662020906e-05,
"loss": 1.3019,
"step": 235
},
{
"epoch": 0.25,
"grad_norm": 1.9110344126396037,
"learning_rate": 1.672473867595819e-05,
"loss": 1.3042,
"step": 240
},
{
"epoch": 0.26,
"grad_norm": 1.5325811093043817,
"learning_rate": 1.7073170731707317e-05,
"loss": 1.2944,
"step": 245
},
{
"epoch": 0.26,
"grad_norm": 1.6541987992128948,
"learning_rate": 1.7421602787456446e-05,
"loss": 1.2676,
"step": 250
},
{
"epoch": 0.27,
"grad_norm": 1.8715760795102667,
"learning_rate": 1.7770034843205575e-05,
"loss": 1.2885,
"step": 255
},
{
"epoch": 0.27,
"grad_norm": 1.4753005664124763,
"learning_rate": 1.8118466898954705e-05,
"loss": 1.2889,
"step": 260
},
{
"epoch": 0.28,
"grad_norm": 0.7499194131822957,
"learning_rate": 1.8466898954703834e-05,
"loss": 1.2861,
"step": 265
},
{
"epoch": 0.28,
"grad_norm": 1.067947246812113,
"learning_rate": 1.8815331010452963e-05,
"loss": 1.2841,
"step": 270
},
{
"epoch": 0.29,
"grad_norm": 1.1594703789451222,
"learning_rate": 1.9163763066202093e-05,
"loss": 1.2715,
"step": 275
},
{
"epoch": 0.29,
"grad_norm": 1.0804382019345462,
"learning_rate": 1.9512195121951222e-05,
"loss": 1.2726,
"step": 280
},
{
"epoch": 0.3,
"grad_norm": 1.577609147721894,
"learning_rate": 1.9860627177700348e-05,
"loss": 1.2917,
"step": 285
},
{
"epoch": 0.3,
"grad_norm": 1.7946400445447737,
"learning_rate": 1.9999933018123898e-05,
"loss": 1.2763,
"step": 290
},
{
"epoch": 0.31,
"grad_norm": 1.6377092495455956,
"learning_rate": 1.999952368768613e-05,
"loss": 1.2953,
"step": 295
},
{
"epoch": 0.31,
"grad_norm": 0.9954069210827227,
"learning_rate": 1.9998742254177562e-05,
"loss": 1.2821,
"step": 300
},
{
"epoch": 0.32,
"grad_norm": 2.164611199461128,
"learning_rate": 1.9997588746676955e-05,
"loss": 1.2656,
"step": 305
},
{
"epoch": 0.32,
"grad_norm": 0.7921046271806242,
"learning_rate": 1.9996063208108723e-05,
"loss": 1.3008,
"step": 310
},
{
"epoch": 0.33,
"grad_norm": 1.8804572644575333,
"learning_rate": 1.999416569524133e-05,
"loss": 1.2724,
"step": 315
},
{
"epoch": 0.34,
"grad_norm": 1.3969123144992355,
"learning_rate": 1.9991896278685176e-05,
"loss": 1.2611,
"step": 320
},
{
"epoch": 0.34,
"grad_norm": 1.1862393265946558,
"learning_rate": 1.998925504288997e-05,
"loss": 1.2766,
"step": 325
},
{
"epoch": 0.35,
"grad_norm": 1.4892893389513397,
"learning_rate": 1.9986242086141584e-05,
"loss": 1.27,
"step": 330
},
{
"epoch": 0.35,
"grad_norm": 0.9876595822720282,
"learning_rate": 1.9982857520558413e-05,
"loss": 1.2625,
"step": 335
},
{
"epoch": 0.36,
"grad_norm": 0.9713854238419963,
"learning_rate": 1.9979101472087175e-05,
"loss": 1.2679,
"step": 340
},
{
"epoch": 0.36,
"grad_norm": 0.7625563417654819,
"learning_rate": 1.997497408049824e-05,
"loss": 1.2629,
"step": 345
},
{
"epoch": 0.37,
"grad_norm": 0.5967473191664531,
"learning_rate": 1.9970475499380444e-05,
"loss": 1.2808,
"step": 350
},
{
"epoch": 0.37,
"grad_norm": 0.6033710670101882,
"learning_rate": 1.9965605896135336e-05,
"loss": 1.2652,
"step": 355
},
{
"epoch": 0.38,
"grad_norm": 0.7515300392928511,
"learning_rate": 1.996036545197098e-05,
"loss": 1.2507,
"step": 360
},
{
"epoch": 0.38,
"grad_norm": 0.739465275385991,
"learning_rate": 1.9954754361895204e-05,
"loss": 1.2672,
"step": 365
},
{
"epoch": 0.39,
"grad_norm": 0.7658502604757624,
"learning_rate": 1.994877283470834e-05,
"loss": 1.2642,
"step": 370
},
{
"epoch": 0.39,
"grad_norm": 0.777867445359302,
"learning_rate": 1.994242109299545e-05,
"loss": 1.2503,
"step": 375
},
{
"epoch": 0.4,
"grad_norm": 1.067352419351627,
"learning_rate": 1.993569937311805e-05,
"loss": 1.2568,
"step": 380
},
{
"epoch": 0.4,
"grad_norm": 1.0432065502009356,
"learning_rate": 1.992860792520532e-05,
"loss": 1.2724,
"step": 385
},
{
"epoch": 0.41,
"grad_norm": 1.14590404279004,
"learning_rate": 1.9921147013144782e-05,
"loss": 1.2631,
"step": 390
},
{
"epoch": 0.41,
"grad_norm": 0.9221067228236387,
"learning_rate": 1.9913316914572483e-05,
"loss": 1.259,
"step": 395
},
{
"epoch": 0.42,
"grad_norm": 1.31523632154192,
"learning_rate": 1.9905117920862684e-05,
"loss": 1.2552,
"step": 400
},
{
"epoch": 0.42,
"grad_norm": 1.4960646467520413,
"learning_rate": 1.9896550337116984e-05,
"loss": 1.2483,
"step": 405
},
{
"epoch": 0.43,
"grad_norm": 0.9152573123249131,
"learning_rate": 1.988761448215299e-05,
"loss": 1.2604,
"step": 410
},
{
"epoch": 0.43,
"grad_norm": 0.8964737908165725,
"learning_rate": 1.9878310688492452e-05,
"loss": 1.2685,
"step": 415
},
{
"epoch": 0.44,
"grad_norm": 0.73050943790973,
"learning_rate": 1.986863930234888e-05,
"loss": 1.2442,
"step": 420
},
{
"epoch": 0.45,
"grad_norm": 1.0321668548308196,
"learning_rate": 1.985860068361466e-05,
"loss": 1.2743,
"step": 425
},
{
"epoch": 0.45,
"grad_norm": 0.7632454386025007,
"learning_rate": 1.9848195205847672e-05,
"loss": 1.2509,
"step": 430
},
{
"epoch": 0.46,
"grad_norm": 0.6247074764509415,
"learning_rate": 1.9837423256257388e-05,
"loss": 1.2495,
"step": 435
},
{
"epoch": 0.46,
"grad_norm": 1.0065873063510602,
"learning_rate": 1.9826285235690447e-05,
"loss": 1.2427,
"step": 440
},
{
"epoch": 0.47,
"grad_norm": 0.8936528115587639,
"learning_rate": 1.9814781558615755e-05,
"loss": 1.2667,
"step": 445
},
{
"epoch": 0.47,
"grad_norm": 0.8349464387013416,
"learning_rate": 1.9802912653109063e-05,
"loss": 1.2476,
"step": 450
},
{
"epoch": 0.48,
"grad_norm": 0.7812930126145669,
"learning_rate": 1.9790678960837028e-05,
"loss": 1.2628,
"step": 455
},
{
"epoch": 0.48,
"grad_norm": 1.0392145708087202,
"learning_rate": 1.977808093704077e-05,
"loss": 1.2265,
"step": 460
},
{
"epoch": 0.49,
"grad_norm": 1.288414125838311,
"learning_rate": 1.9765119050518963e-05,
"loss": 1.2523,
"step": 465
},
{
"epoch": 0.49,
"grad_norm": 0.7997618939203232,
"learning_rate": 1.9751793783610353e-05,
"loss": 1.2379,
"step": 470
},
{
"epoch": 0.5,
"grad_norm": 1.6375053013306426,
"learning_rate": 1.9738105632175837e-05,
"loss": 1.2486,
"step": 475
},
{
"epoch": 0.5,
"grad_norm": 0.7684769418259976,
"learning_rate": 1.972405510557999e-05,
"loss": 1.2521,
"step": 480
},
{
"epoch": 0.51,
"grad_norm": 1.057893765512863,
"learning_rate": 1.970964272667213e-05,
"loss": 1.2346,
"step": 485
},
{
"epoch": 0.51,
"grad_norm": 1.0469682428421143,
"learning_rate": 1.969486903176684e-05,
"loss": 1.2378,
"step": 490
},
{
"epoch": 0.52,
"grad_norm": 1.2898633170628728,
"learning_rate": 1.967973457062404e-05,
"loss": 1.2871,
"step": 495
},
{
"epoch": 0.52,
"grad_norm": 0.8847269096128985,
"learning_rate": 1.9664239906428494e-05,
"loss": 1.2577,
"step": 500
},
{
"epoch": 0.53,
"grad_norm": 0.7907229291793376,
"learning_rate": 1.9648385615768882e-05,
"loss": 1.2511,
"step": 505
},
{
"epoch": 0.53,
"grad_norm": 0.7308812942640984,
"learning_rate": 1.9632172288616328e-05,
"loss": 1.2453,
"step": 510
},
{
"epoch": 0.54,
"grad_norm": 0.6721784803540968,
"learning_rate": 1.961560052830245e-05,
"loss": 1.2589,
"step": 515
},
{
"epoch": 0.54,
"grad_norm": 0.6836345678248982,
"learning_rate": 1.959867095149691e-05,
"loss": 1.257,
"step": 520
},
{
"epoch": 0.55,
"grad_norm": 0.6585644601195803,
"learning_rate": 1.9581384188184475e-05,
"loss": 1.2176,
"step": 525
},
{
"epoch": 0.56,
"grad_norm": 0.936869795179473,
"learning_rate": 1.9563740881641548e-05,
"loss": 1.2517,
"step": 530
},
{
"epoch": 0.56,
"grad_norm": 0.6213634019839556,
"learning_rate": 1.9545741688412256e-05,
"loss": 1.2678,
"step": 535
},
{
"epoch": 0.57,
"grad_norm": 0.6590705626309112,
"learning_rate": 1.9527387278284008e-05,
"loss": 1.2326,
"step": 540
},
{
"epoch": 0.57,
"grad_norm": 0.6175703752934739,
"learning_rate": 1.950867833426258e-05,
"loss": 1.2416,
"step": 545
},
{
"epoch": 0.58,
"grad_norm": 1.8069255393803412,
"learning_rate": 1.9489615552546685e-05,
"loss": 1.2215,
"step": 550
},
{
"epoch": 0.58,
"grad_norm": 0.7882020657987857,
"learning_rate": 1.9470199642502062e-05,
"loss": 1.2495,
"step": 555
},
{
"epoch": 0.59,
"grad_norm": 1.9320081851604725,
"learning_rate": 1.945043132663511e-05,
"loss": 1.2328,
"step": 560
},
{
"epoch": 0.59,
"grad_norm": 0.7845993467402427,
"learning_rate": 1.9430311340565967e-05,
"loss": 1.2473,
"step": 565
},
{
"epoch": 0.6,
"grad_norm": 0.9310707703874664,
"learning_rate": 1.9409840433001153e-05,
"loss": 1.2589,
"step": 570
},
{
"epoch": 0.6,
"grad_norm": 1.10631072604195,
"learning_rate": 1.9389019365705718e-05,
"loss": 1.238,
"step": 575
},
{
"epoch": 0.61,
"grad_norm": 0.7697801068360803,
"learning_rate": 1.936784891347486e-05,
"loss": 1.2413,
"step": 580
},
{
"epoch": 0.61,
"grad_norm": 1.2984446895687671,
"learning_rate": 1.9346329864105144e-05,
"loss": 1.2499,
"step": 585
},
{
"epoch": 0.62,
"grad_norm": 1.8472130639489917,
"learning_rate": 1.932446301836514e-05,
"loss": 1.2423,
"step": 590
},
{
"epoch": 0.62,
"grad_norm": 0.9193276135943657,
"learning_rate": 1.9302249189965655e-05,
"loss": 1.2591,
"step": 595
},
{
"epoch": 0.63,
"grad_norm": 1.4664005044938184,
"learning_rate": 1.9279689205529432e-05,
"loss": 1.2273,
"step": 600
},
{
"epoch": 0.63,
"grad_norm": 0.9158089380324357,
"learning_rate": 1.925678390456041e-05,
"loss": 1.2154,
"step": 605
},
{
"epoch": 0.64,
"grad_norm": 0.5830344635891589,
"learning_rate": 1.9233534139412473e-05,
"loss": 1.2301,
"step": 610
},
{
"epoch": 0.64,
"grad_norm": 1.2706713858805934,
"learning_rate": 1.920994077525773e-05,
"loss": 1.2333,
"step": 615
},
{
"epoch": 0.65,
"grad_norm": 0.642710454215131,
"learning_rate": 1.9186004690054316e-05,
"loss": 1.2472,
"step": 620
},
{
"epoch": 0.65,
"grad_norm": 1.2367251668301513,
"learning_rate": 1.9161726774513748e-05,
"loss": 1.2512,
"step": 625
},
{
"epoch": 0.66,
"grad_norm": 0.8543741303434413,
"learning_rate": 1.9137107932067746e-05,
"loss": 1.2398,
"step": 630
},
{
"epoch": 0.67,
"grad_norm": 0.9142215981163747,
"learning_rate": 1.9112149078834634e-05,
"loss": 1.2566,
"step": 635
},
{
"epoch": 0.67,
"grad_norm": 0.6091661762562299,
"learning_rate": 1.9086851143585242e-05,
"loss": 1.245,
"step": 640
},
{
"epoch": 0.68,
"grad_norm": 0.7625900298996494,
"learning_rate": 1.9061215067708355e-05,
"loss": 1.2375,
"step": 645
},
{
"epoch": 0.68,
"grad_norm": 0.9238321967697791,
"learning_rate": 1.9035241805175655e-05,
"loss": 1.2236,
"step": 650
},
{
"epoch": 0.69,
"grad_norm": 1.0512213583671923,
"learning_rate": 1.9008932322506264e-05,
"loss": 1.2406,
"step": 655
},
{
"epoch": 0.69,
"grad_norm": 0.7893664043123937,
"learning_rate": 1.898228759873074e-05,
"loss": 1.2322,
"step": 660
},
{
"epoch": 0.7,
"grad_norm": 0.5965831318877641,
"learning_rate": 1.8955308625354664e-05,
"loss": 1.2456,
"step": 665
},
{
"epoch": 0.7,
"grad_norm": 1.3822725764119563,
"learning_rate": 1.8927996406321737e-05,
"loss": 1.2239,
"step": 670
},
{
"epoch": 0.71,
"grad_norm": 0.8215545396938848,
"learning_rate": 1.8900351957976434e-05,
"loss": 1.2449,
"step": 675
},
{
"epoch": 0.71,
"grad_norm": 0.7816009056609501,
"learning_rate": 1.887237630902615e-05,
"loss": 1.2496,
"step": 680
},
{
"epoch": 0.72,
"grad_norm": 1.3604997432266277,
"learning_rate": 1.8844070500502972e-05,
"loss": 1.2343,
"step": 685
},
{
"epoch": 0.72,
"grad_norm": 1.3298037023453548,
"learning_rate": 1.8815435585724898e-05,
"loss": 1.2363,
"step": 690
},
{
"epoch": 0.73,
"grad_norm": 1.04923784928965,
"learning_rate": 1.8786472630256647e-05,
"loss": 1.2288,
"step": 695
},
{
"epoch": 0.73,
"grad_norm": 0.7034522701108372,
"learning_rate": 1.8757182711870028e-05,
"loss": 1.2205,
"step": 700
},
{
"epoch": 0.74,
"grad_norm": 0.6278091982076448,
"learning_rate": 1.8727566920503806e-05,
"loss": 1.2312,
"step": 705
},
{
"epoch": 0.74,
"grad_norm": 0.7164311663882154,
"learning_rate": 1.8697626358223172e-05,
"loss": 1.2231,
"step": 710
},
{
"epoch": 0.75,
"grad_norm": 0.6464193718074598,
"learning_rate": 1.866736213917871e-05,
"loss": 1.2357,
"step": 715
},
{
"epoch": 0.75,
"grad_norm": 0.8102264151566746,
"learning_rate": 1.8636775389564943e-05,
"loss": 1.2017,
"step": 720
},
{
"epoch": 0.76,
"grad_norm": 0.8857254660267397,
"learning_rate": 1.8605867247578434e-05,
"loss": 1.2332,
"step": 725
},
{
"epoch": 0.76,
"grad_norm": 0.9502103192657813,
"learning_rate": 1.857463886337542e-05,
"loss": 1.2305,
"step": 730
},
{
"epoch": 0.77,
"grad_norm": 0.6457110058300561,
"learning_rate": 1.8543091399029013e-05,
"loss": 1.2216,
"step": 735
},
{
"epoch": 0.78,
"grad_norm": 1.0174358190419057,
"learning_rate": 1.8511226028485973e-05,
"loss": 1.2296,
"step": 740
},
{
"epoch": 0.78,
"grad_norm": 1.0288623039557885,
"learning_rate": 1.8479043937522996e-05,
"loss": 1.2348,
"step": 745
},
{
"epoch": 0.79,
"grad_norm": 0.8034866796324897,
"learning_rate": 1.844654632370262e-05,
"loss": 1.2091,
"step": 750
},
{
"epoch": 0.79,
"grad_norm": 1.2257005540474746,
"learning_rate": 1.8413734396328626e-05,
"loss": 1.2357,
"step": 755
},
{
"epoch": 0.8,
"grad_norm": 0.7827093843154933,
"learning_rate": 1.8380609376401072e-05,
"loss": 1.2217,
"step": 760
},
{
"epoch": 0.8,
"grad_norm": 1.3984501095744437,
"learning_rate": 1.8347172496570825e-05,
"loss": 1.2156,
"step": 765
},
{
"epoch": 0.81,
"grad_norm": 1.2318804988172873,
"learning_rate": 1.8313425001093724e-05,
"loss": 1.2315,
"step": 770
},
{
"epoch": 0.81,
"grad_norm": 0.5759845281822537,
"learning_rate": 1.827936814578426e-05,
"loss": 1.2207,
"step": 775
},
{
"epoch": 0.82,
"grad_norm": 1.74490989219296,
"learning_rate": 1.824500319796883e-05,
"loss": 1.2386,
"step": 780
},
{
"epoch": 0.82,
"grad_norm": 2.036500418910802,
"learning_rate": 1.8210331436438607e-05,
"loss": 1.2175,
"step": 785
},
{
"epoch": 0.83,
"grad_norm": 1.2675789977465821,
"learning_rate": 1.817535415140195e-05,
"loss": 1.2198,
"step": 790
},
{
"epoch": 0.83,
"grad_norm": 0.7631690800915077,
"learning_rate": 1.8140072644436357e-05,
"loss": 1.2089,
"step": 795
},
{
"epoch": 0.84,
"grad_norm": 1.5373247435706394,
"learning_rate": 1.8104488228440083e-05,
"loss": 1.2074,
"step": 800
},
{
"epoch": 0.84,
"grad_norm": 1.3737051209424804,
"learning_rate": 1.8068602227583242e-05,
"loss": 1.2177,
"step": 805
},
{
"epoch": 0.85,
"grad_norm": 2.464051941040213,
"learning_rate": 1.8032415977258552e-05,
"loss": 1.2247,
"step": 810
},
{
"epoch": 0.85,
"grad_norm": 0.6609436548261699,
"learning_rate": 1.7995930824031632e-05,
"loss": 1.1946,
"step": 815
},
{
"epoch": 0.86,
"grad_norm": 0.6754674228365908,
"learning_rate": 1.795914812559092e-05,
"loss": 1.223,
"step": 820
},
{
"epoch": 0.86,
"grad_norm": 0.8912232516784413,
"learning_rate": 1.7922069250697105e-05,
"loss": 1.2396,
"step": 825
},
{
"epoch": 0.87,
"grad_norm": 1.0770577124307328,
"learning_rate": 1.7884695579132233e-05,
"loss": 1.2334,
"step": 830
},
{
"epoch": 0.87,
"grad_norm": 0.8066099472994738,
"learning_rate": 1.784702850164834e-05,
"loss": 1.2204,
"step": 835
},
{
"epoch": 0.88,
"grad_norm": 1.1142550073662545,
"learning_rate": 1.780906941991571e-05,
"loss": 1.231,
"step": 840
},
{
"epoch": 0.89,
"grad_norm": 0.7942742374403763,
"learning_rate": 1.7770819746470717e-05,
"loss": 1.2017,
"step": 845
},
{
"epoch": 0.89,
"grad_norm": 0.787461799289639,
"learning_rate": 1.773228090466324e-05,
"loss": 1.2249,
"step": 850
},
{
"epoch": 0.9,
"grad_norm": 0.9956475479777713,
"learning_rate": 1.769345432860374e-05,
"loss": 1.235,
"step": 855
},
{
"epoch": 0.9,
"grad_norm": 0.7685563421512083,
"learning_rate": 1.765434146310984e-05,
"loss": 1.2602,
"step": 860
},
{
"epoch": 0.91,
"grad_norm": 0.6093551096266576,
"learning_rate": 1.7614943763652614e-05,
"loss": 1.2202,
"step": 865
},
{
"epoch": 0.91,
"grad_norm": 1.1950800849269592,
"learning_rate": 1.7575262696302378e-05,
"loss": 1.2265,
"step": 870
},
{
"epoch": 0.92,
"grad_norm": 0.9774950505729677,
"learning_rate": 1.753529973767417e-05,
"loss": 1.212,
"step": 875
},
{
"epoch": 0.92,
"grad_norm": 2.1611938644889155,
"learning_rate": 1.7495056374872785e-05,
"loss": 1.2,
"step": 880
},
{
"epoch": 0.93,
"grad_norm": 0.7850546983377475,
"learning_rate": 1.7454534105437438e-05,
"loss": 1.227,
"step": 885
},
{
"epoch": 0.93,
"grad_norm": 1.2824362159766542,
"learning_rate": 1.741373443728605e-05,
"loss": 1.2184,
"step": 890
},
{
"epoch": 0.94,
"grad_norm": 0.8737871043379588,
"learning_rate": 1.737265888865911e-05,
"loss": 1.2163,
"step": 895
},
{
"epoch": 0.94,
"grad_norm": 1.0683670456497376,
"learning_rate": 1.73313089880632e-05,
"loss": 1.2082,
"step": 900
},
{
"epoch": 0.95,
"grad_norm": 0.9499829077056968,
"learning_rate": 1.7289686274214116e-05,
"loss": 1.2198,
"step": 905
},
{
"epoch": 0.95,
"grad_norm": 1.2620917350201721,
"learning_rate": 1.7247792295979593e-05,
"loss": 1.2186,
"step": 910
},
{
"epoch": 0.96,
"grad_norm": 0.7587707348526181,
"learning_rate": 1.720562861232168e-05,
"loss": 1.2228,
"step": 915
},
{
"epoch": 0.96,
"grad_norm": 0.7379732881547334,
"learning_rate": 1.716319679223873e-05,
"loss": 1.2196,
"step": 920
},
{
"epoch": 0.97,
"grad_norm": 2.0351908728013695,
"learning_rate": 1.7120498414707e-05,
"loss": 1.1984,
"step": 925
},
{
"epoch": 0.97,
"grad_norm": 0.7892087152273225,
"learning_rate": 1.7077535068621916e-05,
"loss": 1.2281,
"step": 930
},
{
"epoch": 0.98,
"grad_norm": 1.9153171096738413,
"learning_rate": 1.703430835273893e-05,
"loss": 1.2223,
"step": 935
},
{
"epoch": 0.98,
"grad_norm": 1.3820986225530358,
"learning_rate": 1.6990819875614033e-05,
"loss": 1.2389,
"step": 940
},
{
"epoch": 0.99,
"grad_norm": 0.5571713036569522,
"learning_rate": 1.6947071255543894e-05,
"loss": 1.196,
"step": 945
},
{
"epoch": 1.0,
"grad_norm": 1.8672448004164541,
"learning_rate": 1.6903064120505638e-05,
"loss": 1.2073,
"step": 950
},
{
"epoch": 1.0,
"eval_loss": 1.2288095951080322,
"eval_runtime": 242.1709,
"eval_samples_per_second": 55.832,
"eval_steps_per_second": 3.493,
"step": 954
},
{
"epoch": 1.0,
"grad_norm": 2.437727075512732,
"learning_rate": 1.6858800108096277e-05,
"loss": 1.1522,
"step": 955
},
{
"epoch": 1.01,
"grad_norm": 1.383204022339626,
"learning_rate": 1.681428086547176e-05,
"loss": 1.1836,
"step": 960
},
{
"epoch": 1.01,
"grad_norm": 0.9192357678332601,
"learning_rate": 1.676950804928569e-05,
"loss": 1.1673,
"step": 965
},
{
"epoch": 1.02,
"grad_norm": 0.7323573565551772,
"learning_rate": 1.672448332562766e-05,
"loss": 1.1882,
"step": 970
},
{
"epoch": 1.02,
"grad_norm": 0.9192997873212712,
"learning_rate": 1.667920836996127e-05,
"loss": 1.1723,
"step": 975
},
{
"epoch": 1.03,
"grad_norm": 1.1595532253612875,
"learning_rate": 1.663368486706177e-05,
"loss": 1.1965,
"step": 980
},
{
"epoch": 1.03,
"grad_norm": 0.8359822590501053,
"learning_rate": 1.6587914510953366e-05,
"loss": 1.1629,
"step": 985
},
{
"epoch": 1.04,
"grad_norm": 0.6905062230023404,
"learning_rate": 1.65418990048462e-05,
"loss": 1.1622,
"step": 990
},
{
"epoch": 1.04,
"grad_norm": 1.0001015892785858,
"learning_rate": 1.6495640061072933e-05,
"loss": 1.1783,
"step": 995
},
{
"epoch": 1.05,
"grad_norm": 0.8577002381177762,
"learning_rate": 1.644913940102507e-05,
"loss": 1.1713,
"step": 1000
},
{
"epoch": 1.05,
"grad_norm": 1.4943867044106558,
"learning_rate": 1.640239875508887e-05,
"loss": 1.1913,
"step": 1005
},
{
"epoch": 1.06,
"grad_norm": 0.917302733076574,
"learning_rate": 1.6355419862580963e-05,
"loss": 1.1834,
"step": 1010
},
{
"epoch": 1.06,
"grad_norm": 0.8468483810836,
"learning_rate": 1.6308204471683638e-05,
"loss": 1.1805,
"step": 1015
},
{
"epoch": 1.07,
"grad_norm": 0.7346509543245626,
"learning_rate": 1.626075433937977e-05,
"loss": 1.1948,
"step": 1020
},
{
"epoch": 1.07,
"grad_norm": 0.8203677664126644,
"learning_rate": 1.6213071231387463e-05,
"loss": 1.1958,
"step": 1025
},
{
"epoch": 1.08,
"grad_norm": 0.9113536180257384,
"learning_rate": 1.616515692209432e-05,
"loss": 1.1723,
"step": 1030
},
{
"epoch": 1.08,
"grad_norm": 0.7547776726819061,
"learning_rate": 1.6117013194491434e-05,
"loss": 1.1693,
"step": 1035
},
{
"epoch": 1.09,
"grad_norm": 0.8032908719199294,
"learning_rate": 1.606864184010702e-05,
"loss": 1.1746,
"step": 1040
},
{
"epoch": 1.1,
"grad_norm": 0.5655675059882953,
"learning_rate": 1.6020044658939767e-05,
"loss": 1.1844,
"step": 1045
},
{
"epoch": 1.1,
"grad_norm": 1.0261861231837262,
"learning_rate": 1.5971223459391853e-05,
"loss": 1.1651,
"step": 1050
},
{
"epoch": 1.11,
"grad_norm": 0.971543601309903,
"learning_rate": 1.5922180058201623e-05,
"loss": 1.1675,
"step": 1055
},
{
"epoch": 1.11,
"grad_norm": 0.9019717504295954,
"learning_rate": 1.587291628037604e-05,
"loss": 1.1997,
"step": 1060
},
{
"epoch": 1.12,
"grad_norm": 1.4985822747916373,
"learning_rate": 1.582343395912271e-05,
"loss": 1.1903,
"step": 1065
},
{
"epoch": 1.12,
"grad_norm": 0.9262639968389074,
"learning_rate": 1.577373493578171e-05,
"loss": 1.1725,
"step": 1070
},
{
"epoch": 1.13,
"grad_norm": 0.7391674175901184,
"learning_rate": 1.5723821059757057e-05,
"loss": 1.1888,
"step": 1075
},
{
"epoch": 1.13,
"grad_norm": 0.6531304452656148,
"learning_rate": 1.5673694188447865e-05,
"loss": 1.1949,
"step": 1080
},
{
"epoch": 1.14,
"grad_norm": 0.5788877878136035,
"learning_rate": 1.5623356187179265e-05,
"loss": 1.192,
"step": 1085
},
{
"epoch": 1.14,
"grad_norm": 0.6407560960718433,
"learning_rate": 1.557280892913296e-05,
"loss": 1.1828,
"step": 1090
},
{
"epoch": 1.15,
"grad_norm": 0.9437156762106859,
"learning_rate": 1.5522054295277534e-05,
"loss": 1.1826,
"step": 1095
},
{
"epoch": 1.15,
"grad_norm": 0.7128866928102491,
"learning_rate": 1.5471094174298464e-05,
"loss": 1.1829,
"step": 1100
},
{
"epoch": 1.16,
"grad_norm": 0.6495335009567871,
"learning_rate": 1.5419930462527823e-05,
"loss": 1.1826,
"step": 1105
},
{
"epoch": 1.16,
"grad_norm": 0.6753191649242204,
"learning_rate": 1.5368565063873723e-05,
"loss": 1.1814,
"step": 1110
},
{
"epoch": 1.17,
"grad_norm": 0.7242669026430926,
"learning_rate": 1.5316999889749466e-05,
"loss": 1.1745,
"step": 1115
},
{
"epoch": 1.17,
"grad_norm": 0.6277195801598839,
"learning_rate": 1.5265236859002406e-05,
"loss": 1.1877,
"step": 1120
},
{
"epoch": 1.18,
"grad_norm": 0.6449483182356984,
"learning_rate": 1.521327789784257e-05,
"loss": 1.1841,
"step": 1125
},
{
"epoch": 1.18,
"grad_norm": 0.71144532450671,
"learning_rate": 1.5161124939770946e-05,
"loss": 1.175,
"step": 1130
},
{
"epoch": 1.19,
"grad_norm": 0.5956313826510653,
"learning_rate": 1.5108779925507562e-05,
"loss": 1.1741,
"step": 1135
},
{
"epoch": 1.19,
"grad_norm": 0.9169314446206946,
"learning_rate": 1.5056244802919251e-05,
"loss": 1.1776,
"step": 1140
},
{
"epoch": 1.2,
"grad_norm": 0.6225512957951,
"learning_rate": 1.500352152694717e-05,
"loss": 1.1816,
"step": 1145
},
{
"epoch": 1.21,
"grad_norm": 0.5654432853309879,
"learning_rate": 1.4950612059534061e-05,
"loss": 1.1934,
"step": 1150
},
{
"epoch": 1.21,
"grad_norm": 0.8247438376281386,
"learning_rate": 1.4897518369551236e-05,
"loss": 1.1565,
"step": 1155
},
{
"epoch": 1.22,
"grad_norm": 1.0093264873842502,
"learning_rate": 1.4844242432725307e-05,
"loss": 1.184,
"step": 1160
},
{
"epoch": 1.22,
"grad_norm": 0.6532381360941817,
"learning_rate": 1.4790786231564672e-05,
"loss": 1.1661,
"step": 1165
},
{
"epoch": 1.23,
"grad_norm": 0.7692200243834251,
"learning_rate": 1.473715175528574e-05,
"loss": 1.1943,
"step": 1170
},
{
"epoch": 1.23,
"grad_norm": 1.1055969644472505,
"learning_rate": 1.4683340999738908e-05,
"loss": 1.2005,
"step": 1175
},
{
"epoch": 1.24,
"grad_norm": 0.8048696995837356,
"learning_rate": 1.4629355967334297e-05,
"loss": 1.1719,
"step": 1180
},
{
"epoch": 1.24,
"grad_norm": 0.889717365336604,
"learning_rate": 1.457519866696722e-05,
"loss": 1.1682,
"step": 1185
},
{
"epoch": 1.25,
"grad_norm": 0.6148804884198862,
"learning_rate": 1.4520871113943447e-05,
"loss": 1.1764,
"step": 1190
},
{
"epoch": 1.25,
"grad_norm": 0.7923025886639391,
"learning_rate": 1.4466375329904208e-05,
"loss": 1.18,
"step": 1195
},
{
"epoch": 1.26,
"grad_norm": 0.6685953200016346,
"learning_rate": 1.4411713342750942e-05,
"loss": 1.1787,
"step": 1200
},
{
"epoch": 1.26,
"grad_norm": 1.6833872807771428,
"learning_rate": 1.4356887186569872e-05,
"loss": 1.1911,
"step": 1205
},
{
"epoch": 1.27,
"grad_norm": 0.595915875174286,
"learning_rate": 1.4301898901556279e-05,
"loss": 1.193,
"step": 1210
},
{
"epoch": 1.27,
"grad_norm": 2.4195391281265075,
"learning_rate": 1.4246750533938603e-05,
"loss": 1.179,
"step": 1215
},
{
"epoch": 1.28,
"grad_norm": 2.666183841968522,
"learning_rate": 1.4191444135902277e-05,
"loss": 1.2062,
"step": 1220
},
{
"epoch": 1.28,
"grad_norm": 1.5699673804148295,
"learning_rate": 1.4135981765513391e-05,
"loss": 1.1653,
"step": 1225
},
{
"epoch": 1.29,
"grad_norm": 1.30078615012255,
"learning_rate": 1.4080365486642081e-05,
"loss": 1.1787,
"step": 1230
},
{
"epoch": 1.29,
"grad_norm": 0.7312415642370438,
"learning_rate": 1.402459736888574e-05,
"loss": 1.1973,
"step": 1235
},
{
"epoch": 1.3,
"grad_norm": 0.9239652236265656,
"learning_rate": 1.3968679487492001e-05,
"loss": 1.1721,
"step": 1240
},
{
"epoch": 1.3,
"grad_norm": 0.9752780176018039,
"learning_rate": 1.3912613923281517e-05,
"loss": 1.1751,
"step": 1245
},
{
"epoch": 1.31,
"grad_norm": 1.8567198773718243,
"learning_rate": 1.385640276257052e-05,
"loss": 1.1826,
"step": 1250
},
{
"epoch": 1.32,
"grad_norm": 1.7764524004410311,
"learning_rate": 1.3800048097093193e-05,
"loss": 1.176,
"step": 1255
},
{
"epoch": 1.32,
"grad_norm": 1.6137325394086428,
"learning_rate": 1.374355202392383e-05,
"loss": 1.1598,
"step": 1260
},
{
"epoch": 1.33,
"grad_norm": 0.9905926410538433,
"learning_rate": 1.3686916645398802e-05,
"loss": 1.1713,
"step": 1265
},
{
"epoch": 1.33,
"grad_norm": 0.6252897153910507,
"learning_rate": 1.3630144069038319e-05,
"loss": 1.185,
"step": 1270
},
{
"epoch": 1.34,
"grad_norm": 0.8419531357033517,
"learning_rate": 1.3573236407468002e-05,
"loss": 1.1847,
"step": 1275
},
{
"epoch": 1.34,
"grad_norm": 0.7143224147173993,
"learning_rate": 1.3516195778340287e-05,
"loss": 1.1763,
"step": 1280
},
{
"epoch": 1.35,
"grad_norm": 0.8166891680997591,
"learning_rate": 1.3459024304255597e-05,
"loss": 1.1928,
"step": 1285
},
{
"epoch": 1.35,
"grad_norm": 0.7498465670185521,
"learning_rate": 1.3401724112683376e-05,
"loss": 1.1841,
"step": 1290
},
{
"epoch": 1.36,
"grad_norm": 0.8820226956971534,
"learning_rate": 1.334429733588291e-05,
"loss": 1.1865,
"step": 1295
},
{
"epoch": 1.36,
"grad_norm": 1.0005003720785055,
"learning_rate": 1.328674611082398e-05,
"loss": 1.1808,
"step": 1300
},
{
"epoch": 1.37,
"grad_norm": 0.8103098028876912,
"learning_rate": 1.322907257910736e-05,
"loss": 1.1737,
"step": 1305
},
{
"epoch": 1.37,
"grad_norm": 0.5753071458336478,
"learning_rate": 1.3171278886885092e-05,
"loss": 1.1721,
"step": 1310
},
{
"epoch": 1.38,
"grad_norm": 0.7810412721647562,
"learning_rate": 1.311336718478065e-05,
"loss": 1.1595,
"step": 1315
},
{
"epoch": 1.38,
"grad_norm": 0.8916952697239972,
"learning_rate": 1.3055339627808898e-05,
"loss": 1.1673,
"step": 1320
},
{
"epoch": 1.39,
"grad_norm": 0.8734488156436931,
"learning_rate": 1.2997198375295905e-05,
"loss": 1.1384,
"step": 1325
},
{
"epoch": 1.39,
"grad_norm": 0.6465452485316637,
"learning_rate": 1.293894559079858e-05,
"loss": 1.1685,
"step": 1330
},
{
"epoch": 1.4,
"grad_norm": 0.7638110746281211,
"learning_rate": 1.288058344202417e-05,
"loss": 1.18,
"step": 1335
},
{
"epoch": 1.4,
"grad_norm": 0.8254252496032165,
"learning_rate": 1.2822114100749606e-05,
"loss": 1.1658,
"step": 1340
},
{
"epoch": 1.41,
"grad_norm": 1.0644071307325733,
"learning_rate": 1.2763539742740656e-05,
"loss": 1.1777,
"step": 1345
},
{
"epoch": 1.41,
"grad_norm": 0.6737323414071268,
"learning_rate": 1.2704862547670999e-05,
"loss": 1.1759,
"step": 1350
},
{
"epoch": 1.42,
"grad_norm": 0.8963594388253979,
"learning_rate": 1.2646084699041077e-05,
"loss": 1.1673,
"step": 1355
},
{
"epoch": 1.43,
"grad_norm": 0.8429027666417599,
"learning_rate": 1.2587208384096874e-05,
"loss": 1.1823,
"step": 1360
},
{
"epoch": 1.43,
"grad_norm": 1.0943368368233308,
"learning_rate": 1.2528235793748497e-05,
"loss": 1.1567,
"step": 1365
},
{
"epoch": 1.44,
"grad_norm": 0.7878376571416128,
"learning_rate": 1.246916912248868e-05,
"loss": 1.1792,
"step": 1370
},
{
"epoch": 1.44,
"grad_norm": 1.1231733102691281,
"learning_rate": 1.2410010568311081e-05,
"loss": 1.1832,
"step": 1375
},
{
"epoch": 1.45,
"grad_norm": 1.2962290932079525,
"learning_rate": 1.2350762332628527e-05,
"loss": 1.1749,
"step": 1380
},
{
"epoch": 1.45,
"grad_norm": 0.8841590537545198,
"learning_rate": 1.2291426620191083e-05,
"loss": 1.1643,
"step": 1385
},
{
"epoch": 1.46,
"grad_norm": 0.6312641161855239,
"learning_rate": 1.2232005639003993e-05,
"loss": 1.1712,
"step": 1390
},
{
"epoch": 1.46,
"grad_norm": 0.5880454553402258,
"learning_rate": 1.2172501600245533e-05,
"loss": 1.1612,
"step": 1395
},
{
"epoch": 1.47,
"grad_norm": 1.426056294856292,
"learning_rate": 1.211291671818473e-05,
"loss": 1.1781,
"step": 1400
},
{
"epoch": 1.47,
"grad_norm": 1.6259734065919687,
"learning_rate": 1.2053253210098954e-05,
"loss": 1.1937,
"step": 1405
},
{
"epoch": 1.48,
"grad_norm": 0.8115709432523085,
"learning_rate": 1.1993513296191415e-05,
"loss": 1.1685,
"step": 1410
},
{
"epoch": 1.48,
"grad_norm": 1.090441878398841,
"learning_rate": 1.1933699199508537e-05,
"loss": 1.165,
"step": 1415
},
{
"epoch": 1.49,
"grad_norm": 0.7799184051624892,
"learning_rate": 1.187381314585725e-05,
"loss": 1.1934,
"step": 1420
},
{
"epoch": 1.49,
"grad_norm": 0.789575779962715,
"learning_rate": 1.1813857363722137e-05,
"loss": 1.1792,
"step": 1425
},
{
"epoch": 1.5,
"grad_norm": 0.6748544000484231,
"learning_rate": 1.1753834084182534e-05,
"loss": 1.1729,
"step": 1430
},
{
"epoch": 1.5,
"grad_norm": 0.6128387524725831,
"learning_rate": 1.16937455408295e-05,
"loss": 1.1677,
"step": 1435
},
{
"epoch": 1.51,
"grad_norm": 0.6843706926684864,
"learning_rate": 1.163359396968268e-05,
"loss": 1.1649,
"step": 1440
},
{
"epoch": 1.51,
"grad_norm": 0.7664951509251977,
"learning_rate": 1.1573381609107128e-05,
"loss": 1.1591,
"step": 1445
},
{
"epoch": 1.52,
"grad_norm": 0.7013422721030406,
"learning_rate": 1.1513110699729997e-05,
"loss": 1.1637,
"step": 1450
},
{
"epoch": 1.52,
"grad_norm": 0.6124885601184967,
"learning_rate": 1.1452783484357158e-05,
"loss": 1.1717,
"step": 1455
},
{
"epoch": 1.53,
"grad_norm": 0.6585541054145577,
"learning_rate": 1.139240220788975e-05,
"loss": 1.1914,
"step": 1460
},
{
"epoch": 1.54,
"grad_norm": 0.5239454717735947,
"learning_rate": 1.1331969117240632e-05,
"loss": 1.1943,
"step": 1465
},
{
"epoch": 1.54,
"grad_norm": 0.6348870524297613,
"learning_rate": 1.1271486461250782e-05,
"loss": 1.1783,
"step": 1470
},
{
"epoch": 1.55,
"grad_norm": 1.090240560802293,
"learning_rate": 1.1210956490605604e-05,
"loss": 1.1556,
"step": 1475
},
{
"epoch": 1.55,
"grad_norm": 0.8143044809228668,
"learning_rate": 1.1150381457751184e-05,
"loss": 1.169,
"step": 1480
},
{
"epoch": 1.56,
"grad_norm": 1.0216449105750403,
"learning_rate": 1.108976361681046e-05,
"loss": 1.1814,
"step": 1485
},
{
"epoch": 1.56,
"grad_norm": 0.604523212778367,
"learning_rate": 1.1029105223499348e-05,
"loss": 1.1756,
"step": 1490
},
{
"epoch": 1.57,
"grad_norm": 0.7180871019095254,
"learning_rate": 1.096840853504281e-05,
"loss": 1.1783,
"step": 1495
},
{
"epoch": 1.57,
"grad_norm": 0.6503319657691394,
"learning_rate": 1.0907675810090836e-05,
"loss": 1.188,
"step": 1500
},
{
"epoch": 1.58,
"grad_norm": 0.7372905093890288,
"learning_rate": 1.0846909308634426e-05,
"loss": 1.1672,
"step": 1505
},
{
"epoch": 1.58,
"grad_norm": 0.8031354500807256,
"learning_rate": 1.0786111291921462e-05,
"loss": 1.1719,
"step": 1510
},
{
"epoch": 1.59,
"grad_norm": 0.5688198082488224,
"learning_rate": 1.0725284022372575e-05,
"loss": 1.1681,
"step": 1515
},
{
"epoch": 1.59,
"grad_norm": 0.7380172507844508,
"learning_rate": 1.0664429763496964e-05,
"loss": 1.1606,
"step": 1520
},
{
"epoch": 1.6,
"grad_norm": 1.1076622734272275,
"learning_rate": 1.0603550779808143e-05,
"loss": 1.1828,
"step": 1525
},
{
"epoch": 1.6,
"grad_norm": 0.8235583138487149,
"learning_rate": 1.0542649336739704e-05,
"loss": 1.1716,
"step": 1530
},
{
"epoch": 1.61,
"grad_norm": 0.6316859317208665,
"learning_rate": 1.048172770056098e-05,
"loss": 1.1548,
"step": 1535
},
{
"epoch": 1.61,
"grad_norm": 1.1769292378330949,
"learning_rate": 1.0420788138292751e-05,
"loss": 1.1938,
"step": 1540
},
{
"epoch": 1.62,
"grad_norm": 0.8313556604567502,
"learning_rate": 1.035983291762285e-05,
"loss": 1.1567,
"step": 1545
},
{
"epoch": 1.62,
"grad_norm": 0.9229639294556211,
"learning_rate": 1.0298864306821797e-05,
"loss": 1.1692,
"step": 1550
},
{
"epoch": 1.63,
"grad_norm": 0.6958637526255228,
"learning_rate": 1.023788457465839e-05,
"loss": 1.1753,
"step": 1555
},
{
"epoch": 1.63,
"grad_norm": 1.423393298325319,
"learning_rate": 1.0176895990315267e-05,
"loss": 1.1696,
"step": 1560
},
{
"epoch": 1.64,
"grad_norm": 1.3876922211974263,
"learning_rate": 1.0115900823304486e-05,
"loss": 1.1749,
"step": 1565
},
{
"epoch": 1.65,
"grad_norm": 0.9416334312921356,
"learning_rate": 1.005490134338305e-05,
"loss": 1.1685,
"step": 1570
},
{
"epoch": 1.65,
"grad_norm": 0.5740299734599242,
"learning_rate": 9.993899820468454e-06,
"loss": 1.1564,
"step": 1575
},
{
"epoch": 1.66,
"grad_norm": 0.7091361792604712,
"learning_rate": 9.932898524554225e-06,
"loss": 1.1565,
"step": 1580
},
{
"epoch": 1.66,
"grad_norm": 0.5374693466514453,
"learning_rate": 9.871899725625438e-06,
"loss": 1.187,
"step": 1585
},
{
"epoch": 1.67,
"grad_norm": 0.7681591496256306,
"learning_rate": 9.810905693574248e-06,
"loss": 1.1688,
"step": 1590
},
{
"epoch": 1.67,
"grad_norm": 1.203587813415243,
"learning_rate": 9.74991869811543e-06,
"loss": 1.1831,
"step": 1595
},
{
"epoch": 1.68,
"grad_norm": 0.6687000750633519,
"learning_rate": 9.68894100870191e-06,
"loss": 1.1853,
"step": 1600
},
{
"epoch": 1.68,
"grad_norm": 0.9509797549709357,
"learning_rate": 9.627974894440315e-06,
"loss": 1.1589,
"step": 1605
},
{
"epoch": 1.69,
"grad_norm": 1.1175268207785156,
"learning_rate": 9.567022624006538e-06,
"loss": 1.1665,
"step": 1610
},
{
"epoch": 1.69,
"grad_norm": 0.5239676322203335,
"learning_rate": 9.50608646556131e-06,
"loss": 1.179,
"step": 1615
},
{
"epoch": 1.7,
"grad_norm": 1.102036187195103,
"learning_rate": 9.445168686665814e-06,
"loss": 1.1569,
"step": 1620
},
{
"epoch": 1.7,
"grad_norm": 0.9946521310612318,
"learning_rate": 9.38427155419728e-06,
"loss": 1.1824,
"step": 1625
},
{
"epoch": 1.71,
"grad_norm": 0.6323716236426264,
"learning_rate": 9.323397334264646e-06,
"loss": 1.1771,
"step": 1630
},
{
"epoch": 1.71,
"grad_norm": 1.1356317631006072,
"learning_rate": 9.262548292124224e-06,
"loss": 1.1773,
"step": 1635
},
{
"epoch": 1.72,
"grad_norm": 0.6221433694055266,
"learning_rate": 9.201726692095405e-06,
"loss": 1.174,
"step": 1640
},
{
"epoch": 1.72,
"grad_norm": 1.0336456992038663,
"learning_rate": 9.140934797476418e-06,
"loss": 1.1607,
"step": 1645
},
{
"epoch": 1.73,
"grad_norm": 0.5833610110725996,
"learning_rate": 9.080174870460075e-06,
"loss": 1.1597,
"step": 1650
},
{
"epoch": 1.73,
"grad_norm": 0.5504241757602442,
"learning_rate": 9.01944917204961e-06,
"loss": 1.1557,
"step": 1655
},
{
"epoch": 1.74,
"grad_norm": 0.5513897786814727,
"learning_rate": 8.958759961974548e-06,
"loss": 1.1759,
"step": 1660
},
{
"epoch": 1.74,
"grad_norm": 0.6761529083201228,
"learning_rate": 8.898109498606595e-06,
"loss": 1.1644,
"step": 1665
},
{
"epoch": 1.75,
"grad_norm": 0.510298654348033,
"learning_rate": 8.837500038875624e-06,
"loss": 1.1743,
"step": 1670
},
{
"epoch": 1.76,
"grad_norm": 0.6635295343261142,
"learning_rate": 8.776933838185669e-06,
"loss": 1.1796,
"step": 1675
},
{
"epoch": 1.76,
"grad_norm": 0.5621485277936616,
"learning_rate": 8.716413150331008e-06,
"loss": 1.1661,
"step": 1680
},
{
"epoch": 1.77,
"grad_norm": 0.6727200783293344,
"learning_rate": 8.655940227412289e-06,
"loss": 1.1611,
"step": 1685
},
{
"epoch": 1.77,
"grad_norm": 0.6996297230281375,
"learning_rate": 8.595517319752728e-06,
"loss": 1.201,
"step": 1690
},
{
"epoch": 1.78,
"grad_norm": 0.6530153092369445,
"learning_rate": 8.535146675814376e-06,
"loss": 1.1697,
"step": 1695
},
{
"epoch": 1.78,
"grad_norm": 0.8296987665536919,
"learning_rate": 8.474830542114435e-06,
"loss": 1.1682,
"step": 1700
},
{
"epoch": 1.79,
"grad_norm": 0.5776648028392474,
"learning_rate": 8.41457116314167e-06,
"loss": 1.1522,
"step": 1705
},
{
"epoch": 1.79,
"grad_norm": 0.5654344393900492,
"learning_rate": 8.354370781272877e-06,
"loss": 1.1663,
"step": 1710
},
{
"epoch": 1.8,
"grad_norm": 0.7184505095246639,
"learning_rate": 8.294231636689465e-06,
"loss": 1.1749,
"step": 1715
},
{
"epoch": 1.8,
"grad_norm": 0.6433111512147223,
"learning_rate": 8.234155967294062e-06,
"loss": 1.1502,
"step": 1720
},
{
"epoch": 1.81,
"grad_norm": 1.1499746704740308,
"learning_rate": 8.174146008627252e-06,
"loss": 1.1572,
"step": 1725
},
{
"epoch": 1.81,
"grad_norm": 1.5033353170719848,
"learning_rate": 8.114203993784395e-06,
"loss": 1.1729,
"step": 1730
},
{
"epoch": 1.82,
"grad_norm": 0.5430907023905489,
"learning_rate": 8.05433215333251e-06,
"loss": 1.1818,
"step": 1735
},
{
"epoch": 1.82,
"grad_norm": 1.120155366638794,
"learning_rate": 7.99453271522729e-06,
"loss": 1.1806,
"step": 1740
},
{
"epoch": 1.83,
"grad_norm": 0.6834215748747892,
"learning_rate": 7.934807904730182e-06,
"loss": 1.1877,
"step": 1745
},
{
"epoch": 1.83,
"grad_norm": 0.6698489940313849,
"learning_rate": 7.875159944325582e-06,
"loss": 1.1614,
"step": 1750
},
{
"epoch": 1.84,
"grad_norm": 0.6463681773869416,
"learning_rate": 7.81559105363814e-06,
"loss": 1.1523,
"step": 1755
},
{
"epoch": 1.84,
"grad_norm": 0.6229728598122368,
"learning_rate": 7.75610344935015e-06,
"loss": 1.1738,
"step": 1760
},
{
"epoch": 1.85,
"grad_norm": 0.7129040965822541,
"learning_rate": 7.696699345119078e-06,
"loss": 1.1659,
"step": 1765
},
{
"epoch": 1.85,
"grad_norm": 0.820666793110945,
"learning_rate": 7.637380951495175e-06,
"loss": 1.1676,
"step": 1770
},
{
"epoch": 1.86,
"grad_norm": 1.0697417131815192,
"learning_rate": 7.578150475839221e-06,
"loss": 1.1502,
"step": 1775
},
{
"epoch": 1.87,
"grad_norm": 0.736350496476206,
"learning_rate": 7.519010122240389e-06,
"loss": 1.1594,
"step": 1780
},
{
"epoch": 1.87,
"grad_norm": 0.5237499283236696,
"learning_rate": 7.459962091434214e-06,
"loss": 1.1683,
"step": 1785
},
{
"epoch": 1.88,
"grad_norm": 0.5655444143867623,
"learning_rate": 7.401008580720725e-06,
"loss": 1.1583,
"step": 1790
},
{
"epoch": 1.88,
"grad_norm": 0.5707432866254527,
"learning_rate": 7.342151783882647e-06,
"loss": 1.1562,
"step": 1795
},
{
"epoch": 1.89,
"grad_norm": 0.7849566159320196,
"learning_rate": 7.283393891103787e-06,
"loss": 1.1834,
"step": 1800
},
{
"epoch": 1.89,
"grad_norm": 0.5891941886864047,
"learning_rate": 7.224737088887523e-06,
"loss": 1.1854,
"step": 1805
},
{
"epoch": 1.9,
"grad_norm": 0.7232811486904074,
"learning_rate": 7.166183559975442e-06,
"loss": 1.1829,
"step": 1810
},
{
"epoch": 1.9,
"grad_norm": 0.5522278764215799,
"learning_rate": 7.107735483266122e-06,
"loss": 1.1607,
"step": 1815
},
{
"epoch": 1.91,
"grad_norm": 0.6672547623917537,
"learning_rate": 7.049395033734045e-06,
"loss": 1.1929,
"step": 1820
},
{
"epoch": 1.91,
"grad_norm": 0.6276933889391908,
"learning_rate": 6.991164382348657e-06,
"loss": 1.1612,
"step": 1825
},
{
"epoch": 1.92,
"grad_norm": 0.5067309776821597,
"learning_rate": 6.933045695993583e-06,
"loss": 1.1747,
"step": 1830
},
{
"epoch": 1.92,
"grad_norm": 0.5809711787697803,
"learning_rate": 6.875041137386011e-06,
"loss": 1.1854,
"step": 1835
},
{
"epoch": 1.93,
"grad_norm": 0.6578066740151318,
"learning_rate": 6.8171528649961885e-06,
"loss": 1.1899,
"step": 1840
},
{
"epoch": 1.93,
"grad_norm": 0.5311339794807645,
"learning_rate": 6.759383032967106e-06,
"loss": 1.1556,
"step": 1845
},
{
"epoch": 1.94,
"grad_norm": 0.8242375492371855,
"learning_rate": 6.701733791034353e-06,
"loss": 1.17,
"step": 1850
},
{
"epoch": 1.94,
"grad_norm": 0.4995109952563037,
"learning_rate": 6.644207284446099e-06,
"loss": 1.1649,
"step": 1855
},
{
"epoch": 1.95,
"grad_norm": 0.6215638849366457,
"learning_rate": 6.586805653883292e-06,
"loss": 1.1661,
"step": 1860
},
{
"epoch": 1.95,
"grad_norm": 0.8913758463182494,
"learning_rate": 6.529531035379969e-06,
"loss": 1.1726,
"step": 1865
},
{
"epoch": 1.96,
"grad_norm": 0.6635746323440206,
"learning_rate": 6.472385560243788e-06,
"loss": 1.194,
"step": 1870
},
{
"epoch": 1.96,
"grad_norm": 0.5883239994771402,
"learning_rate": 6.4153713549767184e-06,
"loss": 1.1614,
"step": 1875
},
{
"epoch": 1.97,
"grad_norm": 0.6170833224316274,
"learning_rate": 6.358490541195899e-06,
"loss": 1.1631,
"step": 1880
},
{
"epoch": 1.98,
"grad_norm": 0.6020456873462202,
"learning_rate": 6.301745235554695e-06,
"loss": 1.1531,
"step": 1885
},
{
"epoch": 1.98,
"grad_norm": 0.5385380342656197,
"learning_rate": 6.245137549663938e-06,
"loss": 1.1737,
"step": 1890
},
{
"epoch": 1.99,
"grad_norm": 0.4991012602339736,
"learning_rate": 6.188669590013336e-06,
"loss": 1.1586,
"step": 1895
},
{
"epoch": 1.99,
"grad_norm": 0.6599426549894339,
"learning_rate": 6.132343457893093e-06,
"loss": 1.1706,
"step": 1900
},
{
"epoch": 2.0,
"grad_norm": 0.7674322359131748,
"learning_rate": 6.076161249315715e-06,
"loss": 1.1523,
"step": 1905
},
{
"epoch": 2.0,
"eval_loss": 1.2080984115600586,
"eval_runtime": 242.64,
"eval_samples_per_second": 55.725,
"eval_steps_per_second": 3.487,
"step": 1908
},
{
"epoch": 2.0,
"grad_norm": 0.6108377736438585,
"learning_rate": 6.020125054938024e-06,
"loss": 1.172,
"step": 1910
},
{
"epoch": 2.01,
"grad_norm": 0.6355792058389351,
"learning_rate": 5.964236959983337e-06,
"loss": 1.1208,
"step": 1915
},
{
"epoch": 2.01,
"grad_norm": 0.879239535625017,
"learning_rate": 5.9084990441638905e-06,
"loss": 1.115,
"step": 1920
},
{
"epoch": 2.02,
"grad_norm": 0.5084102868781382,
"learning_rate": 5.852913381603439e-06,
"loss": 1.1317,
"step": 1925
},
{
"epoch": 2.02,
"grad_norm": 0.6540246598987741,
"learning_rate": 5.797482040760074e-06,
"loss": 1.1313,
"step": 1930
},
{
"epoch": 2.03,
"grad_norm": 0.5504097794011897,
"learning_rate": 5.742207084349274e-06,
"loss": 1.1425,
"step": 1935
},
{
"epoch": 2.03,
"grad_norm": 0.5729704981717381,
"learning_rate": 5.687090569267102e-06,
"loss": 1.1083,
"step": 1940
},
{
"epoch": 2.04,
"grad_norm": 0.5859997831020897,
"learning_rate": 5.632134546513706e-06,
"loss": 1.112,
"step": 1945
},
{
"epoch": 2.04,
"grad_norm": 0.5010327625190149,
"learning_rate": 5.577341061116971e-06,
"loss": 1.1271,
"step": 1950
},
{
"epoch": 2.05,
"grad_norm": 0.5443268320423013,
"learning_rate": 5.52271215205644e-06,
"loss": 1.1367,
"step": 1955
},
{
"epoch": 2.05,
"grad_norm": 0.6044785266663183,
"learning_rate": 5.468249852187418e-06,
"loss": 1.1458,
"step": 1960
},
{
"epoch": 2.06,
"grad_norm": 0.5276461764121819,
"learning_rate": 5.413956188165341e-06,
"loss": 1.128,
"step": 1965
},
{
"epoch": 2.06,
"grad_norm": 0.5048766724880872,
"learning_rate": 5.359833180370353e-06,
"loss": 1.1122,
"step": 1970
},
{
"epoch": 2.07,
"grad_norm": 0.6267033939873857,
"learning_rate": 5.305882842832119e-06,
"loss": 1.1293,
"step": 1975
},
{
"epoch": 2.07,
"grad_norm": 0.6614222318335767,
"learning_rate": 5.2521071831549e-06,
"loss": 1.1601,
"step": 1980
},
{
"epoch": 2.08,
"grad_norm": 0.5076709648516241,
"learning_rate": 5.1985082024428155e-06,
"loss": 1.1198,
"step": 1985
},
{
"epoch": 2.09,
"grad_norm": 0.6230140336196388,
"learning_rate": 5.145087895225402e-06,
"loss": 1.1501,
"step": 1990
},
{
"epoch": 2.09,
"grad_norm": 0.931744789227786,
"learning_rate": 5.091848249383379e-06,
"loss": 1.1256,
"step": 1995
},
{
"epoch": 2.1,
"grad_norm": 1.0808597918500182,
"learning_rate": 5.038791246074677e-06,
"loss": 1.1333,
"step": 2000
},
{
"epoch": 2.1,
"grad_norm": 1.1272890459984872,
"learning_rate": 4.985918859660732e-06,
"loss": 1.1151,
"step": 2005
},
{
"epoch": 2.11,
"grad_norm": 0.8907484355351619,
"learning_rate": 4.933233057632989e-06,
"loss": 1.1291,
"step": 2010
},
{
"epoch": 2.11,
"grad_norm": 0.6868418315412614,
"learning_rate": 4.880735800539703e-06,
"loss": 1.1173,
"step": 2015
},
{
"epoch": 2.12,
"grad_norm": 0.6603790321835555,
"learning_rate": 4.828429041912981e-06,
"loss": 1.1086,
"step": 2020
},
{
"epoch": 2.12,
"grad_norm": 0.6097972777999011,
"learning_rate": 4.77631472819608e-06,
"loss": 1.1375,
"step": 2025
},
{
"epoch": 2.13,
"grad_norm": 0.6811294046642341,
"learning_rate": 4.724394798670997e-06,
"loss": 1.1305,
"step": 2030
},
{
"epoch": 2.13,
"grad_norm": 0.5920286096312233,
"learning_rate": 4.672671185386273e-06,
"loss": 1.1426,
"step": 2035
},
{
"epoch": 2.14,
"grad_norm": 0.5311324267932382,
"learning_rate": 4.621145813085117e-06,
"loss": 1.1341,
"step": 2040
},
{
"epoch": 2.14,
"grad_norm": 0.6669660304061911,
"learning_rate": 4.569820599133789e-06,
"loss": 1.1311,
"step": 2045
},
{
"epoch": 2.15,
"grad_norm": 0.6342449368188828,
"learning_rate": 4.518697453450229e-06,
"loss": 1.1318,
"step": 2050
},
{
"epoch": 2.15,
"grad_norm": 0.5980979854288464,
"learning_rate": 4.467778278432997e-06,
"loss": 1.114,
"step": 2055
},
{
"epoch": 2.16,
"grad_norm": 0.7291034384771369,
"learning_rate": 4.4170649688904896e-06,
"loss": 1.1238,
"step": 2060
},
{
"epoch": 2.16,
"grad_norm": 0.5599808785726302,
"learning_rate": 4.366559411970413e-06,
"loss": 1.1252,
"step": 2065
},
{
"epoch": 2.17,
"grad_norm": 0.5641041640576039,
"learning_rate": 4.316263487089567e-06,
"loss": 1.1421,
"step": 2070
},
{
"epoch": 2.17,
"grad_norm": 0.5682156948565344,
"learning_rate": 4.2661790658639055e-06,
"loss": 1.1377,
"step": 2075
},
{
"epoch": 2.18,
"grad_norm": 0.5824756644983848,
"learning_rate": 4.216308012038903e-06,
"loss": 1.1352,
"step": 2080
},
{
"epoch": 2.18,
"grad_norm": 0.5426891122639489,
"learning_rate": 4.166652181420177e-06,
"loss": 1.1225,
"step": 2085
},
{
"epoch": 2.19,
"grad_norm": 0.8420072847752859,
"learning_rate": 4.117213421804445e-06,
"loss": 1.1163,
"step": 2090
},
{
"epoch": 2.2,
"grad_norm": 0.5817440149929336,
"learning_rate": 4.067993572910759e-06,
"loss": 1.1137,
"step": 2095
},
{
"epoch": 2.2,
"grad_norm": 0.5674357426562558,
"learning_rate": 4.01899446631206e-06,
"loss": 1.1306,
"step": 2100
},
{
"epoch": 2.21,
"grad_norm": 0.5385862302848274,
"learning_rate": 3.9702179253669925e-06,
"loss": 1.1262,
"step": 2105
},
{
"epoch": 2.21,
"grad_norm": 0.6659835785229048,
"learning_rate": 3.921665765152079e-06,
"loss": 1.1257,
"step": 2110
},
{
"epoch": 2.22,
"grad_norm": 0.7278111818799208,
"learning_rate": 3.87333979239417e-06,
"loss": 1.1158,
"step": 2115
},
{
"epoch": 2.22,
"grad_norm": 0.6234519865995276,
"learning_rate": 3.825241805403201e-06,
"loss": 1.1246,
"step": 2120
},
{
"epoch": 2.23,
"grad_norm": 0.5083706887918839,
"learning_rate": 3.777373594005298e-06,
"loss": 1.1221,
"step": 2125
},
{
"epoch": 2.23,
"grad_norm": 0.5099991707335331,
"learning_rate": 3.729736939476147e-06,
"loss": 1.1462,
"step": 2130
},
{
"epoch": 2.24,
"grad_norm": 0.5977976653156017,
"learning_rate": 3.6823336144747248e-06,
"loss": 1.143,
"step": 2135
},
{
"epoch": 2.24,
"grad_norm": 0.7383812403362637,
"learning_rate": 3.6351653829773315e-06,
"loss": 1.126,
"step": 2140
},
{
"epoch": 2.25,
"grad_norm": 0.5942576317191711,
"learning_rate": 3.5882340002119466e-06,
"loss": 1.1326,
"step": 2145
},
{
"epoch": 2.25,
"grad_norm": 0.5565794632031579,
"learning_rate": 3.541541212592924e-06,
"loss": 1.1334,
"step": 2150
},
{
"epoch": 2.26,
"grad_norm": 0.49181110359174474,
"learning_rate": 3.495088757655989e-06,
"loss": 1.1466,
"step": 2155
},
{
"epoch": 2.26,
"grad_norm": 0.5555605529718436,
"learning_rate": 3.4488783639935875e-06,
"loss": 1.1301,
"step": 2160
},
{
"epoch": 2.27,
"grad_norm": 0.5448046312610463,
"learning_rate": 3.402911751190565e-06,
"loss": 1.1365,
"step": 2165
},
{
"epoch": 2.27,
"grad_norm": 0.6314458157350088,
"learning_rate": 3.3571906297601697e-06,
"loss": 1.1209,
"step": 2170
},
{
"epoch": 2.28,
"grad_norm": 0.6212971379518492,
"learning_rate": 3.3117167010804142e-06,
"loss": 1.1227,
"step": 2175
},
{
"epoch": 2.28,
"grad_norm": 0.5342655049092973,
"learning_rate": 3.2664916573307483e-06,
"loss": 1.1234,
"step": 2180
},
{
"epoch": 2.29,
"grad_norm": 0.6542876578152053,
"learning_rate": 3.2215171814290924e-06,
"loss": 1.1127,
"step": 2185
},
{
"epoch": 2.29,
"grad_norm": 0.5361704587765225,
"learning_rate": 3.176794946969227e-06,
"loss": 1.1228,
"step": 2190
},
{
"epoch": 2.3,
"grad_norm": 0.4958500428125291,
"learning_rate": 3.1323266181584967e-06,
"loss": 1.1377,
"step": 2195
},
{
"epoch": 2.31,
"grad_norm": 0.5446841164025293,
"learning_rate": 3.088113849755885e-06,
"loss": 1.1242,
"step": 2200
},
{
"epoch": 2.31,
"grad_norm": 0.5430688386297582,
"learning_rate": 3.0441582870104537e-06,
"loss": 1.1266,
"step": 2205
},
{
"epoch": 2.32,
"grad_norm": 0.5254359242007764,
"learning_rate": 3.000461565600096e-06,
"loss": 1.1149,
"step": 2210
},
{
"epoch": 2.32,
"grad_norm": 0.5402763846114558,
"learning_rate": 2.9570253115706802e-06,
"loss": 1.1271,
"step": 2215
},
{
"epoch": 2.33,
"grad_norm": 0.6222999681398729,
"learning_rate": 2.9138511412755553e-06,
"loss": 1.1305,
"step": 2220
},
{
"epoch": 2.33,
"grad_norm": 1.134165507110343,
"learning_rate": 2.8709406613153757e-06,
"loss": 1.1235,
"step": 2225
},
{
"epoch": 2.34,
"grad_norm": 0.5175878284379065,
"learning_rate": 2.8282954684783337e-06,
"loss": 1.1353,
"step": 2230
},
{
"epoch": 2.34,
"grad_norm": 0.5005769123013962,
"learning_rate": 2.785917149680738e-06,
"loss": 1.1171,
"step": 2235
},
{
"epoch": 2.35,
"grad_norm": 0.5485234561749593,
"learning_rate": 2.7438072819079553e-06,
"loss": 1.111,
"step": 2240
},
{
"epoch": 2.35,
"grad_norm": 0.5180512042054937,
"learning_rate": 2.70196743215574e-06,
"loss": 1.1332,
"step": 2245
},
{
"epoch": 2.36,
"grad_norm": 0.5318572208372209,
"learning_rate": 2.660399157371907e-06,
"loss": 1.1405,
"step": 2250
},
{
"epoch": 2.36,
"grad_norm": 0.5236645040470878,
"learning_rate": 2.619104004398403e-06,
"loss": 1.116,
"step": 2255
},
{
"epoch": 2.37,
"grad_norm": 0.505778509055168,
"learning_rate": 2.5780835099137446e-06,
"loss": 1.1258,
"step": 2260
},
{
"epoch": 2.37,
"grad_norm": 0.5240908498566061,
"learning_rate": 2.5373392003758333e-06,
"loss": 1.126,
"step": 2265
},
{
"epoch": 2.38,
"grad_norm": 0.5018783209220711,
"learning_rate": 2.4968725919651614e-06,
"loss": 1.1192,
"step": 2270
},
{
"epoch": 2.38,
"grad_norm": 0.502218350959747,
"learning_rate": 2.4566851905283774e-06,
"loss": 1.13,
"step": 2275
},
{
"epoch": 2.39,
"grad_norm": 0.5568603717029933,
"learning_rate": 2.4167784915222592e-06,
"loss": 1.131,
"step": 2280
},
{
"epoch": 2.39,
"grad_norm": 0.5715894479029213,
"learning_rate": 2.3771539799580645e-06,
"loss": 1.1334,
"step": 2285
},
{
"epoch": 2.4,
"grad_norm": 0.5267964480808124,
"learning_rate": 2.337813130346267e-06,
"loss": 1.1267,
"step": 2290
},
{
"epoch": 2.4,
"grad_norm": 0.5212598608716227,
"learning_rate": 2.2987574066416963e-06,
"loss": 1.1106,
"step": 2295
},
{
"epoch": 2.41,
"grad_norm": 0.5625446019216419,
"learning_rate": 2.2599882621890467e-06,
"loss": 1.1403,
"step": 2300
},
{
"epoch": 2.42,
"grad_norm": 0.5040192141998451,
"learning_rate": 2.2215071396688058e-06,
"loss": 1.1029,
"step": 2305
},
{
"epoch": 2.42,
"grad_norm": 0.5122958595236623,
"learning_rate": 2.1833154710435657e-06,
"loss": 1.1241,
"step": 2310
},
{
"epoch": 2.43,
"grad_norm": 0.491900310035707,
"learning_rate": 2.1454146775047334e-06,
"loss": 1.1552,
"step": 2315
},
{
"epoch": 2.43,
"grad_norm": 0.5519547637040755,
"learning_rate": 2.1078061694196584e-06,
"loss": 1.1247,
"step": 2320
},
{
"epoch": 2.44,
"grad_norm": 0.5670375507717159,
"learning_rate": 2.070491346279131e-06,
"loss": 1.1281,
"step": 2325
},
{
"epoch": 2.44,
"grad_norm": 0.7387779425602033,
"learning_rate": 2.033471596645318e-06,
"loss": 1.1342,
"step": 2330
},
{
"epoch": 2.45,
"grad_norm": 0.5119585269280015,
"learning_rate": 1.9967482981000896e-06,
"loss": 1.1354,
"step": 2335
},
{
"epoch": 2.45,
"grad_norm": 0.7484987761106697,
"learning_rate": 1.9603228171937505e-06,
"loss": 1.1253,
"step": 2340
},
{
"epoch": 2.46,
"grad_norm": 0.5680645954314844,
"learning_rate": 1.9241965093941906e-06,
"loss": 1.1139,
"step": 2345
},
{
"epoch": 2.46,
"grad_norm": 0.5242001099591352,
"learning_rate": 1.8883707190364552e-06,
"loss": 1.1313,
"step": 2350
},
{
"epoch": 2.47,
"grad_norm": 0.5579618363338916,
"learning_rate": 1.8528467792727023e-06,
"loss": 1.1163,
"step": 2355
},
{
"epoch": 2.47,
"grad_norm": 0.6243087656360263,
"learning_rate": 1.8176260120225985e-06,
"loss": 1.1176,
"step": 2360
},
{
"epoch": 2.48,
"grad_norm": 1.7068933761542626,
"learning_rate": 1.7827097279241446e-06,
"loss": 1.1219,
"step": 2365
},
{
"epoch": 2.48,
"grad_norm": 0.4886930431779373,
"learning_rate": 1.7480992262848773e-06,
"loss": 1.1142,
"step": 2370
},
{
"epoch": 2.49,
"grad_norm": 0.4786373057596891,
"learning_rate": 1.713795795033537e-06,
"loss": 1.1197,
"step": 2375
},
{
"epoch": 2.49,
"grad_norm": 0.6067331789622677,
"learning_rate": 1.6798007106721349e-06,
"loss": 1.1473,
"step": 2380
},
{
"epoch": 2.5,
"grad_norm": 0.47349202299217047,
"learning_rate": 1.6461152382284528e-06,
"loss": 1.13,
"step": 2385
},
{
"epoch": 2.5,
"grad_norm": 0.50666743547298,
"learning_rate": 1.6127406312089755e-06,
"loss": 1.1227,
"step": 2390
},
{
"epoch": 2.51,
"grad_norm": 0.5780759831825091,
"learning_rate": 1.5796781315522302e-06,
"loss": 1.1263,
"step": 2395
},
{
"epoch": 2.51,
"grad_norm": 0.5145283808530754,
"learning_rate": 1.546928969582584e-06,
"loss": 1.1282,
"step": 2400
},
{
"epoch": 2.52,
"grad_norm": 0.5187563761871239,
"learning_rate": 1.5144943639644582e-06,
"loss": 1.1114,
"step": 2405
},
{
"epoch": 2.53,
"grad_norm": 0.6730796328654831,
"learning_rate": 1.4823755216569747e-06,
"loss": 1.1359,
"step": 2410
},
{
"epoch": 2.53,
"grad_norm": 0.5077287898071522,
"learning_rate": 1.4505736378690504e-06,
"loss": 1.1191,
"step": 2415
},
{
"epoch": 2.54,
"grad_norm": 0.4869055108528289,
"learning_rate": 1.4190898960149146e-06,
"loss": 1.1269,
"step": 2420
},
{
"epoch": 2.54,
"grad_norm": 0.5487100234080864,
"learning_rate": 1.3879254676700715e-06,
"loss": 1.119,
"step": 2425
},
{
"epoch": 2.55,
"grad_norm": 0.5619141478397307,
"learning_rate": 1.357081512527708e-06,
"loss": 1.1256,
"step": 2430
},
{
"epoch": 2.55,
"grad_norm": 0.5295178016960835,
"learning_rate": 1.3265591783555343e-06,
"loss": 1.1184,
"step": 2435
},
{
"epoch": 2.56,
"grad_norm": 0.4826005918628002,
"learning_rate": 1.296359600953081e-06,
"loss": 1.1281,
"step": 2440
},
{
"epoch": 2.56,
"grad_norm": 0.5234934481608914,
"learning_rate": 1.2664839041094224e-06,
"loss": 1.1186,
"step": 2445
},
{
"epoch": 2.57,
"grad_norm": 0.5672765758503912,
"learning_rate": 1.2369331995613664e-06,
"loss": 1.1419,
"step": 2450
},
{
"epoch": 2.57,
"grad_norm": 0.5153361493064631,
"learning_rate": 1.20770858695208e-06,
"loss": 1.121,
"step": 2455
},
{
"epoch": 2.58,
"grad_norm": 0.4700812866873563,
"learning_rate": 1.1788111537901703e-06,
"loss": 1.1344,
"step": 2460
},
{
"epoch": 2.58,
"grad_norm": 0.47907430351988983,
"learning_rate": 1.150241975409222e-06,
"loss": 1.1343,
"step": 2465
},
{
"epoch": 2.59,
"grad_norm": 0.7173981450403498,
"learning_rate": 1.1220021149277739e-06,
"loss": 1.1208,
"step": 2470
},
{
"epoch": 2.59,
"grad_norm": 0.48336071065694813,
"learning_rate": 1.0940926232097549e-06,
"loss": 1.1252,
"step": 2475
},
{
"epoch": 2.6,
"grad_norm": 0.4907311025391313,
"learning_rate": 1.0665145388253973e-06,
"loss": 1.1177,
"step": 2480
},
{
"epoch": 2.6,
"grad_norm": 0.5349221336758597,
"learning_rate": 1.0392688880125657e-06,
"loss": 1.1288,
"step": 2485
},
{
"epoch": 2.61,
"grad_norm": 0.49385829217633803,
"learning_rate": 1.012356684638589e-06,
"loss": 1.1264,
"step": 2490
},
{
"epoch": 2.61,
"grad_norm": 0.4667902902659354,
"learning_rate": 9.857789301625176e-07,
"loss": 1.1212,
"step": 2495
},
{
"epoch": 2.62,
"grad_norm": 0.5158817967481687,
"learning_rate": 9.595366135978657e-07,
"loss": 1.1092,
"step": 2500
},
{
"epoch": 2.62,
"grad_norm": 1.5250186054627204,
"learning_rate": 9.336307114758014e-07,
"loss": 1.1144,
"step": 2505
},
{
"epoch": 2.63,
"grad_norm": 0.5339217405564394,
"learning_rate": 9.080621878088203e-07,
"loss": 1.1345,
"step": 2510
},
{
"epoch": 2.63,
"grad_norm": 0.5253157865355497,
"learning_rate": 8.828319940548557e-07,
"loss": 1.1201,
"step": 2515
},
{
"epoch": 2.64,
"grad_norm": 0.6125121884479827,
"learning_rate": 8.579410690818857e-07,
"loss": 1.1423,
"step": 2520
},
{
"epoch": 2.65,
"grad_norm": 0.478023260769249,
"learning_rate": 8.333903391329878e-07,
"loss": 1.1208,
"step": 2525
},
{
"epoch": 2.65,
"grad_norm": 0.4801399490168033,
"learning_rate": 8.091807177918776e-07,
"loss": 1.1575,
"step": 2530
},
{
"epoch": 2.66,
"grad_norm": 0.5214182987941468,
"learning_rate": 7.853131059489139e-07,
"loss": 1.1334,
"step": 2535
},
{
"epoch": 2.66,
"grad_norm": 0.5249516533101624,
"learning_rate": 7.617883917675639e-07,
"loss": 1.1064,
"step": 2540
},
{
"epoch": 2.67,
"grad_norm": 0.4852415218297866,
"learning_rate": 7.38607450651364e-07,
"loss": 1.1231,
"step": 2545
},
{
"epoch": 2.67,
"grad_norm": 0.48251940456449577,
"learning_rate": 7.15771145211337e-07,
"loss": 1.1161,
"step": 2550
},
{
"epoch": 2.68,
"grad_norm": 0.5085684403729371,
"learning_rate": 6.932803252338971e-07,
"loss": 1.13,
"step": 2555
},
{
"epoch": 2.68,
"grad_norm": 0.4890404954235663,
"learning_rate": 6.711358276492296e-07,
"loss": 1.1456,
"step": 2560
},
{
"epoch": 2.69,
"grad_norm": 0.5361102319060816,
"learning_rate": 6.493384765001376e-07,
"loss": 1.1338,
"step": 2565
},
{
"epoch": 2.69,
"grad_norm": 0.517878808787806,
"learning_rate": 6.278890829113859e-07,
"loss": 1.1251,
"step": 2570
},
{
"epoch": 2.7,
"grad_norm": 0.5065553109073893,
"learning_rate": 6.067884450595151e-07,
"loss": 1.12,
"step": 2575
},
{
"epoch": 2.7,
"grad_norm": 0.5028673282413361,
"learning_rate": 5.86037348143137e-07,
"loss": 1.1245,
"step": 2580
},
{
"epoch": 2.71,
"grad_norm": 0.4870963227277234,
"learning_rate": 5.656365643537242e-07,
"loss": 1.1077,
"step": 2585
},
{
"epoch": 2.71,
"grad_norm": 0.5111521346962293,
"learning_rate": 5.455868528468633e-07,
"loss": 1.1519,
"step": 2590
},
{
"epoch": 2.72,
"grad_norm": 0.5388194414428982,
"learning_rate": 5.258889597140159e-07,
"loss": 1.1184,
"step": 2595
},
{
"epoch": 2.72,
"grad_norm": 0.48700880681105724,
"learning_rate": 5.065436179547434e-07,
"loss": 1.1097,
"step": 2600
},
{
"epoch": 2.73,
"grad_norm": 0.4728387494141863,
"learning_rate": 4.875515474494474e-07,
"loss": 1.1353,
"step": 2605
},
{
"epoch": 2.73,
"grad_norm": 0.5199495120809019,
"learning_rate": 4.6891345493256355e-07,
"loss": 1.1165,
"step": 2610
},
{
"epoch": 2.74,
"grad_norm": 0.4814614293781753,
"learning_rate": 4.506300339662717e-07,
"loss": 1.1253,
"step": 2615
},
{
"epoch": 2.74,
"grad_norm": 0.5173576971059289,
"learning_rate": 4.327019649146891e-07,
"loss": 1.1304,
"step": 2620
},
{
"epoch": 2.75,
"grad_norm": 0.6397808529091451,
"learning_rate": 4.1512991491854835e-07,
"loss": 1.1322,
"step": 2625
},
{
"epoch": 2.76,
"grad_norm": 0.4725181894860057,
"learning_rate": 3.979145378703675e-07,
"loss": 1.1346,
"step": 2630
},
{
"epoch": 2.76,
"grad_norm": 0.5458978880301589,
"learning_rate": 3.8105647439013016e-07,
"loss": 1.137,
"step": 2635
},
{
"epoch": 2.77,
"grad_norm": 0.48302580764517844,
"learning_rate": 3.64556351801435e-07,
"loss": 1.1257,
"step": 2640
},
{
"epoch": 2.77,
"grad_norm": 0.4962326101580355,
"learning_rate": 3.484147841081542e-07,
"loss": 1.142,
"step": 2645
},
{
"epoch": 2.78,
"grad_norm": 0.4585051697085661,
"learning_rate": 3.3263237197158957e-07,
"loss": 1.1434,
"step": 2650
},
{
"epoch": 2.78,
"grad_norm": 0.5099548271081592,
"learning_rate": 3.1720970268811823e-07,
"loss": 1.13,
"step": 2655
},
{
"epoch": 2.79,
"grad_norm": 0.4703307632982891,
"learning_rate": 3.0214735016733444e-07,
"loss": 1.1107,
"step": 2660
},
{
"epoch": 2.79,
"grad_norm": 0.5451589072669626,
"learning_rate": 2.8744587491069897e-07,
"loss": 1.1368,
"step": 2665
},
{
"epoch": 2.8,
"grad_norm": 0.4661620281760464,
"learning_rate": 2.7310582399067807e-07,
"loss": 1.1299,
"step": 2670
},
{
"epoch": 2.8,
"grad_norm": 0.4700376493841961,
"learning_rate": 2.5912773103038635e-07,
"loss": 1.1351,
"step": 2675
},
{
"epoch": 2.81,
"grad_norm": 0.5281567728088438,
"learning_rate": 2.455121161837337e-07,
"loss": 1.1348,
"step": 2680
},
{
"epoch": 2.81,
"grad_norm": 0.5203565754462774,
"learning_rate": 2.3225948611605985e-07,
"loss": 1.1178,
"step": 2685
},
{
"epoch": 2.82,
"grad_norm": 0.7533691953849389,
"learning_rate": 2.193703339852904e-07,
"loss": 1.1343,
"step": 2690
},
{
"epoch": 2.82,
"grad_norm": 0.5456152859357938,
"learning_rate": 2.068451394235793e-07,
"loss": 1.1344,
"step": 2695
},
{
"epoch": 2.83,
"grad_norm": 0.471951733281154,
"learning_rate": 1.9468436851946104e-07,
"loss": 1.1183,
"step": 2700
},
{
"epoch": 2.83,
"grad_norm": 0.5054028637411992,
"learning_rate": 1.828884738005121e-07,
"loss": 1.1171,
"step": 2705
},
{
"epoch": 2.84,
"grad_norm": 0.4973211550697257,
"learning_rate": 1.714578942165057e-07,
"loss": 1.1452,
"step": 2710
},
{
"epoch": 2.84,
"grad_norm": 0.4776119270184997,
"learning_rate": 1.603930551230759e-07,
"loss": 1.1234,
"step": 2715
},
{
"epoch": 2.85,
"grad_norm": 0.4678420409776623,
"learning_rate": 1.496943682658958e-07,
"loss": 1.1254,
"step": 2720
},
{
"epoch": 2.85,
"grad_norm": 0.48059962717119603,
"learning_rate": 1.3936223176535202e-07,
"loss": 1.1235,
"step": 2725
},
{
"epoch": 2.86,
"grad_norm": 0.4580176747899339,
"learning_rate": 1.293970301017311e-07,
"loss": 1.1254,
"step": 2730
},
{
"epoch": 2.87,
"grad_norm": 0.4792836332815102,
"learning_rate": 1.197991341009086e-07,
"loss": 1.1214,
"step": 2735
},
{
"epoch": 2.87,
"grad_norm": 0.8388017413294173,
"learning_rate": 1.105689009205535e-07,
"loss": 1.1222,
"step": 2740
},
{
"epoch": 2.88,
"grad_norm": 0.4707921501124877,
"learning_rate": 1.0170667403683665e-07,
"loss": 1.1323,
"step": 2745
},
{
"epoch": 2.88,
"grad_norm": 0.7024386364310734,
"learning_rate": 9.321278323165206e-08,
"loss": 1.1294,
"step": 2750
},
{
"epoch": 2.89,
"grad_norm": 0.5794998801288503,
"learning_rate": 8.508754458033896e-08,
"loss": 1.1284,
"step": 2755
},
{
"epoch": 2.89,
"grad_norm": 0.5210033701753554,
"learning_rate": 7.733126043992233e-08,
"loss": 1.1255,
"step": 2760
},
{
"epoch": 2.9,
"grad_norm": 0.4534516086493498,
"learning_rate": 6.994421943786744e-08,
"loss": 1.1255,
"step": 2765
},
{
"epoch": 2.9,
"grad_norm": 0.5040846527641326,
"learning_rate": 6.292669646132953e-08,
"loss": 1.1196,
"step": 2770
},
{
"epoch": 2.91,
"grad_norm": 0.4776172273958398,
"learning_rate": 5.627895264693206e-08,
"loss": 1.1261,
"step": 2775
},
{
"epoch": 2.91,
"grad_norm": 0.6159306629296952,
"learning_rate": 5.000123537104884e-08,
"loss": 1.123,
"step": 2780
},
{
"epoch": 2.92,
"grad_norm": 0.5082477677931655,
"learning_rate": 4.409377824059147e-08,
"loss": 1.1135,
"step": 2785
},
{
"epoch": 2.92,
"grad_norm": 0.5064357141343069,
"learning_rate": 3.8556801084326244e-08,
"loss": 1.1313,
"step": 2790
},
{
"epoch": 2.93,
"grad_norm": 0.5262632528102094,
"learning_rate": 3.339050994468629e-08,
"loss": 1.1177,
"step": 2795
},
{
"epoch": 2.93,
"grad_norm": 0.46586388144714996,
"learning_rate": 2.8595097070108768e-08,
"loss": 1.1286,
"step": 2800
},
{
"epoch": 2.94,
"grad_norm": 0.4632808157554121,
"learning_rate": 2.417074090788063e-08,
"loss": 1.1403,
"step": 2805
},
{
"epoch": 2.94,
"grad_norm": 0.4779820067851786,
"learning_rate": 2.0117606097492803e-08,
"loss": 1.1225,
"step": 2810
},
{
"epoch": 2.95,
"grad_norm": 0.48020094179715184,
"learning_rate": 1.6435843464522873e-08,
"loss": 1.1228,
"step": 2815
},
{
"epoch": 2.95,
"grad_norm": 0.46922994755027636,
"learning_rate": 1.312559001501179e-08,
"loss": 1.1242,
"step": 2820
},
{
"epoch": 2.96,
"grad_norm": 0.561473699487665,
"learning_rate": 1.018696893037685e-08,
"loss": 1.1238,
"step": 2825
},
{
"epoch": 2.96,
"grad_norm": 0.6935294385086034,
"learning_rate": 7.620089562817568e-09,
"loss": 1.127,
"step": 2830
},
{
"epoch": 2.97,
"grad_norm": 0.5009987624085611,
"learning_rate": 5.425047431254493e-09,
"loss": 1.1349,
"step": 2835
},
{
"epoch": 2.98,
"grad_norm": 0.4758170109030934,
"learning_rate": 3.6019242177698368e-09,
"loss": 1.1115,
"step": 2840
},
{
"epoch": 2.98,
"grad_norm": 0.47357473014321283,
"learning_rate": 2.1507877645687846e-09,
"loss": 1.1206,
"step": 2845
},
{
"epoch": 2.99,
"grad_norm": 0.45459743840166006,
"learning_rate": 1.0716920714570755e-09,
"loss": 1.13,
"step": 2850
},
{
"epoch": 2.99,
"grad_norm": 0.470224105606156,
"learning_rate": 3.646772938292742e-10,
"loss": 1.1261,
"step": 2855
},
{
"epoch": 3.0,
"grad_norm": 0.49294459710268396,
"learning_rate": 2.976974117552267e-11,
"loss": 1.1174,
"step": 2860
},
{
"epoch": 3.0,
"eval_loss": 1.2072049379348755,
"eval_runtime": 242.3055,
"eval_samples_per_second": 55.801,
"eval_steps_per_second": 3.491,
"step": 2862
},
{
"epoch": 3.0,
"step": 2862,
"total_flos": 341168042803200.0,
"train_loss": 0.7671343510089264,
"train_runtime": 16733.4448,
"train_samples_per_second": 21.908,
"train_steps_per_second": 0.171
}
],
"logging_steps": 5,
"max_steps": 2862,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 341168042803200.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}