{ "best_metric": 0.6505705118179321, "best_model_checkpoint": "test-hasy-6/checkpoint-24886", "epoch": 50.0, "eval_steps": 100, "global_step": 27050, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 3.252760887145996, "learning_rate": 1.999260628465804e-05, "loss": 3.1209, "step": 10 }, { "epoch": 0.04, "grad_norm": 4.324207305908203, "learning_rate": 1.9985212569316085e-05, "loss": 3.1439, "step": 20 }, { "epoch": 0.06, "grad_norm": 4.484713554382324, "learning_rate": 1.9977818853974124e-05, "loss": 3.0244, "step": 30 }, { "epoch": 0.07, "grad_norm": 3.3624324798583984, "learning_rate": 1.9970425138632164e-05, "loss": 2.981, "step": 40 }, { "epoch": 0.09, "grad_norm": 4.335402488708496, "learning_rate": 1.9963031423290204e-05, "loss": 2.9373, "step": 50 }, { "epoch": 0.11, "grad_norm": 4.105679988861084, "learning_rate": 1.9955637707948247e-05, "loss": 2.9558, "step": 60 }, { "epoch": 0.13, "grad_norm": 3.997988700866699, "learning_rate": 1.9948243992606287e-05, "loss": 2.9417, "step": 70 }, { "epoch": 0.15, "grad_norm": 3.570570468902588, "learning_rate": 1.9940850277264327e-05, "loss": 3.1505, "step": 80 }, { "epoch": 0.17, "grad_norm": 4.583168983459473, "learning_rate": 1.993345656192237e-05, "loss": 3.2608, "step": 90 }, { "epoch": 0.18, "grad_norm": 3.3935163021087646, "learning_rate": 1.9926062846580406e-05, "loss": 3.0807, "step": 100 }, { "epoch": 0.2, "grad_norm": 3.6814966201782227, "learning_rate": 1.991866913123845e-05, "loss": 3.0088, "step": 110 }, { "epoch": 0.22, "grad_norm": 3.9908082485198975, "learning_rate": 1.991127541589649e-05, "loss": 3.2089, "step": 120 }, { "epoch": 0.24, "grad_norm": 4.034529209136963, "learning_rate": 1.990388170055453e-05, "loss": 2.998, "step": 130 }, { "epoch": 0.26, "grad_norm": 4.857136249542236, "learning_rate": 1.9896487985212572e-05, "loss": 2.9093, "step": 140 }, { "epoch": 0.28, "grad_norm": 3.0363869667053223, "learning_rate": 1.9889094269870612e-05, "loss": 2.9522, "step": 150 }, { "epoch": 0.3, "grad_norm": 5.306853294372559, "learning_rate": 1.9881700554528652e-05, "loss": 3.2575, "step": 160 }, { "epoch": 0.31, "grad_norm": 4.796173572540283, "learning_rate": 1.987430683918669e-05, "loss": 3.3054, "step": 170 }, { "epoch": 0.33, "grad_norm": 3.989049196243286, "learning_rate": 1.9866913123844735e-05, "loss": 3.0162, "step": 180 }, { "epoch": 0.35, "grad_norm": 3.9188003540039062, "learning_rate": 1.9859519408502774e-05, "loss": 2.9124, "step": 190 }, { "epoch": 0.37, "grad_norm": 4.185227394104004, "learning_rate": 1.9852125693160814e-05, "loss": 3.0018, "step": 200 }, { "epoch": 0.39, "grad_norm": 7.368288993835449, "learning_rate": 1.9844731977818857e-05, "loss": 3.2482, "step": 210 }, { "epoch": 0.41, "grad_norm": 5.437251091003418, "learning_rate": 1.9837338262476897e-05, "loss": 2.8915, "step": 220 }, { "epoch": 0.43, "grad_norm": 4.302529811859131, "learning_rate": 1.9829944547134937e-05, "loss": 3.1141, "step": 230 }, { "epoch": 0.44, "grad_norm": 4.926127910614014, "learning_rate": 1.9822550831792977e-05, "loss": 2.9946, "step": 240 }, { "epoch": 0.46, "grad_norm": 5.317417144775391, "learning_rate": 1.9815157116451017e-05, "loss": 3.036, "step": 250 }, { "epoch": 0.48, "grad_norm": 3.326779842376709, "learning_rate": 1.980776340110906e-05, "loss": 2.8822, "step": 260 }, { "epoch": 0.5, "grad_norm": 3.6894800662994385, "learning_rate": 1.98003696857671e-05, "loss": 3.0709, "step": 270 }, { "epoch": 0.52, "grad_norm": 5.024881362915039, "learning_rate": 1.9792975970425143e-05, "loss": 2.994, "step": 280 }, { "epoch": 0.54, "grad_norm": 5.87685489654541, "learning_rate": 1.978558225508318e-05, "loss": 3.0313, "step": 290 }, { "epoch": 0.55, "grad_norm": 4.148473739624023, "learning_rate": 1.9778188539741222e-05, "loss": 2.824, "step": 300 }, { "epoch": 0.57, "grad_norm": 3.380990743637085, "learning_rate": 1.9770794824399262e-05, "loss": 2.8581, "step": 310 }, { "epoch": 0.59, "grad_norm": 5.89368200302124, "learning_rate": 1.9763401109057302e-05, "loss": 2.8344, "step": 320 }, { "epoch": 0.61, "grad_norm": 4.146866321563721, "learning_rate": 1.9756007393715345e-05, "loss": 2.7313, "step": 330 }, { "epoch": 0.63, "grad_norm": 4.897014617919922, "learning_rate": 1.9748613678373385e-05, "loss": 2.9126, "step": 340 }, { "epoch": 0.65, "grad_norm": 4.273492336273193, "learning_rate": 1.9741219963031425e-05, "loss": 2.9078, "step": 350 }, { "epoch": 0.67, "grad_norm": 3.1492631435394287, "learning_rate": 1.9733826247689464e-05, "loss": 2.8082, "step": 360 }, { "epoch": 0.68, "grad_norm": 4.791119575500488, "learning_rate": 1.9726432532347508e-05, "loss": 3.0234, "step": 370 }, { "epoch": 0.7, "grad_norm": 5.639841079711914, "learning_rate": 1.9719038817005547e-05, "loss": 2.8484, "step": 380 }, { "epoch": 0.72, "grad_norm": 5.128878593444824, "learning_rate": 1.9711645101663587e-05, "loss": 2.9479, "step": 390 }, { "epoch": 0.74, "grad_norm": 5.027972221374512, "learning_rate": 1.970425138632163e-05, "loss": 2.7186, "step": 400 }, { "epoch": 0.76, "grad_norm": 4.988519668579102, "learning_rate": 1.969685767097967e-05, "loss": 2.7704, "step": 410 }, { "epoch": 0.78, "grad_norm": 4.468504905700684, "learning_rate": 1.968946395563771e-05, "loss": 2.8062, "step": 420 }, { "epoch": 0.79, "grad_norm": 3.9717366695404053, "learning_rate": 1.968207024029575e-05, "loss": 3.1325, "step": 430 }, { "epoch": 0.81, "grad_norm": 3.64151930809021, "learning_rate": 1.967467652495379e-05, "loss": 3.0769, "step": 440 }, { "epoch": 0.83, "grad_norm": 4.135356426239014, "learning_rate": 1.9667282809611833e-05, "loss": 2.881, "step": 450 }, { "epoch": 0.85, "grad_norm": 3.905519962310791, "learning_rate": 1.9659889094269872e-05, "loss": 3.0021, "step": 460 }, { "epoch": 0.87, "grad_norm": 4.349752902984619, "learning_rate": 1.9652495378927912e-05, "loss": 2.59, "step": 470 }, { "epoch": 0.89, "grad_norm": 7.4061808586120605, "learning_rate": 1.9645101663585952e-05, "loss": 2.6744, "step": 480 }, { "epoch": 0.91, "grad_norm": 7.038602828979492, "learning_rate": 1.9637707948243995e-05, "loss": 3.1262, "step": 490 }, { "epoch": 0.92, "grad_norm": 4.853476047515869, "learning_rate": 1.9630314232902035e-05, "loss": 3.2875, "step": 500 }, { "epoch": 0.94, "grad_norm": 8.122692108154297, "learning_rate": 1.9622920517560075e-05, "loss": 2.8782, "step": 510 }, { "epoch": 0.96, "grad_norm": 3.7128429412841797, "learning_rate": 1.9615526802218118e-05, "loss": 3.0457, "step": 520 }, { "epoch": 0.98, "grad_norm": 4.896644115447998, "learning_rate": 1.9608133086876158e-05, "loss": 2.9343, "step": 530 }, { "epoch": 1.0, "grad_norm": 6.1974968910217285, "learning_rate": 1.9600739371534197e-05, "loss": 3.0857, "step": 540 }, { "epoch": 1.0, "eval_accuracy": 0.5571725571725572, "eval_loss": 2.4484260082244873, "eval_runtime": 1.6634, "eval_samples_per_second": 289.169, "eval_steps_per_second": 36.672, "step": 541 }, { "epoch": 1.02, "grad_norm": 4.972373008728027, "learning_rate": 1.9593345656192237e-05, "loss": 2.8815, "step": 550 }, { "epoch": 1.04, "grad_norm": 5.010016918182373, "learning_rate": 1.9585951940850277e-05, "loss": 2.7107, "step": 560 }, { "epoch": 1.05, "grad_norm": 6.35477352142334, "learning_rate": 1.957855822550832e-05, "loss": 2.8684, "step": 570 }, { "epoch": 1.07, "grad_norm": 3.8225924968719482, "learning_rate": 1.957116451016636e-05, "loss": 2.7739, "step": 580 }, { "epoch": 1.09, "grad_norm": 5.407051086425781, "learning_rate": 1.9563770794824403e-05, "loss": 2.9076, "step": 590 }, { "epoch": 1.11, "grad_norm": 3.661503553390503, "learning_rate": 1.9556377079482443e-05, "loss": 2.8712, "step": 600 }, { "epoch": 1.13, "grad_norm": 4.298554420471191, "learning_rate": 1.9548983364140483e-05, "loss": 2.7706, "step": 610 }, { "epoch": 1.15, "grad_norm": 4.072779178619385, "learning_rate": 1.9541589648798522e-05, "loss": 2.865, "step": 620 }, { "epoch": 1.16, "grad_norm": 6.765620231628418, "learning_rate": 1.9534195933456562e-05, "loss": 2.5834, "step": 630 }, { "epoch": 1.18, "grad_norm": 4.58746862411499, "learning_rate": 1.9526802218114605e-05, "loss": 2.6017, "step": 640 }, { "epoch": 1.2, "grad_norm": 4.879488468170166, "learning_rate": 1.9519408502772645e-05, "loss": 2.7706, "step": 650 }, { "epoch": 1.22, "grad_norm": 6.828296661376953, "learning_rate": 1.9512014787430685e-05, "loss": 2.8006, "step": 660 }, { "epoch": 1.24, "grad_norm": 4.980978488922119, "learning_rate": 1.9504621072088725e-05, "loss": 2.6562, "step": 670 }, { "epoch": 1.26, "grad_norm": 6.915592670440674, "learning_rate": 1.9497227356746765e-05, "loss": 2.4987, "step": 680 }, { "epoch": 1.28, "grad_norm": 4.394547462463379, "learning_rate": 1.9489833641404808e-05, "loss": 2.8909, "step": 690 }, { "epoch": 1.29, "grad_norm": 7.548334121704102, "learning_rate": 1.9482439926062848e-05, "loss": 2.7494, "step": 700 }, { "epoch": 1.31, "grad_norm": 4.25028657913208, "learning_rate": 1.947504621072089e-05, "loss": 2.6962, "step": 710 }, { "epoch": 1.33, "grad_norm": 3.9434807300567627, "learning_rate": 1.946765249537893e-05, "loss": 2.5333, "step": 720 }, { "epoch": 1.35, "grad_norm": 4.3457841873168945, "learning_rate": 1.946025878003697e-05, "loss": 2.8117, "step": 730 }, { "epoch": 1.37, "grad_norm": 5.3315606117248535, "learning_rate": 1.945286506469501e-05, "loss": 2.9177, "step": 740 }, { "epoch": 1.39, "grad_norm": 5.058216094970703, "learning_rate": 1.944547134935305e-05, "loss": 2.573, "step": 750 }, { "epoch": 1.4, "grad_norm": 5.15712308883667, "learning_rate": 1.9438077634011093e-05, "loss": 3.0234, "step": 760 }, { "epoch": 1.42, "grad_norm": 5.003973960876465, "learning_rate": 1.9430683918669133e-05, "loss": 3.0634, "step": 770 }, { "epoch": 1.44, "grad_norm": 5.414166450500488, "learning_rate": 1.9423290203327173e-05, "loss": 2.8271, "step": 780 }, { "epoch": 1.46, "grad_norm": 5.76715612411499, "learning_rate": 1.9415896487985216e-05, "loss": 2.7979, "step": 790 }, { "epoch": 1.48, "grad_norm": 6.7035393714904785, "learning_rate": 1.9408502772643256e-05, "loss": 2.7153, "step": 800 }, { "epoch": 1.5, "grad_norm": 8.553651809692383, "learning_rate": 1.9401109057301295e-05, "loss": 2.5224, "step": 810 }, { "epoch": 1.52, "grad_norm": 7.570606708526611, "learning_rate": 1.9393715341959335e-05, "loss": 2.5144, "step": 820 }, { "epoch": 1.53, "grad_norm": 4.116618633270264, "learning_rate": 1.9386321626617378e-05, "loss": 2.7629, "step": 830 }, { "epoch": 1.55, "grad_norm": 6.356128692626953, "learning_rate": 1.9378927911275418e-05, "loss": 2.7204, "step": 840 }, { "epoch": 1.57, "grad_norm": 7.290828227996826, "learning_rate": 1.9371534195933458e-05, "loss": 2.841, "step": 850 }, { "epoch": 1.59, "grad_norm": 5.881626605987549, "learning_rate": 1.9364140480591498e-05, "loss": 2.6174, "step": 860 }, { "epoch": 1.61, "grad_norm": 5.055831432342529, "learning_rate": 1.9356746765249537e-05, "loss": 2.7439, "step": 870 }, { "epoch": 1.63, "grad_norm": 7.34787654876709, "learning_rate": 1.934935304990758e-05, "loss": 2.6421, "step": 880 }, { "epoch": 1.65, "grad_norm": 6.8356757164001465, "learning_rate": 1.934195933456562e-05, "loss": 2.7375, "step": 890 }, { "epoch": 1.66, "grad_norm": 3.9384195804595947, "learning_rate": 1.9334565619223664e-05, "loss": 2.8247, "step": 900 }, { "epoch": 1.68, "grad_norm": 5.555051803588867, "learning_rate": 1.9327171903881703e-05, "loss": 2.825, "step": 910 }, { "epoch": 1.7, "grad_norm": 5.856509208679199, "learning_rate": 1.9319778188539743e-05, "loss": 2.6824, "step": 920 }, { "epoch": 1.72, "grad_norm": 4.14664888381958, "learning_rate": 1.9312384473197783e-05, "loss": 2.5481, "step": 930 }, { "epoch": 1.74, "grad_norm": 4.0290303230285645, "learning_rate": 1.9304990757855823e-05, "loss": 2.7097, "step": 940 }, { "epoch": 1.76, "grad_norm": 6.699609756469727, "learning_rate": 1.9297597042513866e-05, "loss": 2.6901, "step": 950 }, { "epoch": 1.77, "grad_norm": 6.287888050079346, "learning_rate": 1.9290203327171906e-05, "loss": 2.5165, "step": 960 }, { "epoch": 1.79, "grad_norm": 6.18824577331543, "learning_rate": 1.9282809611829945e-05, "loss": 2.8045, "step": 970 }, { "epoch": 1.81, "grad_norm": 5.226430892944336, "learning_rate": 1.927541589648799e-05, "loss": 2.9172, "step": 980 }, { "epoch": 1.83, "grad_norm": 4.301969051361084, "learning_rate": 1.9268022181146025e-05, "loss": 2.575, "step": 990 }, { "epoch": 1.85, "grad_norm": 6.760509014129639, "learning_rate": 1.9260628465804068e-05, "loss": 2.5171, "step": 1000 }, { "epoch": 1.87, "grad_norm": 4.816244125366211, "learning_rate": 1.9253234750462108e-05, "loss": 2.4188, "step": 1010 }, { "epoch": 1.89, "grad_norm": 8.158167839050293, "learning_rate": 1.924584103512015e-05, "loss": 3.079, "step": 1020 }, { "epoch": 1.9, "grad_norm": 6.19435453414917, "learning_rate": 1.923844731977819e-05, "loss": 2.3048, "step": 1030 }, { "epoch": 1.92, "grad_norm": 4.324227809906006, "learning_rate": 1.923105360443623e-05, "loss": 2.6988, "step": 1040 }, { "epoch": 1.94, "grad_norm": 4.569825172424316, "learning_rate": 1.922365988909427e-05, "loss": 2.8718, "step": 1050 }, { "epoch": 1.96, "grad_norm": 5.378350734710693, "learning_rate": 1.921626617375231e-05, "loss": 2.6557, "step": 1060 }, { "epoch": 1.98, "grad_norm": 4.976410865783691, "learning_rate": 1.9208872458410353e-05, "loss": 2.6919, "step": 1070 }, { "epoch": 2.0, "grad_norm": 8.337263107299805, "learning_rate": 1.9201478743068393e-05, "loss": 2.3006, "step": 1080 }, { "epoch": 2.0, "eval_accuracy": 0.5904365904365905, "eval_loss": 2.158792018890381, "eval_runtime": 1.5197, "eval_samples_per_second": 316.511, "eval_steps_per_second": 40.14, "step": 1082 }, { "epoch": 2.01, "grad_norm": 4.764266014099121, "learning_rate": 1.9194085027726433e-05, "loss": 2.6972, "step": 1090 }, { "epoch": 2.03, "grad_norm": 10.492167472839355, "learning_rate": 1.9186691312384476e-05, "loss": 2.7993, "step": 1100 }, { "epoch": 2.05, "grad_norm": 5.655553340911865, "learning_rate": 1.9179297597042516e-05, "loss": 2.4557, "step": 1110 }, { "epoch": 2.07, "grad_norm": 5.0441436767578125, "learning_rate": 1.9171903881700556e-05, "loss": 2.6651, "step": 1120 }, { "epoch": 2.09, "grad_norm": 4.979348659515381, "learning_rate": 1.9164510166358596e-05, "loss": 2.6527, "step": 1130 }, { "epoch": 2.11, "grad_norm": 4.502183437347412, "learning_rate": 1.915711645101664e-05, "loss": 2.6776, "step": 1140 }, { "epoch": 2.13, "grad_norm": 6.17383337020874, "learning_rate": 1.914972273567468e-05, "loss": 2.7542, "step": 1150 }, { "epoch": 2.14, "grad_norm": 5.377480983734131, "learning_rate": 1.9142329020332718e-05, "loss": 2.4211, "step": 1160 }, { "epoch": 2.16, "grad_norm": 4.262052536010742, "learning_rate": 1.913493530499076e-05, "loss": 2.7208, "step": 1170 }, { "epoch": 2.18, "grad_norm": 7.188355922698975, "learning_rate": 1.9127541589648798e-05, "loss": 2.624, "step": 1180 }, { "epoch": 2.2, "grad_norm": 2.962965250015259, "learning_rate": 1.912014787430684e-05, "loss": 2.5303, "step": 1190 }, { "epoch": 2.22, "grad_norm": 4.486663818359375, "learning_rate": 1.911275415896488e-05, "loss": 2.5201, "step": 1200 }, { "epoch": 2.24, "grad_norm": 8.877091407775879, "learning_rate": 1.9105360443622924e-05, "loss": 2.6091, "step": 1210 }, { "epoch": 2.26, "grad_norm": 6.5678205490112305, "learning_rate": 1.9097966728280964e-05, "loss": 2.5881, "step": 1220 }, { "epoch": 2.27, "grad_norm": 6.904757976531982, "learning_rate": 1.9090573012939004e-05, "loss": 2.4029, "step": 1230 }, { "epoch": 2.29, "grad_norm": 6.389434814453125, "learning_rate": 1.9083179297597043e-05, "loss": 2.6486, "step": 1240 }, { "epoch": 2.31, "grad_norm": 5.92376708984375, "learning_rate": 1.9075785582255083e-05, "loss": 2.5807, "step": 1250 }, { "epoch": 2.33, "grad_norm": 6.494824409484863, "learning_rate": 1.9068391866913126e-05, "loss": 2.5927, "step": 1260 }, { "epoch": 2.35, "grad_norm": 4.639347553253174, "learning_rate": 1.9060998151571166e-05, "loss": 2.6715, "step": 1270 }, { "epoch": 2.37, "grad_norm": 4.74893856048584, "learning_rate": 1.9053604436229206e-05, "loss": 2.835, "step": 1280 }, { "epoch": 2.38, "grad_norm": 5.14687967300415, "learning_rate": 1.904621072088725e-05, "loss": 2.6049, "step": 1290 }, { "epoch": 2.4, "grad_norm": 4.7364277839660645, "learning_rate": 1.9038817005545285e-05, "loss": 2.5251, "step": 1300 }, { "epoch": 2.42, "grad_norm": 5.866145610809326, "learning_rate": 1.903142329020333e-05, "loss": 2.4343, "step": 1310 }, { "epoch": 2.44, "grad_norm": 5.459924697875977, "learning_rate": 1.902402957486137e-05, "loss": 2.5935, "step": 1320 }, { "epoch": 2.46, "grad_norm": 2.858912944793701, "learning_rate": 1.901663585951941e-05, "loss": 2.2931, "step": 1330 }, { "epoch": 2.48, "grad_norm": 6.834282875061035, "learning_rate": 1.900924214417745e-05, "loss": 2.4241, "step": 1340 }, { "epoch": 2.5, "grad_norm": 4.568212509155273, "learning_rate": 1.900184842883549e-05, "loss": 2.2663, "step": 1350 }, { "epoch": 2.51, "grad_norm": 3.8758466243743896, "learning_rate": 1.8994454713493534e-05, "loss": 2.782, "step": 1360 }, { "epoch": 2.53, "grad_norm": 4.505823612213135, "learning_rate": 1.898706099815157e-05, "loss": 2.5449, "step": 1370 }, { "epoch": 2.55, "grad_norm": 4.957785129547119, "learning_rate": 1.8979667282809614e-05, "loss": 2.4452, "step": 1380 }, { "epoch": 2.57, "grad_norm": 3.6909983158111572, "learning_rate": 1.8972273567467654e-05, "loss": 2.7413, "step": 1390 }, { "epoch": 2.59, "grad_norm": 4.87821102142334, "learning_rate": 1.8964879852125693e-05, "loss": 2.1905, "step": 1400 }, { "epoch": 2.61, "grad_norm": 5.593238353729248, "learning_rate": 1.8957486136783737e-05, "loss": 2.6415, "step": 1410 }, { "epoch": 2.62, "grad_norm": 7.167879104614258, "learning_rate": 1.8950092421441776e-05, "loss": 2.7471, "step": 1420 }, { "epoch": 2.64, "grad_norm": 3.1362695693969727, "learning_rate": 1.8942698706099816e-05, "loss": 2.6451, "step": 1430 }, { "epoch": 2.66, "grad_norm": 4.573113441467285, "learning_rate": 1.8935304990757856e-05, "loss": 2.63, "step": 1440 }, { "epoch": 2.68, "grad_norm": 5.238269805908203, "learning_rate": 1.89279112754159e-05, "loss": 2.2971, "step": 1450 }, { "epoch": 2.7, "grad_norm": 5.956825256347656, "learning_rate": 1.892051756007394e-05, "loss": 2.4407, "step": 1460 }, { "epoch": 2.72, "grad_norm": 4.380825042724609, "learning_rate": 1.891312384473198e-05, "loss": 2.4588, "step": 1470 }, { "epoch": 2.74, "grad_norm": 4.232132434844971, "learning_rate": 1.8905730129390022e-05, "loss": 2.473, "step": 1480 }, { "epoch": 2.75, "grad_norm": 5.387021541595459, "learning_rate": 1.8898336414048058e-05, "loss": 2.3206, "step": 1490 }, { "epoch": 2.77, "grad_norm": 4.617791175842285, "learning_rate": 1.88909426987061e-05, "loss": 2.3927, "step": 1500 }, { "epoch": 2.79, "grad_norm": 4.398989677429199, "learning_rate": 1.888354898336414e-05, "loss": 2.579, "step": 1510 }, { "epoch": 2.81, "grad_norm": 5.663540363311768, "learning_rate": 1.887615526802218e-05, "loss": 2.5161, "step": 1520 }, { "epoch": 2.83, "grad_norm": 3.4754462242126465, "learning_rate": 1.8868761552680224e-05, "loss": 2.3508, "step": 1530 }, { "epoch": 2.85, "grad_norm": 5.5402092933654785, "learning_rate": 1.8861367837338264e-05, "loss": 2.2675, "step": 1540 }, { "epoch": 2.87, "grad_norm": 4.493966102600098, "learning_rate": 1.8853974121996307e-05, "loss": 2.3501, "step": 1550 }, { "epoch": 2.88, "grad_norm": 5.757681846618652, "learning_rate": 1.8846580406654344e-05, "loss": 2.6911, "step": 1560 }, { "epoch": 2.9, "grad_norm": 4.580160617828369, "learning_rate": 1.8839186691312387e-05, "loss": 2.2739, "step": 1570 }, { "epoch": 2.92, "grad_norm": 5.2407307624816895, "learning_rate": 1.8831792975970426e-05, "loss": 2.3847, "step": 1580 }, { "epoch": 2.94, "grad_norm": 5.808749675750732, "learning_rate": 1.8824399260628466e-05, "loss": 2.318, "step": 1590 }, { "epoch": 2.96, "grad_norm": 3.379912853240967, "learning_rate": 1.881700554528651e-05, "loss": 2.1705, "step": 1600 }, { "epoch": 2.98, "grad_norm": 6.693317413330078, "learning_rate": 1.880961182994455e-05, "loss": 2.2205, "step": 1610 }, { "epoch": 2.99, "grad_norm": 7.003932952880859, "learning_rate": 1.880221811460259e-05, "loss": 2.4406, "step": 1620 }, { "epoch": 3.0, "eval_accuracy": 0.6444906444906445, "eval_loss": 1.887886643409729, "eval_runtime": 1.545, "eval_samples_per_second": 311.326, "eval_steps_per_second": 39.482, "step": 1623 }, { "epoch": 3.01, "grad_norm": 5.77971887588501, "learning_rate": 1.879482439926063e-05, "loss": 2.6285, "step": 1630 }, { "epoch": 3.03, "grad_norm": 5.434704780578613, "learning_rate": 1.8787430683918672e-05, "loss": 2.1978, "step": 1640 }, { "epoch": 3.05, "grad_norm": 5.5808000564575195, "learning_rate": 1.8780036968576712e-05, "loss": 2.3736, "step": 1650 }, { "epoch": 3.07, "grad_norm": 7.72202205657959, "learning_rate": 1.877264325323475e-05, "loss": 2.2857, "step": 1660 }, { "epoch": 3.09, "grad_norm": 4.7164740562438965, "learning_rate": 1.8765249537892795e-05, "loss": 2.2698, "step": 1670 }, { "epoch": 3.11, "grad_norm": 4.336970329284668, "learning_rate": 1.875785582255083e-05, "loss": 2.2654, "step": 1680 }, { "epoch": 3.12, "grad_norm": 4.242778301239014, "learning_rate": 1.8750462107208874e-05, "loss": 2.1881, "step": 1690 }, { "epoch": 3.14, "grad_norm": 6.460256576538086, "learning_rate": 1.8743068391866914e-05, "loss": 2.4307, "step": 1700 }, { "epoch": 3.16, "grad_norm": 4.839225769042969, "learning_rate": 1.8735674676524954e-05, "loss": 2.3347, "step": 1710 }, { "epoch": 3.18, "grad_norm": 4.808290004730225, "learning_rate": 1.8728280961182997e-05, "loss": 2.4675, "step": 1720 }, { "epoch": 3.2, "grad_norm": 6.206542491912842, "learning_rate": 1.8720887245841037e-05, "loss": 2.4566, "step": 1730 }, { "epoch": 3.22, "grad_norm": 5.714911937713623, "learning_rate": 1.871349353049908e-05, "loss": 2.4947, "step": 1740 }, { "epoch": 3.23, "grad_norm": 5.4967122077941895, "learning_rate": 1.8706099815157116e-05, "loss": 2.2926, "step": 1750 }, { "epoch": 3.25, "grad_norm": 4.162055969238281, "learning_rate": 1.869870609981516e-05, "loss": 2.4108, "step": 1760 }, { "epoch": 3.27, "grad_norm": 5.215890884399414, "learning_rate": 1.8692051756007395e-05, "loss": 2.6094, "step": 1770 }, { "epoch": 3.29, "grad_norm": 5.482348442077637, "learning_rate": 1.8684658040665434e-05, "loss": 2.3268, "step": 1780 }, { "epoch": 3.31, "grad_norm": 4.174530506134033, "learning_rate": 1.8677264325323478e-05, "loss": 2.4543, "step": 1790 }, { "epoch": 3.33, "grad_norm": 6.862208366394043, "learning_rate": 1.8669870609981517e-05, "loss": 2.389, "step": 1800 }, { "epoch": 3.35, "grad_norm": 6.486968994140625, "learning_rate": 1.8662476894639557e-05, "loss": 2.0596, "step": 1810 }, { "epoch": 3.36, "grad_norm": 5.249489784240723, "learning_rate": 1.86550831792976e-05, "loss": 2.2848, "step": 1820 }, { "epoch": 3.38, "grad_norm": 6.245966911315918, "learning_rate": 1.8647689463955637e-05, "loss": 2.5066, "step": 1830 }, { "epoch": 3.4, "grad_norm": 7.4608988761901855, "learning_rate": 1.864029574861368e-05, "loss": 2.5392, "step": 1840 }, { "epoch": 3.42, "grad_norm": 3.965076446533203, "learning_rate": 1.863290203327172e-05, "loss": 2.2119, "step": 1850 }, { "epoch": 3.44, "grad_norm": 5.980205535888672, "learning_rate": 1.8625508317929763e-05, "loss": 2.1613, "step": 1860 }, { "epoch": 3.46, "grad_norm": 4.33217716217041, "learning_rate": 1.8618114602587803e-05, "loss": 2.295, "step": 1870 }, { "epoch": 3.48, "grad_norm": 9.344121932983398, "learning_rate": 1.8610720887245842e-05, "loss": 1.8096, "step": 1880 }, { "epoch": 3.49, "grad_norm": 5.314825057983398, "learning_rate": 1.8603327171903882e-05, "loss": 2.4459, "step": 1890 }, { "epoch": 3.51, "grad_norm": 4.74118185043335, "learning_rate": 1.8595933456561922e-05, "loss": 2.1148, "step": 1900 }, { "epoch": 3.53, "grad_norm": 5.454382419586182, "learning_rate": 1.8588539741219965e-05, "loss": 2.5157, "step": 1910 }, { "epoch": 3.55, "grad_norm": 6.346231937408447, "learning_rate": 1.8581146025878005e-05, "loss": 2.5551, "step": 1920 }, { "epoch": 3.57, "grad_norm": 7.034316062927246, "learning_rate": 1.8573752310536045e-05, "loss": 2.5645, "step": 1930 }, { "epoch": 3.59, "grad_norm": 3.959404706954956, "learning_rate": 1.8566358595194088e-05, "loss": 2.1851, "step": 1940 }, { "epoch": 3.6, "grad_norm": 4.215963840484619, "learning_rate": 1.8558964879852128e-05, "loss": 2.4418, "step": 1950 }, { "epoch": 3.62, "grad_norm": 6.364134788513184, "learning_rate": 1.8551571164510167e-05, "loss": 2.292, "step": 1960 }, { "epoch": 3.64, "grad_norm": 4.857297420501709, "learning_rate": 1.8544177449168207e-05, "loss": 2.2575, "step": 1970 }, { "epoch": 3.66, "grad_norm": 5.284545421600342, "learning_rate": 1.853678373382625e-05, "loss": 2.461, "step": 1980 }, { "epoch": 3.68, "grad_norm": 4.974768161773682, "learning_rate": 1.852939001848429e-05, "loss": 2.2719, "step": 1990 }, { "epoch": 3.7, "grad_norm": 6.079647541046143, "learning_rate": 1.852199630314233e-05, "loss": 2.5226, "step": 2000 }, { "epoch": 3.72, "grad_norm": 4.387228012084961, "learning_rate": 1.8514602587800373e-05, "loss": 2.3531, "step": 2010 }, { "epoch": 3.73, "grad_norm": 4.595489978790283, "learning_rate": 1.850720887245841e-05, "loss": 2.2488, "step": 2020 }, { "epoch": 3.75, "grad_norm": 4.649653911590576, "learning_rate": 1.8499815157116453e-05, "loss": 2.2152, "step": 2030 }, { "epoch": 3.77, "grad_norm": 6.717101573944092, "learning_rate": 1.8492421441774493e-05, "loss": 2.3524, "step": 2040 }, { "epoch": 3.79, "grad_norm": 7.453444004058838, "learning_rate": 1.8485027726432532e-05, "loss": 2.2822, "step": 2050 }, { "epoch": 3.81, "grad_norm": 8.945061683654785, "learning_rate": 1.8477634011090576e-05, "loss": 2.319, "step": 2060 }, { "epoch": 3.83, "grad_norm": 4.923318862915039, "learning_rate": 1.8470240295748615e-05, "loss": 2.5616, "step": 2070 }, { "epoch": 3.84, "grad_norm": 5.03168249130249, "learning_rate": 1.8462846580406655e-05, "loss": 2.2336, "step": 2080 }, { "epoch": 3.86, "grad_norm": 2.8677268028259277, "learning_rate": 1.8455452865064695e-05, "loss": 2.4687, "step": 2090 }, { "epoch": 3.88, "grad_norm": 4.071888446807861, "learning_rate": 1.8448059149722738e-05, "loss": 2.2483, "step": 2100 }, { "epoch": 3.9, "grad_norm": 3.3788464069366455, "learning_rate": 1.8440665434380778e-05, "loss": 2.1346, "step": 2110 }, { "epoch": 3.92, "grad_norm": 4.101327419281006, "learning_rate": 1.8433271719038818e-05, "loss": 2.3493, "step": 2120 }, { "epoch": 3.94, "grad_norm": 4.824129581451416, "learning_rate": 1.842587800369686e-05, "loss": 1.9549, "step": 2130 }, { "epoch": 3.96, "grad_norm": 6.033672332763672, "learning_rate": 1.84184842883549e-05, "loss": 2.5284, "step": 2140 }, { "epoch": 3.97, "grad_norm": 3.3915722370147705, "learning_rate": 1.841109057301294e-05, "loss": 2.3212, "step": 2150 }, { "epoch": 3.99, "grad_norm": 5.525197982788086, "learning_rate": 1.840369685767098e-05, "loss": 2.342, "step": 2160 }, { "epoch": 4.0, "eval_accuracy": 0.6673596673596673, "eval_loss": 1.7121784687042236, "eval_runtime": 1.5337, "eval_samples_per_second": 313.613, "eval_steps_per_second": 39.772, "step": 2164 }, { "epoch": 4.01, "grad_norm": 6.444073677062988, "learning_rate": 1.8396303142329023e-05, "loss": 2.1484, "step": 2170 }, { "epoch": 4.03, "grad_norm": 5.4424662590026855, "learning_rate": 1.8388909426987063e-05, "loss": 2.2897, "step": 2180 }, { "epoch": 4.05, "grad_norm": 7.729257583618164, "learning_rate": 1.8381515711645103e-05, "loss": 2.1029, "step": 2190 }, { "epoch": 4.07, "grad_norm": 7.3023834228515625, "learning_rate": 1.8374121996303146e-05, "loss": 2.3332, "step": 2200 }, { "epoch": 4.09, "grad_norm": 4.459550857543945, "learning_rate": 1.8366728280961182e-05, "loss": 2.1083, "step": 2210 }, { "epoch": 4.1, "grad_norm": 5.390074729919434, "learning_rate": 1.8359334565619226e-05, "loss": 2.1204, "step": 2220 }, { "epoch": 4.12, "grad_norm": 4.704233169555664, "learning_rate": 1.8351940850277265e-05, "loss": 2.5474, "step": 2230 }, { "epoch": 4.14, "grad_norm": 4.656131267547607, "learning_rate": 1.8344547134935305e-05, "loss": 2.572, "step": 2240 }, { "epoch": 4.16, "grad_norm": 5.0121917724609375, "learning_rate": 1.833715341959335e-05, "loss": 2.3695, "step": 2250 }, { "epoch": 4.18, "grad_norm": 8.962514877319336, "learning_rate": 1.8329759704251388e-05, "loss": 2.0792, "step": 2260 }, { "epoch": 4.2, "grad_norm": 5.248993873596191, "learning_rate": 1.8322365988909428e-05, "loss": 2.298, "step": 2270 }, { "epoch": 4.21, "grad_norm": 4.159375190734863, "learning_rate": 1.8314972273567468e-05, "loss": 2.1424, "step": 2280 }, { "epoch": 4.23, "grad_norm": 2.9768073558807373, "learning_rate": 1.830757855822551e-05, "loss": 2.2786, "step": 2290 }, { "epoch": 4.25, "grad_norm": 3.906672716140747, "learning_rate": 1.830018484288355e-05, "loss": 2.2417, "step": 2300 }, { "epoch": 4.27, "grad_norm": 5.082000255584717, "learning_rate": 1.829279112754159e-05, "loss": 2.4165, "step": 2310 }, { "epoch": 4.29, "grad_norm": 4.825994491577148, "learning_rate": 1.8285397412199634e-05, "loss": 2.0159, "step": 2320 }, { "epoch": 4.31, "grad_norm": 7.900585651397705, "learning_rate": 1.8278003696857673e-05, "loss": 2.1615, "step": 2330 }, { "epoch": 4.33, "grad_norm": 7.153266906738281, "learning_rate": 1.8270609981515713e-05, "loss": 2.226, "step": 2340 }, { "epoch": 4.34, "grad_norm": 4.499361038208008, "learning_rate": 1.8263216266173753e-05, "loss": 2.2553, "step": 2350 }, { "epoch": 4.36, "grad_norm": 5.496757984161377, "learning_rate": 1.8255822550831793e-05, "loss": 2.0807, "step": 2360 }, { "epoch": 4.38, "grad_norm": 5.804848670959473, "learning_rate": 1.8248428835489836e-05, "loss": 2.1043, "step": 2370 }, { "epoch": 4.4, "grad_norm": 3.903456687927246, "learning_rate": 1.8241035120147876e-05, "loss": 1.7245, "step": 2380 }, { "epoch": 4.42, "grad_norm": 5.654705047607422, "learning_rate": 1.823364140480592e-05, "loss": 2.2464, "step": 2390 }, { "epoch": 4.44, "grad_norm": 6.445099830627441, "learning_rate": 1.8226247689463955e-05, "loss": 2.2765, "step": 2400 }, { "epoch": 4.45, "grad_norm": 6.518381595611572, "learning_rate": 1.8218853974122e-05, "loss": 2.1103, "step": 2410 }, { "epoch": 4.47, "grad_norm": 6.7880682945251465, "learning_rate": 1.8211460258780038e-05, "loss": 2.3617, "step": 2420 }, { "epoch": 4.49, "grad_norm": 4.15521764755249, "learning_rate": 1.8204066543438078e-05, "loss": 2.0655, "step": 2430 }, { "epoch": 4.51, "grad_norm": 4.148801803588867, "learning_rate": 1.819667282809612e-05, "loss": 2.0171, "step": 2440 }, { "epoch": 4.53, "grad_norm": 7.420351505279541, "learning_rate": 1.818927911275416e-05, "loss": 2.4612, "step": 2450 }, { "epoch": 4.55, "grad_norm": 7.1027655601501465, "learning_rate": 1.81818853974122e-05, "loss": 2.2333, "step": 2460 }, { "epoch": 4.57, "grad_norm": 7.96389627456665, "learning_rate": 1.817449168207024e-05, "loss": 2.334, "step": 2470 }, { "epoch": 4.58, "grad_norm": 3.440479278564453, "learning_rate": 1.8167097966728284e-05, "loss": 2.0689, "step": 2480 }, { "epoch": 4.6, "grad_norm": 4.944855690002441, "learning_rate": 1.8159704251386323e-05, "loss": 2.3026, "step": 2490 }, { "epoch": 4.62, "grad_norm": 10.164743423461914, "learning_rate": 1.8152310536044363e-05, "loss": 2.1141, "step": 2500 }, { "epoch": 4.64, "grad_norm": 3.4128763675689697, "learning_rate": 1.8144916820702406e-05, "loss": 2.2014, "step": 2510 }, { "epoch": 4.66, "grad_norm": 2.963181734085083, "learning_rate": 1.8137523105360446e-05, "loss": 2.1768, "step": 2520 }, { "epoch": 4.68, "grad_norm": 3.5528838634490967, "learning_rate": 1.8130129390018486e-05, "loss": 1.8215, "step": 2530 }, { "epoch": 4.7, "grad_norm": 5.219555854797363, "learning_rate": 1.8122735674676526e-05, "loss": 2.1584, "step": 2540 }, { "epoch": 4.71, "grad_norm": 7.532898902893066, "learning_rate": 1.8115341959334566e-05, "loss": 2.4033, "step": 2550 }, { "epoch": 4.73, "grad_norm": 6.051506996154785, "learning_rate": 1.810794824399261e-05, "loss": 2.1177, "step": 2560 }, { "epoch": 4.75, "grad_norm": 5.1289448738098145, "learning_rate": 1.810055452865065e-05, "loss": 1.9681, "step": 2570 }, { "epoch": 4.77, "grad_norm": 3.2103374004364014, "learning_rate": 1.809316081330869e-05, "loss": 2.0469, "step": 2580 }, { "epoch": 4.79, "grad_norm": 4.3876423835754395, "learning_rate": 1.8085767097966728e-05, "loss": 2.2053, "step": 2590 }, { "epoch": 4.81, "grad_norm": 8.629464149475098, "learning_rate": 1.807837338262477e-05, "loss": 2.1286, "step": 2600 }, { "epoch": 4.82, "grad_norm": 5.985232353210449, "learning_rate": 1.807097966728281e-05, "loss": 2.0127, "step": 2610 }, { "epoch": 4.84, "grad_norm": 5.886923313140869, "learning_rate": 1.806358595194085e-05, "loss": 2.3391, "step": 2620 }, { "epoch": 4.86, "grad_norm": 4.419260501861572, "learning_rate": 1.8056192236598894e-05, "loss": 2.0551, "step": 2630 }, { "epoch": 4.88, "grad_norm": 7.0475969314575195, "learning_rate": 1.8048798521256934e-05, "loss": 2.0977, "step": 2640 }, { "epoch": 4.9, "grad_norm": 6.6292405128479, "learning_rate": 1.8041404805914974e-05, "loss": 2.0159, "step": 2650 }, { "epoch": 4.92, "grad_norm": 5.401991367340088, "learning_rate": 1.8034011090573013e-05, "loss": 2.3721, "step": 2660 }, { "epoch": 4.94, "grad_norm": 6.96116828918457, "learning_rate": 1.8026617375231053e-05, "loss": 2.1951, "step": 2670 }, { "epoch": 4.95, "grad_norm": 4.288328647613525, "learning_rate": 1.8019223659889096e-05, "loss": 2.1259, "step": 2680 }, { "epoch": 4.97, "grad_norm": 3.4335110187530518, "learning_rate": 1.8011829944547136e-05, "loss": 2.0344, "step": 2690 }, { "epoch": 4.99, "grad_norm": 4.07798957824707, "learning_rate": 1.800443622920518e-05, "loss": 2.1852, "step": 2700 }, { "epoch": 5.0, "eval_accuracy": 0.6923076923076923, "eval_loss": 1.5466641187667847, "eval_runtime": 1.6831, "eval_samples_per_second": 285.779, "eval_steps_per_second": 36.242, "step": 2705 }, { "epoch": 5.01, "grad_norm": 5.298517227172852, "learning_rate": 1.799704251386322e-05, "loss": 2.0423, "step": 2710 }, { "epoch": 5.03, "grad_norm": 7.244368553161621, "learning_rate": 1.798964879852126e-05, "loss": 2.213, "step": 2720 }, { "epoch": 5.05, "grad_norm": 5.899136066436768, "learning_rate": 1.79822550831793e-05, "loss": 2.2225, "step": 2730 }, { "epoch": 5.06, "grad_norm": 3.8384222984313965, "learning_rate": 1.797486136783734e-05, "loss": 2.0061, "step": 2740 }, { "epoch": 5.08, "grad_norm": 5.2859392166137695, "learning_rate": 1.796746765249538e-05, "loss": 2.0581, "step": 2750 }, { "epoch": 5.1, "grad_norm": 5.703100204467773, "learning_rate": 1.796007393715342e-05, "loss": 1.717, "step": 2760 }, { "epoch": 5.12, "grad_norm": 4.230671405792236, "learning_rate": 1.795268022181146e-05, "loss": 1.894, "step": 2770 }, { "epoch": 5.14, "grad_norm": 6.351840496063232, "learning_rate": 1.79452865064695e-05, "loss": 2.15, "step": 2780 }, { "epoch": 5.16, "grad_norm": 5.351569652557373, "learning_rate": 1.7937892791127544e-05, "loss": 2.4203, "step": 2790 }, { "epoch": 5.18, "grad_norm": 4.467886447906494, "learning_rate": 1.7930499075785584e-05, "loss": 2.0776, "step": 2800 }, { "epoch": 5.19, "grad_norm": 5.716907024383545, "learning_rate": 1.7923105360443624e-05, "loss": 2.1391, "step": 2810 }, { "epoch": 5.21, "grad_norm": 4.919709205627441, "learning_rate": 1.7915711645101667e-05, "loss": 2.0863, "step": 2820 }, { "epoch": 5.23, "grad_norm": 4.775925159454346, "learning_rate": 1.7908317929759707e-05, "loss": 1.8478, "step": 2830 }, { "epoch": 5.25, "grad_norm": 5.981485843658447, "learning_rate": 1.7900924214417746e-05, "loss": 2.2735, "step": 2840 }, { "epoch": 5.27, "grad_norm": 6.747861862182617, "learning_rate": 1.7893530499075786e-05, "loss": 2.0071, "step": 2850 }, { "epoch": 5.29, "grad_norm": 8.530503273010254, "learning_rate": 1.7886136783733826e-05, "loss": 1.873, "step": 2860 }, { "epoch": 5.3, "grad_norm": 2.632251501083374, "learning_rate": 1.787874306839187e-05, "loss": 1.9939, "step": 2870 }, { "epoch": 5.32, "grad_norm": 8.007118225097656, "learning_rate": 1.787134935304991e-05, "loss": 2.0315, "step": 2880 }, { "epoch": 5.34, "grad_norm": 10.74657154083252, "learning_rate": 1.786395563770795e-05, "loss": 1.9395, "step": 2890 }, { "epoch": 5.36, "grad_norm": 6.513667106628418, "learning_rate": 1.7856561922365992e-05, "loss": 1.6863, "step": 2900 }, { "epoch": 5.38, "grad_norm": 8.592857360839844, "learning_rate": 1.7849168207024032e-05, "loss": 2.4774, "step": 2910 }, { "epoch": 5.4, "grad_norm": 3.0531833171844482, "learning_rate": 1.784177449168207e-05, "loss": 1.9043, "step": 2920 }, { "epoch": 5.42, "grad_norm": 7.819247245788574, "learning_rate": 1.783438077634011e-05, "loss": 2.0391, "step": 2930 }, { "epoch": 5.43, "grad_norm": 6.394385814666748, "learning_rate": 1.7826987060998154e-05, "loss": 2.1098, "step": 2940 }, { "epoch": 5.45, "grad_norm": 5.372478485107422, "learning_rate": 1.7819593345656194e-05, "loss": 2.0258, "step": 2950 }, { "epoch": 5.47, "grad_norm": 5.48792028427124, "learning_rate": 1.7812199630314234e-05, "loss": 1.9466, "step": 2960 }, { "epoch": 5.49, "grad_norm": 4.60893440246582, "learning_rate": 1.7804805914972274e-05, "loss": 1.8653, "step": 2970 }, { "epoch": 5.51, "grad_norm": 5.470660209655762, "learning_rate": 1.7797412199630314e-05, "loss": 2.5282, "step": 2980 }, { "epoch": 5.53, "grad_norm": 6.080894470214844, "learning_rate": 1.7790018484288357e-05, "loss": 2.3226, "step": 2990 }, { "epoch": 5.55, "grad_norm": 5.201182842254639, "learning_rate": 1.7782624768946397e-05, "loss": 2.0071, "step": 3000 }, { "epoch": 5.56, "grad_norm": 3.8985416889190674, "learning_rate": 1.777523105360444e-05, "loss": 1.7798, "step": 3010 }, { "epoch": 5.58, "grad_norm": 5.473669528961182, "learning_rate": 1.776783733826248e-05, "loss": 1.9088, "step": 3020 }, { "epoch": 5.6, "grad_norm": 5.637909889221191, "learning_rate": 1.776044362292052e-05, "loss": 2.0687, "step": 3030 }, { "epoch": 5.62, "grad_norm": 8.199572563171387, "learning_rate": 1.775304990757856e-05, "loss": 1.6683, "step": 3040 }, { "epoch": 5.64, "grad_norm": 6.898763179779053, "learning_rate": 1.77456561922366e-05, "loss": 2.1265, "step": 3050 }, { "epoch": 5.66, "grad_norm": 9.663229942321777, "learning_rate": 1.7738262476894642e-05, "loss": 2.1856, "step": 3060 }, { "epoch": 5.67, "grad_norm": 6.835959434509277, "learning_rate": 1.7730868761552682e-05, "loss": 1.9722, "step": 3070 }, { "epoch": 5.69, "grad_norm": 5.0374908447265625, "learning_rate": 1.772347504621072e-05, "loss": 2.1073, "step": 3080 }, { "epoch": 5.71, "grad_norm": 2.971951961517334, "learning_rate": 1.7716081330868765e-05, "loss": 2.5055, "step": 3090 }, { "epoch": 5.73, "grad_norm": 6.078326225280762, "learning_rate": 1.77086876155268e-05, "loss": 1.9683, "step": 3100 }, { "epoch": 5.75, "grad_norm": 4.573218822479248, "learning_rate": 1.7701293900184844e-05, "loss": 1.7455, "step": 3110 }, { "epoch": 5.77, "grad_norm": 5.869566440582275, "learning_rate": 1.7693900184842884e-05, "loss": 1.9752, "step": 3120 }, { "epoch": 5.79, "grad_norm": 7.765271186828613, "learning_rate": 1.7686506469500927e-05, "loss": 1.9578, "step": 3130 }, { "epoch": 5.8, "grad_norm": 7.2641167640686035, "learning_rate": 1.7679112754158967e-05, "loss": 1.9724, "step": 3140 }, { "epoch": 5.82, "grad_norm": 5.513419151306152, "learning_rate": 1.7671719038817007e-05, "loss": 1.8555, "step": 3150 }, { "epoch": 5.84, "grad_norm": 8.616771697998047, "learning_rate": 1.7664325323475047e-05, "loss": 2.2072, "step": 3160 }, { "epoch": 5.86, "grad_norm": 4.0062336921691895, "learning_rate": 1.7656931608133086e-05, "loss": 2.0517, "step": 3170 }, { "epoch": 5.88, "grad_norm": 9.101136207580566, "learning_rate": 1.764953789279113e-05, "loss": 2.1571, "step": 3180 }, { "epoch": 5.9, "grad_norm": 5.34522008895874, "learning_rate": 1.764214417744917e-05, "loss": 2.0756, "step": 3190 }, { "epoch": 5.91, "grad_norm": 7.170718669891357, "learning_rate": 1.763475046210721e-05, "loss": 2.2981, "step": 3200 }, { "epoch": 5.93, "grad_norm": 5.0911545753479, "learning_rate": 1.7627356746765252e-05, "loss": 2.2296, "step": 3210 }, { "epoch": 5.95, "grad_norm": 4.195282459259033, "learning_rate": 1.7619963031423292e-05, "loss": 2.3057, "step": 3220 }, { "epoch": 5.97, "grad_norm": 5.193121910095215, "learning_rate": 1.7612569316081332e-05, "loss": 2.1447, "step": 3230 }, { "epoch": 5.99, "grad_norm": 10.203285217285156, "learning_rate": 1.760517560073937e-05, "loss": 1.9535, "step": 3240 }, { "epoch": 6.0, "eval_accuracy": 0.7047817047817048, "eval_loss": 1.4113445281982422, "eval_runtime": 1.5398, "eval_samples_per_second": 312.378, "eval_steps_per_second": 39.616, "step": 3246 }, { "epoch": 6.01, "grad_norm": 5.486194610595703, "learning_rate": 1.7597781885397415e-05, "loss": 2.0906, "step": 3250 }, { "epoch": 6.03, "grad_norm": 7.852478504180908, "learning_rate": 1.7590388170055455e-05, "loss": 1.7803, "step": 3260 }, { "epoch": 6.04, "grad_norm": 6.984188079833984, "learning_rate": 1.7582994454713494e-05, "loss": 1.8691, "step": 3270 }, { "epoch": 6.06, "grad_norm": 6.279801845550537, "learning_rate": 1.7575600739371538e-05, "loss": 1.9502, "step": 3280 }, { "epoch": 6.08, "grad_norm": 4.827626705169678, "learning_rate": 1.7568207024029574e-05, "loss": 1.9988, "step": 3290 }, { "epoch": 6.1, "grad_norm": 6.406517505645752, "learning_rate": 1.7560813308687617e-05, "loss": 1.6651, "step": 3300 }, { "epoch": 6.12, "grad_norm": 3.3150088787078857, "learning_rate": 1.7553419593345657e-05, "loss": 2.2606, "step": 3310 }, { "epoch": 6.14, "grad_norm": 4.611109733581543, "learning_rate": 1.75460258780037e-05, "loss": 1.8671, "step": 3320 }, { "epoch": 6.16, "grad_norm": 4.767951488494873, "learning_rate": 1.753863216266174e-05, "loss": 2.0714, "step": 3330 }, { "epoch": 6.17, "grad_norm": 7.092107772827148, "learning_rate": 1.753123844731978e-05, "loss": 2.1358, "step": 3340 }, { "epoch": 6.19, "grad_norm": 8.061872482299805, "learning_rate": 1.752384473197782e-05, "loss": 2.3942, "step": 3350 }, { "epoch": 6.21, "grad_norm": 5.438494682312012, "learning_rate": 1.751645101663586e-05, "loss": 2.3551, "step": 3360 }, { "epoch": 6.23, "grad_norm": 4.006960391998291, "learning_rate": 1.7509057301293902e-05, "loss": 1.7538, "step": 3370 }, { "epoch": 6.25, "grad_norm": 8.417580604553223, "learning_rate": 1.7501663585951942e-05, "loss": 2.222, "step": 3380 }, { "epoch": 6.27, "grad_norm": 8.379271507263184, "learning_rate": 1.7494269870609982e-05, "loss": 2.066, "step": 3390 }, { "epoch": 6.28, "grad_norm": 4.969542503356934, "learning_rate": 1.7486876155268025e-05, "loss": 2.0801, "step": 3400 }, { "epoch": 6.3, "grad_norm": 7.461071968078613, "learning_rate": 1.747948243992606e-05, "loss": 1.9323, "step": 3410 }, { "epoch": 6.32, "grad_norm": 6.13628625869751, "learning_rate": 1.7472088724584105e-05, "loss": 2.0192, "step": 3420 }, { "epoch": 6.34, "grad_norm": 5.534074783325195, "learning_rate": 1.7464695009242145e-05, "loss": 1.967, "step": 3430 }, { "epoch": 6.36, "grad_norm": 4.8219733238220215, "learning_rate": 1.7457301293900188e-05, "loss": 2.1641, "step": 3440 }, { "epoch": 6.38, "grad_norm": 5.982542514801025, "learning_rate": 1.7449907578558227e-05, "loss": 2.0182, "step": 3450 }, { "epoch": 6.4, "grad_norm": 6.451911449432373, "learning_rate": 1.7442513863216267e-05, "loss": 1.826, "step": 3460 }, { "epoch": 6.41, "grad_norm": 3.6778299808502197, "learning_rate": 1.743512014787431e-05, "loss": 1.8454, "step": 3470 }, { "epoch": 6.43, "grad_norm": 6.107019901275635, "learning_rate": 1.7427726432532347e-05, "loss": 1.8224, "step": 3480 }, { "epoch": 6.45, "grad_norm": 5.207563877105713, "learning_rate": 1.742033271719039e-05, "loss": 2.1439, "step": 3490 }, { "epoch": 6.47, "grad_norm": 4.160572052001953, "learning_rate": 1.741293900184843e-05, "loss": 1.9476, "step": 3500 }, { "epoch": 6.49, "grad_norm": 6.487608432769775, "learning_rate": 1.740554528650647e-05, "loss": 1.8823, "step": 3510 }, { "epoch": 6.51, "grad_norm": 6.683421611785889, "learning_rate": 1.7398151571164513e-05, "loss": 2.009, "step": 3520 }, { "epoch": 6.52, "grad_norm": 7.4384684562683105, "learning_rate": 1.7390757855822553e-05, "loss": 2.1267, "step": 3530 }, { "epoch": 6.54, "grad_norm": 4.5928120613098145, "learning_rate": 1.7383364140480592e-05, "loss": 1.8083, "step": 3540 }, { "epoch": 6.56, "grad_norm": 8.427355766296387, "learning_rate": 1.7375970425138632e-05, "loss": 2.127, "step": 3550 }, { "epoch": 6.58, "grad_norm": 6.375779628753662, "learning_rate": 1.7368576709796675e-05, "loss": 1.9548, "step": 3560 }, { "epoch": 6.6, "grad_norm": 4.484184265136719, "learning_rate": 1.7361182994454715e-05, "loss": 1.9963, "step": 3570 }, { "epoch": 6.62, "grad_norm": 5.637427806854248, "learning_rate": 1.7353789279112755e-05, "loss": 2.1716, "step": 3580 }, { "epoch": 6.64, "grad_norm": 6.859792709350586, "learning_rate": 1.7346395563770798e-05, "loss": 2.0562, "step": 3590 }, { "epoch": 6.65, "grad_norm": 5.938406467437744, "learning_rate": 1.7339001848428834e-05, "loss": 1.7996, "step": 3600 }, { "epoch": 6.67, "grad_norm": 8.111343383789062, "learning_rate": 1.7331608133086878e-05, "loss": 1.8559, "step": 3610 }, { "epoch": 6.69, "grad_norm": 4.254898548126221, "learning_rate": 1.7324214417744917e-05, "loss": 2.1403, "step": 3620 }, { "epoch": 6.71, "grad_norm": 5.51759147644043, "learning_rate": 1.731682070240296e-05, "loss": 2.1104, "step": 3630 }, { "epoch": 6.73, "grad_norm": 4.796214580535889, "learning_rate": 1.7309426987061e-05, "loss": 2.0483, "step": 3640 }, { "epoch": 6.75, "grad_norm": 10.199125289916992, "learning_rate": 1.730203327171904e-05, "loss": 1.9072, "step": 3650 }, { "epoch": 6.77, "grad_norm": 3.6283345222473145, "learning_rate": 1.7294639556377083e-05, "loss": 1.617, "step": 3660 }, { "epoch": 6.78, "grad_norm": 9.140597343444824, "learning_rate": 1.728724584103512e-05, "loss": 1.693, "step": 3670 }, { "epoch": 6.8, "grad_norm": 4.946544170379639, "learning_rate": 1.7279852125693163e-05, "loss": 1.8071, "step": 3680 }, { "epoch": 6.82, "grad_norm": 5.619536876678467, "learning_rate": 1.7272458410351203e-05, "loss": 1.6218, "step": 3690 }, { "epoch": 6.84, "grad_norm": 4.349669456481934, "learning_rate": 1.7265064695009242e-05, "loss": 1.6641, "step": 3700 }, { "epoch": 6.86, "grad_norm": 7.926258087158203, "learning_rate": 1.7257670979667286e-05, "loss": 1.9527, "step": 3710 }, { "epoch": 6.88, "grad_norm": 6.45033597946167, "learning_rate": 1.7250277264325325e-05, "loss": 2.0904, "step": 3720 }, { "epoch": 6.89, "grad_norm": 3.526333808898926, "learning_rate": 1.7242883548983365e-05, "loss": 1.9981, "step": 3730 }, { "epoch": 6.91, "grad_norm": 6.21485710144043, "learning_rate": 1.7235489833641405e-05, "loss": 1.9026, "step": 3740 }, { "epoch": 6.93, "grad_norm": 5.945407867431641, "learning_rate": 1.7228096118299448e-05, "loss": 1.8212, "step": 3750 }, { "epoch": 6.95, "grad_norm": 6.2672200202941895, "learning_rate": 1.7220702402957488e-05, "loss": 1.9252, "step": 3760 }, { "epoch": 6.97, "grad_norm": 4.408575534820557, "learning_rate": 1.7213308687615528e-05, "loss": 2.1886, "step": 3770 }, { "epoch": 6.99, "grad_norm": 4.581439018249512, "learning_rate": 1.720591497227357e-05, "loss": 1.9061, "step": 3780 }, { "epoch": 7.0, "eval_accuracy": 0.6881496881496881, "eval_loss": 1.3136467933654785, "eval_runtime": 1.5345, "eval_samples_per_second": 313.462, "eval_steps_per_second": 39.753, "step": 3787 }, { "epoch": 7.01, "grad_norm": 4.586516380310059, "learning_rate": 1.7198521256931607e-05, "loss": 1.8657, "step": 3790 }, { "epoch": 7.02, "grad_norm": 4.44637393951416, "learning_rate": 1.719112754158965e-05, "loss": 1.8572, "step": 3800 }, { "epoch": 7.04, "grad_norm": 6.990758419036865, "learning_rate": 1.718373382624769e-05, "loss": 1.597, "step": 3810 }, { "epoch": 7.06, "grad_norm": 5.798426628112793, "learning_rate": 1.717634011090573e-05, "loss": 1.7034, "step": 3820 }, { "epoch": 7.08, "grad_norm": 8.482425689697266, "learning_rate": 1.7168946395563773e-05, "loss": 1.7854, "step": 3830 }, { "epoch": 7.1, "grad_norm": 3.766000509262085, "learning_rate": 1.7161552680221813e-05, "loss": 1.7495, "step": 3840 }, { "epoch": 7.12, "grad_norm": 4.311873912811279, "learning_rate": 1.7154158964879856e-05, "loss": 2.0306, "step": 3850 }, { "epoch": 7.13, "grad_norm": 5.808076858520508, "learning_rate": 1.7146765249537893e-05, "loss": 2.0037, "step": 3860 }, { "epoch": 7.15, "grad_norm": 6.657934665679932, "learning_rate": 1.7139371534195936e-05, "loss": 2.0147, "step": 3870 }, { "epoch": 7.17, "grad_norm": 6.266687870025635, "learning_rate": 1.7131977818853975e-05, "loss": 1.842, "step": 3880 }, { "epoch": 7.19, "grad_norm": 5.961197376251221, "learning_rate": 1.7124584103512015e-05, "loss": 1.9722, "step": 3890 }, { "epoch": 7.21, "grad_norm": 3.8531596660614014, "learning_rate": 1.711719038817006e-05, "loss": 1.9986, "step": 3900 }, { "epoch": 7.23, "grad_norm": 4.792156219482422, "learning_rate": 1.7109796672828098e-05, "loss": 2.1142, "step": 3910 }, { "epoch": 7.25, "grad_norm": 8.346179962158203, "learning_rate": 1.7102402957486138e-05, "loss": 1.8902, "step": 3920 }, { "epoch": 7.26, "grad_norm": 5.697525501251221, "learning_rate": 1.7095009242144178e-05, "loss": 2.1232, "step": 3930 }, { "epoch": 7.28, "grad_norm": 4.150140285491943, "learning_rate": 1.7087615526802218e-05, "loss": 1.9837, "step": 3940 }, { "epoch": 7.3, "grad_norm": 5.114126205444336, "learning_rate": 1.708022181146026e-05, "loss": 1.4826, "step": 3950 }, { "epoch": 7.32, "grad_norm": 12.21320629119873, "learning_rate": 1.70728280961183e-05, "loss": 2.2377, "step": 3960 }, { "epoch": 7.34, "grad_norm": 6.315988540649414, "learning_rate": 1.7065434380776344e-05, "loss": 1.7932, "step": 3970 }, { "epoch": 7.36, "grad_norm": 7.104751110076904, "learning_rate": 1.705804066543438e-05, "loss": 1.6013, "step": 3980 }, { "epoch": 7.38, "grad_norm": 7.56428337097168, "learning_rate": 1.7050646950092423e-05, "loss": 1.9246, "step": 3990 }, { "epoch": 7.39, "grad_norm": 5.263105392456055, "learning_rate": 1.7043253234750463e-05, "loss": 1.5609, "step": 4000 }, { "epoch": 7.41, "grad_norm": 6.601494789123535, "learning_rate": 1.7035859519408503e-05, "loss": 2.0788, "step": 4010 }, { "epoch": 7.43, "grad_norm": 6.946007251739502, "learning_rate": 1.7028465804066546e-05, "loss": 2.1174, "step": 4020 }, { "epoch": 7.45, "grad_norm": 6.089936256408691, "learning_rate": 1.7021072088724586e-05, "loss": 1.9115, "step": 4030 }, { "epoch": 7.47, "grad_norm": 10.277483940124512, "learning_rate": 1.7013678373382626e-05, "loss": 2.0313, "step": 4040 }, { "epoch": 7.49, "grad_norm": 6.100281238555908, "learning_rate": 1.7006284658040665e-05, "loss": 1.8331, "step": 4050 }, { "epoch": 7.5, "grad_norm": 5.153899669647217, "learning_rate": 1.699889094269871e-05, "loss": 1.6957, "step": 4060 }, { "epoch": 7.52, "grad_norm": 5.163270473480225, "learning_rate": 1.699149722735675e-05, "loss": 2.182, "step": 4070 }, { "epoch": 7.54, "grad_norm": 4.999426364898682, "learning_rate": 1.6984103512014788e-05, "loss": 1.8903, "step": 4080 }, { "epoch": 7.56, "grad_norm": 4.213411808013916, "learning_rate": 1.697670979667283e-05, "loss": 1.7585, "step": 4090 }, { "epoch": 7.58, "grad_norm": 6.371384143829346, "learning_rate": 1.696931608133087e-05, "loss": 1.9111, "step": 4100 }, { "epoch": 7.6, "grad_norm": 6.486601829528809, "learning_rate": 1.696192236598891e-05, "loss": 1.7223, "step": 4110 }, { "epoch": 7.62, "grad_norm": 4.265336513519287, "learning_rate": 1.695452865064695e-05, "loss": 2.2302, "step": 4120 }, { "epoch": 7.63, "grad_norm": 11.500389099121094, "learning_rate": 1.694713493530499e-05, "loss": 1.7558, "step": 4130 }, { "epoch": 7.65, "grad_norm": 7.355184078216553, "learning_rate": 1.6939741219963034e-05, "loss": 2.129, "step": 4140 }, { "epoch": 7.67, "grad_norm": 6.149376392364502, "learning_rate": 1.6932347504621073e-05, "loss": 2.249, "step": 4150 }, { "epoch": 7.69, "grad_norm": 8.594679832458496, "learning_rate": 1.692569316081331e-05, "loss": 2.0776, "step": 4160 }, { "epoch": 7.71, "grad_norm": 4.261440277099609, "learning_rate": 1.691829944547135e-05, "loss": 1.6514, "step": 4170 }, { "epoch": 7.73, "grad_norm": 6.84397554397583, "learning_rate": 1.691090573012939e-05, "loss": 1.9197, "step": 4180 }, { "epoch": 7.74, "grad_norm": 5.621590614318848, "learning_rate": 1.690351201478743e-05, "loss": 2.0232, "step": 4190 }, { "epoch": 7.76, "grad_norm": 9.00403118133545, "learning_rate": 1.689611829944547e-05, "loss": 2.1983, "step": 4200 }, { "epoch": 7.78, "grad_norm": 5.122200965881348, "learning_rate": 1.6888724584103514e-05, "loss": 2.0075, "step": 4210 }, { "epoch": 7.8, "grad_norm": 4.513713836669922, "learning_rate": 1.6881330868761554e-05, "loss": 1.8924, "step": 4220 }, { "epoch": 7.82, "grad_norm": 5.402963161468506, "learning_rate": 1.6873937153419594e-05, "loss": 1.8264, "step": 4230 }, { "epoch": 7.84, "grad_norm": 4.173924446105957, "learning_rate": 1.6866543438077637e-05, "loss": 1.881, "step": 4240 }, { "epoch": 7.86, "grad_norm": 9.862312316894531, "learning_rate": 1.6859149722735677e-05, "loss": 1.8464, "step": 4250 }, { "epoch": 7.87, "grad_norm": 7.036571502685547, "learning_rate": 1.6851756007393717e-05, "loss": 1.6236, "step": 4260 }, { "epoch": 7.89, "grad_norm": 4.979981422424316, "learning_rate": 1.6844362292051756e-05, "loss": 1.6973, "step": 4270 }, { "epoch": 7.91, "grad_norm": 5.2604079246521, "learning_rate": 1.68369685767098e-05, "loss": 1.6986, "step": 4280 }, { "epoch": 7.93, "grad_norm": 4.230659008026123, "learning_rate": 1.682957486136784e-05, "loss": 1.8497, "step": 4290 }, { "epoch": 7.95, "grad_norm": 6.562105178833008, "learning_rate": 1.682218114602588e-05, "loss": 1.7189, "step": 4300 }, { "epoch": 7.97, "grad_norm": 8.38758659362793, "learning_rate": 1.6814787430683922e-05, "loss": 1.4513, "step": 4310 }, { "epoch": 7.99, "grad_norm": 5.028570652008057, "learning_rate": 1.680739371534196e-05, "loss": 1.5934, "step": 4320 }, { "epoch": 8.0, "eval_accuracy": 0.7089397089397089, "eval_loss": 1.2058689594268799, "eval_runtime": 1.5461, "eval_samples_per_second": 311.1, "eval_steps_per_second": 39.453, "step": 4328 }, { "epoch": 8.0, "grad_norm": 4.681914806365967, "learning_rate": 1.6800000000000002e-05, "loss": 1.6128, "step": 4330 }, { "epoch": 8.02, "grad_norm": 4.084405899047852, "learning_rate": 1.679260628465804e-05, "loss": 1.8149, "step": 4340 }, { "epoch": 8.04, "grad_norm": 6.964240550994873, "learning_rate": 1.678521256931608e-05, "loss": 2.064, "step": 4350 }, { "epoch": 8.06, "grad_norm": 5.049116611480713, "learning_rate": 1.6777818853974125e-05, "loss": 1.9576, "step": 4360 }, { "epoch": 8.08, "grad_norm": 7.169123649597168, "learning_rate": 1.6770425138632164e-05, "loss": 1.6481, "step": 4370 }, { "epoch": 8.1, "grad_norm": 4.872652053833008, "learning_rate": 1.6763031423290204e-05, "loss": 1.9997, "step": 4380 }, { "epoch": 8.11, "grad_norm": 6.030390739440918, "learning_rate": 1.6755637707948244e-05, "loss": 1.9267, "step": 4390 }, { "epoch": 8.13, "grad_norm": 5.29840612411499, "learning_rate": 1.6748243992606287e-05, "loss": 2.0566, "step": 4400 }, { "epoch": 8.15, "grad_norm": 4.303355693817139, "learning_rate": 1.6740850277264327e-05, "loss": 1.6762, "step": 4410 }, { "epoch": 8.17, "grad_norm": 4.432554244995117, "learning_rate": 1.6733456561922367e-05, "loss": 1.9368, "step": 4420 }, { "epoch": 8.19, "grad_norm": 4.260908126831055, "learning_rate": 1.672606284658041e-05, "loss": 1.5083, "step": 4430 }, { "epoch": 8.21, "grad_norm": 6.840023517608643, "learning_rate": 1.671866913123845e-05, "loss": 1.8341, "step": 4440 }, { "epoch": 8.23, "grad_norm": 8.099714279174805, "learning_rate": 1.671127541589649e-05, "loss": 1.9728, "step": 4450 }, { "epoch": 8.24, "grad_norm": 3.785170555114746, "learning_rate": 1.670388170055453e-05, "loss": 1.7725, "step": 4460 }, { "epoch": 8.26, "grad_norm": 3.6673882007598877, "learning_rate": 1.669648798521257e-05, "loss": 1.7185, "step": 4470 }, { "epoch": 8.28, "grad_norm": 5.505701541900635, "learning_rate": 1.6689094269870612e-05, "loss": 1.8416, "step": 4480 }, { "epoch": 8.3, "grad_norm": 7.419302463531494, "learning_rate": 1.6681700554528652e-05, "loss": 1.7865, "step": 4490 }, { "epoch": 8.32, "grad_norm": 5.522082805633545, "learning_rate": 1.6674306839186695e-05, "loss": 2.2497, "step": 4500 }, { "epoch": 8.34, "grad_norm": 5.4933085441589355, "learning_rate": 1.666691312384473e-05, "loss": 1.996, "step": 4510 }, { "epoch": 8.35, "grad_norm": 5.113419055938721, "learning_rate": 1.6659519408502775e-05, "loss": 1.8892, "step": 4520 }, { "epoch": 8.37, "grad_norm": 6.080321788787842, "learning_rate": 1.6652125693160814e-05, "loss": 1.7554, "step": 4530 }, { "epoch": 8.39, "grad_norm": 9.923974990844727, "learning_rate": 1.6644731977818854e-05, "loss": 1.7445, "step": 4540 }, { "epoch": 8.41, "grad_norm": 5.210130214691162, "learning_rate": 1.6637338262476897e-05, "loss": 1.9865, "step": 4550 }, { "epoch": 8.43, "grad_norm": 7.728953838348389, "learning_rate": 1.6629944547134937e-05, "loss": 1.5186, "step": 4560 }, { "epoch": 8.45, "grad_norm": 8.815023422241211, "learning_rate": 1.6622550831792977e-05, "loss": 2.0233, "step": 4570 }, { "epoch": 8.47, "grad_norm": 5.50808572769165, "learning_rate": 1.6615157116451017e-05, "loss": 1.9297, "step": 4580 }, { "epoch": 8.48, "grad_norm": 6.375161647796631, "learning_rate": 1.660776340110906e-05, "loss": 1.7131, "step": 4590 }, { "epoch": 8.5, "grad_norm": 5.382516860961914, "learning_rate": 1.66003696857671e-05, "loss": 1.993, "step": 4600 }, { "epoch": 8.52, "grad_norm": 5.755428791046143, "learning_rate": 1.659297597042514e-05, "loss": 1.8935, "step": 4610 }, { "epoch": 8.54, "grad_norm": 5.815938472747803, "learning_rate": 1.6585582255083183e-05, "loss": 1.8336, "step": 4620 }, { "epoch": 8.56, "grad_norm": 4.193169116973877, "learning_rate": 1.6578188539741222e-05, "loss": 1.8713, "step": 4630 }, { "epoch": 8.58, "grad_norm": 6.436243534088135, "learning_rate": 1.6570794824399262e-05, "loss": 2.1586, "step": 4640 }, { "epoch": 8.6, "grad_norm": 10.52224063873291, "learning_rate": 1.6563401109057302e-05, "loss": 1.6671, "step": 4650 }, { "epoch": 8.61, "grad_norm": 6.4911298751831055, "learning_rate": 1.6556007393715342e-05, "loss": 2.2308, "step": 4660 }, { "epoch": 8.63, "grad_norm": 4.897397518157959, "learning_rate": 1.6548613678373385e-05, "loss": 1.8734, "step": 4670 }, { "epoch": 8.65, "grad_norm": 7.949416160583496, "learning_rate": 1.6541219963031425e-05, "loss": 1.8414, "step": 4680 }, { "epoch": 8.67, "grad_norm": 5.1268815994262695, "learning_rate": 1.6533826247689465e-05, "loss": 1.5545, "step": 4690 }, { "epoch": 8.69, "grad_norm": 4.521894931793213, "learning_rate": 1.6526432532347504e-05, "loss": 1.7879, "step": 4700 }, { "epoch": 8.71, "grad_norm": 9.684229850769043, "learning_rate": 1.6519038817005547e-05, "loss": 1.9793, "step": 4710 }, { "epoch": 8.72, "grad_norm": 3.4764626026153564, "learning_rate": 1.6511645101663587e-05, "loss": 2.0146, "step": 4720 }, { "epoch": 8.74, "grad_norm": 6.923727512359619, "learning_rate": 1.6504251386321627e-05, "loss": 1.8222, "step": 4730 }, { "epoch": 8.76, "grad_norm": 5.267928123474121, "learning_rate": 1.649685767097967e-05, "loss": 2.198, "step": 4740 }, { "epoch": 8.78, "grad_norm": 5.144866943359375, "learning_rate": 1.648946395563771e-05, "loss": 1.8322, "step": 4750 }, { "epoch": 8.8, "grad_norm": 5.443819522857666, "learning_rate": 1.648207024029575e-05, "loss": 1.9888, "step": 4760 }, { "epoch": 8.82, "grad_norm": 6.604461669921875, "learning_rate": 1.647467652495379e-05, "loss": 2.1875, "step": 4770 }, { "epoch": 8.84, "grad_norm": 7.767526149749756, "learning_rate": 1.646728280961183e-05, "loss": 2.0143, "step": 4780 }, { "epoch": 8.85, "grad_norm": 4.7501678466796875, "learning_rate": 1.6459889094269873e-05, "loss": 2.0738, "step": 4790 }, { "epoch": 8.87, "grad_norm": 6.003524303436279, "learning_rate": 1.6452495378927912e-05, "loss": 1.2932, "step": 4800 }, { "epoch": 8.89, "grad_norm": 8.089621543884277, "learning_rate": 1.6445101663585955e-05, "loss": 2.1161, "step": 4810 }, { "epoch": 8.91, "grad_norm": 6.005993843078613, "learning_rate": 1.6437707948243995e-05, "loss": 1.6404, "step": 4820 }, { "epoch": 8.93, "grad_norm": 6.290034294128418, "learning_rate": 1.6430314232902035e-05, "loss": 1.7748, "step": 4830 }, { "epoch": 8.95, "grad_norm": 4.321631908416748, "learning_rate": 1.6422920517560075e-05, "loss": 1.7497, "step": 4840 }, { "epoch": 8.96, "grad_norm": 6.173918724060059, "learning_rate": 1.6415526802218115e-05, "loss": 1.8344, "step": 4850 }, { "epoch": 8.98, "grad_norm": 9.61416244506836, "learning_rate": 1.6408133086876158e-05, "loss": 1.8755, "step": 4860 }, { "epoch": 9.0, "eval_accuracy": 0.7172557172557172, "eval_loss": 1.163785457611084, "eval_runtime": 1.533, "eval_samples_per_second": 313.766, "eval_steps_per_second": 39.791, "step": 4869 }, { "epoch": 9.0, "grad_norm": 6.215839385986328, "learning_rate": 1.6400739371534198e-05, "loss": 1.712, "step": 4870 }, { "epoch": 9.02, "grad_norm": 5.702755928039551, "learning_rate": 1.6393345656192237e-05, "loss": 1.7867, "step": 4880 }, { "epoch": 9.04, "grad_norm": 7.144179344177246, "learning_rate": 1.6385951940850277e-05, "loss": 1.5488, "step": 4890 }, { "epoch": 9.06, "grad_norm": 6.906232833862305, "learning_rate": 1.637855822550832e-05, "loss": 1.4695, "step": 4900 }, { "epoch": 9.08, "grad_norm": 7.913764953613281, "learning_rate": 1.637116451016636e-05, "loss": 1.8798, "step": 4910 }, { "epoch": 9.09, "grad_norm": 7.2428107261657715, "learning_rate": 1.63637707948244e-05, "loss": 1.7621, "step": 4920 }, { "epoch": 9.11, "grad_norm": 9.698939323425293, "learning_rate": 1.6356377079482443e-05, "loss": 1.7613, "step": 4930 }, { "epoch": 9.13, "grad_norm": 5.739033222198486, "learning_rate": 1.6348983364140483e-05, "loss": 1.8478, "step": 4940 }, { "epoch": 9.15, "grad_norm": 7.739719867706299, "learning_rate": 1.6341589648798523e-05, "loss": 1.7035, "step": 4950 }, { "epoch": 9.17, "grad_norm": 5.32645320892334, "learning_rate": 1.6334195933456562e-05, "loss": 1.5282, "step": 4960 }, { "epoch": 9.19, "grad_norm": 5.991862773895264, "learning_rate": 1.6326802218114602e-05, "loss": 1.8806, "step": 4970 }, { "epoch": 9.21, "grad_norm": 11.046886444091797, "learning_rate": 1.6319408502772645e-05, "loss": 1.646, "step": 4980 }, { "epoch": 9.22, "grad_norm": 7.490607261657715, "learning_rate": 1.6312014787430685e-05, "loss": 1.9067, "step": 4990 }, { "epoch": 9.24, "grad_norm": 4.589789867401123, "learning_rate": 1.6304621072088725e-05, "loss": 1.781, "step": 5000 }, { "epoch": 9.26, "grad_norm": 7.872344970703125, "learning_rate": 1.6297227356746768e-05, "loss": 1.8494, "step": 5010 }, { "epoch": 9.28, "grad_norm": 4.643067836761475, "learning_rate": 1.6289833641404808e-05, "loss": 1.5545, "step": 5020 }, { "epoch": 9.3, "grad_norm": 5.385990619659424, "learning_rate": 1.6282439926062848e-05, "loss": 1.7669, "step": 5030 }, { "epoch": 9.32, "grad_norm": 7.040643692016602, "learning_rate": 1.6275046210720887e-05, "loss": 1.8073, "step": 5040 }, { "epoch": 9.33, "grad_norm": 5.6456618309021, "learning_rate": 1.626765249537893e-05, "loss": 1.6957, "step": 5050 }, { "epoch": 9.35, "grad_norm": 8.07316780090332, "learning_rate": 1.626025878003697e-05, "loss": 1.6143, "step": 5060 }, { "epoch": 9.37, "grad_norm": 5.339541912078857, "learning_rate": 1.625286506469501e-05, "loss": 1.8137, "step": 5070 }, { "epoch": 9.39, "grad_norm": 5.7450408935546875, "learning_rate": 1.624547134935305e-05, "loss": 1.9434, "step": 5080 }, { "epoch": 9.41, "grad_norm": 5.424485206604004, "learning_rate": 1.623807763401109e-05, "loss": 1.6725, "step": 5090 }, { "epoch": 9.43, "grad_norm": 7.112890243530273, "learning_rate": 1.6230683918669133e-05, "loss": 1.7533, "step": 5100 }, { "epoch": 9.45, "grad_norm": 6.392171382904053, "learning_rate": 1.6223290203327173e-05, "loss": 1.7744, "step": 5110 }, { "epoch": 9.46, "grad_norm": 5.556612968444824, "learning_rate": 1.6215896487985216e-05, "loss": 1.8228, "step": 5120 }, { "epoch": 9.48, "grad_norm": 5.705777168273926, "learning_rate": 1.6208502772643256e-05, "loss": 1.9301, "step": 5130 }, { "epoch": 9.5, "grad_norm": 6.481344223022461, "learning_rate": 1.6201109057301295e-05, "loss": 1.4782, "step": 5140 }, { "epoch": 9.52, "grad_norm": 7.628378391265869, "learning_rate": 1.6193715341959335e-05, "loss": 1.9316, "step": 5150 }, { "epoch": 9.54, "grad_norm": 5.013953685760498, "learning_rate": 1.6186321626617375e-05, "loss": 1.7173, "step": 5160 }, { "epoch": 9.56, "grad_norm": 6.3282151222229, "learning_rate": 1.6178927911275418e-05, "loss": 1.6536, "step": 5170 }, { "epoch": 9.57, "grad_norm": 4.387366771697998, "learning_rate": 1.6171534195933458e-05, "loss": 1.9206, "step": 5180 }, { "epoch": 9.59, "grad_norm": 4.622939586639404, "learning_rate": 1.6164140480591498e-05, "loss": 1.9404, "step": 5190 }, { "epoch": 9.61, "grad_norm": 8.402334213256836, "learning_rate": 1.615674676524954e-05, "loss": 1.4884, "step": 5200 }, { "epoch": 9.63, "grad_norm": 5.279985427856445, "learning_rate": 1.6149353049907577e-05, "loss": 1.7794, "step": 5210 }, { "epoch": 9.65, "grad_norm": 6.160462379455566, "learning_rate": 1.614195933456562e-05, "loss": 1.8218, "step": 5220 }, { "epoch": 9.67, "grad_norm": 4.768271446228027, "learning_rate": 1.613456561922366e-05, "loss": 1.9448, "step": 5230 }, { "epoch": 9.69, "grad_norm": 7.608345985412598, "learning_rate": 1.6127171903881703e-05, "loss": 1.4963, "step": 5240 }, { "epoch": 9.7, "grad_norm": 6.557635307312012, "learning_rate": 1.6119778188539743e-05, "loss": 1.955, "step": 5250 }, { "epoch": 9.72, "grad_norm": 4.4564361572265625, "learning_rate": 1.6112384473197783e-05, "loss": 1.938, "step": 5260 }, { "epoch": 9.74, "grad_norm": 6.673394203186035, "learning_rate": 1.6104990757855823e-05, "loss": 2.1702, "step": 5270 }, { "epoch": 9.76, "grad_norm": 9.461224555969238, "learning_rate": 1.6097597042513863e-05, "loss": 1.7126, "step": 5280 }, { "epoch": 9.78, "grad_norm": 5.4480438232421875, "learning_rate": 1.6090203327171906e-05, "loss": 1.7907, "step": 5290 }, { "epoch": 9.8, "grad_norm": 5.746652603149414, "learning_rate": 1.6082809611829946e-05, "loss": 1.6066, "step": 5300 }, { "epoch": 9.82, "grad_norm": 8.280226707458496, "learning_rate": 1.6075415896487985e-05, "loss": 2.0288, "step": 5310 }, { "epoch": 9.83, "grad_norm": 6.322702407836914, "learning_rate": 1.606802218114603e-05, "loss": 2.0334, "step": 5320 }, { "epoch": 9.85, "grad_norm": 5.549195766448975, "learning_rate": 1.6060628465804068e-05, "loss": 1.7249, "step": 5330 }, { "epoch": 9.87, "grad_norm": 9.851544380187988, "learning_rate": 1.6053234750462108e-05, "loss": 1.7398, "step": 5340 }, { "epoch": 9.89, "grad_norm": 9.24363899230957, "learning_rate": 1.6045841035120148e-05, "loss": 1.8049, "step": 5350 }, { "epoch": 9.91, "grad_norm": 3.545948028564453, "learning_rate": 1.603844731977819e-05, "loss": 1.446, "step": 5360 }, { "epoch": 9.93, "grad_norm": 6.653872013092041, "learning_rate": 1.603105360443623e-05, "loss": 1.7196, "step": 5370 }, { "epoch": 9.94, "grad_norm": 5.5079874992370605, "learning_rate": 1.602365988909427e-05, "loss": 1.9835, "step": 5380 }, { "epoch": 9.96, "grad_norm": 8.454424858093262, "learning_rate": 1.6016266173752314e-05, "loss": 1.4913, "step": 5390 }, { "epoch": 9.98, "grad_norm": 3.126283884048462, "learning_rate": 1.600887245841035e-05, "loss": 1.7413, "step": 5400 }, { "epoch": 10.0, "grad_norm": 11.112178802490234, "learning_rate": 1.6001478743068393e-05, "loss": 1.6319, "step": 5410 }, { "epoch": 10.0, "eval_accuracy": 0.7234927234927235, "eval_loss": 1.10242760181427, "eval_runtime": 1.6666, "eval_samples_per_second": 288.619, "eval_steps_per_second": 36.602, "step": 5410 }, { "epoch": 10.02, "grad_norm": 6.6197123527526855, "learning_rate": 1.5994085027726433e-05, "loss": 1.9106, "step": 5420 }, { "epoch": 10.04, "grad_norm": 6.858761787414551, "learning_rate": 1.5986691312384476e-05, "loss": 1.9151, "step": 5430 }, { "epoch": 10.06, "grad_norm": 8.37847900390625, "learning_rate": 1.5979297597042516e-05, "loss": 1.5413, "step": 5440 }, { "epoch": 10.07, "grad_norm": 9.353206634521484, "learning_rate": 1.5971903881700556e-05, "loss": 1.6448, "step": 5450 }, { "epoch": 10.09, "grad_norm": 9.860567092895508, "learning_rate": 1.5964510166358596e-05, "loss": 1.6352, "step": 5460 }, { "epoch": 10.11, "grad_norm": 7.389817714691162, "learning_rate": 1.5957116451016635e-05, "loss": 1.8545, "step": 5470 }, { "epoch": 10.13, "grad_norm": 10.615865707397461, "learning_rate": 1.594972273567468e-05, "loss": 1.9411, "step": 5480 }, { "epoch": 10.15, "grad_norm": 2.8454349040985107, "learning_rate": 1.594232902033272e-05, "loss": 1.6815, "step": 5490 }, { "epoch": 10.17, "grad_norm": 11.584176063537598, "learning_rate": 1.5934935304990758e-05, "loss": 1.9164, "step": 5500 }, { "epoch": 10.18, "grad_norm": 8.103424072265625, "learning_rate": 1.59275415896488e-05, "loss": 2.1083, "step": 5510 }, { "epoch": 10.2, "grad_norm": 6.688677787780762, "learning_rate": 1.5920147874306838e-05, "loss": 1.6031, "step": 5520 }, { "epoch": 10.22, "grad_norm": 5.246740818023682, "learning_rate": 1.591275415896488e-05, "loss": 1.42, "step": 5530 }, { "epoch": 10.24, "grad_norm": 7.326999664306641, "learning_rate": 1.590536044362292e-05, "loss": 1.4732, "step": 5540 }, { "epoch": 10.26, "grad_norm": 8.370280265808105, "learning_rate": 1.5897966728280964e-05, "loss": 1.5685, "step": 5550 }, { "epoch": 10.28, "grad_norm": 6.881199836730957, "learning_rate": 1.5890573012939004e-05, "loss": 1.767, "step": 5560 }, { "epoch": 10.3, "grad_norm": 6.8406453132629395, "learning_rate": 1.5883179297597043e-05, "loss": 1.4783, "step": 5570 }, { "epoch": 10.31, "grad_norm": 9.51223373413086, "learning_rate": 1.5875785582255087e-05, "loss": 1.7611, "step": 5580 }, { "epoch": 10.33, "grad_norm": 6.8711466789245605, "learning_rate": 1.5868391866913123e-05, "loss": 2.1209, "step": 5590 }, { "epoch": 10.35, "grad_norm": 8.527156829833984, "learning_rate": 1.5860998151571166e-05, "loss": 1.5011, "step": 5600 }, { "epoch": 10.37, "grad_norm": 7.204869747161865, "learning_rate": 1.5853604436229206e-05, "loss": 1.8573, "step": 5610 }, { "epoch": 10.39, "grad_norm": 6.966355323791504, "learning_rate": 1.5846210720887246e-05, "loss": 1.7452, "step": 5620 }, { "epoch": 10.41, "grad_norm": 6.01743221282959, "learning_rate": 1.583881700554529e-05, "loss": 1.5316, "step": 5630 }, { "epoch": 10.43, "grad_norm": 3.1018764972686768, "learning_rate": 1.583142329020333e-05, "loss": 1.8752, "step": 5640 }, { "epoch": 10.44, "grad_norm": 6.013047695159912, "learning_rate": 1.582402957486137e-05, "loss": 1.5721, "step": 5650 }, { "epoch": 10.46, "grad_norm": 5.9063897132873535, "learning_rate": 1.5816635859519408e-05, "loss": 1.7681, "step": 5660 }, { "epoch": 10.48, "grad_norm": 5.383518218994141, "learning_rate": 1.580924214417745e-05, "loss": 1.7579, "step": 5670 }, { "epoch": 10.5, "grad_norm": 5.483528137207031, "learning_rate": 1.580184842883549e-05, "loss": 1.84, "step": 5680 }, { "epoch": 10.52, "grad_norm": 8.526223182678223, "learning_rate": 1.579445471349353e-05, "loss": 1.6757, "step": 5690 }, { "epoch": 10.54, "grad_norm": 9.638723373413086, "learning_rate": 1.5787060998151574e-05, "loss": 1.684, "step": 5700 }, { "epoch": 10.55, "grad_norm": 6.910712718963623, "learning_rate": 1.577966728280961e-05, "loss": 1.9138, "step": 5710 }, { "epoch": 10.57, "grad_norm": 6.714941024780273, "learning_rate": 1.5772273567467654e-05, "loss": 1.565, "step": 5720 }, { "epoch": 10.59, "grad_norm": 4.602434158325195, "learning_rate": 1.5764879852125694e-05, "loss": 1.7848, "step": 5730 }, { "epoch": 10.61, "grad_norm": 9.314746856689453, "learning_rate": 1.5757486136783737e-05, "loss": 1.6997, "step": 5740 }, { "epoch": 10.63, "grad_norm": 5.620849132537842, "learning_rate": 1.5750092421441777e-05, "loss": 1.9766, "step": 5750 }, { "epoch": 10.65, "grad_norm": 5.389798164367676, "learning_rate": 1.5742698706099816e-05, "loss": 1.4327, "step": 5760 }, { "epoch": 10.67, "grad_norm": 7.618233680725098, "learning_rate": 1.573530499075786e-05, "loss": 2.0241, "step": 5770 }, { "epoch": 10.68, "grad_norm": 7.156101703643799, "learning_rate": 1.5727911275415896e-05, "loss": 1.8515, "step": 5780 }, { "epoch": 10.7, "grad_norm": 7.632164001464844, "learning_rate": 1.572051756007394e-05, "loss": 1.5441, "step": 5790 }, { "epoch": 10.72, "grad_norm": 7.00955867767334, "learning_rate": 1.571312384473198e-05, "loss": 1.7605, "step": 5800 }, { "epoch": 10.74, "grad_norm": 7.532792568206787, "learning_rate": 1.570573012939002e-05, "loss": 1.641, "step": 5810 }, { "epoch": 10.76, "grad_norm": 7.597188949584961, "learning_rate": 1.5698336414048062e-05, "loss": 1.914, "step": 5820 }, { "epoch": 10.78, "grad_norm": 13.01384162902832, "learning_rate": 1.56909426987061e-05, "loss": 1.5948, "step": 5830 }, { "epoch": 10.79, "grad_norm": 4.868382453918457, "learning_rate": 1.568354898336414e-05, "loss": 1.4025, "step": 5840 }, { "epoch": 10.81, "grad_norm": 3.8863894939422607, "learning_rate": 1.567615526802218e-05, "loss": 1.5675, "step": 5850 }, { "epoch": 10.83, "grad_norm": 5.704493045806885, "learning_rate": 1.5668761552680224e-05, "loss": 1.6681, "step": 5860 }, { "epoch": 10.85, "grad_norm": 5.760870933532715, "learning_rate": 1.5661367837338264e-05, "loss": 1.6806, "step": 5870 }, { "epoch": 10.87, "grad_norm": 7.638999938964844, "learning_rate": 1.5653974121996304e-05, "loss": 1.8769, "step": 5880 }, { "epoch": 10.89, "grad_norm": 6.3926849365234375, "learning_rate": 1.5646580406654347e-05, "loss": 1.5922, "step": 5890 }, { "epoch": 10.91, "grad_norm": 5.591503143310547, "learning_rate": 1.5639186691312383e-05, "loss": 1.7111, "step": 5900 }, { "epoch": 10.92, "grad_norm": 7.386925220489502, "learning_rate": 1.5631792975970427e-05, "loss": 1.9865, "step": 5910 }, { "epoch": 10.94, "grad_norm": 6.559366703033447, "learning_rate": 1.5624399260628466e-05, "loss": 1.6646, "step": 5920 }, { "epoch": 10.96, "grad_norm": 7.8627095222473145, "learning_rate": 1.5617005545286506e-05, "loss": 1.3241, "step": 5930 }, { "epoch": 10.98, "grad_norm": 12.11966609954834, "learning_rate": 1.560961182994455e-05, "loss": 1.5248, "step": 5940 }, { "epoch": 11.0, "grad_norm": 7.013985633850098, "learning_rate": 1.560221811460259e-05, "loss": 1.5899, "step": 5950 }, { "epoch": 11.0, "eval_accuracy": 0.7338877338877339, "eval_loss": 1.037523627281189, "eval_runtime": 1.5543, "eval_samples_per_second": 309.467, "eval_steps_per_second": 39.246, "step": 5951 }, { "epoch": 11.02, "grad_norm": 5.60391092300415, "learning_rate": 1.5594824399260632e-05, "loss": 1.7301, "step": 5960 }, { "epoch": 11.04, "grad_norm": 6.589072227478027, "learning_rate": 1.558743068391867e-05, "loss": 1.7259, "step": 5970 }, { "epoch": 11.05, "grad_norm": 4.4540791511535645, "learning_rate": 1.5580036968576712e-05, "loss": 1.6178, "step": 5980 }, { "epoch": 11.07, "grad_norm": 12.74376106262207, "learning_rate": 1.557264325323475e-05, "loss": 1.513, "step": 5990 }, { "epoch": 11.09, "grad_norm": 9.285293579101562, "learning_rate": 1.556524953789279e-05, "loss": 1.6429, "step": 6000 }, { "epoch": 11.11, "grad_norm": 4.228150844573975, "learning_rate": 1.5557855822550835e-05, "loss": 1.8027, "step": 6010 }, { "epoch": 11.13, "grad_norm": 6.479814052581787, "learning_rate": 1.5550462107208874e-05, "loss": 1.7569, "step": 6020 }, { "epoch": 11.15, "grad_norm": 8.13913345336914, "learning_rate": 1.5543068391866914e-05, "loss": 1.5466, "step": 6030 }, { "epoch": 11.16, "grad_norm": 4.259387016296387, "learning_rate": 1.5535674676524954e-05, "loss": 1.5825, "step": 6040 }, { "epoch": 11.18, "grad_norm": 9.195317268371582, "learning_rate": 1.5528280961182994e-05, "loss": 1.6189, "step": 6050 }, { "epoch": 11.2, "grad_norm": 5.625119209289551, "learning_rate": 1.5520887245841037e-05, "loss": 1.4942, "step": 6060 }, { "epoch": 11.22, "grad_norm": 4.036601543426514, "learning_rate": 1.5513493530499077e-05, "loss": 1.3357, "step": 6070 }, { "epoch": 11.24, "grad_norm": 8.619260787963867, "learning_rate": 1.550609981515712e-05, "loss": 1.5838, "step": 6080 }, { "epoch": 11.26, "grad_norm": 4.570966720581055, "learning_rate": 1.5498706099815156e-05, "loss": 1.6877, "step": 6090 }, { "epoch": 11.28, "grad_norm": 5.669454097747803, "learning_rate": 1.54913123844732e-05, "loss": 1.6888, "step": 6100 }, { "epoch": 11.29, "grad_norm": 7.858445644378662, "learning_rate": 1.548391866913124e-05, "loss": 1.7782, "step": 6110 }, { "epoch": 11.31, "grad_norm": 5.75015115737915, "learning_rate": 1.547652495378928e-05, "loss": 1.6898, "step": 6120 }, { "epoch": 11.33, "grad_norm": 7.184450149536133, "learning_rate": 1.5469131238447322e-05, "loss": 1.6974, "step": 6130 }, { "epoch": 11.35, "grad_norm": 8.132901191711426, "learning_rate": 1.5461737523105362e-05, "loss": 1.6524, "step": 6140 }, { "epoch": 11.37, "grad_norm": 4.093606472015381, "learning_rate": 1.5454343807763402e-05, "loss": 1.4192, "step": 6150 }, { "epoch": 11.39, "grad_norm": 5.621367454528809, "learning_rate": 1.544695009242144e-05, "loss": 1.5851, "step": 6160 }, { "epoch": 11.4, "grad_norm": 7.379901885986328, "learning_rate": 1.5439556377079485e-05, "loss": 1.7208, "step": 6170 }, { "epoch": 11.42, "grad_norm": 10.03763198852539, "learning_rate": 1.5432162661737525e-05, "loss": 1.5085, "step": 6180 }, { "epoch": 11.44, "grad_norm": 7.815421104431152, "learning_rate": 1.5424768946395564e-05, "loss": 1.5802, "step": 6190 }, { "epoch": 11.46, "grad_norm": 4.954154014587402, "learning_rate": 1.5417375231053607e-05, "loss": 1.9409, "step": 6200 }, { "epoch": 11.48, "grad_norm": 5.130891799926758, "learning_rate": 1.5409981515711647e-05, "loss": 1.6532, "step": 6210 }, { "epoch": 11.5, "grad_norm": 1.637441873550415, "learning_rate": 1.5402587800369687e-05, "loss": 1.5477, "step": 6220 }, { "epoch": 11.52, "grad_norm": 9.160568237304688, "learning_rate": 1.5395194085027727e-05, "loss": 1.9149, "step": 6230 }, { "epoch": 11.53, "grad_norm": 7.1242194175720215, "learning_rate": 1.5387800369685767e-05, "loss": 1.6602, "step": 6240 }, { "epoch": 11.55, "grad_norm": 8.229101181030273, "learning_rate": 1.538040665434381e-05, "loss": 1.6501, "step": 6250 }, { "epoch": 11.57, "grad_norm": 8.466025352478027, "learning_rate": 1.537301293900185e-05, "loss": 1.732, "step": 6260 }, { "epoch": 11.59, "grad_norm": 5.168890476226807, "learning_rate": 1.5365619223659893e-05, "loss": 1.7798, "step": 6270 }, { "epoch": 11.61, "grad_norm": 4.901388168334961, "learning_rate": 1.535822550831793e-05, "loss": 1.5237, "step": 6280 }, { "epoch": 11.63, "grad_norm": 7.3732123374938965, "learning_rate": 1.5350831792975972e-05, "loss": 1.4315, "step": 6290 }, { "epoch": 11.65, "grad_norm": 7.440344333648682, "learning_rate": 1.5343438077634012e-05, "loss": 1.7513, "step": 6300 }, { "epoch": 11.66, "grad_norm": 7.809604644775391, "learning_rate": 1.5336044362292052e-05, "loss": 1.8531, "step": 6310 }, { "epoch": 11.68, "grad_norm": 6.435956001281738, "learning_rate": 1.5328650646950095e-05, "loss": 1.9228, "step": 6320 }, { "epoch": 11.7, "grad_norm": 3.976518154144287, "learning_rate": 1.5321256931608135e-05, "loss": 1.413, "step": 6330 }, { "epoch": 11.72, "grad_norm": 5.598086833953857, "learning_rate": 1.5313863216266175e-05, "loss": 1.8874, "step": 6340 }, { "epoch": 11.74, "grad_norm": 6.977279186248779, "learning_rate": 1.5306469500924214e-05, "loss": 1.6802, "step": 6350 }, { "epoch": 11.76, "grad_norm": 4.966686725616455, "learning_rate": 1.5299075785582254e-05, "loss": 1.6049, "step": 6360 }, { "epoch": 11.77, "grad_norm": 4.8055853843688965, "learning_rate": 1.5291682070240297e-05, "loss": 1.6506, "step": 6370 }, { "epoch": 11.79, "grad_norm": 3.892423391342163, "learning_rate": 1.5284288354898337e-05, "loss": 1.6055, "step": 6380 }, { "epoch": 11.81, "grad_norm": 6.14129638671875, "learning_rate": 1.527689463955638e-05, "loss": 1.4514, "step": 6390 }, { "epoch": 11.83, "grad_norm": 8.457772254943848, "learning_rate": 1.526950092421442e-05, "loss": 1.7927, "step": 6400 }, { "epoch": 11.85, "grad_norm": 6.5199079513549805, "learning_rate": 1.526210720887246e-05, "loss": 1.8552, "step": 6410 }, { "epoch": 11.87, "grad_norm": 6.28503942489624, "learning_rate": 1.52547134935305e-05, "loss": 1.648, "step": 6420 }, { "epoch": 11.89, "grad_norm": 8.903117179870605, "learning_rate": 1.5247319778188541e-05, "loss": 1.7005, "step": 6430 }, { "epoch": 11.9, "grad_norm": 5.986446857452393, "learning_rate": 1.5239926062846581e-05, "loss": 1.7298, "step": 6440 }, { "epoch": 11.92, "grad_norm": 9.497461318969727, "learning_rate": 1.5232532347504622e-05, "loss": 1.457, "step": 6450 }, { "epoch": 11.94, "grad_norm": 4.832051753997803, "learning_rate": 1.5225138632162664e-05, "loss": 1.5964, "step": 6460 }, { "epoch": 11.96, "grad_norm": 5.802597522735596, "learning_rate": 1.5217744916820702e-05, "loss": 1.6609, "step": 6470 }, { "epoch": 11.98, "grad_norm": 7.783098220825195, "learning_rate": 1.5210351201478743e-05, "loss": 1.6063, "step": 6480 }, { "epoch": 12.0, "grad_norm": 4.908028602600098, "learning_rate": 1.5202957486136785e-05, "loss": 1.6427, "step": 6490 }, { "epoch": 12.0, "eval_accuracy": 0.7525987525987526, "eval_loss": 0.9656060934066772, "eval_runtime": 1.661, "eval_samples_per_second": 289.579, "eval_steps_per_second": 36.724, "step": 6492 }, { "epoch": 12.01, "grad_norm": 3.448066473007202, "learning_rate": 1.5195563770794826e-05, "loss": 1.441, "step": 6500 }, { "epoch": 12.03, "grad_norm": 5.901543140411377, "learning_rate": 1.5188170055452866e-05, "loss": 1.4466, "step": 6510 }, { "epoch": 12.05, "grad_norm": 12.5198392868042, "learning_rate": 1.5180776340110908e-05, "loss": 1.6995, "step": 6520 }, { "epoch": 12.07, "grad_norm": 5.436539649963379, "learning_rate": 1.5173382624768949e-05, "loss": 1.4173, "step": 6530 }, { "epoch": 12.09, "grad_norm": 3.4112651348114014, "learning_rate": 1.5165988909426987e-05, "loss": 1.6636, "step": 6540 }, { "epoch": 12.11, "grad_norm": 7.081146717071533, "learning_rate": 1.5158595194085029e-05, "loss": 2.2208, "step": 6550 }, { "epoch": 12.13, "grad_norm": 5.584806442260742, "learning_rate": 1.515120147874307e-05, "loss": 1.3755, "step": 6560 }, { "epoch": 12.14, "grad_norm": 10.636102676391602, "learning_rate": 1.514380776340111e-05, "loss": 1.6048, "step": 6570 }, { "epoch": 12.16, "grad_norm": 10.160847663879395, "learning_rate": 1.5136414048059151e-05, "loss": 1.7038, "step": 6580 }, { "epoch": 12.18, "grad_norm": 5.678435325622559, "learning_rate": 1.5129020332717193e-05, "loss": 1.7492, "step": 6590 }, { "epoch": 12.2, "grad_norm": 3.134272813796997, "learning_rate": 1.5121626617375231e-05, "loss": 1.3304, "step": 6600 }, { "epoch": 12.22, "grad_norm": 6.381455898284912, "learning_rate": 1.5114232902033272e-05, "loss": 1.4498, "step": 6610 }, { "epoch": 12.24, "grad_norm": 7.086697578430176, "learning_rate": 1.5106839186691314e-05, "loss": 1.8536, "step": 6620 }, { "epoch": 12.26, "grad_norm": 4.185774326324463, "learning_rate": 1.5099445471349354e-05, "loss": 1.9374, "step": 6630 }, { "epoch": 12.27, "grad_norm": 5.7592926025390625, "learning_rate": 1.5092051756007395e-05, "loss": 1.9616, "step": 6640 }, { "epoch": 12.29, "grad_norm": 4.030360221862793, "learning_rate": 1.5084658040665437e-05, "loss": 1.6716, "step": 6650 }, { "epoch": 12.31, "grad_norm": 8.987281799316406, "learning_rate": 1.5077264325323475e-05, "loss": 1.7801, "step": 6660 }, { "epoch": 12.33, "grad_norm": 5.447028160095215, "learning_rate": 1.5069870609981516e-05, "loss": 1.353, "step": 6670 }, { "epoch": 12.35, "grad_norm": 7.787957191467285, "learning_rate": 1.5062476894639558e-05, "loss": 1.5683, "step": 6680 }, { "epoch": 12.37, "grad_norm": 3.8201277256011963, "learning_rate": 1.5055083179297598e-05, "loss": 1.5776, "step": 6690 }, { "epoch": 12.38, "grad_norm": 5.542779445648193, "learning_rate": 1.5047689463955639e-05, "loss": 1.9497, "step": 6700 }, { "epoch": 12.4, "grad_norm": 6.7789082527160645, "learning_rate": 1.504029574861368e-05, "loss": 1.5026, "step": 6710 }, { "epoch": 12.42, "grad_norm": 9.256391525268555, "learning_rate": 1.5032902033271722e-05, "loss": 1.7416, "step": 6720 }, { "epoch": 12.44, "grad_norm": 10.52377986907959, "learning_rate": 1.502550831792976e-05, "loss": 1.8205, "step": 6730 }, { "epoch": 12.46, "grad_norm": 5.957433223724365, "learning_rate": 1.5018114602587802e-05, "loss": 1.7258, "step": 6740 }, { "epoch": 12.48, "grad_norm": 6.628762245178223, "learning_rate": 1.5010720887245841e-05, "loss": 1.5374, "step": 6750 }, { "epoch": 12.5, "grad_norm": 6.431212902069092, "learning_rate": 1.5003327171903883e-05, "loss": 1.421, "step": 6760 }, { "epoch": 12.51, "grad_norm": 4.727957725524902, "learning_rate": 1.4995933456561924e-05, "loss": 1.5339, "step": 6770 }, { "epoch": 12.53, "grad_norm": 9.263405799865723, "learning_rate": 1.4988539741219966e-05, "loss": 1.6073, "step": 6780 }, { "epoch": 12.55, "grad_norm": 7.269367218017578, "learning_rate": 1.4981146025878004e-05, "loss": 1.9465, "step": 6790 }, { "epoch": 12.57, "grad_norm": 7.820775985717773, "learning_rate": 1.4973752310536045e-05, "loss": 1.6782, "step": 6800 }, { "epoch": 12.59, "grad_norm": 6.880367279052734, "learning_rate": 1.4966358595194085e-05, "loss": 1.4589, "step": 6810 }, { "epoch": 12.61, "grad_norm": 8.777186393737793, "learning_rate": 1.4958964879852127e-05, "loss": 1.7014, "step": 6820 }, { "epoch": 12.62, "grad_norm": 9.317229270935059, "learning_rate": 1.4951571164510168e-05, "loss": 1.589, "step": 6830 }, { "epoch": 12.64, "grad_norm": 7.191954612731934, "learning_rate": 1.494417744916821e-05, "loss": 1.8561, "step": 6840 }, { "epoch": 12.66, "grad_norm": 4.9460906982421875, "learning_rate": 1.493678373382625e-05, "loss": 1.7491, "step": 6850 }, { "epoch": 12.68, "grad_norm": 8.261725425720215, "learning_rate": 1.4929390018484289e-05, "loss": 1.2502, "step": 6860 }, { "epoch": 12.7, "grad_norm": 7.115355491638184, "learning_rate": 1.492199630314233e-05, "loss": 1.5045, "step": 6870 }, { "epoch": 12.72, "grad_norm": 9.270259857177734, "learning_rate": 1.491460258780037e-05, "loss": 1.8743, "step": 6880 }, { "epoch": 12.74, "grad_norm": 3.3993301391601562, "learning_rate": 1.4907208872458412e-05, "loss": 1.2419, "step": 6890 }, { "epoch": 12.75, "grad_norm": 11.75745964050293, "learning_rate": 1.4899815157116453e-05, "loss": 1.5002, "step": 6900 }, { "epoch": 12.77, "grad_norm": 5.36479377746582, "learning_rate": 1.4892421441774493e-05, "loss": 1.6219, "step": 6910 }, { "epoch": 12.79, "grad_norm": 7.44586706161499, "learning_rate": 1.4885027726432533e-05, "loss": 1.7933, "step": 6920 }, { "epoch": 12.81, "grad_norm": 9.066186904907227, "learning_rate": 1.4877634011090574e-05, "loss": 1.7978, "step": 6930 }, { "epoch": 12.83, "grad_norm": 6.805649757385254, "learning_rate": 1.4870240295748614e-05, "loss": 1.7624, "step": 6940 }, { "epoch": 12.85, "grad_norm": 7.065145015716553, "learning_rate": 1.4862846580406656e-05, "loss": 1.8904, "step": 6950 }, { "epoch": 12.87, "grad_norm": 9.71106243133545, "learning_rate": 1.4855452865064697e-05, "loss": 1.7664, "step": 6960 }, { "epoch": 12.88, "grad_norm": 5.458385467529297, "learning_rate": 1.4848059149722737e-05, "loss": 1.7716, "step": 6970 }, { "epoch": 12.9, "grad_norm": 9.14933967590332, "learning_rate": 1.4840665434380777e-05, "loss": 1.6279, "step": 6980 }, { "epoch": 12.92, "grad_norm": 11.714768409729004, "learning_rate": 1.4833271719038818e-05, "loss": 1.7414, "step": 6990 }, { "epoch": 12.94, "grad_norm": 7.316355228424072, "learning_rate": 1.4825878003696858e-05, "loss": 1.5052, "step": 7000 }, { "epoch": 12.96, "grad_norm": 7.466034412384033, "learning_rate": 1.48184842883549e-05, "loss": 1.4183, "step": 7010 }, { "epoch": 12.98, "grad_norm": 7.870053291320801, "learning_rate": 1.4811090573012941e-05, "loss": 1.7464, "step": 7020 }, { "epoch": 12.99, "grad_norm": 6.396212577819824, "learning_rate": 1.4803696857670982e-05, "loss": 1.8022, "step": 7030 }, { "epoch": 13.0, "eval_accuracy": 0.7422037422037422, "eval_loss": 0.9759501218795776, "eval_runtime": 1.5545, "eval_samples_per_second": 309.422, "eval_steps_per_second": 39.241, "step": 7033 }, { "epoch": 13.01, "grad_norm": 9.775729179382324, "learning_rate": 1.4796303142329022e-05, "loss": 1.7349, "step": 7040 }, { "epoch": 13.03, "grad_norm": 7.444331169128418, "learning_rate": 1.4788909426987062e-05, "loss": 1.8206, "step": 7050 }, { "epoch": 13.05, "grad_norm": 7.023839473724365, "learning_rate": 1.4781515711645102e-05, "loss": 1.5978, "step": 7060 }, { "epoch": 13.07, "grad_norm": 6.148096084594727, "learning_rate": 1.4774121996303143e-05, "loss": 1.7455, "step": 7070 }, { "epoch": 13.09, "grad_norm": 6.564537048339844, "learning_rate": 1.4766728280961185e-05, "loss": 1.3934, "step": 7080 }, { "epoch": 13.11, "grad_norm": 6.070539474487305, "learning_rate": 1.4759334565619226e-05, "loss": 1.5452, "step": 7090 }, { "epoch": 13.12, "grad_norm": 3.192253828048706, "learning_rate": 1.4751940850277266e-05, "loss": 1.8153, "step": 7100 }, { "epoch": 13.14, "grad_norm": 5.1541748046875, "learning_rate": 1.4744547134935306e-05, "loss": 1.5209, "step": 7110 }, { "epoch": 13.16, "grad_norm": 5.539430618286133, "learning_rate": 1.4737153419593346e-05, "loss": 1.9908, "step": 7120 }, { "epoch": 13.18, "grad_norm": 9.003644943237305, "learning_rate": 1.4729759704251387e-05, "loss": 1.5192, "step": 7130 }, { "epoch": 13.2, "grad_norm": 6.054264545440674, "learning_rate": 1.4722365988909429e-05, "loss": 1.6799, "step": 7140 }, { "epoch": 13.22, "grad_norm": 2.9800052642822266, "learning_rate": 1.471497227356747e-05, "loss": 1.7161, "step": 7150 }, { "epoch": 13.23, "grad_norm": 5.217889785766602, "learning_rate": 1.470757855822551e-05, "loss": 1.4401, "step": 7160 }, { "epoch": 13.25, "grad_norm": 6.164737224578857, "learning_rate": 1.470018484288355e-05, "loss": 1.709, "step": 7170 }, { "epoch": 13.27, "grad_norm": 6.545243263244629, "learning_rate": 1.4692791127541591e-05, "loss": 1.5946, "step": 7180 }, { "epoch": 13.29, "grad_norm": 10.040055274963379, "learning_rate": 1.468539741219963e-05, "loss": 1.8137, "step": 7190 }, { "epoch": 13.31, "grad_norm": 9.318700790405273, "learning_rate": 1.4678003696857672e-05, "loss": 1.8085, "step": 7200 }, { "epoch": 13.33, "grad_norm": 8.463800430297852, "learning_rate": 1.4670609981515714e-05, "loss": 1.8121, "step": 7210 }, { "epoch": 13.35, "grad_norm": 6.740206241607666, "learning_rate": 1.4663216266173754e-05, "loss": 1.4416, "step": 7220 }, { "epoch": 13.36, "grad_norm": 6.259397029876709, "learning_rate": 1.4655822550831795e-05, "loss": 1.4735, "step": 7230 }, { "epoch": 13.38, "grad_norm": 4.66219425201416, "learning_rate": 1.4648428835489835e-05, "loss": 1.7049, "step": 7240 }, { "epoch": 13.4, "grad_norm": 7.739853858947754, "learning_rate": 1.4641035120147875e-05, "loss": 1.4948, "step": 7250 }, { "epoch": 13.42, "grad_norm": 5.544546127319336, "learning_rate": 1.4633641404805916e-05, "loss": 1.5345, "step": 7260 }, { "epoch": 13.44, "grad_norm": 4.8158040046691895, "learning_rate": 1.4626987060998153e-05, "loss": 1.4794, "step": 7270 }, { "epoch": 13.46, "grad_norm": 4.196196556091309, "learning_rate": 1.4619593345656193e-05, "loss": 1.6506, "step": 7280 }, { "epoch": 13.48, "grad_norm": 3.980792999267578, "learning_rate": 1.4612199630314234e-05, "loss": 1.473, "step": 7290 }, { "epoch": 13.49, "grad_norm": 3.531205654144287, "learning_rate": 1.4604805914972276e-05, "loss": 1.5042, "step": 7300 }, { "epoch": 13.51, "grad_norm": 8.832963943481445, "learning_rate": 1.4597412199630317e-05, "loss": 1.4186, "step": 7310 }, { "epoch": 13.53, "grad_norm": 5.607369422912598, "learning_rate": 1.4590018484288355e-05, "loss": 1.6208, "step": 7320 }, { "epoch": 13.55, "grad_norm": 4.904766082763672, "learning_rate": 1.4582624768946397e-05, "loss": 1.664, "step": 7330 }, { "epoch": 13.57, "grad_norm": 13.896210670471191, "learning_rate": 1.4575231053604436e-05, "loss": 1.6129, "step": 7340 }, { "epoch": 13.59, "grad_norm": 8.129071235656738, "learning_rate": 1.4567837338262478e-05, "loss": 1.765, "step": 7350 }, { "epoch": 13.6, "grad_norm": 9.554658889770508, "learning_rate": 1.456044362292052e-05, "loss": 1.1582, "step": 7360 }, { "epoch": 13.62, "grad_norm": 7.690789222717285, "learning_rate": 1.4553049907578561e-05, "loss": 1.4821, "step": 7370 }, { "epoch": 13.64, "grad_norm": 7.09590482711792, "learning_rate": 1.4545656192236599e-05, "loss": 1.4884, "step": 7380 }, { "epoch": 13.66, "grad_norm": 3.245009422302246, "learning_rate": 1.453826247689464e-05, "loss": 1.6297, "step": 7390 }, { "epoch": 13.68, "grad_norm": 8.11549186706543, "learning_rate": 1.4530868761552682e-05, "loss": 1.5279, "step": 7400 }, { "epoch": 13.7, "grad_norm": 3.247609853744507, "learning_rate": 1.4523475046210722e-05, "loss": 1.5101, "step": 7410 }, { "epoch": 13.72, "grad_norm": 12.376482009887695, "learning_rate": 1.4516081330868763e-05, "loss": 1.7137, "step": 7420 }, { "epoch": 13.73, "grad_norm": 11.389594078063965, "learning_rate": 1.4508687615526805e-05, "loss": 1.3565, "step": 7430 }, { "epoch": 13.75, "grad_norm": 4.671752452850342, "learning_rate": 1.4501293900184844e-05, "loss": 1.8149, "step": 7440 }, { "epoch": 13.77, "grad_norm": 6.193767070770264, "learning_rate": 1.4493900184842884e-05, "loss": 1.5856, "step": 7450 }, { "epoch": 13.79, "grad_norm": 7.994163513183594, "learning_rate": 1.4486506469500926e-05, "loss": 2.1028, "step": 7460 }, { "epoch": 13.81, "grad_norm": 5.570643901824951, "learning_rate": 1.4479112754158966e-05, "loss": 1.6469, "step": 7470 }, { "epoch": 13.83, "grad_norm": 7.402944564819336, "learning_rate": 1.4471719038817007e-05, "loss": 1.6919, "step": 7480 }, { "epoch": 13.84, "grad_norm": 5.880756378173828, "learning_rate": 1.4464325323475048e-05, "loss": 1.3381, "step": 7490 }, { "epoch": 13.86, "grad_norm": 11.928231239318848, "learning_rate": 1.4456931608133088e-05, "loss": 1.7797, "step": 7500 }, { "epoch": 13.88, "grad_norm": 8.824029922485352, "learning_rate": 1.4449537892791128e-05, "loss": 1.5211, "step": 7510 }, { "epoch": 13.9, "grad_norm": 4.614089012145996, "learning_rate": 1.444214417744917e-05, "loss": 1.5793, "step": 7520 }, { "epoch": 13.92, "grad_norm": 7.449356555938721, "learning_rate": 1.443475046210721e-05, "loss": 1.8143, "step": 7530 }, { "epoch": 13.94, "grad_norm": 5.570579528808594, "learning_rate": 1.442735674676525e-05, "loss": 1.4603, "step": 7540 }, { "epoch": 13.96, "grad_norm": 5.946522235870361, "learning_rate": 1.4419963031423292e-05, "loss": 1.2679, "step": 7550 }, { "epoch": 13.97, "grad_norm": 4.31473445892334, "learning_rate": 1.4412569316081334e-05, "loss": 1.2853, "step": 7560 }, { "epoch": 13.99, "grad_norm": 3.5280280113220215, "learning_rate": 1.4405175600739372e-05, "loss": 1.7161, "step": 7570 }, { "epoch": 14.0, "eval_accuracy": 0.760914760914761, "eval_loss": 0.8951783776283264, "eval_runtime": 1.5304, "eval_samples_per_second": 314.294, "eval_steps_per_second": 39.859, "step": 7574 }, { "epoch": 14.01, "grad_norm": 9.568656921386719, "learning_rate": 1.4397781885397413e-05, "loss": 1.9271, "step": 7580 }, { "epoch": 14.03, "grad_norm": 3.4803380966186523, "learning_rate": 1.4390388170055453e-05, "loss": 1.5116, "step": 7590 }, { "epoch": 14.05, "grad_norm": 6.021322250366211, "learning_rate": 1.4382994454713495e-05, "loss": 1.8064, "step": 7600 }, { "epoch": 14.07, "grad_norm": 3.152942657470703, "learning_rate": 1.4375600739371536e-05, "loss": 1.7749, "step": 7610 }, { "epoch": 14.09, "grad_norm": 5.195441722869873, "learning_rate": 1.4368207024029578e-05, "loss": 1.4719, "step": 7620 }, { "epoch": 14.1, "grad_norm": 7.667139530181885, "learning_rate": 1.4360813308687617e-05, "loss": 1.4498, "step": 7630 }, { "epoch": 14.12, "grad_norm": 6.858312606811523, "learning_rate": 1.4353419593345657e-05, "loss": 1.5269, "step": 7640 }, { "epoch": 14.14, "grad_norm": 9.447962760925293, "learning_rate": 1.4346025878003697e-05, "loss": 1.3667, "step": 7650 }, { "epoch": 14.16, "grad_norm": 8.01750373840332, "learning_rate": 1.4338632162661738e-05, "loss": 1.9008, "step": 7660 }, { "epoch": 14.18, "grad_norm": 7.747750282287598, "learning_rate": 1.433123844731978e-05, "loss": 1.435, "step": 7670 }, { "epoch": 14.2, "grad_norm": 7.1059417724609375, "learning_rate": 1.4323844731977821e-05, "loss": 1.3909, "step": 7680 }, { "epoch": 14.21, "grad_norm": 7.670748233795166, "learning_rate": 1.4316451016635861e-05, "loss": 1.5908, "step": 7690 }, { "epoch": 14.23, "grad_norm": 5.720839500427246, "learning_rate": 1.4309057301293901e-05, "loss": 1.6515, "step": 7700 }, { "epoch": 14.25, "grad_norm": 4.928955078125, "learning_rate": 1.430166358595194e-05, "loss": 1.4803, "step": 7710 }, { "epoch": 14.27, "grad_norm": 6.57696533203125, "learning_rate": 1.4294269870609982e-05, "loss": 1.5027, "step": 7720 }, { "epoch": 14.29, "grad_norm": 7.476659774780273, "learning_rate": 1.4286876155268024e-05, "loss": 1.4742, "step": 7730 }, { "epoch": 14.31, "grad_norm": 9.980905532836914, "learning_rate": 1.4279482439926065e-05, "loss": 1.6126, "step": 7740 }, { "epoch": 14.33, "grad_norm": 7.370110511779785, "learning_rate": 1.4272088724584105e-05, "loss": 1.1437, "step": 7750 }, { "epoch": 14.34, "grad_norm": 8.434205055236816, "learning_rate": 1.4264695009242145e-05, "loss": 1.6809, "step": 7760 }, { "epoch": 14.36, "grad_norm": 6.557777404785156, "learning_rate": 1.4257301293900186e-05, "loss": 1.589, "step": 7770 }, { "epoch": 14.38, "grad_norm": 9.328699111938477, "learning_rate": 1.4249907578558226e-05, "loss": 1.743, "step": 7780 }, { "epoch": 14.4, "grad_norm": 8.024626731872559, "learning_rate": 1.4242513863216267e-05, "loss": 1.4591, "step": 7790 }, { "epoch": 14.42, "grad_norm": 4.080429553985596, "learning_rate": 1.4235120147874309e-05, "loss": 1.2619, "step": 7800 }, { "epoch": 14.44, "grad_norm": 6.075506210327148, "learning_rate": 1.4227726432532349e-05, "loss": 1.5155, "step": 7810 }, { "epoch": 14.45, "grad_norm": 6.258358478546143, "learning_rate": 1.422033271719039e-05, "loss": 1.6283, "step": 7820 }, { "epoch": 14.47, "grad_norm": 4.162814617156982, "learning_rate": 1.421293900184843e-05, "loss": 1.6274, "step": 7830 }, { "epoch": 14.49, "grad_norm": 7.838924407958984, "learning_rate": 1.420554528650647e-05, "loss": 1.3089, "step": 7840 }, { "epoch": 14.51, "grad_norm": 5.278045177459717, "learning_rate": 1.4198151571164511e-05, "loss": 1.7157, "step": 7850 }, { "epoch": 14.53, "grad_norm": 6.241564750671387, "learning_rate": 1.4190757855822553e-05, "loss": 1.4711, "step": 7860 }, { "epoch": 14.55, "grad_norm": 3.3926894664764404, "learning_rate": 1.4183364140480592e-05, "loss": 1.3169, "step": 7870 }, { "epoch": 14.57, "grad_norm": 7.819077014923096, "learning_rate": 1.4175970425138634e-05, "loss": 1.5253, "step": 7880 }, { "epoch": 14.58, "grad_norm": 8.876294136047363, "learning_rate": 1.4168576709796674e-05, "loss": 1.3643, "step": 7890 }, { "epoch": 14.6, "grad_norm": 7.137850284576416, "learning_rate": 1.4161182994454714e-05, "loss": 1.5533, "step": 7900 }, { "epoch": 14.62, "grad_norm": 8.039826393127441, "learning_rate": 1.4153789279112755e-05, "loss": 1.7456, "step": 7910 }, { "epoch": 14.64, "grad_norm": 7.523586750030518, "learning_rate": 1.4146395563770796e-05, "loss": 1.4071, "step": 7920 }, { "epoch": 14.66, "grad_norm": 7.60550594329834, "learning_rate": 1.4139001848428838e-05, "loss": 1.4125, "step": 7930 }, { "epoch": 14.68, "grad_norm": 7.06260871887207, "learning_rate": 1.4131608133086878e-05, "loss": 1.7399, "step": 7940 }, { "epoch": 14.7, "grad_norm": 4.78956937789917, "learning_rate": 1.4124214417744918e-05, "loss": 1.5122, "step": 7950 }, { "epoch": 14.71, "grad_norm": 6.314175605773926, "learning_rate": 1.4116820702402957e-05, "loss": 1.2775, "step": 7960 }, { "epoch": 14.73, "grad_norm": 9.40743637084961, "learning_rate": 1.4109426987060999e-05, "loss": 1.4765, "step": 7970 }, { "epoch": 14.75, "grad_norm": 4.410609722137451, "learning_rate": 1.410203327171904e-05, "loss": 1.4447, "step": 7980 }, { "epoch": 14.77, "grad_norm": 6.0591583251953125, "learning_rate": 1.4094639556377082e-05, "loss": 1.6114, "step": 7990 }, { "epoch": 14.79, "grad_norm": 8.225895881652832, "learning_rate": 1.4087245841035122e-05, "loss": 1.4964, "step": 8000 }, { "epoch": 14.81, "grad_norm": 5.904598236083984, "learning_rate": 1.4079852125693163e-05, "loss": 1.696, "step": 8010 }, { "epoch": 14.82, "grad_norm": 6.0454912185668945, "learning_rate": 1.4072458410351201e-05, "loss": 1.5459, "step": 8020 }, { "epoch": 14.84, "grad_norm": 9.243764877319336, "learning_rate": 1.4065064695009243e-05, "loss": 1.8469, "step": 8030 }, { "epoch": 14.86, "grad_norm": 5.916385173797607, "learning_rate": 1.4057670979667284e-05, "loss": 1.5535, "step": 8040 }, { "epoch": 14.88, "grad_norm": 5.863710880279541, "learning_rate": 1.4050277264325326e-05, "loss": 1.6623, "step": 8050 }, { "epoch": 14.9, "grad_norm": 4.047174453735352, "learning_rate": 1.4042883548983365e-05, "loss": 1.4213, "step": 8060 }, { "epoch": 14.92, "grad_norm": 7.472527980804443, "learning_rate": 1.4035489833641407e-05, "loss": 1.4917, "step": 8070 }, { "epoch": 14.94, "grad_norm": 10.257954597473145, "learning_rate": 1.4028096118299447e-05, "loss": 1.3152, "step": 8080 }, { "epoch": 14.95, "grad_norm": 4.463430404663086, "learning_rate": 1.4020702402957486e-05, "loss": 1.602, "step": 8090 }, { "epoch": 14.97, "grad_norm": 6.369846820831299, "learning_rate": 1.4013308687615528e-05, "loss": 1.3432, "step": 8100 }, { "epoch": 14.99, "grad_norm": 8.309425354003906, "learning_rate": 1.400591497227357e-05, "loss": 1.2123, "step": 8110 }, { "epoch": 15.0, "eval_accuracy": 0.7692307692307693, "eval_loss": 0.875005304813385, "eval_runtime": 1.6208, "eval_samples_per_second": 296.76, "eval_steps_per_second": 37.635, "step": 8115 }, { "epoch": 15.01, "grad_norm": 7.835675239562988, "learning_rate": 1.3998521256931609e-05, "loss": 1.7383, "step": 8120 }, { "epoch": 15.03, "grad_norm": 3.2767491340637207, "learning_rate": 1.399112754158965e-05, "loss": 1.432, "step": 8130 }, { "epoch": 15.05, "grad_norm": 5.480152130126953, "learning_rate": 1.398373382624769e-05, "loss": 1.2541, "step": 8140 }, { "epoch": 15.06, "grad_norm": 6.756795406341553, "learning_rate": 1.397634011090573e-05, "loss": 1.5545, "step": 8150 }, { "epoch": 15.08, "grad_norm": 5.441122531890869, "learning_rate": 1.3968946395563772e-05, "loss": 1.3774, "step": 8160 }, { "epoch": 15.1, "grad_norm": 6.860857963562012, "learning_rate": 1.3961552680221813e-05, "loss": 1.5567, "step": 8170 }, { "epoch": 15.12, "grad_norm": 7.389680862426758, "learning_rate": 1.3954158964879853e-05, "loss": 1.7104, "step": 8180 }, { "epoch": 15.14, "grad_norm": 5.605249881744385, "learning_rate": 1.3946765249537894e-05, "loss": 1.887, "step": 8190 }, { "epoch": 15.16, "grad_norm": 5.693309307098389, "learning_rate": 1.3939371534195936e-05, "loss": 1.224, "step": 8200 }, { "epoch": 15.18, "grad_norm": 5.397551536560059, "learning_rate": 1.3931977818853974e-05, "loss": 1.5648, "step": 8210 }, { "epoch": 15.19, "grad_norm": 7.162982940673828, "learning_rate": 1.3924584103512015e-05, "loss": 1.5644, "step": 8220 }, { "epoch": 15.21, "grad_norm": 6.452093601226807, "learning_rate": 1.3917190388170057e-05, "loss": 1.5655, "step": 8230 }, { "epoch": 15.23, "grad_norm": 8.003186225891113, "learning_rate": 1.3909796672828098e-05, "loss": 1.2743, "step": 8240 }, { "epoch": 15.25, "grad_norm": 6.923473358154297, "learning_rate": 1.3902402957486138e-05, "loss": 1.4848, "step": 8250 }, { "epoch": 15.27, "grad_norm": 5.648501873016357, "learning_rate": 1.389500924214418e-05, "loss": 1.7805, "step": 8260 }, { "epoch": 15.29, "grad_norm": 5.158925533294678, "learning_rate": 1.3887615526802218e-05, "loss": 1.381, "step": 8270 }, { "epoch": 15.3, "grad_norm": 4.301165580749512, "learning_rate": 1.388022181146026e-05, "loss": 1.6663, "step": 8280 }, { "epoch": 15.32, "grad_norm": 3.019256114959717, "learning_rate": 1.38728280961183e-05, "loss": 1.519, "step": 8290 }, { "epoch": 15.34, "grad_norm": 6.327902317047119, "learning_rate": 1.3865434380776342e-05, "loss": 1.2965, "step": 8300 }, { "epoch": 15.36, "grad_norm": 6.554246425628662, "learning_rate": 1.3858040665434382e-05, "loss": 1.7524, "step": 8310 }, { "epoch": 15.38, "grad_norm": 7.9888811111450195, "learning_rate": 1.3850646950092423e-05, "loss": 1.3809, "step": 8320 }, { "epoch": 15.4, "grad_norm": 4.879361629486084, "learning_rate": 1.3843253234750462e-05, "loss": 1.2404, "step": 8330 }, { "epoch": 15.42, "grad_norm": 7.885076999664307, "learning_rate": 1.3835859519408503e-05, "loss": 1.4931, "step": 8340 }, { "epoch": 15.43, "grad_norm": 6.635437488555908, "learning_rate": 1.3828465804066544e-05, "loss": 1.521, "step": 8350 }, { "epoch": 15.45, "grad_norm": 3.67458438873291, "learning_rate": 1.3821072088724586e-05, "loss": 1.4295, "step": 8360 }, { "epoch": 15.47, "grad_norm": 9.082315444946289, "learning_rate": 1.3813678373382626e-05, "loss": 1.6446, "step": 8370 }, { "epoch": 15.49, "grad_norm": 7.202337265014648, "learning_rate": 1.3806284658040667e-05, "loss": 1.6391, "step": 8380 }, { "epoch": 15.51, "grad_norm": 8.268771171569824, "learning_rate": 1.3798890942698709e-05, "loss": 1.5757, "step": 8390 }, { "epoch": 15.53, "grad_norm": 6.023488998413086, "learning_rate": 1.3791497227356747e-05, "loss": 1.6294, "step": 8400 }, { "epoch": 15.55, "grad_norm": 6.259392261505127, "learning_rate": 1.3784103512014788e-05, "loss": 1.5776, "step": 8410 }, { "epoch": 15.56, "grad_norm": 6.201175212860107, "learning_rate": 1.377670979667283e-05, "loss": 1.3506, "step": 8420 }, { "epoch": 15.58, "grad_norm": 3.1131412982940674, "learning_rate": 1.376931608133087e-05, "loss": 1.6446, "step": 8430 }, { "epoch": 15.6, "grad_norm": 5.762711524963379, "learning_rate": 1.3761922365988911e-05, "loss": 1.3444, "step": 8440 }, { "epoch": 15.62, "grad_norm": 5.656573295593262, "learning_rate": 1.3754528650646952e-05, "loss": 1.916, "step": 8450 }, { "epoch": 15.64, "grad_norm": 6.152479648590088, "learning_rate": 1.374713493530499e-05, "loss": 1.1917, "step": 8460 }, { "epoch": 15.66, "grad_norm": 5.258658409118652, "learning_rate": 1.3739741219963032e-05, "loss": 1.6051, "step": 8470 }, { "epoch": 15.67, "grad_norm": 5.550984859466553, "learning_rate": 1.3732347504621074e-05, "loss": 1.5288, "step": 8480 }, { "epoch": 15.69, "grad_norm": 9.263429641723633, "learning_rate": 1.3724953789279113e-05, "loss": 1.6536, "step": 8490 }, { "epoch": 15.71, "grad_norm": 9.853463172912598, "learning_rate": 1.3717560073937155e-05, "loss": 1.475, "step": 8500 }, { "epoch": 15.73, "grad_norm": 12.064788818359375, "learning_rate": 1.3710166358595196e-05, "loss": 1.4831, "step": 8510 }, { "epoch": 15.75, "grad_norm": 8.902283668518066, "learning_rate": 1.3702772643253234e-05, "loss": 1.6931, "step": 8520 }, { "epoch": 15.77, "grad_norm": 5.320046424865723, "learning_rate": 1.3696118299445473e-05, "loss": 1.6629, "step": 8530 }, { "epoch": 15.79, "grad_norm": 6.144475936889648, "learning_rate": 1.3688724584103513e-05, "loss": 1.3841, "step": 8540 }, { "epoch": 15.8, "grad_norm": 6.10235595703125, "learning_rate": 1.3681330868761552e-05, "loss": 1.4384, "step": 8550 }, { "epoch": 15.82, "grad_norm": 4.8999481201171875, "learning_rate": 1.3673937153419594e-05, "loss": 1.8586, "step": 8560 }, { "epoch": 15.84, "grad_norm": 5.523200035095215, "learning_rate": 1.3666543438077635e-05, "loss": 1.3838, "step": 8570 }, { "epoch": 15.86, "grad_norm": 10.619403839111328, "learning_rate": 1.3659149722735677e-05, "loss": 1.625, "step": 8580 }, { "epoch": 15.88, "grad_norm": 8.076568603515625, "learning_rate": 1.3651756007393717e-05, "loss": 1.5098, "step": 8590 }, { "epoch": 15.9, "grad_norm": 5.3539605140686035, "learning_rate": 1.3644362292051758e-05, "loss": 1.2286, "step": 8600 }, { "epoch": 15.91, "grad_norm": 8.34234619140625, "learning_rate": 1.3636968576709796e-05, "loss": 1.6488, "step": 8610 }, { "epoch": 15.93, "grad_norm": 6.0026631355285645, "learning_rate": 1.3629574861367838e-05, "loss": 1.3101, "step": 8620 }, { "epoch": 15.95, "grad_norm": 6.406212329864502, "learning_rate": 1.362218114602588e-05, "loss": 1.4498, "step": 8630 }, { "epoch": 15.97, "grad_norm": 6.359390735626221, "learning_rate": 1.361478743068392e-05, "loss": 1.336, "step": 8640 }, { "epoch": 15.99, "grad_norm": 8.859796524047852, "learning_rate": 1.360739371534196e-05, "loss": 1.5721, "step": 8650 }, { "epoch": 16.0, "eval_accuracy": 0.7754677754677755, "eval_loss": 0.8586370944976807, "eval_runtime": 1.516, "eval_samples_per_second": 317.289, "eval_steps_per_second": 40.238, "step": 8656 }, { "epoch": 16.01, "grad_norm": 5.12769079208374, "learning_rate": 1.3600000000000002e-05, "loss": 1.6776, "step": 8660 }, { "epoch": 16.03, "grad_norm": 10.545654296875, "learning_rate": 1.3592606284658042e-05, "loss": 1.5368, "step": 8670 }, { "epoch": 16.04, "grad_norm": 5.564544200897217, "learning_rate": 1.3585212569316081e-05, "loss": 1.3882, "step": 8680 }, { "epoch": 16.06, "grad_norm": 3.665189266204834, "learning_rate": 1.3577818853974123e-05, "loss": 1.3827, "step": 8690 }, { "epoch": 16.08, "grad_norm": 6.4969635009765625, "learning_rate": 1.3570425138632164e-05, "loss": 1.6836, "step": 8700 }, { "epoch": 16.1, "grad_norm": 10.244078636169434, "learning_rate": 1.3563031423290204e-05, "loss": 1.6026, "step": 8710 }, { "epoch": 16.12, "grad_norm": 10.84144401550293, "learning_rate": 1.3555637707948246e-05, "loss": 1.4057, "step": 8720 }, { "epoch": 16.14, "grad_norm": 6.495248794555664, "learning_rate": 1.3548243992606285e-05, "loss": 1.3694, "step": 8730 }, { "epoch": 16.16, "grad_norm": 8.24008846282959, "learning_rate": 1.3540850277264325e-05, "loss": 1.4311, "step": 8740 }, { "epoch": 16.17, "grad_norm": 10.281193733215332, "learning_rate": 1.3533456561922367e-05, "loss": 1.4035, "step": 8750 }, { "epoch": 16.19, "grad_norm": 7.745493412017822, "learning_rate": 1.3526062846580408e-05, "loss": 1.4378, "step": 8760 }, { "epoch": 16.21, "grad_norm": 4.766057968139648, "learning_rate": 1.3518669131238448e-05, "loss": 1.2844, "step": 8770 }, { "epoch": 16.23, "grad_norm": 3.8194901943206787, "learning_rate": 1.351127541589649e-05, "loss": 1.5725, "step": 8780 }, { "epoch": 16.25, "grad_norm": 8.318746566772461, "learning_rate": 1.3503881700554531e-05, "loss": 1.5977, "step": 8790 }, { "epoch": 16.27, "grad_norm": 9.354193687438965, "learning_rate": 1.3496487985212569e-05, "loss": 1.688, "step": 8800 }, { "epoch": 16.28, "grad_norm": 6.076450347900391, "learning_rate": 1.348909426987061e-05, "loss": 1.3037, "step": 8810 }, { "epoch": 16.3, "grad_norm": 6.12235164642334, "learning_rate": 1.3481700554528652e-05, "loss": 1.5377, "step": 8820 }, { "epoch": 16.32, "grad_norm": 11.015954971313477, "learning_rate": 1.3474306839186694e-05, "loss": 2.0443, "step": 8830 }, { "epoch": 16.34, "grad_norm": 10.13499641418457, "learning_rate": 1.3466913123844733e-05, "loss": 1.4183, "step": 8840 }, { "epoch": 16.36, "grad_norm": 6.375120639801025, "learning_rate": 1.3459519408502775e-05, "loss": 1.3935, "step": 8850 }, { "epoch": 16.38, "grad_norm": 8.752510070800781, "learning_rate": 1.3452125693160813e-05, "loss": 1.2887, "step": 8860 }, { "epoch": 16.4, "grad_norm": 8.627189636230469, "learning_rate": 1.3444731977818854e-05, "loss": 1.3106, "step": 8870 }, { "epoch": 16.41, "grad_norm": 8.70999813079834, "learning_rate": 1.3437338262476896e-05, "loss": 1.4999, "step": 8880 }, { "epoch": 16.43, "grad_norm": 8.14850902557373, "learning_rate": 1.3429944547134937e-05, "loss": 1.5329, "step": 8890 }, { "epoch": 16.45, "grad_norm": 8.647690773010254, "learning_rate": 1.3422550831792977e-05, "loss": 1.1374, "step": 8900 }, { "epoch": 16.47, "grad_norm": 6.208197593688965, "learning_rate": 1.3415157116451019e-05, "loss": 1.6697, "step": 8910 }, { "epoch": 16.49, "grad_norm": 4.9114251136779785, "learning_rate": 1.3407763401109057e-05, "loss": 1.2153, "step": 8920 }, { "epoch": 16.51, "grad_norm": 9.172587394714355, "learning_rate": 1.3400369685767098e-05, "loss": 1.879, "step": 8930 }, { "epoch": 16.52, "grad_norm": 8.116153717041016, "learning_rate": 1.339297597042514e-05, "loss": 1.3952, "step": 8940 }, { "epoch": 16.54, "grad_norm": 8.664998054504395, "learning_rate": 1.3385582255083181e-05, "loss": 1.382, "step": 8950 }, { "epoch": 16.56, "grad_norm": 6.572618007659912, "learning_rate": 1.3378188539741221e-05, "loss": 1.4774, "step": 8960 }, { "epoch": 16.58, "grad_norm": 6.832912445068359, "learning_rate": 1.3370794824399262e-05, "loss": 1.3948, "step": 8970 }, { "epoch": 16.6, "grad_norm": 3.3924314975738525, "learning_rate": 1.3363401109057304e-05, "loss": 1.376, "step": 8980 }, { "epoch": 16.62, "grad_norm": 6.005743503570557, "learning_rate": 1.3356007393715342e-05, "loss": 1.6756, "step": 8990 }, { "epoch": 16.64, "grad_norm": 3.31657075881958, "learning_rate": 1.3348613678373383e-05, "loss": 1.516, "step": 9000 }, { "epoch": 16.65, "grad_norm": 9.30213737487793, "learning_rate": 1.3341219963031425e-05, "loss": 1.2478, "step": 9010 }, { "epoch": 16.67, "grad_norm": 8.852334976196289, "learning_rate": 1.3333826247689465e-05, "loss": 1.5828, "step": 9020 }, { "epoch": 16.69, "grad_norm": 3.939584493637085, "learning_rate": 1.3326432532347506e-05, "loss": 1.4563, "step": 9030 }, { "epoch": 16.71, "grad_norm": 7.800804615020752, "learning_rate": 1.3319038817005548e-05, "loss": 1.6485, "step": 9040 }, { "epoch": 16.73, "grad_norm": 5.7343854904174805, "learning_rate": 1.3311645101663586e-05, "loss": 1.2587, "step": 9050 }, { "epoch": 16.75, "grad_norm": 7.064696311950684, "learning_rate": 1.3304251386321627e-05, "loss": 1.5612, "step": 9060 }, { "epoch": 16.77, "grad_norm": 6.211050033569336, "learning_rate": 1.3296857670979669e-05, "loss": 1.7245, "step": 9070 }, { "epoch": 16.78, "grad_norm": 11.96396541595459, "learning_rate": 1.3289463955637708e-05, "loss": 1.4341, "step": 9080 }, { "epoch": 16.8, "grad_norm": 5.944769382476807, "learning_rate": 1.328207024029575e-05, "loss": 1.5934, "step": 9090 }, { "epoch": 16.82, "grad_norm": 4.271668910980225, "learning_rate": 1.3274676524953791e-05, "loss": 1.1793, "step": 9100 }, { "epoch": 16.84, "grad_norm": 6.431438446044922, "learning_rate": 1.326728280961183e-05, "loss": 1.6721, "step": 9110 }, { "epoch": 16.86, "grad_norm": 8.872383117675781, "learning_rate": 1.3259889094269871e-05, "loss": 1.451, "step": 9120 }, { "epoch": 16.88, "grad_norm": 4.067172527313232, "learning_rate": 1.3252495378927912e-05, "loss": 1.7775, "step": 9130 }, { "epoch": 16.89, "grad_norm": 2.9331016540527344, "learning_rate": 1.3245101663585952e-05, "loss": 1.52, "step": 9140 }, { "epoch": 16.91, "grad_norm": 6.984873294830322, "learning_rate": 1.3237707948243994e-05, "loss": 1.8387, "step": 9150 }, { "epoch": 16.93, "grad_norm": 7.6282806396484375, "learning_rate": 1.3230314232902035e-05, "loss": 1.4469, "step": 9160 }, { "epoch": 16.95, "grad_norm": 7.831760406494141, "learning_rate": 1.3222920517560077e-05, "loss": 1.6411, "step": 9170 }, { "epoch": 16.97, "grad_norm": 8.402664184570312, "learning_rate": 1.3215526802218115e-05, "loss": 1.183, "step": 9180 }, { "epoch": 16.99, "grad_norm": 7.562252521514893, "learning_rate": 1.3208133086876156e-05, "loss": 1.7482, "step": 9190 }, { "epoch": 17.0, "eval_accuracy": 0.7754677754677755, "eval_loss": 0.8279228806495667, "eval_runtime": 1.4925, "eval_samples_per_second": 322.283, "eval_steps_per_second": 40.872, "step": 9197 }, { "epoch": 17.01, "grad_norm": 3.149251699447632, "learning_rate": 1.3200739371534198e-05, "loss": 1.6168, "step": 9200 }, { "epoch": 17.02, "grad_norm": 7.7856903076171875, "learning_rate": 1.3193345656192237e-05, "loss": 1.3576, "step": 9210 }, { "epoch": 17.04, "grad_norm": 8.813015937805176, "learning_rate": 1.3185951940850279e-05, "loss": 1.8225, "step": 9220 }, { "epoch": 17.06, "grad_norm": 6.7569403648376465, "learning_rate": 1.317855822550832e-05, "loss": 1.2781, "step": 9230 }, { "epoch": 17.08, "grad_norm": 7.602080821990967, "learning_rate": 1.3171164510166359e-05, "loss": 1.3553, "step": 9240 }, { "epoch": 17.1, "grad_norm": 4.170512676239014, "learning_rate": 1.31637707948244e-05, "loss": 1.8786, "step": 9250 }, { "epoch": 17.12, "grad_norm": 9.786137580871582, "learning_rate": 1.3156377079482441e-05, "loss": 1.5521, "step": 9260 }, { "epoch": 17.13, "grad_norm": 9.290695190429688, "learning_rate": 1.3148983364140481e-05, "loss": 1.4705, "step": 9270 }, { "epoch": 17.15, "grad_norm": 3.0399742126464844, "learning_rate": 1.3141589648798523e-05, "loss": 1.1759, "step": 9280 }, { "epoch": 17.17, "grad_norm": 4.46043586730957, "learning_rate": 1.3134195933456564e-05, "loss": 1.5453, "step": 9290 }, { "epoch": 17.19, "grad_norm": 6.542462348937988, "learning_rate": 1.3126802218114602e-05, "loss": 1.389, "step": 9300 }, { "epoch": 17.21, "grad_norm": 13.405878067016602, "learning_rate": 1.3119408502772644e-05, "loss": 1.4248, "step": 9310 }, { "epoch": 17.23, "grad_norm": 2.8430697917938232, "learning_rate": 1.3112014787430685e-05, "loss": 1.3599, "step": 9320 }, { "epoch": 17.25, "grad_norm": 8.180906295776367, "learning_rate": 1.3104621072088725e-05, "loss": 1.5578, "step": 9330 }, { "epoch": 17.26, "grad_norm": 10.318791389465332, "learning_rate": 1.3097227356746767e-05, "loss": 1.6348, "step": 9340 }, { "epoch": 17.28, "grad_norm": 8.326462745666504, "learning_rate": 1.3089833641404808e-05, "loss": 1.5988, "step": 9350 }, { "epoch": 17.3, "grad_norm": 3.963932991027832, "learning_rate": 1.308243992606285e-05, "loss": 1.4168, "step": 9360 }, { "epoch": 17.32, "grad_norm": 3.2407381534576416, "learning_rate": 1.3075046210720888e-05, "loss": 1.2951, "step": 9370 }, { "epoch": 17.34, "grad_norm": 10.019411087036133, "learning_rate": 1.3067652495378929e-05, "loss": 1.6042, "step": 9380 }, { "epoch": 17.36, "grad_norm": 7.4781107902526855, "learning_rate": 1.3060258780036969e-05, "loss": 1.1096, "step": 9390 }, { "epoch": 17.38, "grad_norm": 4.153941631317139, "learning_rate": 1.305286506469501e-05, "loss": 1.6911, "step": 9400 }, { "epoch": 17.39, "grad_norm": 5.362704277038574, "learning_rate": 1.3045471349353052e-05, "loss": 1.0497, "step": 9410 }, { "epoch": 17.41, "grad_norm": 8.797815322875977, "learning_rate": 1.3038077634011093e-05, "loss": 1.6615, "step": 9420 }, { "epoch": 17.43, "grad_norm": 3.8004565238952637, "learning_rate": 1.3030683918669131e-05, "loss": 1.3274, "step": 9430 }, { "epoch": 17.45, "grad_norm": 3.993600606918335, "learning_rate": 1.3023290203327173e-05, "loss": 1.3547, "step": 9440 }, { "epoch": 17.47, "grad_norm": 7.846746444702148, "learning_rate": 1.3015896487985213e-05, "loss": 1.5397, "step": 9450 }, { "epoch": 17.49, "grad_norm": 7.167776584625244, "learning_rate": 1.3008502772643254e-05, "loss": 1.6744, "step": 9460 }, { "epoch": 17.5, "grad_norm": 8.235572814941406, "learning_rate": 1.3001109057301296e-05, "loss": 1.5762, "step": 9470 }, { "epoch": 17.52, "grad_norm": 7.707884311676025, "learning_rate": 1.2993715341959337e-05, "loss": 1.2509, "step": 9480 }, { "epoch": 17.54, "grad_norm": 5.796050548553467, "learning_rate": 1.2986321626617375e-05, "loss": 1.5313, "step": 9490 }, { "epoch": 17.56, "grad_norm": 7.025285720825195, "learning_rate": 1.2978927911275417e-05, "loss": 1.432, "step": 9500 }, { "epoch": 17.58, "grad_norm": 8.63866901397705, "learning_rate": 1.2971534195933458e-05, "loss": 1.7734, "step": 9510 }, { "epoch": 17.6, "grad_norm": 12.833883285522461, "learning_rate": 1.2964140480591498e-05, "loss": 1.626, "step": 9520 }, { "epoch": 17.62, "grad_norm": 3.8539888858795166, "learning_rate": 1.295674676524954e-05, "loss": 1.4128, "step": 9530 }, { "epoch": 17.63, "grad_norm": 4.270190238952637, "learning_rate": 1.2949353049907581e-05, "loss": 1.6275, "step": 9540 }, { "epoch": 17.65, "grad_norm": 6.014353275299072, "learning_rate": 1.294195933456562e-05, "loss": 1.5981, "step": 9550 }, { "epoch": 17.67, "grad_norm": 11.311755180358887, "learning_rate": 1.293456561922366e-05, "loss": 1.5159, "step": 9560 }, { "epoch": 17.69, "grad_norm": 8.78183650970459, "learning_rate": 1.2927171903881702e-05, "loss": 1.6537, "step": 9570 }, { "epoch": 17.71, "grad_norm": 9.42581558227539, "learning_rate": 1.2919778188539742e-05, "loss": 1.2195, "step": 9580 }, { "epoch": 17.73, "grad_norm": 5.744399070739746, "learning_rate": 1.2912384473197783e-05, "loss": 1.6063, "step": 9590 }, { "epoch": 17.74, "grad_norm": 8.238062858581543, "learning_rate": 1.2904990757855825e-05, "loss": 1.317, "step": 9600 }, { "epoch": 17.76, "grad_norm": 8.46562671661377, "learning_rate": 1.2897597042513864e-05, "loss": 1.5665, "step": 9610 }, { "epoch": 17.78, "grad_norm": 8.612417221069336, "learning_rate": 1.2890203327171904e-05, "loss": 1.6357, "step": 9620 }, { "epoch": 17.8, "grad_norm": 4.533140659332275, "learning_rate": 1.2882809611829946e-05, "loss": 1.4719, "step": 9630 }, { "epoch": 17.82, "grad_norm": 4.71198034286499, "learning_rate": 1.2875415896487985e-05, "loss": 1.2568, "step": 9640 }, { "epoch": 17.84, "grad_norm": 8.648571014404297, "learning_rate": 1.2868022181146027e-05, "loss": 1.2746, "step": 9650 }, { "epoch": 17.86, "grad_norm": 4.99550199508667, "learning_rate": 1.2860628465804068e-05, "loss": 1.2616, "step": 9660 }, { "epoch": 17.87, "grad_norm": 10.317298889160156, "learning_rate": 1.285323475046211e-05, "loss": 1.49, "step": 9670 }, { "epoch": 17.89, "grad_norm": 5.85258150100708, "learning_rate": 1.2845841035120148e-05, "loss": 1.4027, "step": 9680 }, { "epoch": 17.91, "grad_norm": 5.361637115478516, "learning_rate": 1.283844731977819e-05, "loss": 1.5443, "step": 9690 }, { "epoch": 17.93, "grad_norm": 7.594693183898926, "learning_rate": 1.283105360443623e-05, "loss": 1.5194, "step": 9700 }, { "epoch": 17.95, "grad_norm": 11.394500732421875, "learning_rate": 1.282365988909427e-05, "loss": 1.8359, "step": 9710 }, { "epoch": 17.97, "grad_norm": 8.06380844116211, "learning_rate": 1.2816266173752312e-05, "loss": 1.1302, "step": 9720 }, { "epoch": 17.99, "grad_norm": 10.186760902404785, "learning_rate": 1.2808872458410354e-05, "loss": 1.5992, "step": 9730 }, { "epoch": 18.0, "eval_accuracy": 0.7546777546777547, "eval_loss": 0.8320666551589966, "eval_runtime": 1.6097, "eval_samples_per_second": 298.816, "eval_steps_per_second": 37.896, "step": 9738 }, { "epoch": 18.0, "grad_norm": 6.030892848968506, "learning_rate": 1.2801478743068393e-05, "loss": 1.6808, "step": 9740 }, { "epoch": 18.02, "grad_norm": 5.221505641937256, "learning_rate": 1.2794085027726433e-05, "loss": 1.4358, "step": 9750 }, { "epoch": 18.04, "grad_norm": 6.155542850494385, "learning_rate": 1.2786691312384473e-05, "loss": 1.27, "step": 9760 }, { "epoch": 18.06, "grad_norm": 5.94996452331543, "learning_rate": 1.2779297597042515e-05, "loss": 1.3568, "step": 9770 }, { "epoch": 18.08, "grad_norm": 6.435792922973633, "learning_rate": 1.2771903881700556e-05, "loss": 1.6233, "step": 9780 }, { "epoch": 18.1, "grad_norm": 7.447518825531006, "learning_rate": 1.2764510166358598e-05, "loss": 1.3817, "step": 9790 }, { "epoch": 18.11, "grad_norm": 9.756739616394043, "learning_rate": 1.2757116451016637e-05, "loss": 1.6855, "step": 9800 }, { "epoch": 18.13, "grad_norm": 5.1456217765808105, "learning_rate": 1.2749722735674677e-05, "loss": 1.3089, "step": 9810 }, { "epoch": 18.15, "grad_norm": 6.5887451171875, "learning_rate": 1.2742329020332717e-05, "loss": 1.3499, "step": 9820 }, { "epoch": 18.17, "grad_norm": 7.386535167694092, "learning_rate": 1.2734935304990758e-05, "loss": 1.7566, "step": 9830 }, { "epoch": 18.19, "grad_norm": 3.991417407989502, "learning_rate": 1.27275415896488e-05, "loss": 1.5086, "step": 9840 }, { "epoch": 18.21, "grad_norm": 5.839792251586914, "learning_rate": 1.2720147874306841e-05, "loss": 1.3474, "step": 9850 }, { "epoch": 18.23, "grad_norm": 2.688180446624756, "learning_rate": 1.2712754158964881e-05, "loss": 1.2837, "step": 9860 }, { "epoch": 18.24, "grad_norm": 7.764678001403809, "learning_rate": 1.2705360443622921e-05, "loss": 1.7067, "step": 9870 }, { "epoch": 18.26, "grad_norm": 5.33721923828125, "learning_rate": 1.2697966728280962e-05, "loss": 1.1767, "step": 9880 }, { "epoch": 18.28, "grad_norm": 5.904231071472168, "learning_rate": 1.2690573012939002e-05, "loss": 1.4742, "step": 9890 }, { "epoch": 18.3, "grad_norm": 6.462328910827637, "learning_rate": 1.2683179297597044e-05, "loss": 1.6092, "step": 9900 }, { "epoch": 18.32, "grad_norm": 6.495627403259277, "learning_rate": 1.2675785582255085e-05, "loss": 1.2973, "step": 9910 }, { "epoch": 18.34, "grad_norm": 4.568264961242676, "learning_rate": 1.2668391866913125e-05, "loss": 1.3722, "step": 9920 }, { "epoch": 18.35, "grad_norm": 6.186374187469482, "learning_rate": 1.2660998151571166e-05, "loss": 1.1674, "step": 9930 }, { "epoch": 18.37, "grad_norm": 4.376631736755371, "learning_rate": 1.2653604436229206e-05, "loss": 1.4119, "step": 9940 }, { "epoch": 18.39, "grad_norm": 8.93445110321045, "learning_rate": 1.2646210720887246e-05, "loss": 1.4664, "step": 9950 }, { "epoch": 18.41, "grad_norm": 7.3121490478515625, "learning_rate": 1.2638817005545287e-05, "loss": 1.3694, "step": 9960 }, { "epoch": 18.43, "grad_norm": 4.904880046844482, "learning_rate": 1.2631423290203329e-05, "loss": 1.3683, "step": 9970 }, { "epoch": 18.45, "grad_norm": 6.554380416870117, "learning_rate": 1.2624029574861369e-05, "loss": 1.658, "step": 9980 }, { "epoch": 18.47, "grad_norm": 6.931053638458252, "learning_rate": 1.261663585951941e-05, "loss": 1.262, "step": 9990 }, { "epoch": 18.48, "grad_norm": 6.729274749755859, "learning_rate": 1.260924214417745e-05, "loss": 1.6231, "step": 10000 }, { "epoch": 18.5, "grad_norm": 11.126877784729004, "learning_rate": 1.260184842883549e-05, "loss": 1.7487, "step": 10010 }, { "epoch": 18.52, "grad_norm": 9.691969871520996, "learning_rate": 1.2594454713493531e-05, "loss": 1.6371, "step": 10020 }, { "epoch": 18.54, "grad_norm": 5.114473342895508, "learning_rate": 1.2587060998151573e-05, "loss": 1.267, "step": 10030 }, { "epoch": 18.56, "grad_norm": 7.113764762878418, "learning_rate": 1.2579667282809614e-05, "loss": 1.4248, "step": 10040 }, { "epoch": 18.58, "grad_norm": 6.77772331237793, "learning_rate": 1.2572273567467654e-05, "loss": 1.6878, "step": 10050 }, { "epoch": 18.6, "grad_norm": 5.896045684814453, "learning_rate": 1.2564879852125694e-05, "loss": 1.6482, "step": 10060 }, { "epoch": 18.61, "grad_norm": 4.896398067474365, "learning_rate": 1.2557486136783733e-05, "loss": 1.4376, "step": 10070 }, { "epoch": 18.63, "grad_norm": 4.7241644859313965, "learning_rate": 1.2550092421441775e-05, "loss": 1.3774, "step": 10080 }, { "epoch": 18.65, "grad_norm": 10.624163627624512, "learning_rate": 1.2542698706099816e-05, "loss": 1.7449, "step": 10090 }, { "epoch": 18.67, "grad_norm": 4.23441219329834, "learning_rate": 1.2535304990757858e-05, "loss": 1.5109, "step": 10100 }, { "epoch": 18.69, "grad_norm": 4.992342948913574, "learning_rate": 1.2527911275415898e-05, "loss": 1.2969, "step": 10110 }, { "epoch": 18.71, "grad_norm": 7.287111282348633, "learning_rate": 1.252051756007394e-05, "loss": 1.3997, "step": 10120 }, { "epoch": 18.72, "grad_norm": 3.4431307315826416, "learning_rate": 1.2513123844731977e-05, "loss": 1.0759, "step": 10130 }, { "epoch": 18.74, "grad_norm": 7.204595565795898, "learning_rate": 1.2505730129390019e-05, "loss": 1.3099, "step": 10140 }, { "epoch": 18.76, "grad_norm": 12.001493453979492, "learning_rate": 1.249833641404806e-05, "loss": 1.7803, "step": 10150 }, { "epoch": 18.78, "grad_norm": 6.2655253410339355, "learning_rate": 1.2490942698706102e-05, "loss": 1.4059, "step": 10160 }, { "epoch": 18.8, "grad_norm": 12.253315925598145, "learning_rate": 1.2483548983364141e-05, "loss": 1.6807, "step": 10170 }, { "epoch": 18.82, "grad_norm": 6.592883110046387, "learning_rate": 1.2476155268022183e-05, "loss": 1.6814, "step": 10180 }, { "epoch": 18.84, "grad_norm": 4.955015182495117, "learning_rate": 1.2468761552680223e-05, "loss": 1.4147, "step": 10190 }, { "epoch": 18.85, "grad_norm": 4.897569179534912, "learning_rate": 1.2461367837338263e-05, "loss": 1.3169, "step": 10200 }, { "epoch": 18.87, "grad_norm": 14.085545539855957, "learning_rate": 1.2453974121996304e-05, "loss": 1.4625, "step": 10210 }, { "epoch": 18.89, "grad_norm": 7.442602634429932, "learning_rate": 1.2446580406654345e-05, "loss": 1.6029, "step": 10220 }, { "epoch": 18.91, "grad_norm": 3.8085179328918457, "learning_rate": 1.2439186691312385e-05, "loss": 1.3249, "step": 10230 }, { "epoch": 18.93, "grad_norm": 10.436812400817871, "learning_rate": 1.2431792975970427e-05, "loss": 1.4368, "step": 10240 }, { "epoch": 18.95, "grad_norm": 6.233061790466309, "learning_rate": 1.2424399260628467e-05, "loss": 1.3933, "step": 10250 }, { "epoch": 18.96, "grad_norm": 10.712681770324707, "learning_rate": 1.2417005545286506e-05, "loss": 1.2646, "step": 10260 }, { "epoch": 18.98, "grad_norm": 5.828792095184326, "learning_rate": 1.2409611829944548e-05, "loss": 1.8179, "step": 10270 }, { "epoch": 19.0, "eval_accuracy": 0.7817047817047817, "eval_loss": 0.7898217439651489, "eval_runtime": 1.6243, "eval_samples_per_second": 296.124, "eval_steps_per_second": 37.554, "step": 10279 }, { "epoch": 19.0, "grad_norm": 7.006405353546143, "learning_rate": 1.240221811460259e-05, "loss": 1.452, "step": 10280 }, { "epoch": 19.02, "grad_norm": 9.134038925170898, "learning_rate": 1.2394824399260629e-05, "loss": 1.3542, "step": 10290 }, { "epoch": 19.04, "grad_norm": 7.86276388168335, "learning_rate": 1.238743068391867e-05, "loss": 1.3791, "step": 10300 }, { "epoch": 19.06, "grad_norm": 5.034814834594727, "learning_rate": 1.2380036968576712e-05, "loss": 1.3692, "step": 10310 }, { "epoch": 19.08, "grad_norm": 10.681830406188965, "learning_rate": 1.237264325323475e-05, "loss": 1.6174, "step": 10320 }, { "epoch": 19.09, "grad_norm": 3.9336700439453125, "learning_rate": 1.2365249537892792e-05, "loss": 1.6806, "step": 10330 }, { "epoch": 19.11, "grad_norm": 11.411152839660645, "learning_rate": 1.2357855822550833e-05, "loss": 1.3346, "step": 10340 }, { "epoch": 19.13, "grad_norm": 11.952852249145508, "learning_rate": 1.2350462107208875e-05, "loss": 1.3694, "step": 10350 }, { "epoch": 19.15, "grad_norm": 6.249612331390381, "learning_rate": 1.2343068391866914e-05, "loss": 1.5051, "step": 10360 }, { "epoch": 19.17, "grad_norm": 9.855717658996582, "learning_rate": 1.2335674676524956e-05, "loss": 1.5284, "step": 10370 }, { "epoch": 19.19, "grad_norm": 6.176255226135254, "learning_rate": 1.2328280961182994e-05, "loss": 1.4085, "step": 10380 }, { "epoch": 19.21, "grad_norm": 3.8083300590515137, "learning_rate": 1.2320887245841035e-05, "loss": 1.429, "step": 10390 }, { "epoch": 19.22, "grad_norm": 13.354677200317383, "learning_rate": 1.2313493530499077e-05, "loss": 1.3068, "step": 10400 }, { "epoch": 19.24, "grad_norm": 7.089704990386963, "learning_rate": 1.2306099815157118e-05, "loss": 1.3767, "step": 10410 }, { "epoch": 19.26, "grad_norm": 8.874341011047363, "learning_rate": 1.2298706099815158e-05, "loss": 1.5543, "step": 10420 }, { "epoch": 19.28, "grad_norm": 10.434432983398438, "learning_rate": 1.22913123844732e-05, "loss": 1.329, "step": 10430 }, { "epoch": 19.3, "grad_norm": 6.649686336517334, "learning_rate": 1.2283918669131238e-05, "loss": 1.2412, "step": 10440 }, { "epoch": 19.32, "grad_norm": 4.1714301109313965, "learning_rate": 1.227652495378928e-05, "loss": 1.3319, "step": 10450 }, { "epoch": 19.33, "grad_norm": 4.813749313354492, "learning_rate": 1.226913123844732e-05, "loss": 1.4715, "step": 10460 }, { "epoch": 19.35, "grad_norm": 7.098138332366943, "learning_rate": 1.2261737523105362e-05, "loss": 1.5407, "step": 10470 }, { "epoch": 19.37, "grad_norm": 2.7659730911254883, "learning_rate": 1.2254343807763402e-05, "loss": 0.9396, "step": 10480 }, { "epoch": 19.39, "grad_norm": 6.7903032302856445, "learning_rate": 1.2246950092421443e-05, "loss": 1.5616, "step": 10490 }, { "epoch": 19.41, "grad_norm": 3.182398557662964, "learning_rate": 1.2239556377079485e-05, "loss": 1.4365, "step": 10500 }, { "epoch": 19.43, "grad_norm": 7.8735527992248535, "learning_rate": 1.2232162661737523e-05, "loss": 1.338, "step": 10510 }, { "epoch": 19.45, "grad_norm": 8.822565078735352, "learning_rate": 1.2224768946395564e-05, "loss": 1.3726, "step": 10520 }, { "epoch": 19.46, "grad_norm": 6.738836765289307, "learning_rate": 1.2217375231053606e-05, "loss": 1.4868, "step": 10530 }, { "epoch": 19.48, "grad_norm": 7.850865364074707, "learning_rate": 1.2209981515711646e-05, "loss": 1.5718, "step": 10540 }, { "epoch": 19.5, "grad_norm": 7.542879581451416, "learning_rate": 1.2202587800369687e-05, "loss": 1.5315, "step": 10550 }, { "epoch": 19.52, "grad_norm": 8.251657485961914, "learning_rate": 1.2195194085027729e-05, "loss": 1.5267, "step": 10560 }, { "epoch": 19.54, "grad_norm": 8.844817161560059, "learning_rate": 1.2187800369685767e-05, "loss": 1.3571, "step": 10570 }, { "epoch": 19.56, "grad_norm": 3.1401000022888184, "learning_rate": 1.2180406654343808e-05, "loss": 1.5162, "step": 10580 }, { "epoch": 19.57, "grad_norm": 7.41256856918335, "learning_rate": 1.217301293900185e-05, "loss": 1.1233, "step": 10590 }, { "epoch": 19.59, "grad_norm": 6.249854564666748, "learning_rate": 1.216561922365989e-05, "loss": 1.3556, "step": 10600 }, { "epoch": 19.61, "grad_norm": 6.959690093994141, "learning_rate": 1.2158225508317931e-05, "loss": 1.3167, "step": 10610 }, { "epoch": 19.63, "grad_norm": 7.864748954772949, "learning_rate": 1.2150831792975972e-05, "loss": 1.2773, "step": 10620 }, { "epoch": 19.65, "grad_norm": 8.013951301574707, "learning_rate": 1.214343807763401e-05, "loss": 1.1527, "step": 10630 }, { "epoch": 19.67, "grad_norm": 2.274090051651001, "learning_rate": 1.2136044362292052e-05, "loss": 1.4895, "step": 10640 }, { "epoch": 19.69, "grad_norm": 7.520959854125977, "learning_rate": 1.2128650646950093e-05, "loss": 1.5651, "step": 10650 }, { "epoch": 19.7, "grad_norm": 11.09343433380127, "learning_rate": 1.2121256931608133e-05, "loss": 1.4007, "step": 10660 }, { "epoch": 19.72, "grad_norm": 6.151890277862549, "learning_rate": 1.2113863216266175e-05, "loss": 1.724, "step": 10670 }, { "epoch": 19.74, "grad_norm": 7.829218864440918, "learning_rate": 1.2106469500924216e-05, "loss": 1.5576, "step": 10680 }, { "epoch": 19.76, "grad_norm": 9.747720718383789, "learning_rate": 1.2099075785582258e-05, "loss": 1.2377, "step": 10690 }, { "epoch": 19.78, "grad_norm": 5.929236888885498, "learning_rate": 1.2091682070240296e-05, "loss": 1.1713, "step": 10700 }, { "epoch": 19.8, "grad_norm": 9.261822700500488, "learning_rate": 1.2084288354898337e-05, "loss": 1.3808, "step": 10710 }, { "epoch": 19.82, "grad_norm": 4.972837448120117, "learning_rate": 1.2076894639556379e-05, "loss": 1.4994, "step": 10720 }, { "epoch": 19.83, "grad_norm": 9.350780487060547, "learning_rate": 1.2069500924214419e-05, "loss": 1.1865, "step": 10730 }, { "epoch": 19.85, "grad_norm": 8.722793579101562, "learning_rate": 1.206210720887246e-05, "loss": 1.5371, "step": 10740 }, { "epoch": 19.87, "grad_norm": 5.95665168762207, "learning_rate": 1.2054713493530501e-05, "loss": 1.7162, "step": 10750 }, { "epoch": 19.89, "grad_norm": 5.458383083343506, "learning_rate": 1.204731977818854e-05, "loss": 1.2392, "step": 10760 }, { "epoch": 19.91, "grad_norm": 5.060382843017578, "learning_rate": 1.2039926062846581e-05, "loss": 1.4746, "step": 10770 }, { "epoch": 19.93, "grad_norm": 4.198613166809082, "learning_rate": 1.2032532347504623e-05, "loss": 1.4029, "step": 10780 }, { "epoch": 19.94, "grad_norm": 6.239277362823486, "learning_rate": 1.2025138632162662e-05, "loss": 1.2455, "step": 10790 }, { "epoch": 19.96, "grad_norm": 4.410442352294922, "learning_rate": 1.2017744916820704e-05, "loss": 1.8577, "step": 10800 }, { "epoch": 19.98, "grad_norm": 8.323649406433105, "learning_rate": 1.2010351201478745e-05, "loss": 1.055, "step": 10810 }, { "epoch": 20.0, "grad_norm": 8.968486785888672, "learning_rate": 1.2002957486136783e-05, "loss": 1.2744, "step": 10820 }, { "epoch": 20.0, "eval_accuracy": 0.7671517671517671, "eval_loss": 0.7984076142311096, "eval_runtime": 1.5371, "eval_samples_per_second": 312.933, "eval_steps_per_second": 39.686, "step": 10820 }, { "epoch": 20.02, "grad_norm": 7.444639205932617, "learning_rate": 1.1995563770794825e-05, "loss": 1.3443, "step": 10830 }, { "epoch": 20.04, "grad_norm": 8.463935852050781, "learning_rate": 1.1988170055452866e-05, "loss": 1.5753, "step": 10840 }, { "epoch": 20.06, "grad_norm": 6.217259407043457, "learning_rate": 1.1980776340110906e-05, "loss": 1.252, "step": 10850 }, { "epoch": 20.07, "grad_norm": 5.693349361419678, "learning_rate": 1.1973382624768948e-05, "loss": 1.3776, "step": 10860 }, { "epoch": 20.09, "grad_norm": 5.214580535888672, "learning_rate": 1.1965988909426989e-05, "loss": 1.4569, "step": 10870 }, { "epoch": 20.11, "grad_norm": 6.8569416999816895, "learning_rate": 1.195859519408503e-05, "loss": 1.3386, "step": 10880 }, { "epoch": 20.13, "grad_norm": 8.65273380279541, "learning_rate": 1.1951201478743069e-05, "loss": 1.5625, "step": 10890 }, { "epoch": 20.15, "grad_norm": 5.471249580383301, "learning_rate": 1.194380776340111e-05, "loss": 1.6992, "step": 10900 }, { "epoch": 20.17, "grad_norm": 7.4084248542785645, "learning_rate": 1.193641404805915e-05, "loss": 1.4351, "step": 10910 }, { "epoch": 20.18, "grad_norm": 12.477338790893555, "learning_rate": 1.1929020332717191e-05, "loss": 1.3198, "step": 10920 }, { "epoch": 20.2, "grad_norm": 5.534790515899658, "learning_rate": 1.1921626617375233e-05, "loss": 1.3664, "step": 10930 }, { "epoch": 20.22, "grad_norm": 4.709255695343018, "learning_rate": 1.1914232902033274e-05, "loss": 1.5079, "step": 10940 }, { "epoch": 20.24, "grad_norm": 5.911303520202637, "learning_rate": 1.1906839186691312e-05, "loss": 1.364, "step": 10950 }, { "epoch": 20.26, "grad_norm": 8.19422435760498, "learning_rate": 1.1899445471349354e-05, "loss": 1.3433, "step": 10960 }, { "epoch": 20.28, "grad_norm": 4.987353801727295, "learning_rate": 1.1892051756007394e-05, "loss": 1.3886, "step": 10970 }, { "epoch": 20.3, "grad_norm": 8.56406307220459, "learning_rate": 1.1884658040665435e-05, "loss": 1.0547, "step": 10980 }, { "epoch": 20.31, "grad_norm": 7.828549861907959, "learning_rate": 1.1877264325323477e-05, "loss": 1.446, "step": 10990 }, { "epoch": 20.33, "grad_norm": 8.155370712280273, "learning_rate": 1.1869870609981518e-05, "loss": 1.3824, "step": 11000 }, { "epoch": 20.35, "grad_norm": 5.673706531524658, "learning_rate": 1.1862476894639556e-05, "loss": 1.4146, "step": 11010 }, { "epoch": 20.37, "grad_norm": 8.849701881408691, "learning_rate": 1.1855083179297598e-05, "loss": 1.4007, "step": 11020 }, { "epoch": 20.39, "grad_norm": 4.663218975067139, "learning_rate": 1.184768946395564e-05, "loss": 0.9633, "step": 11030 }, { "epoch": 20.41, "grad_norm": 5.9611616134643555, "learning_rate": 1.1840295748613679e-05, "loss": 1.4722, "step": 11040 }, { "epoch": 20.43, "grad_norm": 3.2603774070739746, "learning_rate": 1.183290203327172e-05, "loss": 1.3949, "step": 11050 }, { "epoch": 20.44, "grad_norm": 9.5177583694458, "learning_rate": 1.1825508317929762e-05, "loss": 1.2295, "step": 11060 }, { "epoch": 20.46, "grad_norm": 12.529622077941895, "learning_rate": 1.1818114602587802e-05, "loss": 1.5634, "step": 11070 }, { "epoch": 20.48, "grad_norm": 4.962494850158691, "learning_rate": 1.1810720887245841e-05, "loss": 1.2392, "step": 11080 }, { "epoch": 20.5, "grad_norm": 7.194056034088135, "learning_rate": 1.1803327171903883e-05, "loss": 1.467, "step": 11090 }, { "epoch": 20.52, "grad_norm": 5.552194118499756, "learning_rate": 1.1795933456561923e-05, "loss": 1.1735, "step": 11100 }, { "epoch": 20.54, "grad_norm": 11.066177368164062, "learning_rate": 1.1788539741219964e-05, "loss": 1.1682, "step": 11110 }, { "epoch": 20.55, "grad_norm": 7.39406681060791, "learning_rate": 1.1781146025878006e-05, "loss": 1.4874, "step": 11120 }, { "epoch": 20.57, "grad_norm": 10.703259468078613, "learning_rate": 1.1773752310536045e-05, "loss": 1.5149, "step": 11130 }, { "epoch": 20.59, "grad_norm": 6.987980365753174, "learning_rate": 1.1766358595194085e-05, "loss": 1.2936, "step": 11140 }, { "epoch": 20.61, "grad_norm": 9.74182415008545, "learning_rate": 1.1758964879852127e-05, "loss": 1.6116, "step": 11150 }, { "epoch": 20.63, "grad_norm": 3.1223936080932617, "learning_rate": 1.1751571164510167e-05, "loss": 1.6728, "step": 11160 }, { "epoch": 20.65, "grad_norm": 3.2789130210876465, "learning_rate": 1.1744177449168208e-05, "loss": 1.229, "step": 11170 }, { "epoch": 20.67, "grad_norm": 5.551230430603027, "learning_rate": 1.173678373382625e-05, "loss": 1.438, "step": 11180 }, { "epoch": 20.68, "grad_norm": 8.53386402130127, "learning_rate": 1.1729390018484291e-05, "loss": 1.525, "step": 11190 }, { "epoch": 20.7, "grad_norm": 9.85353946685791, "learning_rate": 1.1721996303142329e-05, "loss": 1.6619, "step": 11200 }, { "epoch": 20.72, "grad_norm": 6.715352535247803, "learning_rate": 1.171460258780037e-05, "loss": 1.5195, "step": 11210 }, { "epoch": 20.74, "grad_norm": 7.618119716644287, "learning_rate": 1.170720887245841e-05, "loss": 1.3879, "step": 11220 }, { "epoch": 20.76, "grad_norm": 5.73505163192749, "learning_rate": 1.1699815157116452e-05, "loss": 1.2323, "step": 11230 }, { "epoch": 20.78, "grad_norm": 5.210812091827393, "learning_rate": 1.1692421441774493e-05, "loss": 1.2055, "step": 11240 }, { "epoch": 20.79, "grad_norm": 10.23471450805664, "learning_rate": 1.1685027726432535e-05, "loss": 1.3801, "step": 11250 }, { "epoch": 20.81, "grad_norm": 9.552277565002441, "learning_rate": 1.1677634011090575e-05, "loss": 1.5845, "step": 11260 }, { "epoch": 20.83, "grad_norm": 4.311861991882324, "learning_rate": 1.1670240295748614e-05, "loss": 1.2026, "step": 11270 }, { "epoch": 20.85, "grad_norm": 8.047245025634766, "learning_rate": 1.1662846580406654e-05, "loss": 1.4383, "step": 11280 }, { "epoch": 20.87, "grad_norm": 12.119587898254395, "learning_rate": 1.1655452865064696e-05, "loss": 1.5055, "step": 11290 }, { "epoch": 20.89, "grad_norm": 7.067628860473633, "learning_rate": 1.1648059149722737e-05, "loss": 1.238, "step": 11300 }, { "epoch": 20.91, "grad_norm": 5.558277606964111, "learning_rate": 1.1640665434380779e-05, "loss": 1.4432, "step": 11310 }, { "epoch": 20.92, "grad_norm": 6.233585834503174, "learning_rate": 1.1633271719038818e-05, "loss": 1.4838, "step": 11320 }, { "epoch": 20.94, "grad_norm": 8.576188087463379, "learning_rate": 1.1625878003696858e-05, "loss": 1.6853, "step": 11330 }, { "epoch": 20.96, "grad_norm": 6.557576656341553, "learning_rate": 1.1618484288354898e-05, "loss": 1.481, "step": 11340 }, { "epoch": 20.98, "grad_norm": 9.163666725158691, "learning_rate": 1.161109057301294e-05, "loss": 1.8968, "step": 11350 }, { "epoch": 21.0, "grad_norm": 6.775120258331299, "learning_rate": 1.1603696857670981e-05, "loss": 1.2221, "step": 11360 }, { "epoch": 21.0, "eval_accuracy": 0.7733887733887734, "eval_loss": 0.7756778001785278, "eval_runtime": 1.6925, "eval_samples_per_second": 284.2, "eval_steps_per_second": 36.042, "step": 11361 }, { "epoch": 21.02, "grad_norm": 9.552371978759766, "learning_rate": 1.1596303142329022e-05, "loss": 1.2142, "step": 11370 }, { "epoch": 21.04, "grad_norm": 6.913117408752441, "learning_rate": 1.1588909426987062e-05, "loss": 1.2891, "step": 11380 }, { "epoch": 21.05, "grad_norm": 7.510538578033447, "learning_rate": 1.1581515711645102e-05, "loss": 1.3165, "step": 11390 }, { "epoch": 21.07, "grad_norm": 6.96502161026001, "learning_rate": 1.1574121996303143e-05, "loss": 1.3794, "step": 11400 }, { "epoch": 21.09, "grad_norm": 8.72270393371582, "learning_rate": 1.1566728280961183e-05, "loss": 1.3884, "step": 11410 }, { "epoch": 21.11, "grad_norm": 2.992285966873169, "learning_rate": 1.1559334565619225e-05, "loss": 1.6561, "step": 11420 }, { "epoch": 21.13, "grad_norm": 7.449506759643555, "learning_rate": 1.1551940850277266e-05, "loss": 1.2738, "step": 11430 }, { "epoch": 21.15, "grad_norm": 5.459431171417236, "learning_rate": 1.1544547134935306e-05, "loss": 1.3718, "step": 11440 }, { "epoch": 21.16, "grad_norm": 4.572870254516602, "learning_rate": 1.1537153419593347e-05, "loss": 1.1349, "step": 11450 }, { "epoch": 21.18, "grad_norm": 5.0146002769470215, "learning_rate": 1.1529759704251387e-05, "loss": 1.4084, "step": 11460 }, { "epoch": 21.2, "grad_norm": 4.384791374206543, "learning_rate": 1.1522365988909427e-05, "loss": 1.7393, "step": 11470 }, { "epoch": 21.22, "grad_norm": 9.380276679992676, "learning_rate": 1.1514972273567468e-05, "loss": 1.268, "step": 11480 }, { "epoch": 21.24, "grad_norm": 9.275566101074219, "learning_rate": 1.150757855822551e-05, "loss": 1.4464, "step": 11490 }, { "epoch": 21.26, "grad_norm": 4.497946262359619, "learning_rate": 1.150018484288355e-05, "loss": 1.4365, "step": 11500 }, { "epoch": 21.28, "grad_norm": 4.356363773345947, "learning_rate": 1.1492791127541591e-05, "loss": 1.5069, "step": 11510 }, { "epoch": 21.29, "grad_norm": 5.398430824279785, "learning_rate": 1.1485397412199631e-05, "loss": 1.3863, "step": 11520 }, { "epoch": 21.31, "grad_norm": 9.603318214416504, "learning_rate": 1.147800369685767e-05, "loss": 1.103, "step": 11530 }, { "epoch": 21.33, "grad_norm": 10.079381942749023, "learning_rate": 1.1470609981515712e-05, "loss": 1.3287, "step": 11540 }, { "epoch": 21.35, "grad_norm": 3.8888871669769287, "learning_rate": 1.1463216266173754e-05, "loss": 1.3512, "step": 11550 }, { "epoch": 21.37, "grad_norm": 8.649396896362305, "learning_rate": 1.1455822550831795e-05, "loss": 1.5965, "step": 11560 }, { "epoch": 21.39, "grad_norm": 8.849939346313477, "learning_rate": 1.1448428835489835e-05, "loss": 1.3704, "step": 11570 }, { "epoch": 21.4, "grad_norm": 6.654789924621582, "learning_rate": 1.1441035120147875e-05, "loss": 1.1488, "step": 11580 }, { "epoch": 21.42, "grad_norm": 5.095791816711426, "learning_rate": 1.1433641404805915e-05, "loss": 1.2938, "step": 11590 }, { "epoch": 21.44, "grad_norm": 7.757185935974121, "learning_rate": 1.1426247689463956e-05, "loss": 1.5896, "step": 11600 }, { "epoch": 21.46, "grad_norm": 8.986946105957031, "learning_rate": 1.1418853974121997e-05, "loss": 1.7398, "step": 11610 }, { "epoch": 21.48, "grad_norm": 6.265936851501465, "learning_rate": 1.1411460258780039e-05, "loss": 1.665, "step": 11620 }, { "epoch": 21.5, "grad_norm": 8.76430606842041, "learning_rate": 1.1404066543438079e-05, "loss": 1.4463, "step": 11630 }, { "epoch": 21.52, "grad_norm": 9.338278770446777, "learning_rate": 1.139667282809612e-05, "loss": 1.5227, "step": 11640 }, { "epoch": 21.53, "grad_norm": 8.59951114654541, "learning_rate": 1.1389279112754158e-05, "loss": 1.439, "step": 11650 }, { "epoch": 21.55, "grad_norm": 7.271739959716797, "learning_rate": 1.13818853974122e-05, "loss": 1.4116, "step": 11660 }, { "epoch": 21.57, "grad_norm": 4.01560115814209, "learning_rate": 1.1374491682070241e-05, "loss": 1.2463, "step": 11670 }, { "epoch": 21.59, "grad_norm": 4.797820568084717, "learning_rate": 1.1367097966728283e-05, "loss": 1.6659, "step": 11680 }, { "epoch": 21.61, "grad_norm": 13.547115325927734, "learning_rate": 1.1359704251386323e-05, "loss": 1.3175, "step": 11690 }, { "epoch": 21.63, "grad_norm": 9.515949249267578, "learning_rate": 1.1352310536044364e-05, "loss": 1.4198, "step": 11700 }, { "epoch": 21.65, "grad_norm": 5.9182963371276855, "learning_rate": 1.1344916820702404e-05, "loss": 1.2058, "step": 11710 }, { "epoch": 21.66, "grad_norm": 2.581352472305298, "learning_rate": 1.1337523105360444e-05, "loss": 1.1383, "step": 11720 }, { "epoch": 21.68, "grad_norm": 8.78309440612793, "learning_rate": 1.1330129390018485e-05, "loss": 1.3943, "step": 11730 }, { "epoch": 21.7, "grad_norm": 5.156717777252197, "learning_rate": 1.1322735674676527e-05, "loss": 1.0223, "step": 11740 }, { "epoch": 21.72, "grad_norm": 4.120117664337158, "learning_rate": 1.1315341959334566e-05, "loss": 1.1894, "step": 11750 }, { "epoch": 21.74, "grad_norm": 9.779801368713379, "learning_rate": 1.1307948243992608e-05, "loss": 1.2488, "step": 11760 }, { "epoch": 21.76, "grad_norm": 8.527020454406738, "learning_rate": 1.1300554528650648e-05, "loss": 1.4801, "step": 11770 }, { "epoch": 21.77, "grad_norm": 6.712395191192627, "learning_rate": 1.1293160813308687e-05, "loss": 1.2826, "step": 11780 }, { "epoch": 21.79, "grad_norm": 4.997123718261719, "learning_rate": 1.1285767097966729e-05, "loss": 1.2712, "step": 11790 }, { "epoch": 21.81, "grad_norm": 7.852758407592773, "learning_rate": 1.127837338262477e-05, "loss": 1.7154, "step": 11800 }, { "epoch": 21.83, "grad_norm": 8.347437858581543, "learning_rate": 1.127097966728281e-05, "loss": 1.514, "step": 11810 }, { "epoch": 21.85, "grad_norm": 8.173813819885254, "learning_rate": 1.1263585951940852e-05, "loss": 1.1839, "step": 11820 }, { "epoch": 21.87, "grad_norm": 9.060689926147461, "learning_rate": 1.1256192236598893e-05, "loss": 1.4605, "step": 11830 }, { "epoch": 21.89, "grad_norm": 6.16258430480957, "learning_rate": 1.1248798521256931e-05, "loss": 1.324, "step": 11840 }, { "epoch": 21.9, "grad_norm": 6.443804740905762, "learning_rate": 1.1241404805914973e-05, "loss": 1.204, "step": 11850 }, { "epoch": 21.92, "grad_norm": 5.231833457946777, "learning_rate": 1.1234011090573014e-05, "loss": 1.7226, "step": 11860 }, { "epoch": 21.94, "grad_norm": 7.275303363800049, "learning_rate": 1.1226617375231056e-05, "loss": 1.5458, "step": 11870 }, { "epoch": 21.96, "grad_norm": 7.830960750579834, "learning_rate": 1.1219223659889095e-05, "loss": 1.035, "step": 11880 }, { "epoch": 21.98, "grad_norm": 7.463703155517578, "learning_rate": 1.1211829944547137e-05, "loss": 0.9821, "step": 11890 }, { "epoch": 22.0, "grad_norm": 8.260432243347168, "learning_rate": 1.1204436229205175e-05, "loss": 1.4893, "step": 11900 }, { "epoch": 22.0, "eval_accuracy": 0.7817047817047817, "eval_loss": 0.7512285709381104, "eval_runtime": 1.5131, "eval_samples_per_second": 317.9, "eval_steps_per_second": 40.316, "step": 11902 }, { "epoch": 22.01, "grad_norm": 8.712335586547852, "learning_rate": 1.1197042513863216e-05, "loss": 1.1745, "step": 11910 }, { "epoch": 22.03, "grad_norm": 7.98191499710083, "learning_rate": 1.1189648798521258e-05, "loss": 1.3002, "step": 11920 }, { "epoch": 22.05, "grad_norm": 5.209779739379883, "learning_rate": 1.11822550831793e-05, "loss": 1.4761, "step": 11930 }, { "epoch": 22.07, "grad_norm": 9.63319206237793, "learning_rate": 1.117486136783734e-05, "loss": 1.564, "step": 11940 }, { "epoch": 22.09, "grad_norm": 11.607734680175781, "learning_rate": 1.116746765249538e-05, "loss": 1.4209, "step": 11950 }, { "epoch": 22.11, "grad_norm": 7.571458339691162, "learning_rate": 1.1160073937153419e-05, "loss": 1.2007, "step": 11960 }, { "epoch": 22.13, "grad_norm": 7.840938091278076, "learning_rate": 1.115268022181146e-05, "loss": 1.1568, "step": 11970 }, { "epoch": 22.14, "grad_norm": 8.122832298278809, "learning_rate": 1.1145286506469502e-05, "loss": 1.6201, "step": 11980 }, { "epoch": 22.16, "grad_norm": 11.827096939086914, "learning_rate": 1.1137892791127543e-05, "loss": 1.3367, "step": 11990 }, { "epoch": 22.18, "grad_norm": 6.626908779144287, "learning_rate": 1.1130499075785583e-05, "loss": 1.6053, "step": 12000 }, { "epoch": 22.2, "grad_norm": 7.2154364585876465, "learning_rate": 1.1123105360443624e-05, "loss": 1.5185, "step": 12010 }, { "epoch": 22.22, "grad_norm": 4.73091983795166, "learning_rate": 1.1115711645101666e-05, "loss": 1.3093, "step": 12020 }, { "epoch": 22.24, "grad_norm": 6.9771223068237305, "learning_rate": 1.1108317929759704e-05, "loss": 1.1898, "step": 12030 }, { "epoch": 22.26, "grad_norm": 8.85807991027832, "learning_rate": 1.1100924214417745e-05, "loss": 1.4207, "step": 12040 }, { "epoch": 22.27, "grad_norm": 8.001974105834961, "learning_rate": 1.1093530499075787e-05, "loss": 1.3306, "step": 12050 }, { "epoch": 22.29, "grad_norm": 5.023130893707275, "learning_rate": 1.1086136783733827e-05, "loss": 1.4293, "step": 12060 }, { "epoch": 22.31, "grad_norm": 8.700455665588379, "learning_rate": 1.1078743068391868e-05, "loss": 1.6895, "step": 12070 }, { "epoch": 22.33, "grad_norm": 11.155216217041016, "learning_rate": 1.107134935304991e-05, "loss": 1.5393, "step": 12080 }, { "epoch": 22.35, "grad_norm": 7.856583118438721, "learning_rate": 1.1063955637707948e-05, "loss": 1.3634, "step": 12090 }, { "epoch": 22.37, "grad_norm": 8.946937561035156, "learning_rate": 1.105656192236599e-05, "loss": 1.2095, "step": 12100 }, { "epoch": 22.38, "grad_norm": 9.07411003112793, "learning_rate": 1.104916820702403e-05, "loss": 1.2862, "step": 12110 }, { "epoch": 22.4, "grad_norm": 13.299642562866211, "learning_rate": 1.104177449168207e-05, "loss": 1.327, "step": 12120 }, { "epoch": 22.42, "grad_norm": 9.356437683105469, "learning_rate": 1.1034380776340112e-05, "loss": 1.4057, "step": 12130 }, { "epoch": 22.44, "grad_norm": 13.375534057617188, "learning_rate": 1.1026987060998153e-05, "loss": 1.1101, "step": 12140 }, { "epoch": 22.46, "grad_norm": 8.681534767150879, "learning_rate": 1.1019593345656192e-05, "loss": 1.1423, "step": 12150 }, { "epoch": 22.48, "grad_norm": 8.931329727172852, "learning_rate": 1.1012199630314233e-05, "loss": 1.4668, "step": 12160 }, { "epoch": 22.5, "grad_norm": 3.99316143989563, "learning_rate": 1.1004805914972275e-05, "loss": 1.0081, "step": 12170 }, { "epoch": 22.51, "grad_norm": 7.102502346038818, "learning_rate": 1.0997412199630314e-05, "loss": 0.8837, "step": 12180 }, { "epoch": 22.53, "grad_norm": 7.58033561706543, "learning_rate": 1.0990018484288356e-05, "loss": 1.1121, "step": 12190 }, { "epoch": 22.55, "grad_norm": 9.564661979675293, "learning_rate": 1.0982624768946397e-05, "loss": 1.4887, "step": 12200 }, { "epoch": 22.57, "grad_norm": 5.7400922775268555, "learning_rate": 1.0975231053604439e-05, "loss": 1.3586, "step": 12210 }, { "epoch": 22.59, "grad_norm": 6.424672603607178, "learning_rate": 1.0967837338262477e-05, "loss": 1.4396, "step": 12220 }, { "epoch": 22.61, "grad_norm": 4.439335823059082, "learning_rate": 1.0960443622920518e-05, "loss": 1.1746, "step": 12230 }, { "epoch": 22.62, "grad_norm": 5.517796516418457, "learning_rate": 1.095304990757856e-05, "loss": 1.4222, "step": 12240 }, { "epoch": 22.64, "grad_norm": 7.033897399902344, "learning_rate": 1.09456561922366e-05, "loss": 1.6831, "step": 12250 }, { "epoch": 22.66, "grad_norm": 3.94873046875, "learning_rate": 1.0938262476894641e-05, "loss": 1.5636, "step": 12260 }, { "epoch": 22.68, "grad_norm": 5.798871040344238, "learning_rate": 1.0930868761552683e-05, "loss": 1.4525, "step": 12270 }, { "epoch": 22.7, "grad_norm": 5.797488212585449, "learning_rate": 1.092347504621072e-05, "loss": 1.5089, "step": 12280 }, { "epoch": 22.72, "grad_norm": 7.582828998565674, "learning_rate": 1.0916081330868762e-05, "loss": 1.4726, "step": 12290 }, { "epoch": 22.74, "grad_norm": 5.29063081741333, "learning_rate": 1.0908687615526804e-05, "loss": 1.4478, "step": 12300 }, { "epoch": 22.75, "grad_norm": 6.107141971588135, "learning_rate": 1.0901293900184843e-05, "loss": 1.2315, "step": 12310 }, { "epoch": 22.77, "grad_norm": 10.169814109802246, "learning_rate": 1.0893900184842885e-05, "loss": 1.4084, "step": 12320 }, { "epoch": 22.79, "grad_norm": 8.42687702178955, "learning_rate": 1.0886506469500926e-05, "loss": 1.3832, "step": 12330 }, { "epoch": 22.81, "grad_norm": 7.695104122161865, "learning_rate": 1.0879112754158964e-05, "loss": 1.4754, "step": 12340 }, { "epoch": 22.83, "grad_norm": 6.806877613067627, "learning_rate": 1.0871719038817006e-05, "loss": 1.36, "step": 12350 }, { "epoch": 22.85, "grad_norm": 5.7832207679748535, "learning_rate": 1.0864325323475047e-05, "loss": 1.2236, "step": 12360 }, { "epoch": 22.87, "grad_norm": 8.538604736328125, "learning_rate": 1.0856931608133087e-05, "loss": 1.517, "step": 12370 }, { "epoch": 22.88, "grad_norm": 7.029665470123291, "learning_rate": 1.0849537892791129e-05, "loss": 1.2717, "step": 12380 }, { "epoch": 22.9, "grad_norm": 5.0061421394348145, "learning_rate": 1.084214417744917e-05, "loss": 1.1746, "step": 12390 }, { "epoch": 22.92, "grad_norm": 6.185608386993408, "learning_rate": 1.0834750462107212e-05, "loss": 1.3941, "step": 12400 }, { "epoch": 22.94, "grad_norm": 6.6581807136535645, "learning_rate": 1.082735674676525e-05, "loss": 1.6048, "step": 12410 }, { "epoch": 22.96, "grad_norm": 7.305965423583984, "learning_rate": 1.0819963031423291e-05, "loss": 1.3392, "step": 12420 }, { "epoch": 22.98, "grad_norm": 7.749025344848633, "learning_rate": 1.0812569316081331e-05, "loss": 1.1504, "step": 12430 }, { "epoch": 22.99, "grad_norm": 8.142027854919434, "learning_rate": 1.0805175600739372e-05, "loss": 1.5184, "step": 12440 }, { "epoch": 23.0, "eval_accuracy": 0.7817047817047817, "eval_loss": 0.7511539459228516, "eval_runtime": 1.5077, "eval_samples_per_second": 319.032, "eval_steps_per_second": 40.459, "step": 12443 }, { "epoch": 23.01, "grad_norm": 6.971617698669434, "learning_rate": 1.0797781885397414e-05, "loss": 1.301, "step": 12450 }, { "epoch": 23.03, "grad_norm": 11.049004554748535, "learning_rate": 1.0790388170055455e-05, "loss": 1.5958, "step": 12460 }, { "epoch": 23.05, "grad_norm": 9.78603458404541, "learning_rate": 1.0782994454713493e-05, "loss": 1.5679, "step": 12470 }, { "epoch": 23.07, "grad_norm": 5.134902000427246, "learning_rate": 1.0775600739371535e-05, "loss": 1.0408, "step": 12480 }, { "epoch": 23.09, "grad_norm": 7.398831844329834, "learning_rate": 1.0768207024029575e-05, "loss": 1.0493, "step": 12490 }, { "epoch": 23.11, "grad_norm": 4.287550926208496, "learning_rate": 1.0760813308687616e-05, "loss": 1.4846, "step": 12500 }, { "epoch": 23.12, "grad_norm": 10.557262420654297, "learning_rate": 1.0753419593345658e-05, "loss": 1.1735, "step": 12510 }, { "epoch": 23.14, "grad_norm": 5.413837432861328, "learning_rate": 1.07460258780037e-05, "loss": 1.5405, "step": 12520 }, { "epoch": 23.16, "grad_norm": 5.713277816772461, "learning_rate": 1.0739371534195934e-05, "loss": 1.4044, "step": 12530 }, { "epoch": 23.18, "grad_norm": 9.963788986206055, "learning_rate": 1.0731977818853976e-05, "loss": 1.5834, "step": 12540 }, { "epoch": 23.2, "grad_norm": 6.729143142700195, "learning_rate": 1.0724584103512014e-05, "loss": 1.2664, "step": 12550 }, { "epoch": 23.22, "grad_norm": 8.462384223937988, "learning_rate": 1.0717190388170055e-05, "loss": 1.4841, "step": 12560 }, { "epoch": 23.23, "grad_norm": 6.65444278717041, "learning_rate": 1.0709796672828097e-05, "loss": 1.3949, "step": 12570 }, { "epoch": 23.25, "grad_norm": 11.833718299865723, "learning_rate": 1.0702402957486138e-05, "loss": 1.254, "step": 12580 }, { "epoch": 23.27, "grad_norm": 5.106475353240967, "learning_rate": 1.0695009242144178e-05, "loss": 1.1509, "step": 12590 }, { "epoch": 23.29, "grad_norm": 7.56912088394165, "learning_rate": 1.068761552680222e-05, "loss": 1.4075, "step": 12600 }, { "epoch": 23.31, "grad_norm": 6.965387344360352, "learning_rate": 1.0680221811460261e-05, "loss": 1.2153, "step": 12610 }, { "epoch": 23.33, "grad_norm": 2.974879264831543, "learning_rate": 1.0672828096118299e-05, "loss": 1.5849, "step": 12620 }, { "epoch": 23.35, "grad_norm": 5.574324131011963, "learning_rate": 1.066543438077634e-05, "loss": 1.7795, "step": 12630 }, { "epoch": 23.36, "grad_norm": 9.204728126525879, "learning_rate": 1.0658040665434382e-05, "loss": 1.3422, "step": 12640 }, { "epoch": 23.38, "grad_norm": 8.958700180053711, "learning_rate": 1.0650646950092422e-05, "loss": 1.5861, "step": 12650 }, { "epoch": 23.4, "grad_norm": 8.261181831359863, "learning_rate": 1.0643253234750463e-05, "loss": 1.4616, "step": 12660 }, { "epoch": 23.42, "grad_norm": 4.647804260253906, "learning_rate": 1.0635859519408505e-05, "loss": 1.7042, "step": 12670 }, { "epoch": 23.44, "grad_norm": 7.64040994644165, "learning_rate": 1.0628465804066543e-05, "loss": 1.4248, "step": 12680 }, { "epoch": 23.46, "grad_norm": 5.6376752853393555, "learning_rate": 1.0621072088724584e-05, "loss": 1.1861, "step": 12690 }, { "epoch": 23.48, "grad_norm": 7.128305912017822, "learning_rate": 1.0613678373382626e-05, "loss": 1.1855, "step": 12700 }, { "epoch": 23.49, "grad_norm": 5.917848110198975, "learning_rate": 1.0606284658040666e-05, "loss": 1.4552, "step": 12710 }, { "epoch": 23.51, "grad_norm": 6.24388313293457, "learning_rate": 1.0598890942698707e-05, "loss": 1.0989, "step": 12720 }, { "epoch": 23.53, "grad_norm": 5.792581081390381, "learning_rate": 1.0591497227356749e-05, "loss": 1.4022, "step": 12730 }, { "epoch": 23.55, "grad_norm": 6.877856254577637, "learning_rate": 1.0584103512014787e-05, "loss": 1.4989, "step": 12740 }, { "epoch": 23.57, "grad_norm": 7.886204719543457, "learning_rate": 1.0576709796672828e-05, "loss": 1.1803, "step": 12750 }, { "epoch": 23.59, "grad_norm": 6.434450626373291, "learning_rate": 1.056931608133087e-05, "loss": 1.5539, "step": 12760 }, { "epoch": 23.6, "grad_norm": 8.343116760253906, "learning_rate": 1.0561922365988911e-05, "loss": 1.5861, "step": 12770 }, { "epoch": 23.62, "grad_norm": 5.1958723068237305, "learning_rate": 1.0554528650646951e-05, "loss": 1.4414, "step": 12780 }, { "epoch": 23.64, "grad_norm": 6.79095983505249, "learning_rate": 1.0547134935304992e-05, "loss": 1.27, "step": 12790 }, { "epoch": 23.66, "grad_norm": 7.6770148277282715, "learning_rate": 1.0539741219963034e-05, "loss": 1.4357, "step": 12800 }, { "epoch": 23.68, "grad_norm": 6.101117134094238, "learning_rate": 1.0532347504621072e-05, "loss": 1.3508, "step": 12810 }, { "epoch": 23.7, "grad_norm": 8.285028457641602, "learning_rate": 1.0524953789279113e-05, "loss": 1.3846, "step": 12820 }, { "epoch": 23.72, "grad_norm": 8.789499282836914, "learning_rate": 1.0517560073937155e-05, "loss": 1.1099, "step": 12830 }, { "epoch": 23.73, "grad_norm": 5.468909740447998, "learning_rate": 1.0510166358595195e-05, "loss": 1.193, "step": 12840 }, { "epoch": 23.75, "grad_norm": 6.093325614929199, "learning_rate": 1.0502772643253236e-05, "loss": 1.4671, "step": 12850 }, { "epoch": 23.77, "grad_norm": 7.999007701873779, "learning_rate": 1.0495378927911278e-05, "loss": 1.3592, "step": 12860 }, { "epoch": 23.79, "grad_norm": 7.912302017211914, "learning_rate": 1.0487985212569316e-05, "loss": 1.3435, "step": 12870 }, { "epoch": 23.81, "grad_norm": 3.6465883255004883, "learning_rate": 1.0480591497227357e-05, "loss": 1.2532, "step": 12880 }, { "epoch": 23.83, "grad_norm": 10.52814769744873, "learning_rate": 1.0473197781885399e-05, "loss": 1.5627, "step": 12890 }, { "epoch": 23.84, "grad_norm": 8.57843017578125, "learning_rate": 1.0465804066543439e-05, "loss": 1.256, "step": 12900 }, { "epoch": 23.86, "grad_norm": 5.653956890106201, "learning_rate": 1.045841035120148e-05, "loss": 1.634, "step": 12910 }, { "epoch": 23.88, "grad_norm": 3.431403636932373, "learning_rate": 1.0451016635859521e-05, "loss": 1.1348, "step": 12920 }, { "epoch": 23.9, "grad_norm": 6.52334451675415, "learning_rate": 1.044362292051756e-05, "loss": 1.5457, "step": 12930 }, { "epoch": 23.92, "grad_norm": 13.523636817932129, "learning_rate": 1.0436229205175601e-05, "loss": 1.4608, "step": 12940 }, { "epoch": 23.94, "grad_norm": 6.483691215515137, "learning_rate": 1.0428835489833643e-05, "loss": 1.436, "step": 12950 }, { "epoch": 23.96, "grad_norm": 8.430671691894531, "learning_rate": 1.0421441774491682e-05, "loss": 1.4332, "step": 12960 }, { "epoch": 23.97, "grad_norm": 8.225566864013672, "learning_rate": 1.0414048059149724e-05, "loss": 1.2968, "step": 12970 }, { "epoch": 23.99, "grad_norm": 5.112839698791504, "learning_rate": 1.0406654343807765e-05, "loss": 1.6562, "step": 12980 }, { "epoch": 24.0, "eval_accuracy": 0.7796257796257796, "eval_loss": 0.75136399269104, "eval_runtime": 1.5308, "eval_samples_per_second": 314.214, "eval_steps_per_second": 39.848, "step": 12984 }, { "epoch": 24.01, "grad_norm": 5.909259796142578, "learning_rate": 1.0399260628465807e-05, "loss": 1.3883, "step": 12990 }, { "epoch": 24.03, "grad_norm": 8.141520500183105, "learning_rate": 1.0391866913123845e-05, "loss": 1.1229, "step": 13000 }, { "epoch": 24.05, "grad_norm": 8.980448722839355, "learning_rate": 1.0384473197781886e-05, "loss": 1.5767, "step": 13010 }, { "epoch": 24.07, "grad_norm": 6.66533899307251, "learning_rate": 1.0377079482439926e-05, "loss": 1.5113, "step": 13020 }, { "epoch": 24.09, "grad_norm": 10.45879077911377, "learning_rate": 1.0369685767097968e-05, "loss": 1.3386, "step": 13030 }, { "epoch": 24.1, "grad_norm": 9.305734634399414, "learning_rate": 1.0362292051756009e-05, "loss": 1.7952, "step": 13040 }, { "epoch": 24.12, "grad_norm": 7.619143962860107, "learning_rate": 1.035489833641405e-05, "loss": 1.4291, "step": 13050 }, { "epoch": 24.14, "grad_norm": 11.002583503723145, "learning_rate": 1.0347504621072089e-05, "loss": 1.3501, "step": 13060 }, { "epoch": 24.16, "grad_norm": 7.014692306518555, "learning_rate": 1.034011090573013e-05, "loss": 1.4116, "step": 13070 }, { "epoch": 24.18, "grad_norm": 6.4368896484375, "learning_rate": 1.033271719038817e-05, "loss": 1.4815, "step": 13080 }, { "epoch": 24.2, "grad_norm": 9.591557502746582, "learning_rate": 1.0325323475046211e-05, "loss": 1.6322, "step": 13090 }, { "epoch": 24.21, "grad_norm": 5.805912494659424, "learning_rate": 1.0317929759704253e-05, "loss": 1.3219, "step": 13100 }, { "epoch": 24.23, "grad_norm": 6.553102970123291, "learning_rate": 1.0310536044362294e-05, "loss": 1.3192, "step": 13110 }, { "epoch": 24.25, "grad_norm": 4.791707515716553, "learning_rate": 1.0303142329020332e-05, "loss": 1.2892, "step": 13120 }, { "epoch": 24.27, "grad_norm": 5.719997406005859, "learning_rate": 1.0295748613678374e-05, "loss": 0.9475, "step": 13130 }, { "epoch": 24.29, "grad_norm": 3.7858450412750244, "learning_rate": 1.0288354898336415e-05, "loss": 1.3077, "step": 13140 }, { "epoch": 24.31, "grad_norm": 4.12766170501709, "learning_rate": 1.0280961182994455e-05, "loss": 1.3598, "step": 13150 }, { "epoch": 24.33, "grad_norm": 6.629365921020508, "learning_rate": 1.0273567467652497e-05, "loss": 1.3573, "step": 13160 }, { "epoch": 24.34, "grad_norm": 5.114104270935059, "learning_rate": 1.0266173752310538e-05, "loss": 0.9737, "step": 13170 }, { "epoch": 24.36, "grad_norm": 8.67841911315918, "learning_rate": 1.0258780036968578e-05, "loss": 1.2265, "step": 13180 }, { "epoch": 24.38, "grad_norm": 4.377577781677246, "learning_rate": 1.0251386321626618e-05, "loss": 0.9165, "step": 13190 }, { "epoch": 24.4, "grad_norm": 3.7978522777557373, "learning_rate": 1.0243992606284659e-05, "loss": 0.9896, "step": 13200 }, { "epoch": 24.42, "grad_norm": 2.9300734996795654, "learning_rate": 1.0236598890942699e-05, "loss": 1.451, "step": 13210 }, { "epoch": 24.44, "grad_norm": 6.089378833770752, "learning_rate": 1.022920517560074e-05, "loss": 1.3423, "step": 13220 }, { "epoch": 24.45, "grad_norm": 6.38794469833374, "learning_rate": 1.0221811460258782e-05, "loss": 1.4446, "step": 13230 }, { "epoch": 24.47, "grad_norm": 10.86343765258789, "learning_rate": 1.0214417744916822e-05, "loss": 1.5197, "step": 13240 }, { "epoch": 24.49, "grad_norm": 6.462056636810303, "learning_rate": 1.0207024029574861e-05, "loss": 1.1117, "step": 13250 }, { "epoch": 24.51, "grad_norm": 6.570525169372559, "learning_rate": 1.0199630314232903e-05, "loss": 1.5143, "step": 13260 }, { "epoch": 24.53, "grad_norm": 8.937602996826172, "learning_rate": 1.0192236598890943e-05, "loss": 1.3126, "step": 13270 }, { "epoch": 24.55, "grad_norm": 5.952121734619141, "learning_rate": 1.0184842883548984e-05, "loss": 1.0465, "step": 13280 }, { "epoch": 24.57, "grad_norm": 10.19490909576416, "learning_rate": 1.0177449168207026e-05, "loss": 1.3609, "step": 13290 }, { "epoch": 24.58, "grad_norm": 7.464567184448242, "learning_rate": 1.0170055452865067e-05, "loss": 1.434, "step": 13300 }, { "epoch": 24.6, "grad_norm": 8.430435180664062, "learning_rate": 1.0162661737523105e-05, "loss": 1.3657, "step": 13310 }, { "epoch": 24.62, "grad_norm": 8.548328399658203, "learning_rate": 1.0155268022181147e-05, "loss": 1.3866, "step": 13320 }, { "epoch": 24.64, "grad_norm": 10.647490501403809, "learning_rate": 1.0147874306839186e-05, "loss": 1.3026, "step": 13330 }, { "epoch": 24.66, "grad_norm": 9.02955150604248, "learning_rate": 1.0140480591497228e-05, "loss": 1.4993, "step": 13340 }, { "epoch": 24.68, "grad_norm": 6.8586883544921875, "learning_rate": 1.013308687615527e-05, "loss": 1.5833, "step": 13350 }, { "epoch": 24.7, "grad_norm": 6.771891117095947, "learning_rate": 1.0125693160813311e-05, "loss": 1.2607, "step": 13360 }, { "epoch": 24.71, "grad_norm": 9.178412437438965, "learning_rate": 1.011829944547135e-05, "loss": 1.4189, "step": 13370 }, { "epoch": 24.73, "grad_norm": 6.844827175140381, "learning_rate": 1.011090573012939e-05, "loss": 1.3194, "step": 13380 }, { "epoch": 24.75, "grad_norm": 9.727232933044434, "learning_rate": 1.010351201478743e-05, "loss": 1.6348, "step": 13390 }, { "epoch": 24.77, "grad_norm": 5.503018379211426, "learning_rate": 1.0096118299445472e-05, "loss": 1.3988, "step": 13400 }, { "epoch": 24.79, "grad_norm": 7.7515082359313965, "learning_rate": 1.0088724584103513e-05, "loss": 1.5255, "step": 13410 }, { "epoch": 24.81, "grad_norm": 2.694542407989502, "learning_rate": 1.0081330868761555e-05, "loss": 1.3314, "step": 13420 }, { "epoch": 24.82, "grad_norm": 9.787741661071777, "learning_rate": 1.0073937153419595e-05, "loss": 1.2178, "step": 13430 }, { "epoch": 24.84, "grad_norm": 5.796849250793457, "learning_rate": 1.0066543438077634e-05, "loss": 1.3848, "step": 13440 }, { "epoch": 24.86, "grad_norm": 2.8658974170684814, "learning_rate": 1.0059149722735674e-05, "loss": 1.5989, "step": 13450 }, { "epoch": 24.88, "grad_norm": 8.567291259765625, "learning_rate": 1.0051756007393716e-05, "loss": 1.7145, "step": 13460 }, { "epoch": 24.9, "grad_norm": 5.449319362640381, "learning_rate": 1.0044362292051757e-05, "loss": 1.3861, "step": 13470 }, { "epoch": 24.92, "grad_norm": 10.145344734191895, "learning_rate": 1.0036968576709799e-05, "loss": 1.245, "step": 13480 }, { "epoch": 24.94, "grad_norm": 8.282411575317383, "learning_rate": 1.0029574861367838e-05, "loss": 1.6717, "step": 13490 }, { "epoch": 24.95, "grad_norm": 5.21071195602417, "learning_rate": 1.0022181146025878e-05, "loss": 1.0497, "step": 13500 }, { "epoch": 24.97, "grad_norm": 8.37203311920166, "learning_rate": 1.001478743068392e-05, "loss": 1.5947, "step": 13510 }, { "epoch": 24.99, "grad_norm": 8.847264289855957, "learning_rate": 1.000739371534196e-05, "loss": 1.4148, "step": 13520 }, { "epoch": 25.0, "eval_accuracy": 0.7817047817047817, "eval_loss": 0.724145770072937, "eval_runtime": 1.4782, "eval_samples_per_second": 325.402, "eval_steps_per_second": 41.267, "step": 13525 }, { "epoch": 25.01, "grad_norm": 6.254756927490234, "learning_rate": 1e-05, "loss": 1.5484, "step": 13530 }, { "epoch": 25.03, "grad_norm": 9.738399505615234, "learning_rate": 9.992606284658042e-06, "loss": 1.119, "step": 13540 }, { "epoch": 25.05, "grad_norm": 7.1488447189331055, "learning_rate": 9.985212569316082e-06, "loss": 1.5696, "step": 13550 }, { "epoch": 25.06, "grad_norm": 4.991587162017822, "learning_rate": 9.977818853974124e-06, "loss": 1.1105, "step": 13560 }, { "epoch": 25.08, "grad_norm": 6.0340070724487305, "learning_rate": 9.970425138632163e-06, "loss": 1.2808, "step": 13570 }, { "epoch": 25.1, "grad_norm": 9.68111515045166, "learning_rate": 9.963031423290203e-06, "loss": 1.3558, "step": 13580 }, { "epoch": 25.12, "grad_norm": 9.910112380981445, "learning_rate": 9.955637707948245e-06, "loss": 1.1733, "step": 13590 }, { "epoch": 25.14, "grad_norm": 6.2155537605285645, "learning_rate": 9.948243992606286e-06, "loss": 1.1914, "step": 13600 }, { "epoch": 25.16, "grad_norm": 13.912396430969238, "learning_rate": 9.940850277264326e-06, "loss": 1.5712, "step": 13610 }, { "epoch": 25.18, "grad_norm": 6.503040313720703, "learning_rate": 9.933456561922367e-06, "loss": 1.3086, "step": 13620 }, { "epoch": 25.19, "grad_norm": 11.806987762451172, "learning_rate": 9.926062846580407e-06, "loss": 1.3033, "step": 13630 }, { "epoch": 25.21, "grad_norm": 9.485701560974121, "learning_rate": 9.918669131238449e-06, "loss": 1.8192, "step": 13640 }, { "epoch": 25.23, "grad_norm": 4.342387676239014, "learning_rate": 9.911275415896488e-06, "loss": 1.3754, "step": 13650 }, { "epoch": 25.25, "grad_norm": 7.248023509979248, "learning_rate": 9.90388170055453e-06, "loss": 1.4199, "step": 13660 }, { "epoch": 25.27, "grad_norm": 6.55515718460083, "learning_rate": 9.896487985212571e-06, "loss": 1.3879, "step": 13670 }, { "epoch": 25.29, "grad_norm": 4.802649021148682, "learning_rate": 9.889094269870611e-06, "loss": 1.4696, "step": 13680 }, { "epoch": 25.3, "grad_norm": 8.461181640625, "learning_rate": 9.881700554528651e-06, "loss": 1.2484, "step": 13690 }, { "epoch": 25.32, "grad_norm": 6.425735950469971, "learning_rate": 9.874306839186692e-06, "loss": 1.5578, "step": 13700 }, { "epoch": 25.34, "grad_norm": 10.806066513061523, "learning_rate": 9.866913123844732e-06, "loss": 1.1959, "step": 13710 }, { "epoch": 25.36, "grad_norm": 8.461750984191895, "learning_rate": 9.859519408502774e-06, "loss": 1.7533, "step": 13720 }, { "epoch": 25.38, "grad_norm": 8.579279899597168, "learning_rate": 9.852125693160815e-06, "loss": 1.0918, "step": 13730 }, { "epoch": 25.4, "grad_norm": 7.084902763366699, "learning_rate": 9.844731977818855e-06, "loss": 1.1729, "step": 13740 }, { "epoch": 25.42, "grad_norm": 11.286319732666016, "learning_rate": 9.837338262476895e-06, "loss": 1.0522, "step": 13750 }, { "epoch": 25.43, "grad_norm": 5.057124614715576, "learning_rate": 9.829944547134936e-06, "loss": 1.2116, "step": 13760 }, { "epoch": 25.45, "grad_norm": 7.2415642738342285, "learning_rate": 9.822550831792976e-06, "loss": 1.3494, "step": 13770 }, { "epoch": 25.47, "grad_norm": 3.6078479290008545, "learning_rate": 9.815157116451017e-06, "loss": 1.3964, "step": 13780 }, { "epoch": 25.49, "grad_norm": 7.002992153167725, "learning_rate": 9.807763401109059e-06, "loss": 1.1733, "step": 13790 }, { "epoch": 25.51, "grad_norm": 9.333163261413574, "learning_rate": 9.800369685767099e-06, "loss": 1.496, "step": 13800 }, { "epoch": 25.53, "grad_norm": 3.3670945167541504, "learning_rate": 9.792975970425138e-06, "loss": 0.8684, "step": 13810 }, { "epoch": 25.55, "grad_norm": 5.817105293273926, "learning_rate": 9.78558225508318e-06, "loss": 1.3999, "step": 13820 }, { "epoch": 25.56, "grad_norm": 7.8115386962890625, "learning_rate": 9.778188539741221e-06, "loss": 1.3617, "step": 13830 }, { "epoch": 25.58, "grad_norm": 6.651092052459717, "learning_rate": 9.770794824399261e-06, "loss": 1.2798, "step": 13840 }, { "epoch": 25.6, "grad_norm": 7.794206142425537, "learning_rate": 9.763401109057303e-06, "loss": 1.2758, "step": 13850 }, { "epoch": 25.62, "grad_norm": 7.335329532623291, "learning_rate": 9.756007393715342e-06, "loss": 1.1982, "step": 13860 }, { "epoch": 25.64, "grad_norm": 5.3805251121521, "learning_rate": 9.748613678373382e-06, "loss": 1.356, "step": 13870 }, { "epoch": 25.66, "grad_norm": 4.9748616218566895, "learning_rate": 9.741219963031424e-06, "loss": 1.4839, "step": 13880 }, { "epoch": 25.67, "grad_norm": 7.438746452331543, "learning_rate": 9.733826247689465e-06, "loss": 1.3433, "step": 13890 }, { "epoch": 25.69, "grad_norm": 6.420422554016113, "learning_rate": 9.726432532347505e-06, "loss": 1.4756, "step": 13900 }, { "epoch": 25.71, "grad_norm": 4.756766319274902, "learning_rate": 9.719038817005547e-06, "loss": 1.2008, "step": 13910 }, { "epoch": 25.73, "grad_norm": 6.0768022537231445, "learning_rate": 9.711645101663586e-06, "loss": 1.2823, "step": 13920 }, { "epoch": 25.75, "grad_norm": 9.590805053710938, "learning_rate": 9.704251386321628e-06, "loss": 1.4277, "step": 13930 }, { "epoch": 25.77, "grad_norm": 6.328165054321289, "learning_rate": 9.696857670979668e-06, "loss": 1.3879, "step": 13940 }, { "epoch": 25.79, "grad_norm": 7.806405544281006, "learning_rate": 9.689463955637709e-06, "loss": 1.3072, "step": 13950 }, { "epoch": 25.8, "grad_norm": 9.090680122375488, "learning_rate": 9.682070240295749e-06, "loss": 1.3761, "step": 13960 }, { "epoch": 25.82, "grad_norm": 6.800095081329346, "learning_rate": 9.67467652495379e-06, "loss": 1.2226, "step": 13970 }, { "epoch": 25.84, "grad_norm": 6.206569671630859, "learning_rate": 9.667282809611832e-06, "loss": 1.6557, "step": 13980 }, { "epoch": 25.86, "grad_norm": 8.505800247192383, "learning_rate": 9.659889094269872e-06, "loss": 1.305, "step": 13990 }, { "epoch": 25.88, "grad_norm": 5.879108905792236, "learning_rate": 9.652495378927911e-06, "loss": 1.2434, "step": 14000 }, { "epoch": 25.9, "grad_norm": 6.93980598449707, "learning_rate": 9.645101663585953e-06, "loss": 1.5734, "step": 14010 }, { "epoch": 25.91, "grad_norm": 3.7322256565093994, "learning_rate": 9.637707948243994e-06, "loss": 1.4805, "step": 14020 }, { "epoch": 25.93, "grad_norm": 9.904962539672852, "learning_rate": 9.630314232902034e-06, "loss": 1.3385, "step": 14030 }, { "epoch": 25.95, "grad_norm": 5.665058612823486, "learning_rate": 9.622920517560076e-06, "loss": 1.412, "step": 14040 }, { "epoch": 25.97, "grad_norm": 4.041703224182129, "learning_rate": 9.615526802218115e-06, "loss": 1.2419, "step": 14050 }, { "epoch": 25.99, "grad_norm": 7.898007392883301, "learning_rate": 9.608133086876155e-06, "loss": 1.2765, "step": 14060 }, { "epoch": 26.0, "eval_accuracy": 0.8045738045738046, "eval_loss": 0.6907182335853577, "eval_runtime": 1.5485, "eval_samples_per_second": 310.627, "eval_steps_per_second": 39.393, "step": 14066 }, { "epoch": 26.01, "grad_norm": 5.852095603942871, "learning_rate": 9.600739371534197e-06, "loss": 1.4553, "step": 14070 }, { "epoch": 26.03, "grad_norm": 6.461130142211914, "learning_rate": 9.593345656192238e-06, "loss": 1.6001, "step": 14080 }, { "epoch": 26.04, "grad_norm": 6.564659118652344, "learning_rate": 9.585951940850278e-06, "loss": 1.4111, "step": 14090 }, { "epoch": 26.06, "grad_norm": 3.214164972305298, "learning_rate": 9.57855822550832e-06, "loss": 1.2773, "step": 14100 }, { "epoch": 26.08, "grad_norm": 7.175471782684326, "learning_rate": 9.571164510166359e-06, "loss": 1.3384, "step": 14110 }, { "epoch": 26.1, "grad_norm": 7.555532455444336, "learning_rate": 9.563770794824399e-06, "loss": 1.2625, "step": 14120 }, { "epoch": 26.12, "grad_norm": 6.505243301391602, "learning_rate": 9.55637707948244e-06, "loss": 1.2277, "step": 14130 }, { "epoch": 26.14, "grad_norm": 6.616326808929443, "learning_rate": 9.548983364140482e-06, "loss": 1.5304, "step": 14140 }, { "epoch": 26.16, "grad_norm": 5.286740303039551, "learning_rate": 9.541589648798522e-06, "loss": 1.2549, "step": 14150 }, { "epoch": 26.17, "grad_norm": 2.6066131591796875, "learning_rate": 9.534195933456563e-06, "loss": 1.4153, "step": 14160 }, { "epoch": 26.19, "grad_norm": 6.474798679351807, "learning_rate": 9.526802218114603e-06, "loss": 1.1627, "step": 14170 }, { "epoch": 26.21, "grad_norm": 10.11211109161377, "learning_rate": 9.519408502772643e-06, "loss": 1.3931, "step": 14180 }, { "epoch": 26.23, "grad_norm": 7.025973796844482, "learning_rate": 9.512014787430684e-06, "loss": 1.4916, "step": 14190 }, { "epoch": 26.25, "grad_norm": 8.274212837219238, "learning_rate": 9.504621072088726e-06, "loss": 1.2192, "step": 14200 }, { "epoch": 26.27, "grad_norm": 7.697746276855469, "learning_rate": 9.497227356746767e-06, "loss": 1.3365, "step": 14210 }, { "epoch": 26.28, "grad_norm": 2.852369546890259, "learning_rate": 9.489833641404807e-06, "loss": 1.011, "step": 14220 }, { "epoch": 26.3, "grad_norm": 8.063301086425781, "learning_rate": 9.482439926062847e-06, "loss": 1.3058, "step": 14230 }, { "epoch": 26.32, "grad_norm": 9.499516487121582, "learning_rate": 9.475046210720888e-06, "loss": 1.5021, "step": 14240 }, { "epoch": 26.34, "grad_norm": 12.37392520904541, "learning_rate": 9.467652495378928e-06, "loss": 1.4362, "step": 14250 }, { "epoch": 26.36, "grad_norm": 3.8952250480651855, "learning_rate": 9.46025878003697e-06, "loss": 1.4146, "step": 14260 }, { "epoch": 26.38, "grad_norm": 5.7135772705078125, "learning_rate": 9.452865064695011e-06, "loss": 1.04, "step": 14270 }, { "epoch": 26.4, "grad_norm": 6.937562465667725, "learning_rate": 9.44547134935305e-06, "loss": 1.3356, "step": 14280 }, { "epoch": 26.41, "grad_norm": 5.418099403381348, "learning_rate": 9.43807763401109e-06, "loss": 1.3119, "step": 14290 }, { "epoch": 26.43, "grad_norm": 8.156275749206543, "learning_rate": 9.430683918669132e-06, "loss": 1.5926, "step": 14300 }, { "epoch": 26.45, "grad_norm": 6.923559665679932, "learning_rate": 9.423290203327172e-06, "loss": 1.6164, "step": 14310 }, { "epoch": 26.47, "grad_norm": 8.875688552856445, "learning_rate": 9.415896487985213e-06, "loss": 1.3263, "step": 14320 }, { "epoch": 26.49, "grad_norm": 5.2747931480407715, "learning_rate": 9.408502772643255e-06, "loss": 1.1156, "step": 14330 }, { "epoch": 26.51, "grad_norm": 8.07530403137207, "learning_rate": 9.401109057301294e-06, "loss": 1.1775, "step": 14340 }, { "epoch": 26.52, "grad_norm": 8.123104095458984, "learning_rate": 9.393715341959336e-06, "loss": 1.3658, "step": 14350 }, { "epoch": 26.54, "grad_norm": 7.007016658782959, "learning_rate": 9.386321626617376e-06, "loss": 1.613, "step": 14360 }, { "epoch": 26.56, "grad_norm": 4.060016632080078, "learning_rate": 9.378927911275416e-06, "loss": 1.1538, "step": 14370 }, { "epoch": 26.58, "grad_norm": 9.160844802856445, "learning_rate": 9.371534195933457e-06, "loss": 1.0632, "step": 14380 }, { "epoch": 26.6, "grad_norm": 5.1939849853515625, "learning_rate": 9.364140480591499e-06, "loss": 1.2383, "step": 14390 }, { "epoch": 26.62, "grad_norm": 5.127926349639893, "learning_rate": 9.35674676524954e-06, "loss": 1.2155, "step": 14400 }, { "epoch": 26.64, "grad_norm": 2.890629529953003, "learning_rate": 9.34935304990758e-06, "loss": 1.0936, "step": 14410 }, { "epoch": 26.65, "grad_norm": 13.15788459777832, "learning_rate": 9.34195933456562e-06, "loss": 1.3045, "step": 14420 }, { "epoch": 26.67, "grad_norm": 7.378518581390381, "learning_rate": 9.334565619223661e-06, "loss": 1.2686, "step": 14430 }, { "epoch": 26.69, "grad_norm": 7.857909679412842, "learning_rate": 9.3271719038817e-06, "loss": 1.2499, "step": 14440 }, { "epoch": 26.71, "grad_norm": 5.0521016120910645, "learning_rate": 9.319778188539742e-06, "loss": 1.363, "step": 14450 }, { "epoch": 26.73, "grad_norm": 6.694394111633301, "learning_rate": 9.312384473197784e-06, "loss": 1.7492, "step": 14460 }, { "epoch": 26.75, "grad_norm": 11.328523635864258, "learning_rate": 9.304990757855824e-06, "loss": 1.2564, "step": 14470 }, { "epoch": 26.77, "grad_norm": 10.425835609436035, "learning_rate": 9.297597042513863e-06, "loss": 1.1301, "step": 14480 }, { "epoch": 26.78, "grad_norm": 5.862189292907715, "learning_rate": 9.290203327171905e-06, "loss": 1.3059, "step": 14490 }, { "epoch": 26.8, "grad_norm": 14.182328224182129, "learning_rate": 9.282809611829945e-06, "loss": 1.6083, "step": 14500 }, { "epoch": 26.82, "grad_norm": 6.300644397735596, "learning_rate": 9.275415896487986e-06, "loss": 1.6482, "step": 14510 }, { "epoch": 26.84, "grad_norm": 3.3233535289764404, "learning_rate": 9.268022181146028e-06, "loss": 1.1329, "step": 14520 }, { "epoch": 26.86, "grad_norm": 10.081510543823242, "learning_rate": 9.260628465804067e-06, "loss": 1.2548, "step": 14530 }, { "epoch": 26.88, "grad_norm": 7.304401397705078, "learning_rate": 9.253234750462107e-06, "loss": 1.5201, "step": 14540 }, { "epoch": 26.89, "grad_norm": 5.282886981964111, "learning_rate": 9.245841035120149e-06, "loss": 1.1858, "step": 14550 }, { "epoch": 26.91, "grad_norm": 2.7714195251464844, "learning_rate": 9.238447319778188e-06, "loss": 1.3915, "step": 14560 }, { "epoch": 26.93, "grad_norm": 8.017133712768555, "learning_rate": 9.23105360443623e-06, "loss": 1.2421, "step": 14570 }, { "epoch": 26.95, "grad_norm": 6.862401485443115, "learning_rate": 9.223659889094271e-06, "loss": 1.2611, "step": 14580 }, { "epoch": 26.97, "grad_norm": 7.767041206359863, "learning_rate": 9.216266173752311e-06, "loss": 1.2408, "step": 14590 }, { "epoch": 26.99, "grad_norm": 5.0078325271606445, "learning_rate": 9.208872458410351e-06, "loss": 1.3378, "step": 14600 }, { "epoch": 27.0, "eval_accuracy": 0.7900207900207901, "eval_loss": 0.7132007479667664, "eval_runtime": 1.5095, "eval_samples_per_second": 318.658, "eval_steps_per_second": 40.412, "step": 14607 }, { "epoch": 27.01, "grad_norm": 5.039859771728516, "learning_rate": 9.201478743068392e-06, "loss": 1.2002, "step": 14610 }, { "epoch": 27.02, "grad_norm": 8.75201416015625, "learning_rate": 9.194085027726434e-06, "loss": 1.1214, "step": 14620 }, { "epoch": 27.04, "grad_norm": 5.487382888793945, "learning_rate": 9.186691312384474e-06, "loss": 1.1483, "step": 14630 }, { "epoch": 27.06, "grad_norm": 7.221423625946045, "learning_rate": 9.179297597042515e-06, "loss": 0.9272, "step": 14640 }, { "epoch": 27.08, "grad_norm": 4.308627128601074, "learning_rate": 9.171903881700555e-06, "loss": 1.1841, "step": 14650 }, { "epoch": 27.1, "grad_norm": 9.542632102966309, "learning_rate": 9.164510166358596e-06, "loss": 1.2789, "step": 14660 }, { "epoch": 27.12, "grad_norm": 5.294569492340088, "learning_rate": 9.157116451016636e-06, "loss": 0.9547, "step": 14670 }, { "epoch": 27.13, "grad_norm": 5.783150672912598, "learning_rate": 9.149722735674678e-06, "loss": 1.1741, "step": 14680 }, { "epoch": 27.15, "grad_norm": 5.159487724304199, "learning_rate": 9.142329020332717e-06, "loss": 1.4548, "step": 14690 }, { "epoch": 27.17, "grad_norm": 8.204315185546875, "learning_rate": 9.134935304990759e-06, "loss": 1.3422, "step": 14700 }, { "epoch": 27.19, "grad_norm": 3.7133233547210693, "learning_rate": 9.1275415896488e-06, "loss": 1.4382, "step": 14710 }, { "epoch": 27.21, "grad_norm": 7.855378150939941, "learning_rate": 9.12014787430684e-06, "loss": 1.2139, "step": 14720 }, { "epoch": 27.23, "grad_norm": 5.29605770111084, "learning_rate": 9.11275415896488e-06, "loss": 0.9072, "step": 14730 }, { "epoch": 27.25, "grad_norm": 7.356898784637451, "learning_rate": 9.105360443622921e-06, "loss": 1.577, "step": 14740 }, { "epoch": 27.26, "grad_norm": 7.92119836807251, "learning_rate": 9.097966728280961e-06, "loss": 1.1956, "step": 14750 }, { "epoch": 27.28, "grad_norm": 10.34352970123291, "learning_rate": 9.090573012939003e-06, "loss": 1.3853, "step": 14760 }, { "epoch": 27.3, "grad_norm": 5.029427528381348, "learning_rate": 9.083179297597044e-06, "loss": 1.58, "step": 14770 }, { "epoch": 27.32, "grad_norm": 4.419121742248535, "learning_rate": 9.075785582255084e-06, "loss": 1.3275, "step": 14780 }, { "epoch": 27.34, "grad_norm": 11.74969482421875, "learning_rate": 9.068391866913124e-06, "loss": 1.3252, "step": 14790 }, { "epoch": 27.36, "grad_norm": 7.207937717437744, "learning_rate": 9.060998151571165e-06, "loss": 1.1703, "step": 14800 }, { "epoch": 27.38, "grad_norm": 9.68798828125, "learning_rate": 9.053604436229207e-06, "loss": 1.2536, "step": 14810 }, { "epoch": 27.39, "grad_norm": 8.07341194152832, "learning_rate": 9.046210720887246e-06, "loss": 1.2533, "step": 14820 }, { "epoch": 27.41, "grad_norm": 6.094292640686035, "learning_rate": 9.039556377079483e-06, "loss": 1.1355, "step": 14830 }, { "epoch": 27.43, "grad_norm": 6.428986549377441, "learning_rate": 9.032162661737523e-06, "loss": 1.0233, "step": 14840 }, { "epoch": 27.45, "grad_norm": 6.226885795593262, "learning_rate": 9.024768946395565e-06, "loss": 1.4319, "step": 14850 }, { "epoch": 27.47, "grad_norm": 6.993590354919434, "learning_rate": 9.017375231053606e-06, "loss": 1.154, "step": 14860 }, { "epoch": 27.49, "grad_norm": 6.802440166473389, "learning_rate": 9.009981515711646e-06, "loss": 1.6827, "step": 14870 }, { "epoch": 27.5, "grad_norm": 8.937495231628418, "learning_rate": 9.002587800369687e-06, "loss": 1.1589, "step": 14880 }, { "epoch": 27.52, "grad_norm": 8.33581256866455, "learning_rate": 8.995194085027727e-06, "loss": 1.7357, "step": 14890 }, { "epoch": 27.54, "grad_norm": 9.489198684692383, "learning_rate": 8.987800369685767e-06, "loss": 1.4406, "step": 14900 }, { "epoch": 27.56, "grad_norm": 8.746750831604004, "learning_rate": 8.980406654343808e-06, "loss": 1.3446, "step": 14910 }, { "epoch": 27.58, "grad_norm": 4.285055160522461, "learning_rate": 8.97301293900185e-06, "loss": 1.5638, "step": 14920 }, { "epoch": 27.6, "grad_norm": 5.376425266265869, "learning_rate": 8.96561922365989e-06, "loss": 1.1001, "step": 14930 }, { "epoch": 27.62, "grad_norm": 5.555987358093262, "learning_rate": 8.958225508317931e-06, "loss": 1.2352, "step": 14940 }, { "epoch": 27.63, "grad_norm": 8.335539817810059, "learning_rate": 8.950831792975971e-06, "loss": 1.2746, "step": 14950 }, { "epoch": 27.65, "grad_norm": 6.783169746398926, "learning_rate": 8.94343807763401e-06, "loss": 1.0976, "step": 14960 }, { "epoch": 27.67, "grad_norm": 4.414501667022705, "learning_rate": 8.936044362292052e-06, "loss": 1.4421, "step": 14970 }, { "epoch": 27.69, "grad_norm": 7.350251197814941, "learning_rate": 8.928650646950094e-06, "loss": 1.4582, "step": 14980 }, { "epoch": 27.71, "grad_norm": 9.339743614196777, "learning_rate": 8.921256931608135e-06, "loss": 1.6669, "step": 14990 }, { "epoch": 27.73, "grad_norm": 8.79870319366455, "learning_rate": 8.913863216266175e-06, "loss": 1.3409, "step": 15000 }, { "epoch": 27.74, "grad_norm": 11.597415924072266, "learning_rate": 8.906469500924215e-06, "loss": 1.2699, "step": 15010 }, { "epoch": 27.76, "grad_norm": 6.414793014526367, "learning_rate": 8.899075785582256e-06, "loss": 1.4489, "step": 15020 }, { "epoch": 27.78, "grad_norm": 7.066689491271973, "learning_rate": 8.891682070240296e-06, "loss": 1.436, "step": 15030 }, { "epoch": 27.8, "grad_norm": 4.509892463684082, "learning_rate": 8.884288354898337e-06, "loss": 1.0885, "step": 15040 }, { "epoch": 27.82, "grad_norm": 6.194611072540283, "learning_rate": 8.876894639556379e-06, "loss": 0.9849, "step": 15050 }, { "epoch": 27.84, "grad_norm": 12.320152282714844, "learning_rate": 8.869500924214419e-06, "loss": 1.2442, "step": 15060 }, { "epoch": 27.86, "grad_norm": 7.026414394378662, "learning_rate": 8.862107208872458e-06, "loss": 1.2594, "step": 15070 }, { "epoch": 27.87, "grad_norm": 12.537293434143066, "learning_rate": 8.8547134935305e-06, "loss": 1.2708, "step": 15080 }, { "epoch": 27.89, "grad_norm": 9.10591983795166, "learning_rate": 8.84731977818854e-06, "loss": 1.2788, "step": 15090 }, { "epoch": 27.91, "grad_norm": 5.453493118286133, "learning_rate": 8.839926062846581e-06, "loss": 1.2222, "step": 15100 }, { "epoch": 27.93, "grad_norm": 3.5297040939331055, "learning_rate": 8.832532347504623e-06, "loss": 1.2393, "step": 15110 }, { "epoch": 27.95, "grad_norm": 9.551054000854492, "learning_rate": 8.825138632162662e-06, "loss": 1.166, "step": 15120 }, { "epoch": 27.97, "grad_norm": 12.537240982055664, "learning_rate": 8.817744916820702e-06, "loss": 1.4701, "step": 15130 }, { "epoch": 27.99, "grad_norm": 5.152378082275391, "learning_rate": 8.810351201478744e-06, "loss": 1.5446, "step": 15140 }, { "epoch": 28.0, "eval_accuracy": 0.7962577962577962, "eval_loss": 0.6972686648368835, "eval_runtime": 1.5521, "eval_samples_per_second": 309.911, "eval_steps_per_second": 39.303, "step": 15148 }, { "epoch": 28.0, "grad_norm": 7.87412166595459, "learning_rate": 8.802957486136784e-06, "loss": 1.0494, "step": 15150 }, { "epoch": 28.02, "grad_norm": 7.107452869415283, "learning_rate": 8.795563770794825e-06, "loss": 1.0685, "step": 15160 }, { "epoch": 28.04, "grad_norm": 9.718912124633789, "learning_rate": 8.788170055452866e-06, "loss": 1.1993, "step": 15170 }, { "epoch": 28.06, "grad_norm": 5.451140880584717, "learning_rate": 8.780776340110906e-06, "loss": 1.2504, "step": 15180 }, { "epoch": 28.08, "grad_norm": 4.5500993728637695, "learning_rate": 8.773382624768946e-06, "loss": 1.3243, "step": 15190 }, { "epoch": 28.1, "grad_norm": 6.536294460296631, "learning_rate": 8.765988909426988e-06, "loss": 1.1439, "step": 15200 }, { "epoch": 28.11, "grad_norm": 4.668808937072754, "learning_rate": 8.758595194085029e-06, "loss": 0.9068, "step": 15210 }, { "epoch": 28.13, "grad_norm": 5.109579086303711, "learning_rate": 8.751201478743069e-06, "loss": 1.2546, "step": 15220 }, { "epoch": 28.15, "grad_norm": 10.000188827514648, "learning_rate": 8.74380776340111e-06, "loss": 1.4949, "step": 15230 }, { "epoch": 28.17, "grad_norm": 6.8773674964904785, "learning_rate": 8.73641404805915e-06, "loss": 1.2631, "step": 15240 }, { "epoch": 28.19, "grad_norm": 13.287697792053223, "learning_rate": 8.729020332717192e-06, "loss": 1.2647, "step": 15250 }, { "epoch": 28.21, "grad_norm": 3.204146146774292, "learning_rate": 8.721626617375231e-06, "loss": 1.0784, "step": 15260 }, { "epoch": 28.23, "grad_norm": 6.592350006103516, "learning_rate": 8.714232902033273e-06, "loss": 1.2008, "step": 15270 }, { "epoch": 28.24, "grad_norm": 5.222545146942139, "learning_rate": 8.706839186691313e-06, "loss": 1.2095, "step": 15280 }, { "epoch": 28.26, "grad_norm": 5.712159633636475, "learning_rate": 8.699445471349354e-06, "loss": 1.2254, "step": 15290 }, { "epoch": 28.28, "grad_norm": 9.38041877746582, "learning_rate": 8.692051756007396e-06, "loss": 1.1407, "step": 15300 }, { "epoch": 28.3, "grad_norm": 9.5460205078125, "learning_rate": 8.684658040665435e-06, "loss": 0.9492, "step": 15310 }, { "epoch": 28.32, "grad_norm": 1.8266931772232056, "learning_rate": 8.677264325323475e-06, "loss": 1.2681, "step": 15320 }, { "epoch": 28.34, "grad_norm": 11.784220695495605, "learning_rate": 8.669870609981517e-06, "loss": 1.2142, "step": 15330 }, { "epoch": 28.35, "grad_norm": 4.592172145843506, "learning_rate": 8.662476894639556e-06, "loss": 1.046, "step": 15340 }, { "epoch": 28.37, "grad_norm": 4.752230167388916, "learning_rate": 8.655083179297598e-06, "loss": 1.272, "step": 15350 }, { "epoch": 28.39, "grad_norm": 4.362618446350098, "learning_rate": 8.64768946395564e-06, "loss": 1.1461, "step": 15360 }, { "epoch": 28.41, "grad_norm": 7.554275989532471, "learning_rate": 8.640295748613679e-06, "loss": 1.0551, "step": 15370 }, { "epoch": 28.43, "grad_norm": 5.906393527984619, "learning_rate": 8.632902033271719e-06, "loss": 1.2083, "step": 15380 }, { "epoch": 28.45, "grad_norm": 6.439271926879883, "learning_rate": 8.62550831792976e-06, "loss": 1.1542, "step": 15390 }, { "epoch": 28.47, "grad_norm": 7.519855976104736, "learning_rate": 8.618114602587802e-06, "loss": 1.0011, "step": 15400 }, { "epoch": 28.48, "grad_norm": 6.3917236328125, "learning_rate": 8.610720887245842e-06, "loss": 1.5012, "step": 15410 }, { "epoch": 28.5, "grad_norm": 6.3163957595825195, "learning_rate": 8.603327171903883e-06, "loss": 1.1734, "step": 15420 }, { "epoch": 28.52, "grad_norm": 5.384838581085205, "learning_rate": 8.595933456561923e-06, "loss": 1.4773, "step": 15430 }, { "epoch": 28.54, "grad_norm": 4.750423908233643, "learning_rate": 8.588539741219963e-06, "loss": 1.4691, "step": 15440 }, { "epoch": 28.56, "grad_norm": 11.62020206451416, "learning_rate": 8.581146025878004e-06, "loss": 1.0499, "step": 15450 }, { "epoch": 28.58, "grad_norm": 9.687432289123535, "learning_rate": 8.573752310536046e-06, "loss": 1.1584, "step": 15460 }, { "epoch": 28.6, "grad_norm": 6.975427627563477, "learning_rate": 8.566358595194085e-06, "loss": 1.5315, "step": 15470 }, { "epoch": 28.61, "grad_norm": 4.771568775177002, "learning_rate": 8.558964879852127e-06, "loss": 1.653, "step": 15480 }, { "epoch": 28.63, "grad_norm": 9.714116096496582, "learning_rate": 8.551571164510167e-06, "loss": 1.0704, "step": 15490 }, { "epoch": 28.65, "grad_norm": 9.999991416931152, "learning_rate": 8.544177449168206e-06, "loss": 1.0808, "step": 15500 }, { "epoch": 28.67, "grad_norm": 8.696097373962402, "learning_rate": 8.536783733826248e-06, "loss": 1.3472, "step": 15510 }, { "epoch": 28.69, "grad_norm": 10.327348709106445, "learning_rate": 8.52939001848429e-06, "loss": 1.1789, "step": 15520 }, { "epoch": 28.71, "grad_norm": 6.994972229003906, "learning_rate": 8.52199630314233e-06, "loss": 1.3167, "step": 15530 }, { "epoch": 28.72, "grad_norm": 5.305590629577637, "learning_rate": 8.51460258780037e-06, "loss": 1.1263, "step": 15540 }, { "epoch": 28.74, "grad_norm": 4.192887306213379, "learning_rate": 8.50720887245841e-06, "loss": 1.3978, "step": 15550 }, { "epoch": 28.76, "grad_norm": 7.248758792877197, "learning_rate": 8.499815157116452e-06, "loss": 1.485, "step": 15560 }, { "epoch": 28.78, "grad_norm": 9.405587196350098, "learning_rate": 8.492421441774492e-06, "loss": 1.6036, "step": 15570 }, { "epoch": 28.8, "grad_norm": 3.8119053840637207, "learning_rate": 8.485027726432533e-06, "loss": 1.2698, "step": 15580 }, { "epoch": 28.82, "grad_norm": 9.90836238861084, "learning_rate": 8.477634011090575e-06, "loss": 1.1654, "step": 15590 }, { "epoch": 28.84, "grad_norm": 10.753583908081055, "learning_rate": 8.470240295748614e-06, "loss": 1.3658, "step": 15600 }, { "epoch": 28.85, "grad_norm": 7.11258602142334, "learning_rate": 8.462846580406654e-06, "loss": 0.9433, "step": 15610 }, { "epoch": 28.87, "grad_norm": 6.00776481628418, "learning_rate": 8.455452865064696e-06, "loss": 1.3721, "step": 15620 }, { "epoch": 28.89, "grad_norm": 7.134599685668945, "learning_rate": 8.448059149722736e-06, "loss": 1.0444, "step": 15630 }, { "epoch": 28.91, "grad_norm": 4.846619129180908, "learning_rate": 8.440665434380777e-06, "loss": 1.2056, "step": 15640 }, { "epoch": 28.93, "grad_norm": 10.130400657653809, "learning_rate": 8.433271719038818e-06, "loss": 1.3394, "step": 15650 }, { "epoch": 28.95, "grad_norm": 11.903362274169922, "learning_rate": 8.425878003696858e-06, "loss": 1.0248, "step": 15660 }, { "epoch": 28.96, "grad_norm": 6.009066581726074, "learning_rate": 8.4184842883549e-06, "loss": 1.2324, "step": 15670 }, { "epoch": 28.98, "grad_norm": 7.966669082641602, "learning_rate": 8.41109057301294e-06, "loss": 1.1969, "step": 15680 }, { "epoch": 29.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.7010128498077393, "eval_runtime": 1.6199, "eval_samples_per_second": 296.935, "eval_steps_per_second": 37.657, "step": 15689 }, { "epoch": 29.0, "grad_norm": 5.567436218261719, "learning_rate": 8.40369685767098e-06, "loss": 1.2257, "step": 15690 }, { "epoch": 29.02, "grad_norm": 9.6458158493042, "learning_rate": 8.39630314232902e-06, "loss": 1.2979, "step": 15700 }, { "epoch": 29.04, "grad_norm": 11.840112686157227, "learning_rate": 8.388909426987062e-06, "loss": 1.6028, "step": 15710 }, { "epoch": 29.06, "grad_norm": 6.399205684661865, "learning_rate": 8.381515711645102e-06, "loss": 1.4181, "step": 15720 }, { "epoch": 29.08, "grad_norm": 8.72557258605957, "learning_rate": 8.374121996303144e-06, "loss": 1.3381, "step": 15730 }, { "epoch": 29.09, "grad_norm": 8.467247009277344, "learning_rate": 8.366728280961183e-06, "loss": 1.4641, "step": 15740 }, { "epoch": 29.11, "grad_norm": 9.742775917053223, "learning_rate": 8.359334565619225e-06, "loss": 1.6675, "step": 15750 }, { "epoch": 29.13, "grad_norm": 6.297660827636719, "learning_rate": 8.351940850277265e-06, "loss": 1.0966, "step": 15760 }, { "epoch": 29.15, "grad_norm": 7.11913537979126, "learning_rate": 8.344547134935306e-06, "loss": 1.2878, "step": 15770 }, { "epoch": 29.17, "grad_norm": 6.332286834716797, "learning_rate": 8.337153419593348e-06, "loss": 1.0085, "step": 15780 }, { "epoch": 29.19, "grad_norm": 11.746231079101562, "learning_rate": 8.329759704251387e-06, "loss": 1.5078, "step": 15790 }, { "epoch": 29.21, "grad_norm": 5.886460304260254, "learning_rate": 8.322365988909427e-06, "loss": 1.2869, "step": 15800 }, { "epoch": 29.22, "grad_norm": 10.093762397766113, "learning_rate": 8.314972273567469e-06, "loss": 1.4391, "step": 15810 }, { "epoch": 29.24, "grad_norm": 9.548881530761719, "learning_rate": 8.307578558225508e-06, "loss": 1.294, "step": 15820 }, { "epoch": 29.26, "grad_norm": 6.443811893463135, "learning_rate": 8.30018484288355e-06, "loss": 1.3018, "step": 15830 }, { "epoch": 29.28, "grad_norm": 4.717520236968994, "learning_rate": 8.292791127541591e-06, "loss": 1.1304, "step": 15840 }, { "epoch": 29.3, "grad_norm": 6.638114929199219, "learning_rate": 8.285397412199631e-06, "loss": 1.2045, "step": 15850 }, { "epoch": 29.32, "grad_norm": 7.875908851623535, "learning_rate": 8.278003696857671e-06, "loss": 1.3939, "step": 15860 }, { "epoch": 29.33, "grad_norm": 2.890753984451294, "learning_rate": 8.270609981515712e-06, "loss": 1.2045, "step": 15870 }, { "epoch": 29.35, "grad_norm": 7.841060638427734, "learning_rate": 8.263216266173752e-06, "loss": 1.1545, "step": 15880 }, { "epoch": 29.37, "grad_norm": 6.483351230621338, "learning_rate": 8.255822550831794e-06, "loss": 1.2846, "step": 15890 }, { "epoch": 29.39, "grad_norm": 5.128380298614502, "learning_rate": 8.248428835489835e-06, "loss": 1.1731, "step": 15900 }, { "epoch": 29.41, "grad_norm": 7.426737308502197, "learning_rate": 8.241035120147875e-06, "loss": 0.9312, "step": 15910 }, { "epoch": 29.43, "grad_norm": 8.756804466247559, "learning_rate": 8.233641404805915e-06, "loss": 1.1994, "step": 15920 }, { "epoch": 29.45, "grad_norm": 7.084136962890625, "learning_rate": 8.226247689463956e-06, "loss": 1.4171, "step": 15930 }, { "epoch": 29.46, "grad_norm": 6.971207618713379, "learning_rate": 8.218853974121998e-06, "loss": 1.1942, "step": 15940 }, { "epoch": 29.48, "grad_norm": 8.228928565979004, "learning_rate": 8.211460258780037e-06, "loss": 1.2862, "step": 15950 }, { "epoch": 29.5, "grad_norm": 11.807385444641113, "learning_rate": 8.204066543438079e-06, "loss": 1.6667, "step": 15960 }, { "epoch": 29.52, "grad_norm": 8.3796968460083, "learning_rate": 8.196672828096119e-06, "loss": 1.3335, "step": 15970 }, { "epoch": 29.54, "grad_norm": 7.363246917724609, "learning_rate": 8.18927911275416e-06, "loss": 1.3897, "step": 15980 }, { "epoch": 29.56, "grad_norm": 5.294849872589111, "learning_rate": 8.1818853974122e-06, "loss": 1.2631, "step": 15990 }, { "epoch": 29.57, "grad_norm": 9.223872184753418, "learning_rate": 8.174491682070241e-06, "loss": 1.3449, "step": 16000 }, { "epoch": 29.59, "grad_norm": 10.656721115112305, "learning_rate": 8.167097966728281e-06, "loss": 1.4282, "step": 16010 }, { "epoch": 29.61, "grad_norm": 7.928171157836914, "learning_rate": 8.159704251386323e-06, "loss": 1.1596, "step": 16020 }, { "epoch": 29.63, "grad_norm": 7.142699241638184, "learning_rate": 8.152310536044362e-06, "loss": 1.5319, "step": 16030 }, { "epoch": 29.65, "grad_norm": 10.496225357055664, "learning_rate": 8.144916820702404e-06, "loss": 1.0255, "step": 16040 }, { "epoch": 29.67, "grad_norm": 11.129826545715332, "learning_rate": 8.137523105360444e-06, "loss": 1.4079, "step": 16050 }, { "epoch": 29.69, "grad_norm": 3.1209216117858887, "learning_rate": 8.130129390018485e-06, "loss": 1.334, "step": 16060 }, { "epoch": 29.7, "grad_norm": 14.10083293914795, "learning_rate": 8.122735674676525e-06, "loss": 1.2565, "step": 16070 }, { "epoch": 29.72, "grad_norm": 6.532052993774414, "learning_rate": 8.115341959334566e-06, "loss": 1.4587, "step": 16080 }, { "epoch": 29.74, "grad_norm": 9.775726318359375, "learning_rate": 8.107948243992608e-06, "loss": 1.5397, "step": 16090 }, { "epoch": 29.76, "grad_norm": 6.1632280349731445, "learning_rate": 8.100554528650648e-06, "loss": 1.2672, "step": 16100 }, { "epoch": 29.78, "grad_norm": 13.242855072021484, "learning_rate": 8.093160813308688e-06, "loss": 1.3383, "step": 16110 }, { "epoch": 29.8, "grad_norm": 5.912835121154785, "learning_rate": 8.085767097966729e-06, "loss": 1.2295, "step": 16120 }, { "epoch": 29.82, "grad_norm": 7.528906345367432, "learning_rate": 8.07837338262477e-06, "loss": 1.0706, "step": 16130 }, { "epoch": 29.83, "grad_norm": 2.9443514347076416, "learning_rate": 8.07097966728281e-06, "loss": 1.2177, "step": 16140 }, { "epoch": 29.85, "grad_norm": 6.097892761230469, "learning_rate": 8.063585951940852e-06, "loss": 1.186, "step": 16150 }, { "epoch": 29.87, "grad_norm": 11.343749046325684, "learning_rate": 8.056192236598892e-06, "loss": 1.4584, "step": 16160 }, { "epoch": 29.89, "grad_norm": 6.992361068725586, "learning_rate": 8.048798521256931e-06, "loss": 0.8855, "step": 16170 }, { "epoch": 29.91, "grad_norm": 8.224095344543457, "learning_rate": 8.041404805914973e-06, "loss": 1.4394, "step": 16180 }, { "epoch": 29.93, "grad_norm": 5.304134845733643, "learning_rate": 8.034011090573014e-06, "loss": 1.06, "step": 16190 }, { "epoch": 29.94, "grad_norm": 6.789124011993408, "learning_rate": 8.026617375231054e-06, "loss": 0.9003, "step": 16200 }, { "epoch": 29.96, "grad_norm": 12.227429389953613, "learning_rate": 8.019223659889096e-06, "loss": 1.2345, "step": 16210 }, { "epoch": 29.98, "grad_norm": 9.553756713867188, "learning_rate": 8.011829944547135e-06, "loss": 1.2106, "step": 16220 }, { "epoch": 30.0, "grad_norm": 10.147976875305176, "learning_rate": 8.004436229205175e-06, "loss": 1.3721, "step": 16230 }, { "epoch": 30.0, "eval_accuracy": 0.8004158004158004, "eval_loss": 0.692821204662323, "eval_runtime": 1.5307, "eval_samples_per_second": 314.245, "eval_steps_per_second": 39.852, "step": 16230 }, { "epoch": 30.02, "grad_norm": 8.6131010055542, "learning_rate": 7.997042513863217e-06, "loss": 1.1851, "step": 16240 }, { "epoch": 30.04, "grad_norm": 6.777403831481934, "learning_rate": 7.989648798521258e-06, "loss": 0.8933, "step": 16250 }, { "epoch": 30.06, "grad_norm": 11.944092750549316, "learning_rate": 7.982255083179298e-06, "loss": 1.2604, "step": 16260 }, { "epoch": 30.07, "grad_norm": 7.5707926750183105, "learning_rate": 7.97486136783734e-06, "loss": 1.6531, "step": 16270 }, { "epoch": 30.09, "grad_norm": 14.896036148071289, "learning_rate": 7.967467652495379e-06, "loss": 1.3492, "step": 16280 }, { "epoch": 30.11, "grad_norm": 13.501241683959961, "learning_rate": 7.960073937153419e-06, "loss": 1.2151, "step": 16290 }, { "epoch": 30.13, "grad_norm": 9.752790451049805, "learning_rate": 7.95268022181146e-06, "loss": 1.3371, "step": 16300 }, { "epoch": 30.15, "grad_norm": 5.216186046600342, "learning_rate": 7.945286506469502e-06, "loss": 1.2952, "step": 16310 }, { "epoch": 30.17, "grad_norm": 8.047064781188965, "learning_rate": 7.937892791127543e-06, "loss": 1.4769, "step": 16320 }, { "epoch": 30.18, "grad_norm": 6.031919956207275, "learning_rate": 7.930499075785583e-06, "loss": 1.116, "step": 16330 }, { "epoch": 30.2, "grad_norm": 11.455366134643555, "learning_rate": 7.923105360443623e-06, "loss": 1.084, "step": 16340 }, { "epoch": 30.22, "grad_norm": 8.373392105102539, "learning_rate": 7.915711645101664e-06, "loss": 1.148, "step": 16350 }, { "epoch": 30.24, "grad_norm": 8.634023666381836, "learning_rate": 7.908317929759704e-06, "loss": 1.1253, "step": 16360 }, { "epoch": 30.26, "grad_norm": 6.110973834991455, "learning_rate": 7.900924214417746e-06, "loss": 1.1571, "step": 16370 }, { "epoch": 30.28, "grad_norm": 12.797734260559082, "learning_rate": 7.893530499075787e-06, "loss": 1.3428, "step": 16380 }, { "epoch": 30.3, "grad_norm": 8.543042182922363, "learning_rate": 7.886136783733827e-06, "loss": 1.2249, "step": 16390 }, { "epoch": 30.31, "grad_norm": 3.9834508895874023, "learning_rate": 7.878743068391868e-06, "loss": 1.3553, "step": 16400 }, { "epoch": 30.33, "grad_norm": 7.662215709686279, "learning_rate": 7.871349353049908e-06, "loss": 1.3138, "step": 16410 }, { "epoch": 30.35, "grad_norm": 4.957363128662109, "learning_rate": 7.863955637707948e-06, "loss": 1.4089, "step": 16420 }, { "epoch": 30.37, "grad_norm": 5.618158340454102, "learning_rate": 7.85656192236599e-06, "loss": 1.4505, "step": 16430 }, { "epoch": 30.39, "grad_norm": 10.890913009643555, "learning_rate": 7.849168207024031e-06, "loss": 1.5035, "step": 16440 }, { "epoch": 30.41, "grad_norm": 6.355422019958496, "learning_rate": 7.84177449168207e-06, "loss": 1.1115, "step": 16450 }, { "epoch": 30.43, "grad_norm": 9.117525100708008, "learning_rate": 7.834380776340112e-06, "loss": 0.7368, "step": 16460 }, { "epoch": 30.44, "grad_norm": 8.585612297058105, "learning_rate": 7.826987060998152e-06, "loss": 1.1929, "step": 16470 }, { "epoch": 30.46, "grad_norm": 5.227377414703369, "learning_rate": 7.819593345656192e-06, "loss": 1.2802, "step": 16480 }, { "epoch": 30.48, "grad_norm": 6.8948750495910645, "learning_rate": 7.812199630314233e-06, "loss": 1.0912, "step": 16490 }, { "epoch": 30.5, "grad_norm": 4.962432384490967, "learning_rate": 7.804805914972275e-06, "loss": 1.03, "step": 16500 }, { "epoch": 30.52, "grad_norm": 5.336109638214111, "learning_rate": 7.797412199630316e-06, "loss": 1.0357, "step": 16510 }, { "epoch": 30.54, "grad_norm": 9.584413528442383, "learning_rate": 7.790018484288356e-06, "loss": 1.3317, "step": 16520 }, { "epoch": 30.55, "grad_norm": 6.0132951736450195, "learning_rate": 7.782624768946396e-06, "loss": 1.0719, "step": 16530 }, { "epoch": 30.57, "grad_norm": 7.007846355438232, "learning_rate": 7.775231053604437e-06, "loss": 1.1868, "step": 16540 }, { "epoch": 30.59, "grad_norm": 9.673008918762207, "learning_rate": 7.767837338262477e-06, "loss": 1.3189, "step": 16550 }, { "epoch": 30.61, "grad_norm": 7.435279369354248, "learning_rate": 7.760443622920518e-06, "loss": 1.1872, "step": 16560 }, { "epoch": 30.63, "grad_norm": 7.4520440101623535, "learning_rate": 7.75304990757856e-06, "loss": 1.2143, "step": 16570 }, { "epoch": 30.65, "grad_norm": 7.968234539031982, "learning_rate": 7.7456561922366e-06, "loss": 1.3216, "step": 16580 }, { "epoch": 30.67, "grad_norm": 10.14477252960205, "learning_rate": 7.73826247689464e-06, "loss": 1.1187, "step": 16590 }, { "epoch": 30.68, "grad_norm": 6.552546501159668, "learning_rate": 7.730868761552681e-06, "loss": 1.3897, "step": 16600 }, { "epoch": 30.7, "grad_norm": 4.302252769470215, "learning_rate": 7.72347504621072e-06, "loss": 1.2405, "step": 16610 }, { "epoch": 30.72, "grad_norm": 4.336849689483643, "learning_rate": 7.716081330868762e-06, "loss": 1.2856, "step": 16620 }, { "epoch": 30.74, "grad_norm": 9.771384239196777, "learning_rate": 7.708687615526804e-06, "loss": 1.1053, "step": 16630 }, { "epoch": 30.76, "grad_norm": 5.152838706970215, "learning_rate": 7.701293900184844e-06, "loss": 1.4193, "step": 16640 }, { "epoch": 30.78, "grad_norm": 11.440196990966797, "learning_rate": 7.693900184842883e-06, "loss": 1.4242, "step": 16650 }, { "epoch": 30.79, "grad_norm": 3.1950082778930664, "learning_rate": 7.686506469500925e-06, "loss": 0.9671, "step": 16660 }, { "epoch": 30.81, "grad_norm": 4.913647174835205, "learning_rate": 7.679112754158965e-06, "loss": 1.4778, "step": 16670 }, { "epoch": 30.83, "grad_norm": 5.210831642150879, "learning_rate": 7.671719038817006e-06, "loss": 1.3307, "step": 16680 }, { "epoch": 30.85, "grad_norm": 5.174943447113037, "learning_rate": 7.664325323475048e-06, "loss": 1.3062, "step": 16690 }, { "epoch": 30.87, "grad_norm": 2.9148120880126953, "learning_rate": 7.656931608133087e-06, "loss": 1.3512, "step": 16700 }, { "epoch": 30.89, "grad_norm": 4.952943325042725, "learning_rate": 7.649537892791127e-06, "loss": 1.3043, "step": 16710 }, { "epoch": 30.91, "grad_norm": 5.879188537597656, "learning_rate": 7.642144177449169e-06, "loss": 1.3746, "step": 16720 }, { "epoch": 30.92, "grad_norm": 6.974434852600098, "learning_rate": 7.63475046210721e-06, "loss": 1.5869, "step": 16730 }, { "epoch": 30.94, "grad_norm": 4.4884772300720215, "learning_rate": 7.62735674676525e-06, "loss": 1.2257, "step": 16740 }, { "epoch": 30.96, "grad_norm": 9.00014591217041, "learning_rate": 7.6199630314232905e-06, "loss": 1.1639, "step": 16750 }, { "epoch": 30.98, "grad_norm": 7.1107635498046875, "learning_rate": 7.612569316081332e-06, "loss": 1.3184, "step": 16760 }, { "epoch": 31.0, "grad_norm": 5.960830211639404, "learning_rate": 7.605175600739372e-06, "loss": 1.4051, "step": 16770 }, { "epoch": 31.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.697636604309082, "eval_runtime": 1.5409, "eval_samples_per_second": 312.162, "eval_steps_per_second": 39.588, "step": 16771 }, { "epoch": 31.02, "grad_norm": 4.825220108032227, "learning_rate": 7.597781885397413e-06, "loss": 1.3419, "step": 16780 }, { "epoch": 31.04, "grad_norm": 5.570434093475342, "learning_rate": 7.590388170055454e-06, "loss": 1.4342, "step": 16790 }, { "epoch": 31.05, "grad_norm": 8.77595329284668, "learning_rate": 7.582994454713494e-06, "loss": 1.5835, "step": 16800 }, { "epoch": 31.07, "grad_norm": 6.570970058441162, "learning_rate": 7.575600739371535e-06, "loss": 1.269, "step": 16810 }, { "epoch": 31.09, "grad_norm": 7.393406391143799, "learning_rate": 7.568207024029576e-06, "loss": 1.1071, "step": 16820 }, { "epoch": 31.11, "grad_norm": 8.447117805480957, "learning_rate": 7.5608133086876155e-06, "loss": 1.4124, "step": 16830 }, { "epoch": 31.13, "grad_norm": 10.204466819763184, "learning_rate": 7.554158964879853e-06, "loss": 1.1437, "step": 16840 }, { "epoch": 31.15, "grad_norm": 8.4327392578125, "learning_rate": 7.546765249537893e-06, "loss": 1.3628, "step": 16850 }, { "epoch": 31.16, "grad_norm": 5.890074729919434, "learning_rate": 7.539371534195934e-06, "loss": 1.5305, "step": 16860 }, { "epoch": 31.18, "grad_norm": 7.917720794677734, "learning_rate": 7.531977818853975e-06, "loss": 1.1154, "step": 16870 }, { "epoch": 31.2, "grad_norm": 11.313860893249512, "learning_rate": 7.524584103512015e-06, "loss": 1.1409, "step": 16880 }, { "epoch": 31.22, "grad_norm": 8.94122314453125, "learning_rate": 7.5171903881700555e-06, "loss": 1.2609, "step": 16890 }, { "epoch": 31.24, "grad_norm": 10.668384552001953, "learning_rate": 7.509796672828097e-06, "loss": 1.0821, "step": 16900 }, { "epoch": 31.26, "grad_norm": 10.178790092468262, "learning_rate": 7.502402957486138e-06, "loss": 1.3295, "step": 16910 }, { "epoch": 31.28, "grad_norm": 10.923539161682129, "learning_rate": 7.495009242144177e-06, "loss": 1.0994, "step": 16920 }, { "epoch": 31.29, "grad_norm": 8.921259880065918, "learning_rate": 7.487615526802219e-06, "loss": 1.1725, "step": 16930 }, { "epoch": 31.31, "grad_norm": 3.7828547954559326, "learning_rate": 7.4802218114602595e-06, "loss": 1.2786, "step": 16940 }, { "epoch": 31.33, "grad_norm": 6.517751216888428, "learning_rate": 7.4728280961183e-06, "loss": 1.0124, "step": 16950 }, { "epoch": 31.35, "grad_norm": 7.060566425323486, "learning_rate": 7.465434380776341e-06, "loss": 1.3213, "step": 16960 }, { "epoch": 31.37, "grad_norm": 6.9080119132995605, "learning_rate": 7.458040665434381e-06, "loss": 1.2211, "step": 16970 }, { "epoch": 31.39, "grad_norm": 9.345340728759766, "learning_rate": 7.450646950092422e-06, "loss": 1.1817, "step": 16980 }, { "epoch": 31.4, "grad_norm": 8.587270736694336, "learning_rate": 7.443253234750463e-06, "loss": 1.5553, "step": 16990 }, { "epoch": 31.42, "grad_norm": 6.62686014175415, "learning_rate": 7.435859519408503e-06, "loss": 1.3772, "step": 17000 }, { "epoch": 31.44, "grad_norm": 7.4031877517700195, "learning_rate": 7.428465804066544e-06, "loss": 1.2541, "step": 17010 }, { "epoch": 31.46, "grad_norm": 9.940272331237793, "learning_rate": 7.4210720887245845e-06, "loss": 1.2061, "step": 17020 }, { "epoch": 31.48, "grad_norm": 5.82899284362793, "learning_rate": 7.413678373382626e-06, "loss": 0.9043, "step": 17030 }, { "epoch": 31.5, "grad_norm": 5.559067249298096, "learning_rate": 7.406284658040666e-06, "loss": 1.0859, "step": 17040 }, { "epoch": 31.52, "grad_norm": 6.501069068908691, "learning_rate": 7.3988909426987064e-06, "loss": 1.5186, "step": 17050 }, { "epoch": 31.53, "grad_norm": 6.8692779541015625, "learning_rate": 7.391497227356748e-06, "loss": 1.4445, "step": 17060 }, { "epoch": 31.55, "grad_norm": 7.1207804679870605, "learning_rate": 7.3841035120147885e-06, "loss": 1.292, "step": 17070 }, { "epoch": 31.57, "grad_norm": 6.940738677978516, "learning_rate": 7.376709796672828e-06, "loss": 1.0003, "step": 17080 }, { "epoch": 31.59, "grad_norm": 12.646124839782715, "learning_rate": 7.36931608133087e-06, "loss": 1.4767, "step": 17090 }, { "epoch": 31.61, "grad_norm": 6.561670780181885, "learning_rate": 7.3619223659889104e-06, "loss": 1.2954, "step": 17100 }, { "epoch": 31.63, "grad_norm": 6.054847717285156, "learning_rate": 7.35452865064695e-06, "loss": 1.1465, "step": 17110 }, { "epoch": 31.65, "grad_norm": 5.609902381896973, "learning_rate": 7.347134935304992e-06, "loss": 1.2942, "step": 17120 }, { "epoch": 31.66, "grad_norm": 5.242161273956299, "learning_rate": 7.339741219963032e-06, "loss": 1.1172, "step": 17130 }, { "epoch": 31.68, "grad_norm": 8.074248313903809, "learning_rate": 7.332347504621072e-06, "loss": 1.2816, "step": 17140 }, { "epoch": 31.7, "grad_norm": 6.766181468963623, "learning_rate": 7.324953789279114e-06, "loss": 1.3732, "step": 17150 }, { "epoch": 31.72, "grad_norm": 14.79443073272705, "learning_rate": 7.317560073937154e-06, "loss": 1.2403, "step": 17160 }, { "epoch": 31.74, "grad_norm": 10.086904525756836, "learning_rate": 7.310166358595194e-06, "loss": 1.5099, "step": 17170 }, { "epoch": 31.76, "grad_norm": 6.018208026885986, "learning_rate": 7.3027726432532355e-06, "loss": 1.147, "step": 17180 }, { "epoch": 31.77, "grad_norm": 7.212084770202637, "learning_rate": 7.295378927911276e-06, "loss": 1.0345, "step": 17190 }, { "epoch": 31.79, "grad_norm": 6.524225234985352, "learning_rate": 7.287985212569316e-06, "loss": 1.2752, "step": 17200 }, { "epoch": 31.81, "grad_norm": 7.317665100097656, "learning_rate": 7.280591497227357e-06, "loss": 0.9531, "step": 17210 }, { "epoch": 31.83, "grad_norm": 6.2105712890625, "learning_rate": 7.273197781885398e-06, "loss": 1.404, "step": 17220 }, { "epoch": 31.85, "grad_norm": 8.775808334350586, "learning_rate": 7.265804066543438e-06, "loss": 1.3005, "step": 17230 }, { "epoch": 31.87, "grad_norm": 12.767324447631836, "learning_rate": 7.258410351201479e-06, "loss": 1.3752, "step": 17240 }, { "epoch": 31.89, "grad_norm": 4.5290846824646, "learning_rate": 7.25101663585952e-06, "loss": 1.2338, "step": 17250 }, { "epoch": 31.9, "grad_norm": 12.816914558410645, "learning_rate": 7.243622920517561e-06, "loss": 1.4183, "step": 17260 }, { "epoch": 31.92, "grad_norm": 7.383005619049072, "learning_rate": 7.236229205175601e-06, "loss": 1.3048, "step": 17270 }, { "epoch": 31.94, "grad_norm": 11.442276000976562, "learning_rate": 7.228835489833642e-06, "loss": 1.3854, "step": 17280 }, { "epoch": 31.96, "grad_norm": 4.898057460784912, "learning_rate": 7.221441774491683e-06, "loss": 1.289, "step": 17290 }, { "epoch": 31.98, "grad_norm": 4.743943214416504, "learning_rate": 7.214048059149723e-06, "loss": 1.1381, "step": 17300 }, { "epoch": 32.0, "grad_norm": 7.166031360626221, "learning_rate": 7.206654343807764e-06, "loss": 1.1004, "step": 17310 }, { "epoch": 32.0, "eval_accuracy": 0.8004158004158004, "eval_loss": 0.67845219373703, "eval_runtime": 1.5683, "eval_samples_per_second": 306.711, "eval_steps_per_second": 38.897, "step": 17312 }, { "epoch": 32.01, "grad_norm": 5.707307815551758, "learning_rate": 7.199260628465805e-06, "loss": 1.648, "step": 17320 }, { "epoch": 32.03, "grad_norm": 5.773965835571289, "learning_rate": 7.191866913123845e-06, "loss": 1.2303, "step": 17330 }, { "epoch": 32.05, "grad_norm": 7.7907185554504395, "learning_rate": 7.184473197781886e-06, "loss": 1.051, "step": 17340 }, { "epoch": 32.07, "grad_norm": 4.109180450439453, "learning_rate": 7.177079482439927e-06, "loss": 0.9623, "step": 17350 }, { "epoch": 32.09, "grad_norm": 5.0084662437438965, "learning_rate": 7.169685767097967e-06, "loss": 0.9338, "step": 17360 }, { "epoch": 32.11, "grad_norm": 8.391366004943848, "learning_rate": 7.162292051756008e-06, "loss": 1.6567, "step": 17370 }, { "epoch": 32.13, "grad_norm": 9.359806060791016, "learning_rate": 7.154898336414049e-06, "loss": 1.4063, "step": 17380 }, { "epoch": 32.14, "grad_norm": 9.267990112304688, "learning_rate": 7.147504621072089e-06, "loss": 1.3316, "step": 17390 }, { "epoch": 32.16, "grad_norm": 4.867043495178223, "learning_rate": 7.14011090573013e-06, "loss": 1.0945, "step": 17400 }, { "epoch": 32.18, "grad_norm": 7.608358383178711, "learning_rate": 7.132717190388171e-06, "loss": 1.1167, "step": 17410 }, { "epoch": 32.2, "grad_norm": 7.783542156219482, "learning_rate": 7.125323475046211e-06, "loss": 1.3756, "step": 17420 }, { "epoch": 32.22, "grad_norm": 9.59104061126709, "learning_rate": 7.117929759704252e-06, "loss": 1.4766, "step": 17430 }, { "epoch": 32.24, "grad_norm": 6.353194236755371, "learning_rate": 7.110536044362293e-06, "loss": 1.0154, "step": 17440 }, { "epoch": 32.26, "grad_norm": 6.217390060424805, "learning_rate": 7.103142329020334e-06, "loss": 1.2947, "step": 17450 }, { "epoch": 32.27, "grad_norm": 5.801082611083984, "learning_rate": 7.095748613678374e-06, "loss": 1.2958, "step": 17460 }, { "epoch": 32.29, "grad_norm": 7.492624759674072, "learning_rate": 7.088354898336415e-06, "loss": 1.2027, "step": 17470 }, { "epoch": 32.31, "grad_norm": 4.2885637283325195, "learning_rate": 7.080961182994456e-06, "loss": 1.2297, "step": 17480 }, { "epoch": 32.33, "grad_norm": 4.926486492156982, "learning_rate": 7.073567467652496e-06, "loss": 1.4456, "step": 17490 }, { "epoch": 32.35, "grad_norm": 6.511451721191406, "learning_rate": 7.0661737523105365e-06, "loss": 1.2293, "step": 17500 }, { "epoch": 32.37, "grad_norm": 8.372035026550293, "learning_rate": 7.058780036968578e-06, "loss": 1.155, "step": 17510 }, { "epoch": 32.38, "grad_norm": 5.211967945098877, "learning_rate": 7.051386321626618e-06, "loss": 1.022, "step": 17520 }, { "epoch": 32.4, "grad_norm": 5.743246555328369, "learning_rate": 7.0439926062846584e-06, "loss": 1.4818, "step": 17530 }, { "epoch": 32.42, "grad_norm": 8.470646858215332, "learning_rate": 7.0365988909427e-06, "loss": 1.3009, "step": 17540 }, { "epoch": 32.44, "grad_norm": 10.274312019348145, "learning_rate": 7.02920517560074e-06, "loss": 1.2225, "step": 17550 }, { "epoch": 32.46, "grad_norm": 13.005684852600098, "learning_rate": 7.02181146025878e-06, "loss": 1.665, "step": 17560 }, { "epoch": 32.48, "grad_norm": 10.854597091674805, "learning_rate": 7.014417744916822e-06, "loss": 0.9618, "step": 17570 }, { "epoch": 32.5, "grad_norm": 10.92041301727295, "learning_rate": 7.007024029574862e-06, "loss": 1.3036, "step": 17580 }, { "epoch": 32.51, "grad_norm": 11.127786636352539, "learning_rate": 6.999630314232902e-06, "loss": 1.2389, "step": 17590 }, { "epoch": 32.53, "grad_norm": 7.430180549621582, "learning_rate": 6.992236598890944e-06, "loss": 1.0505, "step": 17600 }, { "epoch": 32.55, "grad_norm": 9.821120262145996, "learning_rate": 6.9848428835489835e-06, "loss": 1.1291, "step": 17610 }, { "epoch": 32.57, "grad_norm": 10.013116836547852, "learning_rate": 6.977449168207024e-06, "loss": 0.7212, "step": 17620 }, { "epoch": 32.59, "grad_norm": 3.905520439147949, "learning_rate": 6.970055452865066e-06, "loss": 0.9073, "step": 17630 }, { "epoch": 32.61, "grad_norm": 8.337102890014648, "learning_rate": 6.962661737523106e-06, "loss": 1.2913, "step": 17640 }, { "epoch": 32.62, "grad_norm": 8.829279899597168, "learning_rate": 6.955268022181146e-06, "loss": 1.1015, "step": 17650 }, { "epoch": 32.64, "grad_norm": 8.414956092834473, "learning_rate": 6.9478743068391875e-06, "loss": 1.3008, "step": 17660 }, { "epoch": 32.66, "grad_norm": 4.26358699798584, "learning_rate": 6.940480591497228e-06, "loss": 1.2029, "step": 17670 }, { "epoch": 32.68, "grad_norm": 8.385793685913086, "learning_rate": 6.933086876155268e-06, "loss": 1.0967, "step": 17680 }, { "epoch": 32.7, "grad_norm": 9.37621784210205, "learning_rate": 6.925693160813309e-06, "loss": 1.5326, "step": 17690 }, { "epoch": 32.72, "grad_norm": 11.120820045471191, "learning_rate": 6.91829944547135e-06, "loss": 1.128, "step": 17700 }, { "epoch": 32.74, "grad_norm": 6.421444892883301, "learning_rate": 6.910905730129391e-06, "loss": 1.4186, "step": 17710 }, { "epoch": 32.75, "grad_norm": 9.533405303955078, "learning_rate": 6.903512014787431e-06, "loss": 1.3424, "step": 17720 }, { "epoch": 32.77, "grad_norm": 8.0885648727417, "learning_rate": 6.896118299445472e-06, "loss": 1.5594, "step": 17730 }, { "epoch": 32.79, "grad_norm": 6.89553689956665, "learning_rate": 6.8887245841035125e-06, "loss": 1.2528, "step": 17740 }, { "epoch": 32.81, "grad_norm": 5.13557767868042, "learning_rate": 6.881330868761553e-06, "loss": 1.444, "step": 17750 }, { "epoch": 32.83, "grad_norm": 10.878037452697754, "learning_rate": 6.873937153419594e-06, "loss": 1.3816, "step": 17760 }, { "epoch": 32.85, "grad_norm": 7.203150749206543, "learning_rate": 6.8665434380776344e-06, "loss": 1.2447, "step": 17770 }, { "epoch": 32.87, "grad_norm": 7.941763877868652, "learning_rate": 6.859149722735675e-06, "loss": 1.3524, "step": 17780 }, { "epoch": 32.88, "grad_norm": 4.7530622482299805, "learning_rate": 6.8517560073937165e-06, "loss": 1.3147, "step": 17790 }, { "epoch": 32.9, "grad_norm": 11.4851713180542, "learning_rate": 6.844362292051756e-06, "loss": 1.2484, "step": 17800 }, { "epoch": 32.92, "grad_norm": 5.883292198181152, "learning_rate": 6.836968576709797e-06, "loss": 1.0525, "step": 17810 }, { "epoch": 32.94, "grad_norm": 8.723095893859863, "learning_rate": 6.8295748613678384e-06, "loss": 1.4816, "step": 17820 }, { "epoch": 32.96, "grad_norm": 8.809372901916504, "learning_rate": 6.822181146025879e-06, "loss": 1.4973, "step": 17830 }, { "epoch": 32.98, "grad_norm": 7.0233683586120605, "learning_rate": 6.814787430683919e-06, "loss": 1.2779, "step": 17840 }, { "epoch": 32.99, "grad_norm": 10.026683807373047, "learning_rate": 6.80739371534196e-06, "loss": 1.2668, "step": 17850 }, { "epoch": 33.0, "eval_accuracy": 0.7817047817047817, "eval_loss": 0.688318133354187, "eval_runtime": 1.5012, "eval_samples_per_second": 320.418, "eval_steps_per_second": 40.635, "step": 17853 }, { "epoch": 33.01, "grad_norm": 10.550139427185059, "learning_rate": 6.800000000000001e-06, "loss": 0.861, "step": 17860 }, { "epoch": 33.03, "grad_norm": 10.409921646118164, "learning_rate": 6.792606284658041e-06, "loss": 1.1056, "step": 17870 }, { "epoch": 33.05, "grad_norm": 6.504714488983154, "learning_rate": 6.785212569316082e-06, "loss": 0.9693, "step": 17880 }, { "epoch": 33.07, "grad_norm": 3.7629971504211426, "learning_rate": 6.777818853974123e-06, "loss": 1.1907, "step": 17890 }, { "epoch": 33.09, "grad_norm": 9.16115665435791, "learning_rate": 6.770425138632163e-06, "loss": 1.4635, "step": 17900 }, { "epoch": 33.11, "grad_norm": 8.367034912109375, "learning_rate": 6.763031423290204e-06, "loss": 1.2151, "step": 17910 }, { "epoch": 33.12, "grad_norm": 12.835128784179688, "learning_rate": 6.755637707948245e-06, "loss": 1.3527, "step": 17920 }, { "epoch": 33.14, "grad_norm": 15.745380401611328, "learning_rate": 6.7482439926062845e-06, "loss": 1.2163, "step": 17930 }, { "epoch": 33.16, "grad_norm": 7.038750171661377, "learning_rate": 6.740850277264326e-06, "loss": 1.6452, "step": 17940 }, { "epoch": 33.18, "grad_norm": 5.353537082672119, "learning_rate": 6.733456561922367e-06, "loss": 1.1218, "step": 17950 }, { "epoch": 33.2, "grad_norm": 8.98630142211914, "learning_rate": 6.7260628465804064e-06, "loss": 1.3246, "step": 17960 }, { "epoch": 33.22, "grad_norm": 6.300439357757568, "learning_rate": 6.718669131238448e-06, "loss": 1.2071, "step": 17970 }, { "epoch": 33.23, "grad_norm": 6.620423316955566, "learning_rate": 6.7112754158964885e-06, "loss": 1.1539, "step": 17980 }, { "epoch": 33.25, "grad_norm": 5.563384532928467, "learning_rate": 6.703881700554528e-06, "loss": 1.3814, "step": 17990 }, { "epoch": 33.27, "grad_norm": 6.96130895614624, "learning_rate": 6.69648798521257e-06, "loss": 0.8264, "step": 18000 }, { "epoch": 33.29, "grad_norm": 12.329885482788086, "learning_rate": 6.6890942698706104e-06, "loss": 1.6784, "step": 18010 }, { "epoch": 33.31, "grad_norm": 5.123650550842285, "learning_rate": 6.681700554528652e-06, "loss": 1.0895, "step": 18020 }, { "epoch": 33.33, "grad_norm": 4.630773544311523, "learning_rate": 6.674306839186692e-06, "loss": 1.0869, "step": 18030 }, { "epoch": 33.35, "grad_norm": 6.212345600128174, "learning_rate": 6.666913123844732e-06, "loss": 1.1783, "step": 18040 }, { "epoch": 33.36, "grad_norm": 12.417391777038574, "learning_rate": 6.659519408502774e-06, "loss": 1.1704, "step": 18050 }, { "epoch": 33.38, "grad_norm": 1.7101982831954956, "learning_rate": 6.652125693160814e-06, "loss": 1.3238, "step": 18060 }, { "epoch": 33.4, "grad_norm": 11.24477481842041, "learning_rate": 6.644731977818854e-06, "loss": 1.3389, "step": 18070 }, { "epoch": 33.42, "grad_norm": 12.994150161743164, "learning_rate": 6.637338262476896e-06, "loss": 1.1288, "step": 18080 }, { "epoch": 33.44, "grad_norm": 8.610638618469238, "learning_rate": 6.6299445471349355e-06, "loss": 1.3168, "step": 18090 }, { "epoch": 33.46, "grad_norm": 9.99787425994873, "learning_rate": 6.622550831792976e-06, "loss": 1.1469, "step": 18100 }, { "epoch": 33.48, "grad_norm": 5.676076412200928, "learning_rate": 6.615157116451018e-06, "loss": 1.1002, "step": 18110 }, { "epoch": 33.49, "grad_norm": 7.070483207702637, "learning_rate": 6.607763401109057e-06, "loss": 1.8303, "step": 18120 }, { "epoch": 33.51, "grad_norm": 6.711025238037109, "learning_rate": 6.600369685767099e-06, "loss": 1.4334, "step": 18130 }, { "epoch": 33.53, "grad_norm": 6.475217342376709, "learning_rate": 6.5929759704251395e-06, "loss": 0.8842, "step": 18140 }, { "epoch": 33.55, "grad_norm": 11.096541404724121, "learning_rate": 6.585582255083179e-06, "loss": 1.1342, "step": 18150 }, { "epoch": 33.57, "grad_norm": 12.372204780578613, "learning_rate": 6.578188539741221e-06, "loss": 1.6054, "step": 18160 }, { "epoch": 33.59, "grad_norm": 8.56574821472168, "learning_rate": 6.570794824399261e-06, "loss": 0.9483, "step": 18170 }, { "epoch": 33.6, "grad_norm": 3.999720335006714, "learning_rate": 6.563401109057301e-06, "loss": 1.1578, "step": 18180 }, { "epoch": 33.62, "grad_norm": 2.6338484287261963, "learning_rate": 6.556007393715343e-06, "loss": 1.1265, "step": 18190 }, { "epoch": 33.64, "grad_norm": 12.133766174316406, "learning_rate": 6.548613678373383e-06, "loss": 1.2312, "step": 18200 }, { "epoch": 33.66, "grad_norm": 5.5973310470581055, "learning_rate": 6.541219963031425e-06, "loss": 0.9836, "step": 18210 }, { "epoch": 33.68, "grad_norm": 6.721661567687988, "learning_rate": 6.5338262476894645e-06, "loss": 1.3389, "step": 18220 }, { "epoch": 33.7, "grad_norm": 7.737293720245361, "learning_rate": 6.526432532347505e-06, "loss": 1.3561, "step": 18230 }, { "epoch": 33.72, "grad_norm": 6.108405113220215, "learning_rate": 6.519038817005547e-06, "loss": 1.1974, "step": 18240 }, { "epoch": 33.73, "grad_norm": 8.008671760559082, "learning_rate": 6.5116451016635864e-06, "loss": 1.1124, "step": 18250 }, { "epoch": 33.75, "grad_norm": 12.08618450164795, "learning_rate": 6.504251386321627e-06, "loss": 1.173, "step": 18260 }, { "epoch": 33.77, "grad_norm": 11.471029281616211, "learning_rate": 6.4968576709796685e-06, "loss": 1.0785, "step": 18270 }, { "epoch": 33.79, "grad_norm": 8.31908130645752, "learning_rate": 6.489463955637708e-06, "loss": 1.2155, "step": 18280 }, { "epoch": 33.81, "grad_norm": 4.640090465545654, "learning_rate": 6.482070240295749e-06, "loss": 1.1254, "step": 18290 }, { "epoch": 33.83, "grad_norm": 7.596045017242432, "learning_rate": 6.4746765249537904e-06, "loss": 1.3003, "step": 18300 }, { "epoch": 33.84, "grad_norm": 7.694844722747803, "learning_rate": 6.46728280961183e-06, "loss": 1.333, "step": 18310 }, { "epoch": 33.86, "grad_norm": 14.580756187438965, "learning_rate": 6.459889094269871e-06, "loss": 1.2227, "step": 18320 }, { "epoch": 33.88, "grad_norm": 4.898778438568115, "learning_rate": 6.452495378927912e-06, "loss": 1.2323, "step": 18330 }, { "epoch": 33.9, "grad_norm": 8.674956321716309, "learning_rate": 6.445101663585952e-06, "loss": 1.2953, "step": 18340 }, { "epoch": 33.92, "grad_norm": 15.566268920898438, "learning_rate": 6.437707948243993e-06, "loss": 1.1345, "step": 18350 }, { "epoch": 33.94, "grad_norm": 7.324656963348389, "learning_rate": 6.430314232902034e-06, "loss": 0.8886, "step": 18360 }, { "epoch": 33.96, "grad_norm": 7.430301666259766, "learning_rate": 6.422920517560074e-06, "loss": 1.3072, "step": 18370 }, { "epoch": 33.97, "grad_norm": 5.876465797424316, "learning_rate": 6.415526802218115e-06, "loss": 1.6009, "step": 18380 }, { "epoch": 33.99, "grad_norm": 9.634885787963867, "learning_rate": 6.408133086876156e-06, "loss": 1.0728, "step": 18390 }, { "epoch": 34.0, "eval_accuracy": 0.7858627858627859, "eval_loss": 0.6924498081207275, "eval_runtime": 1.5156, "eval_samples_per_second": 317.361, "eval_steps_per_second": 40.247, "step": 18394 }, { "epoch": 34.01, "grad_norm": 15.661543846130371, "learning_rate": 6.400739371534197e-06, "loss": 1.3721, "step": 18400 }, { "epoch": 34.03, "grad_norm": 6.9028730392456055, "learning_rate": 6.3933456561922365e-06, "loss": 1.3244, "step": 18410 }, { "epoch": 34.05, "grad_norm": 8.469842910766602, "learning_rate": 6.385951940850278e-06, "loss": 1.227, "step": 18420 }, { "epoch": 34.07, "grad_norm": 10.288074493408203, "learning_rate": 6.378558225508319e-06, "loss": 1.2154, "step": 18430 }, { "epoch": 34.09, "grad_norm": 7.508502006530762, "learning_rate": 6.3711645101663584e-06, "loss": 1.295, "step": 18440 }, { "epoch": 34.1, "grad_norm": 9.22084903717041, "learning_rate": 6.3637707948244e-06, "loss": 1.2361, "step": 18450 }, { "epoch": 34.12, "grad_norm": 4.880374431610107, "learning_rate": 6.3563770794824405e-06, "loss": 1.2164, "step": 18460 }, { "epoch": 34.14, "grad_norm": 6.935220718383789, "learning_rate": 6.348983364140481e-06, "loss": 1.0363, "step": 18470 }, { "epoch": 34.16, "grad_norm": 5.247091770172119, "learning_rate": 6.341589648798522e-06, "loss": 1.1903, "step": 18480 }, { "epoch": 34.18, "grad_norm": 3.9794464111328125, "learning_rate": 6.3341959334565624e-06, "loss": 1.337, "step": 18490 }, { "epoch": 34.2, "grad_norm": 8.390508651733398, "learning_rate": 6.326802218114603e-06, "loss": 1.2261, "step": 18500 }, { "epoch": 34.21, "grad_norm": 10.047736167907715, "learning_rate": 6.319408502772644e-06, "loss": 1.1517, "step": 18510 }, { "epoch": 34.23, "grad_norm": 8.966001510620117, "learning_rate": 6.312014787430684e-06, "loss": 0.9307, "step": 18520 }, { "epoch": 34.25, "grad_norm": 5.132773399353027, "learning_rate": 6.304621072088725e-06, "loss": 0.9421, "step": 18530 }, { "epoch": 34.27, "grad_norm": 6.657975196838379, "learning_rate": 6.297227356746766e-06, "loss": 1.1881, "step": 18540 }, { "epoch": 34.29, "grad_norm": 6.071409225463867, "learning_rate": 6.289833641404807e-06, "loss": 1.1646, "step": 18550 }, { "epoch": 34.31, "grad_norm": 8.549930572509766, "learning_rate": 6.282439926062847e-06, "loss": 1.1996, "step": 18560 }, { "epoch": 34.33, "grad_norm": 11.600802421569824, "learning_rate": 6.2750462107208875e-06, "loss": 1.2849, "step": 18570 }, { "epoch": 34.34, "grad_norm": 10.691652297973633, "learning_rate": 6.267652495378929e-06, "loss": 1.091, "step": 18580 }, { "epoch": 34.36, "grad_norm": 2.8452277183532715, "learning_rate": 6.26025878003697e-06, "loss": 1.2909, "step": 18590 }, { "epoch": 34.38, "grad_norm": 1.5564590692520142, "learning_rate": 6.252865064695009e-06, "loss": 0.8964, "step": 18600 }, { "epoch": 34.4, "grad_norm": 6.090004920959473, "learning_rate": 6.245471349353051e-06, "loss": 1.4593, "step": 18610 }, { "epoch": 34.42, "grad_norm": 11.408190727233887, "learning_rate": 6.2380776340110915e-06, "loss": 1.0514, "step": 18620 }, { "epoch": 34.44, "grad_norm": 13.99205207824707, "learning_rate": 6.230683918669131e-06, "loss": 1.2961, "step": 18630 }, { "epoch": 34.45, "grad_norm": 6.7173309326171875, "learning_rate": 6.223290203327173e-06, "loss": 1.3463, "step": 18640 }, { "epoch": 34.47, "grad_norm": 4.755782127380371, "learning_rate": 6.215896487985213e-06, "loss": 0.9707, "step": 18650 }, { "epoch": 34.49, "grad_norm": 11.078941345214844, "learning_rate": 6.208502772643253e-06, "loss": 1.3301, "step": 18660 }, { "epoch": 34.51, "grad_norm": 5.8802642822265625, "learning_rate": 6.201109057301295e-06, "loss": 1.1897, "step": 18670 }, { "epoch": 34.53, "grad_norm": 5.200064659118652, "learning_rate": 6.193715341959335e-06, "loss": 1.0824, "step": 18680 }, { "epoch": 34.55, "grad_norm": 7.96475076675415, "learning_rate": 6.186321626617375e-06, "loss": 0.902, "step": 18690 }, { "epoch": 34.57, "grad_norm": 4.493646144866943, "learning_rate": 6.1789279112754165e-06, "loss": 1.2459, "step": 18700 }, { "epoch": 34.58, "grad_norm": 11.057312965393066, "learning_rate": 6.171534195933457e-06, "loss": 1.303, "step": 18710 }, { "epoch": 34.6, "grad_norm": 5.7545294761657715, "learning_rate": 6.164140480591497e-06, "loss": 1.3534, "step": 18720 }, { "epoch": 34.62, "grad_norm": 7.117556571960449, "learning_rate": 6.1567467652495384e-06, "loss": 1.6746, "step": 18730 }, { "epoch": 34.64, "grad_norm": 4.606558322906494, "learning_rate": 6.149353049907579e-06, "loss": 1.0669, "step": 18740 }, { "epoch": 34.66, "grad_norm": 7.435132026672363, "learning_rate": 6.141959334565619e-06, "loss": 1.2874, "step": 18750 }, { "epoch": 34.68, "grad_norm": 4.1121015548706055, "learning_rate": 6.13456561922366e-06, "loss": 0.9312, "step": 18760 }, { "epoch": 34.7, "grad_norm": 8.007678985595703, "learning_rate": 6.127171903881701e-06, "loss": 1.4374, "step": 18770 }, { "epoch": 34.71, "grad_norm": 5.395533561706543, "learning_rate": 6.1197781885397424e-06, "loss": 1.1522, "step": 18780 }, { "epoch": 34.73, "grad_norm": 15.870338439941406, "learning_rate": 6.112384473197782e-06, "loss": 1.4808, "step": 18790 }, { "epoch": 34.75, "grad_norm": 9.240372657775879, "learning_rate": 6.104990757855823e-06, "loss": 1.1266, "step": 18800 }, { "epoch": 34.77, "grad_norm": 8.89116382598877, "learning_rate": 6.097597042513864e-06, "loss": 1.1094, "step": 18810 }, { "epoch": 34.79, "grad_norm": 8.478248596191406, "learning_rate": 6.090203327171904e-06, "loss": 1.3733, "step": 18820 }, { "epoch": 34.81, "grad_norm": 10.576647758483887, "learning_rate": 6.082809611829945e-06, "loss": 1.2942, "step": 18830 }, { "epoch": 34.82, "grad_norm": 7.021605968475342, "learning_rate": 6.075415896487986e-06, "loss": 1.0428, "step": 18840 }, { "epoch": 34.84, "grad_norm": 3.395364761352539, "learning_rate": 6.068022181146026e-06, "loss": 1.0285, "step": 18850 }, { "epoch": 34.86, "grad_norm": 6.5613813400268555, "learning_rate": 6.060628465804067e-06, "loss": 1.4247, "step": 18860 }, { "epoch": 34.88, "grad_norm": 8.559307098388672, "learning_rate": 6.053234750462108e-06, "loss": 1.4103, "step": 18870 }, { "epoch": 34.9, "grad_norm": 8.162131309509277, "learning_rate": 6.045841035120148e-06, "loss": 0.923, "step": 18880 }, { "epoch": 34.92, "grad_norm": 5.598612308502197, "learning_rate": 6.038447319778189e-06, "loss": 1.3314, "step": 18890 }, { "epoch": 34.94, "grad_norm": 5.433062553405762, "learning_rate": 6.03105360443623e-06, "loss": 1.0424, "step": 18900 }, { "epoch": 34.95, "grad_norm": 10.263800621032715, "learning_rate": 6.02365988909427e-06, "loss": 1.1689, "step": 18910 }, { "epoch": 34.97, "grad_norm": 4.347888469696045, "learning_rate": 6.016266173752311e-06, "loss": 1.0856, "step": 18920 }, { "epoch": 34.99, "grad_norm": 8.751306533813477, "learning_rate": 6.008872458410352e-06, "loss": 1.1856, "step": 18930 }, { "epoch": 35.0, "eval_accuracy": 0.7920997920997921, "eval_loss": 0.6840357184410095, "eval_runtime": 1.5259, "eval_samples_per_second": 315.218, "eval_steps_per_second": 39.976, "step": 18935 }, { "epoch": 35.01, "grad_norm": 5.611832141876221, "learning_rate": 6.001478743068392e-06, "loss": 1.0581, "step": 18940 }, { "epoch": 35.03, "grad_norm": 8.494256019592285, "learning_rate": 5.994085027726433e-06, "loss": 1.2681, "step": 18950 }, { "epoch": 35.05, "grad_norm": 9.69924259185791, "learning_rate": 5.986691312384474e-06, "loss": 1.4476, "step": 18960 }, { "epoch": 35.06, "grad_norm": 8.78618049621582, "learning_rate": 5.979297597042515e-06, "loss": 1.1937, "step": 18970 }, { "epoch": 35.08, "grad_norm": 6.897728443145752, "learning_rate": 5.971903881700555e-06, "loss": 1.1329, "step": 18980 }, { "epoch": 35.1, "grad_norm": 6.76364803314209, "learning_rate": 5.964510166358596e-06, "loss": 1.3099, "step": 18990 }, { "epoch": 35.12, "grad_norm": 11.277626991271973, "learning_rate": 5.957116451016637e-06, "loss": 1.0708, "step": 19000 }, { "epoch": 35.14, "grad_norm": 5.4572272300720215, "learning_rate": 5.949722735674677e-06, "loss": 1.0739, "step": 19010 }, { "epoch": 35.16, "grad_norm": 6.137432098388672, "learning_rate": 5.942329020332718e-06, "loss": 1.2161, "step": 19020 }, { "epoch": 35.18, "grad_norm": 10.644075393676758, "learning_rate": 5.934935304990759e-06, "loss": 1.2948, "step": 19030 }, { "epoch": 35.19, "grad_norm": 14.130032539367676, "learning_rate": 5.927541589648799e-06, "loss": 1.0563, "step": 19040 }, { "epoch": 35.21, "grad_norm": 5.532588481903076, "learning_rate": 5.9201478743068395e-06, "loss": 1.3656, "step": 19050 }, { "epoch": 35.23, "grad_norm": 2.7407379150390625, "learning_rate": 5.912754158964881e-06, "loss": 1.123, "step": 19060 }, { "epoch": 35.25, "grad_norm": 6.711956024169922, "learning_rate": 5.905360443622921e-06, "loss": 1.1315, "step": 19070 }, { "epoch": 35.27, "grad_norm": 7.860185146331787, "learning_rate": 5.897966728280961e-06, "loss": 1.3374, "step": 19080 }, { "epoch": 35.29, "grad_norm": 8.43912410736084, "learning_rate": 5.890573012939003e-06, "loss": 1.1211, "step": 19090 }, { "epoch": 35.3, "grad_norm": 4.875197410583496, "learning_rate": 5.883179297597043e-06, "loss": 1.2133, "step": 19100 }, { "epoch": 35.32, "grad_norm": 3.6006457805633545, "learning_rate": 5.875785582255083e-06, "loss": 1.4415, "step": 19110 }, { "epoch": 35.34, "grad_norm": 14.244126319885254, "learning_rate": 5.868391866913125e-06, "loss": 1.3729, "step": 19120 }, { "epoch": 35.36, "grad_norm": 5.184271335601807, "learning_rate": 5.8609981515711645e-06, "loss": 1.37, "step": 19130 }, { "epoch": 35.38, "grad_norm": 3.7604968547821045, "learning_rate": 5.853604436229205e-06, "loss": 0.9154, "step": 19140 }, { "epoch": 35.4, "grad_norm": 4.972818851470947, "learning_rate": 5.846210720887247e-06, "loss": 1.2202, "step": 19150 }, { "epoch": 35.42, "grad_norm": 8.350935935974121, "learning_rate": 5.838817005545287e-06, "loss": 1.0243, "step": 19160 }, { "epoch": 35.43, "grad_norm": 4.86630392074585, "learning_rate": 5.831423290203327e-06, "loss": 0.9893, "step": 19170 }, { "epoch": 35.45, "grad_norm": 4.281905651092529, "learning_rate": 5.8240295748613685e-06, "loss": 1.2693, "step": 19180 }, { "epoch": 35.47, "grad_norm": 9.322166442871094, "learning_rate": 5.816635859519409e-06, "loss": 1.4563, "step": 19190 }, { "epoch": 35.49, "grad_norm": 9.57886028289795, "learning_rate": 5.809242144177449e-06, "loss": 1.1918, "step": 19200 }, { "epoch": 35.51, "grad_norm": 6.605780124664307, "learning_rate": 5.8018484288354904e-06, "loss": 1.2863, "step": 19210 }, { "epoch": 35.53, "grad_norm": 7.068185806274414, "learning_rate": 5.794454713493531e-06, "loss": 1.1953, "step": 19220 }, { "epoch": 35.55, "grad_norm": 6.858496189117432, "learning_rate": 5.787060998151572e-06, "loss": 1.0251, "step": 19230 }, { "epoch": 35.56, "grad_norm": 2.9089362621307373, "learning_rate": 5.779667282809612e-06, "loss": 1.186, "step": 19240 }, { "epoch": 35.58, "grad_norm": 7.164134502410889, "learning_rate": 5.772273567467653e-06, "loss": 1.3147, "step": 19250 }, { "epoch": 35.6, "grad_norm": 8.177092552185059, "learning_rate": 5.764879852125694e-06, "loss": 1.3164, "step": 19260 }, { "epoch": 35.62, "grad_norm": 5.0043625831604, "learning_rate": 5.757486136783734e-06, "loss": 1.165, "step": 19270 }, { "epoch": 35.64, "grad_norm": 9.111473083496094, "learning_rate": 5.750092421441775e-06, "loss": 1.0097, "step": 19280 }, { "epoch": 35.66, "grad_norm": 7.070840835571289, "learning_rate": 5.7426987060998155e-06, "loss": 0.9336, "step": 19290 }, { "epoch": 35.67, "grad_norm": 12.756954193115234, "learning_rate": 5.735304990757856e-06, "loss": 1.2013, "step": 19300 }, { "epoch": 35.69, "grad_norm": 6.637971878051758, "learning_rate": 5.727911275415898e-06, "loss": 1.4618, "step": 19310 }, { "epoch": 35.71, "grad_norm": 11.127894401550293, "learning_rate": 5.720517560073937e-06, "loss": 1.1037, "step": 19320 }, { "epoch": 35.73, "grad_norm": 8.454278945922852, "learning_rate": 5.713123844731978e-06, "loss": 1.0803, "step": 19330 }, { "epoch": 35.75, "grad_norm": 8.275283813476562, "learning_rate": 5.7057301293900195e-06, "loss": 1.6451, "step": 19340 }, { "epoch": 35.77, "grad_norm": 6.5415120124816895, "learning_rate": 5.69833641404806e-06, "loss": 1.2544, "step": 19350 }, { "epoch": 35.79, "grad_norm": 4.170315742492676, "learning_rate": 5.6909426987061e-06, "loss": 1.2569, "step": 19360 }, { "epoch": 35.8, "grad_norm": 7.194453239440918, "learning_rate": 5.683548983364141e-06, "loss": 1.3341, "step": 19370 }, { "epoch": 35.82, "grad_norm": 4.819294452667236, "learning_rate": 5.676155268022182e-06, "loss": 1.3075, "step": 19380 }, { "epoch": 35.84, "grad_norm": 9.019405364990234, "learning_rate": 5.668761552680222e-06, "loss": 0.8948, "step": 19390 }, { "epoch": 35.86, "grad_norm": 12.656298637390137, "learning_rate": 5.661367837338263e-06, "loss": 1.0023, "step": 19400 }, { "epoch": 35.88, "grad_norm": 9.310526847839355, "learning_rate": 5.653974121996304e-06, "loss": 1.0307, "step": 19410 }, { "epoch": 35.9, "grad_norm": 4.940120220184326, "learning_rate": 5.646580406654344e-06, "loss": 1.3207, "step": 19420 }, { "epoch": 35.91, "grad_norm": 9.295770645141602, "learning_rate": 5.639186691312385e-06, "loss": 1.2356, "step": 19430 }, { "epoch": 35.93, "grad_norm": 15.025140762329102, "learning_rate": 5.631792975970426e-06, "loss": 1.7289, "step": 19440 }, { "epoch": 35.95, "grad_norm": 8.046751022338867, "learning_rate": 5.624399260628466e-06, "loss": 1.1371, "step": 19450 }, { "epoch": 35.97, "grad_norm": 5.763049125671387, "learning_rate": 5.617005545286507e-06, "loss": 1.1711, "step": 19460 }, { "epoch": 35.99, "grad_norm": 5.953471660614014, "learning_rate": 5.609611829944548e-06, "loss": 1.2387, "step": 19470 }, { "epoch": 36.0, "eval_accuracy": 0.8024948024948025, "eval_loss": 0.6738823056221008, "eval_runtime": 1.5314, "eval_samples_per_second": 314.093, "eval_steps_per_second": 39.833, "step": 19476 }, { "epoch": 36.01, "grad_norm": 12.695279121398926, "learning_rate": 5.6022181146025875e-06, "loss": 1.1871, "step": 19480 }, { "epoch": 36.03, "grad_norm": 4.35639762878418, "learning_rate": 5.594824399260629e-06, "loss": 1.3423, "step": 19490 }, { "epoch": 36.04, "grad_norm": 6.1324543952941895, "learning_rate": 5.58743068391867e-06, "loss": 1.2871, "step": 19500 }, { "epoch": 36.06, "grad_norm": 5.5742058753967285, "learning_rate": 5.580036968576709e-06, "loss": 1.0298, "step": 19510 }, { "epoch": 36.08, "grad_norm": 11.901124000549316, "learning_rate": 5.572643253234751e-06, "loss": 1.2485, "step": 19520 }, { "epoch": 36.1, "grad_norm": 8.859539031982422, "learning_rate": 5.5652495378927915e-06, "loss": 1.3228, "step": 19530 }, { "epoch": 36.12, "grad_norm": 4.1057448387146, "learning_rate": 5.557855822550833e-06, "loss": 1.1322, "step": 19540 }, { "epoch": 36.14, "grad_norm": 8.657859802246094, "learning_rate": 5.551201478743069e-06, "loss": 1.0688, "step": 19550 }, { "epoch": 36.16, "grad_norm": 5.6578369140625, "learning_rate": 5.54380776340111e-06, "loss": 1.3766, "step": 19560 }, { "epoch": 36.17, "grad_norm": 9.522768020629883, "learning_rate": 5.53641404805915e-06, "loss": 1.0283, "step": 19570 }, { "epoch": 36.19, "grad_norm": 4.761446952819824, "learning_rate": 5.529020332717191e-06, "loss": 0.9452, "step": 19580 }, { "epoch": 36.21, "grad_norm": 6.248351097106934, "learning_rate": 5.521626617375232e-06, "loss": 1.3383, "step": 19590 }, { "epoch": 36.23, "grad_norm": 5.121849060058594, "learning_rate": 5.514232902033272e-06, "loss": 1.4322, "step": 19600 }, { "epoch": 36.25, "grad_norm": 4.045438289642334, "learning_rate": 5.506839186691313e-06, "loss": 0.9801, "step": 19610 }, { "epoch": 36.27, "grad_norm": 5.671826362609863, "learning_rate": 5.499445471349354e-06, "loss": 1.1938, "step": 19620 }, { "epoch": 36.28, "grad_norm": 7.795522212982178, "learning_rate": 5.492051756007394e-06, "loss": 1.1524, "step": 19630 }, { "epoch": 36.3, "grad_norm": 6.905058860778809, "learning_rate": 5.484658040665435e-06, "loss": 1.3333, "step": 19640 }, { "epoch": 36.32, "grad_norm": 9.50643253326416, "learning_rate": 5.477264325323476e-06, "loss": 1.0708, "step": 19650 }, { "epoch": 36.34, "grad_norm": 4.257543563842773, "learning_rate": 5.469870609981516e-06, "loss": 1.0438, "step": 19660 }, { "epoch": 36.36, "grad_norm": 14.501640319824219, "learning_rate": 5.4624768946395565e-06, "loss": 1.0386, "step": 19670 }, { "epoch": 36.38, "grad_norm": 5.424924373626709, "learning_rate": 5.455083179297598e-06, "loss": 1.1903, "step": 19680 }, { "epoch": 36.4, "grad_norm": 15.840132713317871, "learning_rate": 5.447689463955638e-06, "loss": 1.1542, "step": 19690 }, { "epoch": 36.41, "grad_norm": 10.763233184814453, "learning_rate": 5.440295748613678e-06, "loss": 0.9721, "step": 19700 }, { "epoch": 36.43, "grad_norm": 4.40644645690918, "learning_rate": 5.43290203327172e-06, "loss": 1.2103, "step": 19710 }, { "epoch": 36.45, "grad_norm": 9.434952735900879, "learning_rate": 5.42550831792976e-06, "loss": 1.2747, "step": 19720 }, { "epoch": 36.47, "grad_norm": 5.235437870025635, "learning_rate": 5.4181146025878e-06, "loss": 1.0841, "step": 19730 }, { "epoch": 36.49, "grad_norm": 10.476019859313965, "learning_rate": 5.410720887245842e-06, "loss": 1.0193, "step": 19740 }, { "epoch": 36.51, "grad_norm": 10.128366470336914, "learning_rate": 5.403327171903882e-06, "loss": 1.2471, "step": 19750 }, { "epoch": 36.52, "grad_norm": 5.93746280670166, "learning_rate": 5.395933456561922e-06, "loss": 1.0741, "step": 19760 }, { "epoch": 36.54, "grad_norm": 4.750613689422607, "learning_rate": 5.388539741219964e-06, "loss": 1.4469, "step": 19770 }, { "epoch": 36.56, "grad_norm": 5.176764011383057, "learning_rate": 5.381146025878004e-06, "loss": 1.2243, "step": 19780 }, { "epoch": 36.58, "grad_norm": 10.176244735717773, "learning_rate": 5.373752310536045e-06, "loss": 1.4638, "step": 19790 }, { "epoch": 36.6, "grad_norm": 4.858523845672607, "learning_rate": 5.3663585951940856e-06, "loss": 0.9815, "step": 19800 }, { "epoch": 36.62, "grad_norm": 15.762124061584473, "learning_rate": 5.358964879852126e-06, "loss": 1.3727, "step": 19810 }, { "epoch": 36.64, "grad_norm": 13.113272666931152, "learning_rate": 5.351571164510167e-06, "loss": 1.4776, "step": 19820 }, { "epoch": 36.65, "grad_norm": 13.374110221862793, "learning_rate": 5.3441774491682075e-06, "loss": 1.2718, "step": 19830 }, { "epoch": 36.67, "grad_norm": 12.258637428283691, "learning_rate": 5.336783733826248e-06, "loss": 1.465, "step": 19840 }, { "epoch": 36.69, "grad_norm": 7.793469429016113, "learning_rate": 5.329390018484289e-06, "loss": 1.1117, "step": 19850 }, { "epoch": 36.71, "grad_norm": 6.209737300872803, "learning_rate": 5.321996303142329e-06, "loss": 1.3921, "step": 19860 }, { "epoch": 36.73, "grad_norm": 8.460219383239746, "learning_rate": 5.314602587800371e-06, "loss": 1.0176, "step": 19870 }, { "epoch": 36.75, "grad_norm": 8.175382614135742, "learning_rate": 5.307208872458411e-06, "loss": 1.2899, "step": 19880 }, { "epoch": 36.77, "grad_norm": 9.314245223999023, "learning_rate": 5.299815157116451e-06, "loss": 1.1337, "step": 19890 }, { "epoch": 36.78, "grad_norm": 6.835625648498535, "learning_rate": 5.292421441774493e-06, "loss": 1.2373, "step": 19900 }, { "epoch": 36.8, "grad_norm": 6.811435222625732, "learning_rate": 5.2850277264325325e-06, "loss": 0.8922, "step": 19910 }, { "epoch": 36.82, "grad_norm": 9.42974853515625, "learning_rate": 5.277634011090573e-06, "loss": 1.3595, "step": 19920 }, { "epoch": 36.84, "grad_norm": 7.185044288635254, "learning_rate": 5.270240295748615e-06, "loss": 1.0692, "step": 19930 }, { "epoch": 36.86, "grad_norm": 6.472748756408691, "learning_rate": 5.262846580406655e-06, "loss": 1.0877, "step": 19940 }, { "epoch": 36.88, "grad_norm": 11.556644439697266, "learning_rate": 5.255452865064695e-06, "loss": 1.0314, "step": 19950 }, { "epoch": 36.89, "grad_norm": 10.657917022705078, "learning_rate": 5.2480591497227365e-06, "loss": 1.6622, "step": 19960 }, { "epoch": 36.91, "grad_norm": 7.276571273803711, "learning_rate": 5.240665434380777e-06, "loss": 1.0786, "step": 19970 }, { "epoch": 36.93, "grad_norm": 9.749002456665039, "learning_rate": 5.233271719038817e-06, "loss": 1.0402, "step": 19980 }, { "epoch": 36.95, "grad_norm": 9.379003524780273, "learning_rate": 5.225878003696858e-06, "loss": 1.2869, "step": 19990 }, { "epoch": 36.97, "grad_norm": 10.972497940063477, "learning_rate": 5.218484288354899e-06, "loss": 1.1502, "step": 20000 }, { "epoch": 36.99, "grad_norm": 9.125678062438965, "learning_rate": 5.211090573012939e-06, "loss": 1.5242, "step": 20010 }, { "epoch": 37.0, "eval_accuracy": 0.7962577962577962, "eval_loss": 0.6554017663002014, "eval_runtime": 1.4922, "eval_samples_per_second": 322.35, "eval_steps_per_second": 40.88, "step": 20017 }, { "epoch": 37.01, "grad_norm": 5.834267616271973, "learning_rate": 5.20369685767098e-06, "loss": 1.2175, "step": 20020 }, { "epoch": 37.02, "grad_norm": 7.283585548400879, "learning_rate": 5.196303142329021e-06, "loss": 1.4105, "step": 20030 }, { "epoch": 37.04, "grad_norm": 7.902415752410889, "learning_rate": 5.188909426987061e-06, "loss": 0.9568, "step": 20040 }, { "epoch": 37.06, "grad_norm": 5.0116286277771, "learning_rate": 5.181515711645102e-06, "loss": 1.4063, "step": 20050 }, { "epoch": 37.08, "grad_norm": 7.539445877075195, "learning_rate": 5.174121996303143e-06, "loss": 1.4133, "step": 20060 }, { "epoch": 37.1, "grad_norm": 7.313731670379639, "learning_rate": 5.166728280961183e-06, "loss": 1.2133, "step": 20070 }, { "epoch": 37.12, "grad_norm": 4.551146507263184, "learning_rate": 5.159334565619224e-06, "loss": 1.1639, "step": 20080 }, { "epoch": 37.13, "grad_norm": 12.317727088928223, "learning_rate": 5.151940850277265e-06, "loss": 1.453, "step": 20090 }, { "epoch": 37.15, "grad_norm": 11.362765312194824, "learning_rate": 5.1445471349353045e-06, "loss": 1.3044, "step": 20100 }, { "epoch": 37.17, "grad_norm": 4.082790851593018, "learning_rate": 5.137153419593346e-06, "loss": 1.3534, "step": 20110 }, { "epoch": 37.19, "grad_norm": 6.857915878295898, "learning_rate": 5.129759704251387e-06, "loss": 1.255, "step": 20120 }, { "epoch": 37.21, "grad_norm": 10.075772285461426, "learning_rate": 5.122365988909428e-06, "loss": 1.4907, "step": 20130 }, { "epoch": 37.23, "grad_norm": 5.6129608154296875, "learning_rate": 5.114972273567468e-06, "loss": 1.1381, "step": 20140 }, { "epoch": 37.25, "grad_norm": 8.669118881225586, "learning_rate": 5.1075785582255085e-06, "loss": 1.1591, "step": 20150 }, { "epoch": 37.26, "grad_norm": 5.681878089904785, "learning_rate": 5.10018484288355e-06, "loss": 1.3144, "step": 20160 }, { "epoch": 37.28, "grad_norm": 7.634646415710449, "learning_rate": 5.09279112754159e-06, "loss": 1.3138, "step": 20170 }, { "epoch": 37.3, "grad_norm": 4.998110771179199, "learning_rate": 5.08539741219963e-06, "loss": 1.1665, "step": 20180 }, { "epoch": 37.32, "grad_norm": 9.576936721801758, "learning_rate": 5.078003696857672e-06, "loss": 1.1718, "step": 20190 }, { "epoch": 37.34, "grad_norm": 8.292411804199219, "learning_rate": 5.070609981515712e-06, "loss": 1.2499, "step": 20200 }, { "epoch": 37.36, "grad_norm": 3.7511072158813477, "learning_rate": 5.063216266173753e-06, "loss": 1.2346, "step": 20210 }, { "epoch": 37.38, "grad_norm": 1.9749755859375, "learning_rate": 5.055822550831794e-06, "loss": 1.1448, "step": 20220 }, { "epoch": 37.39, "grad_norm": 12.68874740600586, "learning_rate": 5.0484288354898336e-06, "loss": 1.3072, "step": 20230 }, { "epoch": 37.41, "grad_norm": 5.783342361450195, "learning_rate": 5.041035120147875e-06, "loss": 1.0589, "step": 20240 }, { "epoch": 37.43, "grad_norm": 8.21955680847168, "learning_rate": 5.033641404805916e-06, "loss": 1.2851, "step": 20250 }, { "epoch": 37.45, "grad_norm": 8.555632591247559, "learning_rate": 5.0262476894639554e-06, "loss": 1.0472, "step": 20260 }, { "epoch": 37.47, "grad_norm": 7.716969966888428, "learning_rate": 5.018853974121997e-06, "loss": 1.2857, "step": 20270 }, { "epoch": 37.49, "grad_norm": 8.176843643188477, "learning_rate": 5.0114602587800376e-06, "loss": 1.2132, "step": 20280 }, { "epoch": 37.5, "grad_norm": 5.631040096282959, "learning_rate": 5.004066543438077e-06, "loss": 1.396, "step": 20290 }, { "epoch": 37.52, "grad_norm": 8.99039077758789, "learning_rate": 4.996672828096119e-06, "loss": 1.0291, "step": 20300 }, { "epoch": 37.54, "grad_norm": 1.7645901441574097, "learning_rate": 4.9892791127541595e-06, "loss": 0.8452, "step": 20310 }, { "epoch": 37.56, "grad_norm": 3.0897340774536133, "learning_rate": 4.9818853974122e-06, "loss": 1.2887, "step": 20320 }, { "epoch": 37.58, "grad_norm": 7.579039573669434, "learning_rate": 4.974491682070241e-06, "loss": 1.5666, "step": 20330 }, { "epoch": 37.6, "grad_norm": 5.660084247589111, "learning_rate": 4.967097966728281e-06, "loss": 1.0828, "step": 20340 }, { "epoch": 37.62, "grad_norm": 3.780691623687744, "learning_rate": 4.959704251386322e-06, "loss": 1.2129, "step": 20350 }, { "epoch": 37.63, "grad_norm": 9.319971084594727, "learning_rate": 4.952310536044363e-06, "loss": 1.3015, "step": 20360 }, { "epoch": 37.65, "grad_norm": 4.647171974182129, "learning_rate": 4.944916820702403e-06, "loss": 1.053, "step": 20370 }, { "epoch": 37.67, "grad_norm": 7.176387310028076, "learning_rate": 4.937523105360444e-06, "loss": 1.1532, "step": 20380 }, { "epoch": 37.69, "grad_norm": 8.905428886413574, "learning_rate": 4.9301293900184845e-06, "loss": 1.1751, "step": 20390 }, { "epoch": 37.71, "grad_norm": 8.644600868225098, "learning_rate": 4.922735674676525e-06, "loss": 1.0972, "step": 20400 }, { "epoch": 37.73, "grad_norm": 6.438190460205078, "learning_rate": 4.915341959334566e-06, "loss": 0.911, "step": 20410 }, { "epoch": 37.74, "grad_norm": 8.395973205566406, "learning_rate": 4.907948243992607e-06, "loss": 1.224, "step": 20420 }, { "epoch": 37.76, "grad_norm": 8.687971115112305, "learning_rate": 4.900554528650647e-06, "loss": 1.245, "step": 20430 }, { "epoch": 37.78, "grad_norm": 5.946033477783203, "learning_rate": 4.893160813308688e-06, "loss": 0.9506, "step": 20440 }, { "epoch": 37.8, "grad_norm": 5.8480095863342285, "learning_rate": 4.885767097966729e-06, "loss": 1.2278, "step": 20450 }, { "epoch": 37.82, "grad_norm": 8.44746208190918, "learning_rate": 4.878373382624769e-06, "loss": 1.1109, "step": 20460 }, { "epoch": 37.84, "grad_norm": 7.337038993835449, "learning_rate": 4.87097966728281e-06, "loss": 1.1812, "step": 20470 }, { "epoch": 37.86, "grad_norm": 6.468664646148682, "learning_rate": 4.863585951940851e-06, "loss": 1.2973, "step": 20480 }, { "epoch": 37.87, "grad_norm": 2.8116157054901123, "learning_rate": 4.856192236598891e-06, "loss": 0.9425, "step": 20490 }, { "epoch": 37.89, "grad_norm": 11.060296058654785, "learning_rate": 4.848798521256932e-06, "loss": 1.0687, "step": 20500 }, { "epoch": 37.91, "grad_norm": 7.166051387786865, "learning_rate": 4.841404805914973e-06, "loss": 1.6505, "step": 20510 }, { "epoch": 37.93, "grad_norm": 8.214900016784668, "learning_rate": 4.834011090573013e-06, "loss": 1.3019, "step": 20520 }, { "epoch": 37.95, "grad_norm": 11.252281188964844, "learning_rate": 4.826617375231054e-06, "loss": 1.1939, "step": 20530 }, { "epoch": 37.97, "grad_norm": 10.687776565551758, "learning_rate": 4.819223659889095e-06, "loss": 1.2535, "step": 20540 }, { "epoch": 37.99, "grad_norm": 12.307971954345703, "learning_rate": 4.8118299445471355e-06, "loss": 1.351, "step": 20550 }, { "epoch": 38.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.6736065745353699, "eval_runtime": 1.5537, "eval_samples_per_second": 309.591, "eval_steps_per_second": 39.262, "step": 20558 }, { "epoch": 38.0, "grad_norm": 5.2955193519592285, "learning_rate": 4.804436229205176e-06, "loss": 1.1671, "step": 20560 }, { "epoch": 38.02, "grad_norm": 9.18361759185791, "learning_rate": 4.797042513863217e-06, "loss": 1.0592, "step": 20570 }, { "epoch": 38.04, "grad_norm": 12.313794136047363, "learning_rate": 4.789648798521257e-06, "loss": 1.1253, "step": 20580 }, { "epoch": 38.06, "grad_norm": 8.513861656188965, "learning_rate": 4.782255083179298e-06, "loss": 1.3498, "step": 20590 }, { "epoch": 38.08, "grad_norm": 11.283888816833496, "learning_rate": 4.774861367837339e-06, "loss": 1.4679, "step": 20600 }, { "epoch": 38.1, "grad_norm": 10.420369148254395, "learning_rate": 4.767467652495379e-06, "loss": 1.1148, "step": 20610 }, { "epoch": 38.11, "grad_norm": 11.679603576660156, "learning_rate": 4.76007393715342e-06, "loss": 1.2223, "step": 20620 }, { "epoch": 38.13, "grad_norm": 10.630248069763184, "learning_rate": 4.7526802218114605e-06, "loss": 1.3072, "step": 20630 }, { "epoch": 38.15, "grad_norm": 6.0593743324279785, "learning_rate": 4.745286506469501e-06, "loss": 1.3688, "step": 20640 }, { "epoch": 38.17, "grad_norm": 10.472369194030762, "learning_rate": 4.737892791127542e-06, "loss": 1.0908, "step": 20650 }, { "epoch": 38.19, "grad_norm": 7.963718414306641, "learning_rate": 4.730499075785583e-06, "loss": 1.0935, "step": 20660 }, { "epoch": 38.21, "grad_norm": 6.958829879760742, "learning_rate": 4.723105360443623e-06, "loss": 1.1043, "step": 20670 }, { "epoch": 38.23, "grad_norm": 8.587081909179688, "learning_rate": 4.715711645101664e-06, "loss": 1.1644, "step": 20680 }, { "epoch": 38.24, "grad_norm": 3.3204185962677, "learning_rate": 4.708317929759705e-06, "loss": 0.799, "step": 20690 }, { "epoch": 38.26, "grad_norm": 7.987104415893555, "learning_rate": 4.700924214417745e-06, "loss": 1.3043, "step": 20700 }, { "epoch": 38.28, "grad_norm": 9.932259559631348, "learning_rate": 4.6935304990757856e-06, "loss": 1.0142, "step": 20710 }, { "epoch": 38.3, "grad_norm": 5.825223922729492, "learning_rate": 4.686136783733827e-06, "loss": 1.17, "step": 20720 }, { "epoch": 38.32, "grad_norm": 13.828930854797363, "learning_rate": 4.678743068391867e-06, "loss": 1.0342, "step": 20730 }, { "epoch": 38.34, "grad_norm": 8.067893028259277, "learning_rate": 4.671349353049908e-06, "loss": 0.9711, "step": 20740 }, { "epoch": 38.35, "grad_norm": 9.437845230102539, "learning_rate": 4.663955637707949e-06, "loss": 1.2967, "step": 20750 }, { "epoch": 38.37, "grad_norm": 10.447977066040039, "learning_rate": 4.6565619223659896e-06, "loss": 1.2303, "step": 20760 }, { "epoch": 38.39, "grad_norm": 9.687793731689453, "learning_rate": 4.64916820702403e-06, "loss": 1.2785, "step": 20770 }, { "epoch": 38.41, "grad_norm": 12.428980827331543, "learning_rate": 4.641774491682071e-06, "loss": 1.057, "step": 20780 }, { "epoch": 38.43, "grad_norm": 12.352936744689941, "learning_rate": 4.6343807763401114e-06, "loss": 1.3453, "step": 20790 }, { "epoch": 38.45, "grad_norm": 9.060121536254883, "learning_rate": 4.626987060998152e-06, "loss": 1.2238, "step": 20800 }, { "epoch": 38.47, "grad_norm": 9.057785034179688, "learning_rate": 4.619593345656193e-06, "loss": 1.4452, "step": 20810 }, { "epoch": 38.48, "grad_norm": 6.135321140289307, "learning_rate": 4.612199630314233e-06, "loss": 1.385, "step": 20820 }, { "epoch": 38.5, "grad_norm": 7.854264736175537, "learning_rate": 4.604805914972274e-06, "loss": 1.3182, "step": 20830 }, { "epoch": 38.52, "grad_norm": 13.271817207336426, "learning_rate": 4.597412199630315e-06, "loss": 1.1293, "step": 20840 }, { "epoch": 38.54, "grad_norm": 4.039278507232666, "learning_rate": 4.590018484288355e-06, "loss": 1.6177, "step": 20850 }, { "epoch": 38.56, "grad_norm": 10.522032737731934, "learning_rate": 4.582624768946396e-06, "loss": 1.4771, "step": 20860 }, { "epoch": 38.58, "grad_norm": 5.412691593170166, "learning_rate": 4.5752310536044365e-06, "loss": 1.1049, "step": 20870 }, { "epoch": 38.6, "grad_norm": 5.959412097930908, "learning_rate": 4.567837338262477e-06, "loss": 1.1755, "step": 20880 }, { "epoch": 38.61, "grad_norm": 3.4956252574920654, "learning_rate": 4.560443622920518e-06, "loss": 0.8149, "step": 20890 }, { "epoch": 38.63, "grad_norm": 7.759819984436035, "learning_rate": 4.553049907578558e-06, "loss": 1.183, "step": 20900 }, { "epoch": 38.65, "grad_norm": 7.2698140144348145, "learning_rate": 4.545656192236599e-06, "loss": 1.0231, "step": 20910 }, { "epoch": 38.67, "grad_norm": 7.051075458526611, "learning_rate": 4.53826247689464e-06, "loss": 1.2146, "step": 20920 }, { "epoch": 38.69, "grad_norm": 6.184384346008301, "learning_rate": 4.530868761552681e-06, "loss": 1.2993, "step": 20930 }, { "epoch": 38.71, "grad_norm": 6.691850185394287, "learning_rate": 4.523475046210721e-06, "loss": 0.8796, "step": 20940 }, { "epoch": 38.72, "grad_norm": 16.33539390563965, "learning_rate": 4.5160813308687616e-06, "loss": 1.305, "step": 20950 }, { "epoch": 38.74, "grad_norm": 7.03787088394165, "learning_rate": 4.508687615526803e-06, "loss": 1.3484, "step": 20960 }, { "epoch": 38.76, "grad_norm": 12.721395492553711, "learning_rate": 4.501293900184844e-06, "loss": 1.1831, "step": 20970 }, { "epoch": 38.78, "grad_norm": 5.511435031890869, "learning_rate": 4.4939001848428834e-06, "loss": 1.3844, "step": 20980 }, { "epoch": 38.8, "grad_norm": 4.584927082061768, "learning_rate": 4.486506469500925e-06, "loss": 0.8155, "step": 20990 }, { "epoch": 38.82, "grad_norm": 6.571628570556641, "learning_rate": 4.4791127541589656e-06, "loss": 1.0337, "step": 21000 }, { "epoch": 38.84, "grad_norm": 10.851576805114746, "learning_rate": 4.471719038817005e-06, "loss": 1.375, "step": 21010 }, { "epoch": 38.85, "grad_norm": 10.04033374786377, "learning_rate": 4.464325323475047e-06, "loss": 1.1856, "step": 21020 }, { "epoch": 38.87, "grad_norm": 11.2889986038208, "learning_rate": 4.4569316081330874e-06, "loss": 1.1926, "step": 21030 }, { "epoch": 38.89, "grad_norm": 5.160542011260986, "learning_rate": 4.449537892791128e-06, "loss": 1.1373, "step": 21040 }, { "epoch": 38.91, "grad_norm": 12.31169319152832, "learning_rate": 4.442144177449169e-06, "loss": 1.2447, "step": 21050 }, { "epoch": 38.93, "grad_norm": 1.767460584640503, "learning_rate": 4.434750462107209e-06, "loss": 0.9691, "step": 21060 }, { "epoch": 38.95, "grad_norm": 7.609524726867676, "learning_rate": 4.42735674676525e-06, "loss": 1.1238, "step": 21070 }, { "epoch": 38.96, "grad_norm": 8.611181259155273, "learning_rate": 4.419963031423291e-06, "loss": 1.3049, "step": 21080 }, { "epoch": 38.98, "grad_norm": 8.551547050476074, "learning_rate": 4.412569316081331e-06, "loss": 1.2441, "step": 21090 }, { "epoch": 39.0, "eval_accuracy": 0.8045738045738046, "eval_loss": 0.6659223437309265, "eval_runtime": 1.5688, "eval_samples_per_second": 306.612, "eval_steps_per_second": 38.884, "step": 21099 }, { "epoch": 39.0, "grad_norm": 5.903079986572266, "learning_rate": 4.405175600739372e-06, "loss": 0.9798, "step": 21100 }, { "epoch": 39.02, "grad_norm": 4.473245143890381, "learning_rate": 4.3977818853974125e-06, "loss": 1.1198, "step": 21110 }, { "epoch": 39.04, "grad_norm": 8.966312408447266, "learning_rate": 4.390388170055453e-06, "loss": 1.0961, "step": 21120 }, { "epoch": 39.06, "grad_norm": 7.582473278045654, "learning_rate": 4.382994454713494e-06, "loss": 1.1297, "step": 21130 }, { "epoch": 39.08, "grad_norm": 7.349318027496338, "learning_rate": 4.375600739371534e-06, "loss": 1.0849, "step": 21140 }, { "epoch": 39.09, "grad_norm": 6.938796520233154, "learning_rate": 4.368207024029575e-06, "loss": 1.3813, "step": 21150 }, { "epoch": 39.11, "grad_norm": 5.211213111877441, "learning_rate": 4.360813308687616e-06, "loss": 0.9574, "step": 21160 }, { "epoch": 39.13, "grad_norm": 3.8877034187316895, "learning_rate": 4.353419593345656e-06, "loss": 1.1691, "step": 21170 }, { "epoch": 39.15, "grad_norm": 10.763919830322266, "learning_rate": 4.346025878003698e-06, "loss": 1.4307, "step": 21180 }, { "epoch": 39.17, "grad_norm": 5.094861030578613, "learning_rate": 4.3386321626617375e-06, "loss": 1.022, "step": 21190 }, { "epoch": 39.19, "grad_norm": 17.348848342895508, "learning_rate": 4.331238447319778e-06, "loss": 1.3522, "step": 21200 }, { "epoch": 39.21, "grad_norm": 7.888131618499756, "learning_rate": 4.32384473197782e-06, "loss": 1.1083, "step": 21210 }, { "epoch": 39.22, "grad_norm": 4.147839546203613, "learning_rate": 4.3164510166358594e-06, "loss": 0.805, "step": 21220 }, { "epoch": 39.24, "grad_norm": 13.205792427062988, "learning_rate": 4.309057301293901e-06, "loss": 0.9715, "step": 21230 }, { "epoch": 39.26, "grad_norm": 4.6850762367248535, "learning_rate": 4.3016635859519416e-06, "loss": 1.0367, "step": 21240 }, { "epoch": 39.28, "grad_norm": 9.4832124710083, "learning_rate": 4.294269870609981e-06, "loss": 1.2558, "step": 21250 }, { "epoch": 39.3, "grad_norm": 10.164603233337402, "learning_rate": 4.286876155268023e-06, "loss": 1.3905, "step": 21260 }, { "epoch": 39.32, "grad_norm": 7.906488418579102, "learning_rate": 4.2794824399260634e-06, "loss": 1.1648, "step": 21270 }, { "epoch": 39.33, "grad_norm": 4.285562992095947, "learning_rate": 4.272088724584103e-06, "loss": 1.0882, "step": 21280 }, { "epoch": 39.35, "grad_norm": 8.726879119873047, "learning_rate": 4.264695009242145e-06, "loss": 1.5265, "step": 21290 }, { "epoch": 39.37, "grad_norm": 5.4351091384887695, "learning_rate": 4.257301293900185e-06, "loss": 1.1346, "step": 21300 }, { "epoch": 39.39, "grad_norm": 7.091952800750732, "learning_rate": 4.249907578558226e-06, "loss": 1.0584, "step": 21310 }, { "epoch": 39.41, "grad_norm": 8.407645225524902, "learning_rate": 4.242513863216267e-06, "loss": 1.3722, "step": 21320 }, { "epoch": 39.43, "grad_norm": 5.552674293518066, "learning_rate": 4.235120147874307e-06, "loss": 1.5521, "step": 21330 }, { "epoch": 39.45, "grad_norm": 9.063770294189453, "learning_rate": 4.227726432532348e-06, "loss": 1.2355, "step": 21340 }, { "epoch": 39.46, "grad_norm": 2.432421922683716, "learning_rate": 4.2203327171903885e-06, "loss": 1.109, "step": 21350 }, { "epoch": 39.48, "grad_norm": 4.620524883270264, "learning_rate": 4.212939001848429e-06, "loss": 1.1057, "step": 21360 }, { "epoch": 39.5, "grad_norm": 7.647237300872803, "learning_rate": 4.20554528650647e-06, "loss": 1.5667, "step": 21370 }, { "epoch": 39.52, "grad_norm": 9.697209358215332, "learning_rate": 4.19815157116451e-06, "loss": 1.3528, "step": 21380 }, { "epoch": 39.54, "grad_norm": 4.901544570922852, "learning_rate": 4.190757855822551e-06, "loss": 1.1718, "step": 21390 }, { "epoch": 39.56, "grad_norm": 5.735502243041992, "learning_rate": 4.183364140480592e-06, "loss": 1.0697, "step": 21400 }, { "epoch": 39.57, "grad_norm": 4.579695224761963, "learning_rate": 4.175970425138632e-06, "loss": 1.1813, "step": 21410 }, { "epoch": 39.59, "grad_norm": 5.901272296905518, "learning_rate": 4.168576709796674e-06, "loss": 1.0705, "step": 21420 }, { "epoch": 39.61, "grad_norm": 4.7719831466674805, "learning_rate": 4.1611829944547135e-06, "loss": 1.1433, "step": 21430 }, { "epoch": 39.63, "grad_norm": 3.0104100704193115, "learning_rate": 4.153789279112754e-06, "loss": 1.2149, "step": 21440 }, { "epoch": 39.65, "grad_norm": 3.282778739929199, "learning_rate": 4.146395563770796e-06, "loss": 1.0669, "step": 21450 }, { "epoch": 39.67, "grad_norm": 9.81007194519043, "learning_rate": 4.1390018484288354e-06, "loss": 1.5888, "step": 21460 }, { "epoch": 39.69, "grad_norm": 8.664229393005371, "learning_rate": 4.131608133086876e-06, "loss": 1.1155, "step": 21470 }, { "epoch": 39.7, "grad_norm": 11.130331039428711, "learning_rate": 4.1242144177449176e-06, "loss": 1.1204, "step": 21480 }, { "epoch": 39.72, "grad_norm": 8.101975440979004, "learning_rate": 4.116820702402957e-06, "loss": 1.38, "step": 21490 }, { "epoch": 39.74, "grad_norm": 7.883039474487305, "learning_rate": 4.109426987060999e-06, "loss": 1.2703, "step": 21500 }, { "epoch": 39.76, "grad_norm": 3.2466225624084473, "learning_rate": 4.1020332717190394e-06, "loss": 1.193, "step": 21510 }, { "epoch": 39.78, "grad_norm": 5.406262397766113, "learning_rate": 4.09463955637708e-06, "loss": 1.3175, "step": 21520 }, { "epoch": 39.8, "grad_norm": 1.568410873413086, "learning_rate": 4.087245841035121e-06, "loss": 1.0543, "step": 21530 }, { "epoch": 39.82, "grad_norm": 6.236209392547607, "learning_rate": 4.079852125693161e-06, "loss": 1.1797, "step": 21540 }, { "epoch": 39.83, "grad_norm": 4.249458312988281, "learning_rate": 4.072458410351202e-06, "loss": 1.3599, "step": 21550 }, { "epoch": 39.85, "grad_norm": 9.525856971740723, "learning_rate": 4.065064695009243e-06, "loss": 1.0938, "step": 21560 }, { "epoch": 39.87, "grad_norm": 8.077798843383789, "learning_rate": 4.057670979667283e-06, "loss": 1.0463, "step": 21570 }, { "epoch": 39.89, "grad_norm": 2.116171360015869, "learning_rate": 4.050277264325324e-06, "loss": 1.2397, "step": 21580 }, { "epoch": 39.91, "grad_norm": 7.62498664855957, "learning_rate": 4.0428835489833645e-06, "loss": 1.0805, "step": 21590 }, { "epoch": 39.93, "grad_norm": 4.843806266784668, "learning_rate": 4.035489833641405e-06, "loss": 1.2728, "step": 21600 }, { "epoch": 39.94, "grad_norm": 5.291747570037842, "learning_rate": 4.028096118299446e-06, "loss": 1.3373, "step": 21610 }, { "epoch": 39.96, "grad_norm": 2.741041898727417, "learning_rate": 4.020702402957486e-06, "loss": 1.1997, "step": 21620 }, { "epoch": 39.98, "grad_norm": 13.031542778015137, "learning_rate": 4.013308687615527e-06, "loss": 1.6366, "step": 21630 }, { "epoch": 40.0, "grad_norm": 6.157989501953125, "learning_rate": 4.005914972273568e-06, "loss": 1.2113, "step": 21640 }, { "epoch": 40.0, "eval_accuracy": 0.7983367983367984, "eval_loss": 0.6709253191947937, "eval_runtime": 1.5477, "eval_samples_per_second": 310.788, "eval_steps_per_second": 39.414, "step": 21640 }, { "epoch": 40.02, "grad_norm": 10.915889739990234, "learning_rate": 3.998521256931608e-06, "loss": 1.0584, "step": 21650 }, { "epoch": 40.04, "grad_norm": 7.415562629699707, "learning_rate": 3.991127541589649e-06, "loss": 1.2491, "step": 21660 }, { "epoch": 40.06, "grad_norm": 8.103927612304688, "learning_rate": 3.9837338262476895e-06, "loss": 0.8975, "step": 21670 }, { "epoch": 40.07, "grad_norm": 3.8943428993225098, "learning_rate": 3.97634011090573e-06, "loss": 1.0223, "step": 21680 }, { "epoch": 40.09, "grad_norm": 7.421829700469971, "learning_rate": 3.968946395563772e-06, "loss": 1.0764, "step": 21690 }, { "epoch": 40.11, "grad_norm": 10.446610450744629, "learning_rate": 3.9615526802218114e-06, "loss": 1.24, "step": 21700 }, { "epoch": 40.13, "grad_norm": 5.46060848236084, "learning_rate": 3.954158964879852e-06, "loss": 0.9262, "step": 21710 }, { "epoch": 40.15, "grad_norm": 9.252046585083008, "learning_rate": 3.9467652495378936e-06, "loss": 1.1027, "step": 21720 }, { "epoch": 40.17, "grad_norm": 7.471773624420166, "learning_rate": 3.939371534195934e-06, "loss": 1.1581, "step": 21730 }, { "epoch": 40.18, "grad_norm": 6.522534370422363, "learning_rate": 3.931977818853974e-06, "loss": 1.1721, "step": 21740 }, { "epoch": 40.2, "grad_norm": 5.755692005157471, "learning_rate": 3.9245841035120154e-06, "loss": 1.0068, "step": 21750 }, { "epoch": 40.22, "grad_norm": 9.490312576293945, "learning_rate": 3.917190388170056e-06, "loss": 1.3492, "step": 21760 }, { "epoch": 40.24, "grad_norm": 10.51118278503418, "learning_rate": 3.909796672828096e-06, "loss": 1.3208, "step": 21770 }, { "epoch": 40.26, "grad_norm": 3.8750641345977783, "learning_rate": 3.902402957486137e-06, "loss": 1.0993, "step": 21780 }, { "epoch": 40.28, "grad_norm": 8.260970115661621, "learning_rate": 3.895009242144178e-06, "loss": 1.2171, "step": 21790 }, { "epoch": 40.3, "grad_norm": 6.847070693969727, "learning_rate": 3.887615526802219e-06, "loss": 1.2901, "step": 21800 }, { "epoch": 40.31, "grad_norm": 14.17623519897461, "learning_rate": 3.880221811460259e-06, "loss": 1.2127, "step": 21810 }, { "epoch": 40.33, "grad_norm": 6.718871116638184, "learning_rate": 3.8728280961183e-06, "loss": 1.1787, "step": 21820 }, { "epoch": 40.35, "grad_norm": 6.763284683227539, "learning_rate": 3.8654343807763405e-06, "loss": 1.0129, "step": 21830 }, { "epoch": 40.37, "grad_norm": 7.93693208694458, "learning_rate": 3.858040665434381e-06, "loss": 1.131, "step": 21840 }, { "epoch": 40.39, "grad_norm": 6.473512649536133, "learning_rate": 3.850646950092422e-06, "loss": 0.9589, "step": 21850 }, { "epoch": 40.41, "grad_norm": 8.757797241210938, "learning_rate": 3.843253234750462e-06, "loss": 1.3537, "step": 21860 }, { "epoch": 40.43, "grad_norm": 6.4209699630737305, "learning_rate": 3.835859519408503e-06, "loss": 1.133, "step": 21870 }, { "epoch": 40.44, "grad_norm": 9.058856964111328, "learning_rate": 3.828465804066544e-06, "loss": 1.3107, "step": 21880 }, { "epoch": 40.46, "grad_norm": 6.221926212310791, "learning_rate": 3.821072088724584e-06, "loss": 1.0265, "step": 21890 }, { "epoch": 40.48, "grad_norm": 9.985230445861816, "learning_rate": 3.813678373382625e-06, "loss": 1.462, "step": 21900 }, { "epoch": 40.5, "grad_norm": 7.49562406539917, "learning_rate": 3.806284658040666e-06, "loss": 1.1859, "step": 21910 }, { "epoch": 40.52, "grad_norm": 8.545808792114258, "learning_rate": 3.7996303142329024e-06, "loss": 1.3747, "step": 21920 }, { "epoch": 40.54, "grad_norm": 12.572465896606445, "learning_rate": 3.792236598890943e-06, "loss": 1.2045, "step": 21930 }, { "epoch": 40.55, "grad_norm": 9.94540786743164, "learning_rate": 3.7848428835489836e-06, "loss": 1.3367, "step": 21940 }, { "epoch": 40.57, "grad_norm": 6.534420013427734, "learning_rate": 3.7774491682070243e-06, "loss": 0.9764, "step": 21950 }, { "epoch": 40.59, "grad_norm": 8.174333572387695, "learning_rate": 3.7700554528650653e-06, "loss": 1.0631, "step": 21960 }, { "epoch": 40.61, "grad_norm": 8.270979881286621, "learning_rate": 3.7626617375231055e-06, "loss": 1.3404, "step": 21970 }, { "epoch": 40.63, "grad_norm": 12.575870513916016, "learning_rate": 3.755268022181146e-06, "loss": 1.4472, "step": 21980 }, { "epoch": 40.65, "grad_norm": 13.39450454711914, "learning_rate": 3.747874306839187e-06, "loss": 1.1765, "step": 21990 }, { "epoch": 40.67, "grad_norm": 6.792745113372803, "learning_rate": 3.7404805914972274e-06, "loss": 1.0369, "step": 22000 }, { "epoch": 40.68, "grad_norm": 10.410972595214844, "learning_rate": 3.7330868761552685e-06, "loss": 1.2502, "step": 22010 }, { "epoch": 40.7, "grad_norm": 8.359578132629395, "learning_rate": 3.725693160813309e-06, "loss": 1.2176, "step": 22020 }, { "epoch": 40.72, "grad_norm": 12.54337215423584, "learning_rate": 3.7182994454713493e-06, "loss": 1.4204, "step": 22030 }, { "epoch": 40.74, "grad_norm": 8.721573829650879, "learning_rate": 3.7109057301293904e-06, "loss": 1.1316, "step": 22040 }, { "epoch": 40.76, "grad_norm": 3.8864657878875732, "learning_rate": 3.703512014787431e-06, "loss": 0.8573, "step": 22050 }, { "epoch": 40.78, "grad_norm": 3.844533681869507, "learning_rate": 3.696118299445471e-06, "loss": 1.0257, "step": 22060 }, { "epoch": 40.79, "grad_norm": 4.964743614196777, "learning_rate": 3.6887245841035123e-06, "loss": 1.1429, "step": 22070 }, { "epoch": 40.81, "grad_norm": 6.386358737945557, "learning_rate": 3.681330868761553e-06, "loss": 1.167, "step": 22080 }, { "epoch": 40.83, "grad_norm": 4.903900146484375, "learning_rate": 3.673937153419594e-06, "loss": 1.2082, "step": 22090 }, { "epoch": 40.85, "grad_norm": 3.482051372528076, "learning_rate": 3.666543438077634e-06, "loss": 1.1616, "step": 22100 }, { "epoch": 40.87, "grad_norm": 6.738770484924316, "learning_rate": 3.6591497227356748e-06, "loss": 1.0618, "step": 22110 }, { "epoch": 40.89, "grad_norm": 6.552333354949951, "learning_rate": 3.651756007393716e-06, "loss": 1.0336, "step": 22120 }, { "epoch": 40.91, "grad_norm": 9.507050514221191, "learning_rate": 3.6443622920517565e-06, "loss": 1.2359, "step": 22130 }, { "epoch": 40.92, "grad_norm": 7.443932056427002, "learning_rate": 3.6369685767097967e-06, "loss": 0.9336, "step": 22140 }, { "epoch": 40.94, "grad_norm": 5.044437885284424, "learning_rate": 3.6295748613678377e-06, "loss": 1.2695, "step": 22150 }, { "epoch": 40.96, "grad_norm": 6.375547409057617, "learning_rate": 3.6221811460258784e-06, "loss": 1.2259, "step": 22160 }, { "epoch": 40.98, "grad_norm": 7.536994457244873, "learning_rate": 3.6147874306839194e-06, "loss": 1.2648, "step": 22170 }, { "epoch": 41.0, "grad_norm": 8.181557655334473, "learning_rate": 3.6073937153419596e-06, "loss": 1.1608, "step": 22180 }, { "epoch": 41.0, "eval_accuracy": 0.7983367983367984, "eval_loss": 0.6629524230957031, "eval_runtime": 1.6653, "eval_samples_per_second": 288.841, "eval_steps_per_second": 36.631, "step": 22181 }, { "epoch": 41.02, "grad_norm": 12.933876037597656, "learning_rate": 3.6000000000000003e-06, "loss": 1.1286, "step": 22190 }, { "epoch": 41.04, "grad_norm": 4.056218147277832, "learning_rate": 3.5926062846580413e-06, "loss": 1.0519, "step": 22200 }, { "epoch": 41.05, "grad_norm": 2.6176669597625732, "learning_rate": 3.5852125693160815e-06, "loss": 0.8233, "step": 22210 }, { "epoch": 41.07, "grad_norm": 9.081538200378418, "learning_rate": 3.577818853974122e-06, "loss": 1.012, "step": 22220 }, { "epoch": 41.09, "grad_norm": 2.9023985862731934, "learning_rate": 3.570425138632163e-06, "loss": 1.1743, "step": 22230 }, { "epoch": 41.11, "grad_norm": 7.250444412231445, "learning_rate": 3.5630314232902034e-06, "loss": 1.083, "step": 22240 }, { "epoch": 41.13, "grad_norm": 4.03535795211792, "learning_rate": 3.555637707948244e-06, "loss": 0.7915, "step": 22250 }, { "epoch": 41.15, "grad_norm": 4.8667893409729, "learning_rate": 3.548243992606285e-06, "loss": 1.2715, "step": 22260 }, { "epoch": 41.16, "grad_norm": 7.642014980316162, "learning_rate": 3.5408502772643253e-06, "loss": 1.1286, "step": 22270 }, { "epoch": 41.18, "grad_norm": 8.162959098815918, "learning_rate": 3.5334565619223664e-06, "loss": 1.2848, "step": 22280 }, { "epoch": 41.2, "grad_norm": 8.562190055847168, "learning_rate": 3.526062846580407e-06, "loss": 1.2994, "step": 22290 }, { "epoch": 41.22, "grad_norm": 1.5803712606430054, "learning_rate": 3.5186691312384476e-06, "loss": 1.0266, "step": 22300 }, { "epoch": 41.24, "grad_norm": 17.228378295898438, "learning_rate": 3.5112754158964883e-06, "loss": 1.1853, "step": 22310 }, { "epoch": 41.26, "grad_norm": 6.266294002532959, "learning_rate": 3.503881700554529e-06, "loss": 1.1093, "step": 22320 }, { "epoch": 41.28, "grad_norm": 13.267244338989258, "learning_rate": 3.4964879852125695e-06, "loss": 1.4198, "step": 22330 }, { "epoch": 41.29, "grad_norm": 8.88495922088623, "learning_rate": 3.4890942698706106e-06, "loss": 1.3771, "step": 22340 }, { "epoch": 41.31, "grad_norm": 12.439823150634766, "learning_rate": 3.4817005545286508e-06, "loss": 0.9893, "step": 22350 }, { "epoch": 41.33, "grad_norm": 4.663150787353516, "learning_rate": 3.474306839186692e-06, "loss": 0.8591, "step": 22360 }, { "epoch": 41.35, "grad_norm": 21.777122497558594, "learning_rate": 3.4669131238447325e-06, "loss": 1.199, "step": 22370 }, { "epoch": 41.37, "grad_norm": 11.409698486328125, "learning_rate": 3.4595194085027727e-06, "loss": 1.1828, "step": 22380 }, { "epoch": 41.39, "grad_norm": 4.921295166015625, "learning_rate": 3.4521256931608137e-06, "loss": 1.263, "step": 22390 }, { "epoch": 41.4, "grad_norm": 13.66185188293457, "learning_rate": 3.4447319778188544e-06, "loss": 1.4518, "step": 22400 }, { "epoch": 41.42, "grad_norm": 4.852383136749268, "learning_rate": 3.4373382624768946e-06, "loss": 1.0418, "step": 22410 }, { "epoch": 41.44, "grad_norm": 9.027070999145508, "learning_rate": 3.4299445471349356e-06, "loss": 1.3631, "step": 22420 }, { "epoch": 41.46, "grad_norm": 6.748485565185547, "learning_rate": 3.4225508317929763e-06, "loss": 0.9902, "step": 22430 }, { "epoch": 41.48, "grad_norm": 6.7547287940979, "learning_rate": 3.4151571164510165e-06, "loss": 1.0483, "step": 22440 }, { "epoch": 41.5, "grad_norm": 3.6746206283569336, "learning_rate": 3.4077634011090575e-06, "loss": 1.1947, "step": 22450 }, { "epoch": 41.52, "grad_norm": 9.561907768249512, "learning_rate": 3.400369685767098e-06, "loss": 1.0977, "step": 22460 }, { "epoch": 41.53, "grad_norm": 7.021154403686523, "learning_rate": 3.392975970425139e-06, "loss": 1.2763, "step": 22470 }, { "epoch": 41.55, "grad_norm": 12.588698387145996, "learning_rate": 3.3855822550831794e-06, "loss": 0.8164, "step": 22480 }, { "epoch": 41.57, "grad_norm": 9.636119842529297, "learning_rate": 3.37818853974122e-06, "loss": 1.1789, "step": 22490 }, { "epoch": 41.59, "grad_norm": 6.060165882110596, "learning_rate": 3.370794824399261e-06, "loss": 1.1034, "step": 22500 }, { "epoch": 41.61, "grad_norm": 7.080883026123047, "learning_rate": 3.3634011090573017e-06, "loss": 1.0106, "step": 22510 }, { "epoch": 41.63, "grad_norm": 7.8832502365112305, "learning_rate": 3.356007393715342e-06, "loss": 1.4297, "step": 22520 }, { "epoch": 41.65, "grad_norm": 11.75303840637207, "learning_rate": 3.348613678373383e-06, "loss": 1.1544, "step": 22530 }, { "epoch": 41.66, "grad_norm": 2.066429853439331, "learning_rate": 3.3412199630314236e-06, "loss": 0.8044, "step": 22540 }, { "epoch": 41.68, "grad_norm": 8.18702220916748, "learning_rate": 3.3338262476894647e-06, "loss": 1.3205, "step": 22550 }, { "epoch": 41.7, "grad_norm": 15.409958839416504, "learning_rate": 3.326432532347505e-06, "loss": 1.5227, "step": 22560 }, { "epoch": 41.72, "grad_norm": 9.410429954528809, "learning_rate": 3.3190388170055455e-06, "loss": 1.3215, "step": 22570 }, { "epoch": 41.74, "grad_norm": 11.729928016662598, "learning_rate": 3.3116451016635866e-06, "loss": 1.4807, "step": 22580 }, { "epoch": 41.76, "grad_norm": 8.896803855895996, "learning_rate": 3.3042513863216268e-06, "loss": 1.0058, "step": 22590 }, { "epoch": 41.77, "grad_norm": 8.337052345275879, "learning_rate": 3.2968576709796674e-06, "loss": 1.0914, "step": 22600 }, { "epoch": 41.79, "grad_norm": 5.618338584899902, "learning_rate": 3.2894639556377085e-06, "loss": 1.3185, "step": 22610 }, { "epoch": 41.81, "grad_norm": 8.748095512390137, "learning_rate": 3.2820702402957487e-06, "loss": 1.1208, "step": 22620 }, { "epoch": 41.83, "grad_norm": 5.469058990478516, "learning_rate": 3.2746765249537893e-06, "loss": 0.8211, "step": 22630 }, { "epoch": 41.85, "grad_norm": 6.410799026489258, "learning_rate": 3.2672828096118304e-06, "loss": 1.1915, "step": 22640 }, { "epoch": 41.87, "grad_norm": 16.191852569580078, "learning_rate": 3.2598890942698706e-06, "loss": 1.483, "step": 22650 }, { "epoch": 41.89, "grad_norm": 6.657191276550293, "learning_rate": 3.2524953789279116e-06, "loss": 1.1749, "step": 22660 }, { "epoch": 41.9, "grad_norm": 4.957394123077393, "learning_rate": 3.2451016635859523e-06, "loss": 1.0837, "step": 22670 }, { "epoch": 41.92, "grad_norm": 6.187136173248291, "learning_rate": 3.237707948243993e-06, "loss": 1.0458, "step": 22680 }, { "epoch": 41.94, "grad_norm": 10.597700119018555, "learning_rate": 3.2303142329020335e-06, "loss": 1.2621, "step": 22690 }, { "epoch": 41.96, "grad_norm": 10.295279502868652, "learning_rate": 3.222920517560074e-06, "loss": 1.1766, "step": 22700 }, { "epoch": 41.98, "grad_norm": 6.097641944885254, "learning_rate": 3.2155268022181148e-06, "loss": 0.9648, "step": 22710 }, { "epoch": 42.0, "grad_norm": 7.094331741333008, "learning_rate": 3.208133086876156e-06, "loss": 1.266, "step": 22720 }, { "epoch": 42.0, "eval_accuracy": 0.8004158004158004, "eval_loss": 0.6693481802940369, "eval_runtime": 1.5743, "eval_samples_per_second": 305.529, "eval_steps_per_second": 38.747, "step": 22722 }, { "epoch": 42.01, "grad_norm": 16.76295280456543, "learning_rate": 3.200739371534196e-06, "loss": 1.3903, "step": 22730 }, { "epoch": 42.03, "grad_norm": 14.50536823272705, "learning_rate": 3.193345656192237e-06, "loss": 1.3731, "step": 22740 }, { "epoch": 42.05, "grad_norm": 10.18923568725586, "learning_rate": 3.1859519408502777e-06, "loss": 1.1523, "step": 22750 }, { "epoch": 42.07, "grad_norm": 7.3155927658081055, "learning_rate": 3.178558225508318e-06, "loss": 1.1706, "step": 22760 }, { "epoch": 42.09, "grad_norm": 5.523043632507324, "learning_rate": 3.171164510166359e-06, "loss": 1.5178, "step": 22770 }, { "epoch": 42.11, "grad_norm": 6.9987263679504395, "learning_rate": 3.1637707948243996e-06, "loss": 1.277, "step": 22780 }, { "epoch": 42.13, "grad_norm": 6.800637722015381, "learning_rate": 3.15637707948244e-06, "loss": 1.1725, "step": 22790 }, { "epoch": 42.14, "grad_norm": 8.869393348693848, "learning_rate": 3.148983364140481e-06, "loss": 1.2961, "step": 22800 }, { "epoch": 42.16, "grad_norm": 6.978027820587158, "learning_rate": 3.1415896487985215e-06, "loss": 0.9686, "step": 22810 }, { "epoch": 42.18, "grad_norm": 6.952821254730225, "learning_rate": 3.1341959334565617e-06, "loss": 0.9869, "step": 22820 }, { "epoch": 42.2, "grad_norm": 10.829163551330566, "learning_rate": 3.1268022181146028e-06, "loss": 1.1908, "step": 22830 }, { "epoch": 42.22, "grad_norm": 8.889128684997559, "learning_rate": 3.1194085027726434e-06, "loss": 1.3801, "step": 22840 }, { "epoch": 42.24, "grad_norm": 9.083499908447266, "learning_rate": 3.1120147874306845e-06, "loss": 1.4646, "step": 22850 }, { "epoch": 42.26, "grad_norm": 8.524020195007324, "learning_rate": 3.1046210720887247e-06, "loss": 1.2184, "step": 22860 }, { "epoch": 42.27, "grad_norm": 6.408885955810547, "learning_rate": 3.0972273567467653e-06, "loss": 0.8566, "step": 22870 }, { "epoch": 42.29, "grad_norm": 7.174074649810791, "learning_rate": 3.0898336414048064e-06, "loss": 0.922, "step": 22880 }, { "epoch": 42.31, "grad_norm": 10.796005249023438, "learning_rate": 3.082439926062847e-06, "loss": 1.0765, "step": 22890 }, { "epoch": 42.33, "grad_norm": 11.088624954223633, "learning_rate": 3.075046210720887e-06, "loss": 1.2475, "step": 22900 }, { "epoch": 42.35, "grad_norm": 10.426238059997559, "learning_rate": 3.0676524953789283e-06, "loss": 0.9394, "step": 22910 }, { "epoch": 42.37, "grad_norm": 3.8717520236968994, "learning_rate": 3.060258780036969e-06, "loss": 1.0511, "step": 22920 }, { "epoch": 42.38, "grad_norm": 15.05356502532959, "learning_rate": 3.05286506469501e-06, "loss": 1.098, "step": 22930 }, { "epoch": 42.4, "grad_norm": 4.535237789154053, "learning_rate": 3.04547134935305e-06, "loss": 1.0687, "step": 22940 }, { "epoch": 42.42, "grad_norm": 6.5346293449401855, "learning_rate": 3.0380776340110908e-06, "loss": 0.9871, "step": 22950 }, { "epoch": 42.44, "grad_norm": 13.312052726745605, "learning_rate": 3.030683918669132e-06, "loss": 1.1112, "step": 22960 }, { "epoch": 42.46, "grad_norm": 6.180700302124023, "learning_rate": 3.023290203327172e-06, "loss": 1.1445, "step": 22970 }, { "epoch": 42.48, "grad_norm": 12.029324531555176, "learning_rate": 3.0158964879852127e-06, "loss": 1.1781, "step": 22980 }, { "epoch": 42.5, "grad_norm": 4.146298885345459, "learning_rate": 3.0085027726432537e-06, "loss": 0.858, "step": 22990 }, { "epoch": 42.51, "grad_norm": 6.0574774742126465, "learning_rate": 3.001109057301294e-06, "loss": 1.0819, "step": 23000 }, { "epoch": 42.53, "grad_norm": 6.028679847717285, "learning_rate": 2.9937153419593346e-06, "loss": 1.0059, "step": 23010 }, { "epoch": 42.55, "grad_norm": 8.024667739868164, "learning_rate": 2.9863216266173756e-06, "loss": 1.1921, "step": 23020 }, { "epoch": 42.57, "grad_norm": 16.767513275146484, "learning_rate": 2.978927911275416e-06, "loss": 0.8649, "step": 23030 }, { "epoch": 42.59, "grad_norm": 3.708163261413574, "learning_rate": 2.971534195933457e-06, "loss": 1.0568, "step": 23040 }, { "epoch": 42.61, "grad_norm": 8.879960060119629, "learning_rate": 2.9641404805914975e-06, "loss": 1.0181, "step": 23050 }, { "epoch": 42.62, "grad_norm": 4.3958611488342285, "learning_rate": 2.956746765249538e-06, "loss": 1.4971, "step": 23060 }, { "epoch": 42.64, "grad_norm": 5.4099931716918945, "learning_rate": 2.9493530499075788e-06, "loss": 1.226, "step": 23070 }, { "epoch": 42.66, "grad_norm": 4.1348557472229, "learning_rate": 2.9419593345656194e-06, "loss": 1.0288, "step": 23080 }, { "epoch": 42.68, "grad_norm": 5.87477970123291, "learning_rate": 2.93456561922366e-06, "loss": 1.2729, "step": 23090 }, { "epoch": 42.7, "grad_norm": 6.150369167327881, "learning_rate": 2.927171903881701e-06, "loss": 1.087, "step": 23100 }, { "epoch": 42.72, "grad_norm": 6.187104225158691, "learning_rate": 2.9197781885397413e-06, "loss": 1.3971, "step": 23110 }, { "epoch": 42.74, "grad_norm": 3.1527178287506104, "learning_rate": 2.9123844731977824e-06, "loss": 1.1299, "step": 23120 }, { "epoch": 42.75, "grad_norm": 10.348135948181152, "learning_rate": 2.9057301293900188e-06, "loss": 1.5085, "step": 23130 }, { "epoch": 42.77, "grad_norm": 10.59716796875, "learning_rate": 2.89833641404806e-06, "loss": 1.146, "step": 23140 }, { "epoch": 42.79, "grad_norm": 9.978230476379395, "learning_rate": 2.8909426987061e-06, "loss": 1.1461, "step": 23150 }, { "epoch": 42.81, "grad_norm": 14.321269035339355, "learning_rate": 2.8835489833641407e-06, "loss": 1.1547, "step": 23160 }, { "epoch": 42.83, "grad_norm": 7.4958672523498535, "learning_rate": 2.8761552680221817e-06, "loss": 0.9557, "step": 23170 }, { "epoch": 42.85, "grad_norm": 9.327839851379395, "learning_rate": 2.868761552680222e-06, "loss": 1.255, "step": 23180 }, { "epoch": 42.87, "grad_norm": 9.457281112670898, "learning_rate": 2.8613678373382625e-06, "loss": 1.1878, "step": 23190 }, { "epoch": 42.88, "grad_norm": 6.100465297698975, "learning_rate": 2.8539741219963036e-06, "loss": 1.0268, "step": 23200 }, { "epoch": 42.9, "grad_norm": 7.094675064086914, "learning_rate": 2.846580406654344e-06, "loss": 1.4708, "step": 23210 }, { "epoch": 42.92, "grad_norm": 2.2383298873901367, "learning_rate": 2.8391866913123844e-06, "loss": 1.4038, "step": 23220 }, { "epoch": 42.94, "grad_norm": 3.7855396270751953, "learning_rate": 2.8317929759704255e-06, "loss": 0.9866, "step": 23230 }, { "epoch": 42.96, "grad_norm": 14.928522109985352, "learning_rate": 2.824399260628466e-06, "loss": 1.0129, "step": 23240 }, { "epoch": 42.98, "grad_norm": 10.252041816711426, "learning_rate": 2.8170055452865068e-06, "loss": 1.0213, "step": 23250 }, { "epoch": 42.99, "grad_norm": 9.599827766418457, "learning_rate": 2.8096118299445474e-06, "loss": 0.9426, "step": 23260 }, { "epoch": 43.0, "eval_accuracy": 0.8045738045738046, "eval_loss": 0.6638891696929932, "eval_runtime": 1.6023, "eval_samples_per_second": 300.196, "eval_steps_per_second": 38.071, "step": 23263 }, { "epoch": 43.01, "grad_norm": 10.553892135620117, "learning_rate": 2.802218114602588e-06, "loss": 0.9214, "step": 23270 }, { "epoch": 43.03, "grad_norm": 6.096414089202881, "learning_rate": 2.794824399260629e-06, "loss": 1.1185, "step": 23280 }, { "epoch": 43.05, "grad_norm": 5.5379157066345215, "learning_rate": 2.7874306839186693e-06, "loss": 1.545, "step": 23290 }, { "epoch": 43.07, "grad_norm": 11.210318565368652, "learning_rate": 2.78003696857671e-06, "loss": 1.1231, "step": 23300 }, { "epoch": 43.09, "grad_norm": 4.23773193359375, "learning_rate": 2.772643253234751e-06, "loss": 1.2371, "step": 23310 }, { "epoch": 43.11, "grad_norm": 5.900633811950684, "learning_rate": 2.765249537892791e-06, "loss": 1.1485, "step": 23320 }, { "epoch": 43.12, "grad_norm": 8.045002937316895, "learning_rate": 2.7578558225508322e-06, "loss": 0.874, "step": 23330 }, { "epoch": 43.14, "grad_norm": 9.322183609008789, "learning_rate": 2.750462107208873e-06, "loss": 1.1256, "step": 23340 }, { "epoch": 43.16, "grad_norm": 7.504339218139648, "learning_rate": 2.743068391866913e-06, "loss": 1.0906, "step": 23350 }, { "epoch": 43.18, "grad_norm": 6.6838202476501465, "learning_rate": 2.735674676524954e-06, "loss": 1.2106, "step": 23360 }, { "epoch": 43.2, "grad_norm": 7.609870433807373, "learning_rate": 2.7282809611829948e-06, "loss": 1.0514, "step": 23370 }, { "epoch": 43.22, "grad_norm": 12.557002067565918, "learning_rate": 2.720887245841035e-06, "loss": 0.9509, "step": 23380 }, { "epoch": 43.23, "grad_norm": 8.096417427062988, "learning_rate": 2.713493530499076e-06, "loss": 1.1292, "step": 23390 }, { "epoch": 43.25, "grad_norm": 5.552764892578125, "learning_rate": 2.7060998151571167e-06, "loss": 1.0721, "step": 23400 }, { "epoch": 43.27, "grad_norm": 8.039366722106934, "learning_rate": 2.6987060998151573e-06, "loss": 1.0084, "step": 23410 }, { "epoch": 43.29, "grad_norm": 4.067732810974121, "learning_rate": 2.691312384473198e-06, "loss": 1.2759, "step": 23420 }, { "epoch": 43.31, "grad_norm": 7.529853343963623, "learning_rate": 2.6839186691312385e-06, "loss": 1.0306, "step": 23430 }, { "epoch": 43.33, "grad_norm": 5.951377868652344, "learning_rate": 2.6765249537892796e-06, "loss": 1.161, "step": 23440 }, { "epoch": 43.35, "grad_norm": 11.756484031677246, "learning_rate": 2.6691312384473202e-06, "loss": 0.8822, "step": 23450 }, { "epoch": 43.36, "grad_norm": 8.193309783935547, "learning_rate": 2.6617375231053604e-06, "loss": 0.905, "step": 23460 }, { "epoch": 43.38, "grad_norm": 8.248974800109863, "learning_rate": 2.6543438077634015e-06, "loss": 1.2437, "step": 23470 }, { "epoch": 43.4, "grad_norm": 8.967804908752441, "learning_rate": 2.646950092421442e-06, "loss": 1.2375, "step": 23480 }, { "epoch": 43.42, "grad_norm": 8.81059455871582, "learning_rate": 2.6395563770794823e-06, "loss": 1.0427, "step": 23490 }, { "epoch": 43.44, "grad_norm": 13.394253730773926, "learning_rate": 2.6321626617375234e-06, "loss": 1.0139, "step": 23500 }, { "epoch": 43.46, "grad_norm": 6.634346008300781, "learning_rate": 2.624768946395564e-06, "loss": 1.4453, "step": 23510 }, { "epoch": 43.48, "grad_norm": 8.625813484191895, "learning_rate": 2.617375231053605e-06, "loss": 1.5067, "step": 23520 }, { "epoch": 43.49, "grad_norm": 6.4123854637146, "learning_rate": 2.6099815157116453e-06, "loss": 0.9091, "step": 23530 }, { "epoch": 43.51, "grad_norm": 12.923445701599121, "learning_rate": 2.602587800369686e-06, "loss": 1.3522, "step": 23540 }, { "epoch": 43.53, "grad_norm": 6.408738136291504, "learning_rate": 2.595194085027727e-06, "loss": 1.2697, "step": 23550 }, { "epoch": 43.55, "grad_norm": 4.713378429412842, "learning_rate": 2.587800369685767e-06, "loss": 1.3214, "step": 23560 }, { "epoch": 43.57, "grad_norm": 5.740024566650391, "learning_rate": 2.580406654343808e-06, "loss": 1.0147, "step": 23570 }, { "epoch": 43.59, "grad_norm": 8.7548828125, "learning_rate": 2.573012939001849e-06, "loss": 0.9403, "step": 23580 }, { "epoch": 43.6, "grad_norm": 11.830516815185547, "learning_rate": 2.565619223659889e-06, "loss": 1.0061, "step": 23590 }, { "epoch": 43.62, "grad_norm": 10.120180130004883, "learning_rate": 2.5582255083179297e-06, "loss": 1.0429, "step": 23600 }, { "epoch": 43.64, "grad_norm": 8.686736106872559, "learning_rate": 2.5508317929759708e-06, "loss": 1.0404, "step": 23610 }, { "epoch": 43.66, "grad_norm": 9.89268684387207, "learning_rate": 2.5434380776340114e-06, "loss": 1.039, "step": 23620 }, { "epoch": 43.68, "grad_norm": 8.8170804977417, "learning_rate": 2.536044362292052e-06, "loss": 1.3099, "step": 23630 }, { "epoch": 43.7, "grad_norm": 10.836409568786621, "learning_rate": 2.5286506469500927e-06, "loss": 1.101, "step": 23640 }, { "epoch": 43.72, "grad_norm": 9.239439010620117, "learning_rate": 2.5212569316081333e-06, "loss": 1.5293, "step": 23650 }, { "epoch": 43.73, "grad_norm": 5.881317615509033, "learning_rate": 2.5138632162661743e-06, "loss": 0.9526, "step": 23660 }, { "epoch": 43.75, "grad_norm": 6.735204696655273, "learning_rate": 2.5064695009242145e-06, "loss": 1.2219, "step": 23670 }, { "epoch": 43.77, "grad_norm": 6.46926736831665, "learning_rate": 2.499075785582255e-06, "loss": 1.3036, "step": 23680 }, { "epoch": 43.79, "grad_norm": 6.994606971740723, "learning_rate": 2.4916820702402962e-06, "loss": 1.1432, "step": 23690 }, { "epoch": 43.81, "grad_norm": 7.303675651550293, "learning_rate": 2.4842883548983364e-06, "loss": 1.199, "step": 23700 }, { "epoch": 43.83, "grad_norm": 10.007107734680176, "learning_rate": 2.476894639556377e-06, "loss": 0.9863, "step": 23710 }, { "epoch": 43.84, "grad_norm": 10.816102027893066, "learning_rate": 2.469500924214418e-06, "loss": 0.7131, "step": 23720 }, { "epoch": 43.86, "grad_norm": 10.212610244750977, "learning_rate": 2.4621072088724588e-06, "loss": 1.1726, "step": 23730 }, { "epoch": 43.88, "grad_norm": 3.2680602073669434, "learning_rate": 2.4547134935304994e-06, "loss": 1.0264, "step": 23740 }, { "epoch": 43.9, "grad_norm": 12.915078163146973, "learning_rate": 2.44731977818854e-06, "loss": 1.2129, "step": 23750 }, { "epoch": 43.92, "grad_norm": 4.946972370147705, "learning_rate": 2.4399260628465807e-06, "loss": 0.958, "step": 23760 }, { "epoch": 43.94, "grad_norm": 6.468936920166016, "learning_rate": 2.4325323475046213e-06, "loss": 1.2393, "step": 23770 }, { "epoch": 43.96, "grad_norm": 5.527215957641602, "learning_rate": 2.425138632162662e-06, "loss": 1.2316, "step": 23780 }, { "epoch": 43.97, "grad_norm": 7.687479496002197, "learning_rate": 2.4177449168207025e-06, "loss": 0.8435, "step": 23790 }, { "epoch": 43.99, "grad_norm": 6.82960844039917, "learning_rate": 2.410351201478743e-06, "loss": 1.0066, "step": 23800 }, { "epoch": 44.0, "eval_accuracy": 0.8024948024948025, "eval_loss": 0.6635833382606506, "eval_runtime": 1.5226, "eval_samples_per_second": 315.908, "eval_steps_per_second": 40.063, "step": 23804 }, { "epoch": 44.01, "grad_norm": 7.3728508949279785, "learning_rate": 2.402957486136784e-06, "loss": 1.2433, "step": 23810 }, { "epoch": 44.03, "grad_norm": 11.778696060180664, "learning_rate": 2.3955637707948244e-06, "loss": 1.0628, "step": 23820 }, { "epoch": 44.05, "grad_norm": 6.90411376953125, "learning_rate": 2.3881700554528655e-06, "loss": 0.775, "step": 23830 }, { "epoch": 44.07, "grad_norm": 3.3797664642333984, "learning_rate": 2.380776340110906e-06, "loss": 0.9491, "step": 23840 }, { "epoch": 44.09, "grad_norm": 2.678579568862915, "learning_rate": 2.3733826247689463e-06, "loss": 1.0524, "step": 23850 }, { "epoch": 44.1, "grad_norm": 7.436522483825684, "learning_rate": 2.3659889094269874e-06, "loss": 1.3245, "step": 23860 }, { "epoch": 44.12, "grad_norm": 10.529536247253418, "learning_rate": 2.358595194085028e-06, "loss": 1.4934, "step": 23870 }, { "epoch": 44.14, "grad_norm": 9.76175594329834, "learning_rate": 2.3512014787430687e-06, "loss": 1.2664, "step": 23880 }, { "epoch": 44.16, "grad_norm": 5.443526744842529, "learning_rate": 2.3438077634011093e-06, "loss": 1.1094, "step": 23890 }, { "epoch": 44.18, "grad_norm": 5.521455764770508, "learning_rate": 2.33641404805915e-06, "loss": 1.0088, "step": 23900 }, { "epoch": 44.2, "grad_norm": 12.80307388305664, "learning_rate": 2.3290203327171905e-06, "loss": 1.2794, "step": 23910 }, { "epoch": 44.21, "grad_norm": 9.795181274414062, "learning_rate": 2.321626617375231e-06, "loss": 1.0913, "step": 23920 }, { "epoch": 44.23, "grad_norm": 9.864699363708496, "learning_rate": 2.314232902033272e-06, "loss": 1.0124, "step": 23930 }, { "epoch": 44.25, "grad_norm": 9.094242095947266, "learning_rate": 2.3068391866913124e-06, "loss": 0.9844, "step": 23940 }, { "epoch": 44.27, "grad_norm": 10.510156631469727, "learning_rate": 2.2994454713493535e-06, "loss": 1.1751, "step": 23950 }, { "epoch": 44.29, "grad_norm": 7.642958164215088, "learning_rate": 2.2920517560073937e-06, "loss": 1.0351, "step": 23960 }, { "epoch": 44.31, "grad_norm": 10.634533882141113, "learning_rate": 2.2846580406654343e-06, "loss": 1.4359, "step": 23970 }, { "epoch": 44.33, "grad_norm": 15.84269905090332, "learning_rate": 2.2772643253234754e-06, "loss": 1.1215, "step": 23980 }, { "epoch": 44.34, "grad_norm": 10.358230590820312, "learning_rate": 2.269870609981516e-06, "loss": 1.2551, "step": 23990 }, { "epoch": 44.36, "grad_norm": 5.3606085777282715, "learning_rate": 2.2624768946395567e-06, "loss": 1.151, "step": 24000 }, { "epoch": 44.38, "grad_norm": 10.22650146484375, "learning_rate": 2.2550831792975973e-06, "loss": 1.158, "step": 24010 }, { "epoch": 44.4, "grad_norm": 12.180791854858398, "learning_rate": 2.247689463955638e-06, "loss": 1.2293, "step": 24020 }, { "epoch": 44.42, "grad_norm": 10.044556617736816, "learning_rate": 2.2402957486136785e-06, "loss": 1.1697, "step": 24030 }, { "epoch": 44.44, "grad_norm": 11.277567863464355, "learning_rate": 2.232902033271719e-06, "loss": 1.3498, "step": 24040 }, { "epoch": 44.45, "grad_norm": 10.25592041015625, "learning_rate": 2.22550831792976e-06, "loss": 0.9857, "step": 24050 }, { "epoch": 44.47, "grad_norm": 4.847921371459961, "learning_rate": 2.2181146025878004e-06, "loss": 1.023, "step": 24060 }, { "epoch": 44.49, "grad_norm": 5.861019611358643, "learning_rate": 2.2107208872458415e-06, "loss": 1.3074, "step": 24070 }, { "epoch": 44.51, "grad_norm": 7.366247177124023, "learning_rate": 2.2033271719038817e-06, "loss": 1.3503, "step": 24080 }, { "epoch": 44.53, "grad_norm": 8.607295036315918, "learning_rate": 2.1959334565619223e-06, "loss": 0.9846, "step": 24090 }, { "epoch": 44.55, "grad_norm": 5.850520133972168, "learning_rate": 2.1885397412199634e-06, "loss": 1.0233, "step": 24100 }, { "epoch": 44.57, "grad_norm": 14.20161247253418, "learning_rate": 2.181146025878004e-06, "loss": 0.8266, "step": 24110 }, { "epoch": 44.58, "grad_norm": 8.35457706451416, "learning_rate": 2.1737523105360446e-06, "loss": 0.8753, "step": 24120 }, { "epoch": 44.6, "grad_norm": 8.116217613220215, "learning_rate": 2.1663585951940853e-06, "loss": 1.0943, "step": 24130 }, { "epoch": 44.62, "grad_norm": 11.989364624023438, "learning_rate": 2.158964879852126e-06, "loss": 1.1246, "step": 24140 }, { "epoch": 44.64, "grad_norm": 10.06429386138916, "learning_rate": 2.1515711645101665e-06, "loss": 1.2177, "step": 24150 }, { "epoch": 44.66, "grad_norm": 8.4439697265625, "learning_rate": 2.144177449168207e-06, "loss": 1.2007, "step": 24160 }, { "epoch": 44.68, "grad_norm": 8.916829109191895, "learning_rate": 2.136783733826248e-06, "loss": 1.225, "step": 24170 }, { "epoch": 44.7, "grad_norm": 7.395580768585205, "learning_rate": 2.1293900184842884e-06, "loss": 1.0743, "step": 24180 }, { "epoch": 44.71, "grad_norm": 10.461799621582031, "learning_rate": 2.121996303142329e-06, "loss": 1.2177, "step": 24190 }, { "epoch": 44.73, "grad_norm": 10.332401275634766, "learning_rate": 2.1146025878003697e-06, "loss": 1.186, "step": 24200 }, { "epoch": 44.75, "grad_norm": 5.019920349121094, "learning_rate": 2.1072088724584108e-06, "loss": 1.3574, "step": 24210 }, { "epoch": 44.77, "grad_norm": 7.739672660827637, "learning_rate": 2.0998151571164514e-06, "loss": 1.1587, "step": 24220 }, { "epoch": 44.79, "grad_norm": 6.620255470275879, "learning_rate": 2.0924214417744916e-06, "loss": 1.067, "step": 24230 }, { "epoch": 44.81, "grad_norm": 5.736428737640381, "learning_rate": 2.0850277264325326e-06, "loss": 1.3412, "step": 24240 }, { "epoch": 44.82, "grad_norm": 7.865865230560303, "learning_rate": 2.0776340110905733e-06, "loss": 1.3205, "step": 24250 }, { "epoch": 44.84, "grad_norm": 4.6581034660339355, "learning_rate": 2.070240295748614e-06, "loss": 0.9925, "step": 24260 }, { "epoch": 44.86, "grad_norm": 6.45067024230957, "learning_rate": 2.0628465804066545e-06, "loss": 1.0723, "step": 24270 }, { "epoch": 44.88, "grad_norm": 15.864325523376465, "learning_rate": 2.055452865064695e-06, "loss": 1.225, "step": 24280 }, { "epoch": 44.9, "grad_norm": 8.786124229431152, "learning_rate": 2.048059149722736e-06, "loss": 1.2467, "step": 24290 }, { "epoch": 44.92, "grad_norm": 10.726322174072266, "learning_rate": 2.0406654343807764e-06, "loss": 1.2169, "step": 24300 }, { "epoch": 44.94, "grad_norm": 9.022387504577637, "learning_rate": 2.033271719038817e-06, "loss": 1.2806, "step": 24310 }, { "epoch": 44.95, "grad_norm": 7.745621204376221, "learning_rate": 2.0258780036968577e-06, "loss": 1.1522, "step": 24320 }, { "epoch": 44.97, "grad_norm": 3.969712495803833, "learning_rate": 2.0184842883548988e-06, "loss": 1.1323, "step": 24330 }, { "epoch": 44.99, "grad_norm": 6.5329909324646, "learning_rate": 2.011090573012939e-06, "loss": 1.0856, "step": 24340 }, { "epoch": 45.0, "eval_accuracy": 0.8004158004158004, "eval_loss": 0.6530020833015442, "eval_runtime": 1.5523, "eval_samples_per_second": 309.86, "eval_steps_per_second": 39.296, "step": 24345 }, { "epoch": 45.01, "grad_norm": 13.900078773498535, "learning_rate": 2.0036968576709796e-06, "loss": 1.3151, "step": 24350 }, { "epoch": 45.03, "grad_norm": 8.027884483337402, "learning_rate": 1.9963031423290206e-06, "loss": 0.9515, "step": 24360 }, { "epoch": 45.05, "grad_norm": 7.037506103515625, "learning_rate": 1.9889094269870613e-06, "loss": 1.2651, "step": 24370 }, { "epoch": 45.06, "grad_norm": 5.122119903564453, "learning_rate": 1.981515711645102e-06, "loss": 1.173, "step": 24380 }, { "epoch": 45.08, "grad_norm": 6.578670501708984, "learning_rate": 1.9741219963031425e-06, "loss": 1.3271, "step": 24390 }, { "epoch": 45.1, "grad_norm": 6.045128345489502, "learning_rate": 1.966728280961183e-06, "loss": 1.3551, "step": 24400 }, { "epoch": 45.12, "grad_norm": 12.207243919372559, "learning_rate": 1.959334565619224e-06, "loss": 1.1957, "step": 24410 }, { "epoch": 45.14, "grad_norm": 9.588749885559082, "learning_rate": 1.9519408502772644e-06, "loss": 1.1642, "step": 24420 }, { "epoch": 45.16, "grad_norm": 7.760073661804199, "learning_rate": 1.944547134935305e-06, "loss": 1.4653, "step": 24430 }, { "epoch": 45.18, "grad_norm": 3.4997811317443848, "learning_rate": 1.9371534195933457e-06, "loss": 1.0624, "step": 24440 }, { "epoch": 45.19, "grad_norm": 7.831880569458008, "learning_rate": 1.9297597042513868e-06, "loss": 1.3099, "step": 24450 }, { "epoch": 45.21, "grad_norm": 7.326499938964844, "learning_rate": 1.922365988909427e-06, "loss": 1.1218, "step": 24460 }, { "epoch": 45.23, "grad_norm": 8.108443260192871, "learning_rate": 1.9149722735674676e-06, "loss": 0.9998, "step": 24470 }, { "epoch": 45.25, "grad_norm": 5.27816104888916, "learning_rate": 1.9075785582255086e-06, "loss": 1.1967, "step": 24480 }, { "epoch": 45.27, "grad_norm": 11.589706420898438, "learning_rate": 1.9001848428835493e-06, "loss": 1.1061, "step": 24490 }, { "epoch": 45.29, "grad_norm": 7.894682884216309, "learning_rate": 1.8927911275415897e-06, "loss": 1.084, "step": 24500 }, { "epoch": 45.3, "grad_norm": 6.19917631149292, "learning_rate": 1.8853974121996305e-06, "loss": 1.4451, "step": 24510 }, { "epoch": 45.32, "grad_norm": 11.908498764038086, "learning_rate": 1.8780036968576712e-06, "loss": 1.132, "step": 24520 }, { "epoch": 45.34, "grad_norm": 6.548595905303955, "learning_rate": 1.870609981515712e-06, "loss": 1.2676, "step": 24530 }, { "epoch": 45.36, "grad_norm": 5.26450252532959, "learning_rate": 1.8632162661737524e-06, "loss": 1.0283, "step": 24540 }, { "epoch": 45.38, "grad_norm": 4.299957752227783, "learning_rate": 1.855822550831793e-06, "loss": 1.0751, "step": 24550 }, { "epoch": 45.4, "grad_norm": 4.160711765289307, "learning_rate": 1.848428835489834e-06, "loss": 1.2791, "step": 24560 }, { "epoch": 45.42, "grad_norm": 13.297599792480469, "learning_rate": 1.8410351201478743e-06, "loss": 0.8448, "step": 24570 }, { "epoch": 45.43, "grad_norm": 10.280519485473633, "learning_rate": 1.8336414048059152e-06, "loss": 1.2721, "step": 24580 }, { "epoch": 45.45, "grad_norm": 4.971632480621338, "learning_rate": 1.8262476894639558e-06, "loss": 1.0396, "step": 24590 }, { "epoch": 45.47, "grad_norm": 18.82908821105957, "learning_rate": 1.8188539741219966e-06, "loss": 0.896, "step": 24600 }, { "epoch": 45.49, "grad_norm": 1.70675528049469, "learning_rate": 1.811460258780037e-06, "loss": 1.0524, "step": 24610 }, { "epoch": 45.51, "grad_norm": 7.04319429397583, "learning_rate": 1.8040665434380777e-06, "loss": 0.9377, "step": 24620 }, { "epoch": 45.53, "grad_norm": 14.152902603149414, "learning_rate": 1.7966728280961185e-06, "loss": 1.1071, "step": 24630 }, { "epoch": 45.55, "grad_norm": 12.810236930847168, "learning_rate": 1.7892791127541592e-06, "loss": 1.2618, "step": 24640 }, { "epoch": 45.56, "grad_norm": 4.832306385040283, "learning_rate": 1.7818853974121996e-06, "loss": 1.2824, "step": 24650 }, { "epoch": 45.58, "grad_norm": 15.986501693725586, "learning_rate": 1.7744916820702404e-06, "loss": 1.3297, "step": 24660 }, { "epoch": 45.6, "grad_norm": 12.308773040771484, "learning_rate": 1.767097966728281e-06, "loss": 0.9508, "step": 24670 }, { "epoch": 45.62, "grad_norm": 7.619829177856445, "learning_rate": 1.759704251386322e-06, "loss": 1.0268, "step": 24680 }, { "epoch": 45.64, "grad_norm": 10.901267051696777, "learning_rate": 1.7523105360443623e-06, "loss": 1.3203, "step": 24690 }, { "epoch": 45.66, "grad_norm": 10.945263862609863, "learning_rate": 1.7449168207024032e-06, "loss": 1.135, "step": 24700 }, { "epoch": 45.67, "grad_norm": 10.969324111938477, "learning_rate": 1.7375231053604438e-06, "loss": 1.2676, "step": 24710 }, { "epoch": 45.69, "grad_norm": 6.668211460113525, "learning_rate": 1.7301293900184846e-06, "loss": 1.1207, "step": 24720 }, { "epoch": 45.71, "grad_norm": 6.793056488037109, "learning_rate": 1.722735674676525e-06, "loss": 1.1015, "step": 24730 }, { "epoch": 45.73, "grad_norm": 9.75919246673584, "learning_rate": 1.7153419593345657e-06, "loss": 1.3127, "step": 24740 }, { "epoch": 45.75, "grad_norm": 11.099854469299316, "learning_rate": 1.7079482439926065e-06, "loss": 1.0439, "step": 24750 }, { "epoch": 45.77, "grad_norm": 5.757495880126953, "learning_rate": 1.700554528650647e-06, "loss": 0.8722, "step": 24760 }, { "epoch": 45.79, "grad_norm": 7.795022010803223, "learning_rate": 1.6931608133086878e-06, "loss": 1.1847, "step": 24770 }, { "epoch": 45.8, "grad_norm": 4.379203796386719, "learning_rate": 1.6857670979667284e-06, "loss": 1.107, "step": 24780 }, { "epoch": 45.82, "grad_norm": 5.859982490539551, "learning_rate": 1.6783733826247693e-06, "loss": 0.9548, "step": 24790 }, { "epoch": 45.84, "grad_norm": 4.482446670532227, "learning_rate": 1.6709796672828097e-06, "loss": 1.2252, "step": 24800 }, { "epoch": 45.86, "grad_norm": 8.972475051879883, "learning_rate": 1.6635859519408503e-06, "loss": 1.3335, "step": 24810 }, { "epoch": 45.88, "grad_norm": 16.8807373046875, "learning_rate": 1.6561922365988912e-06, "loss": 1.2489, "step": 24820 }, { "epoch": 45.9, "grad_norm": 5.766531467437744, "learning_rate": 1.6487985212569318e-06, "loss": 0.9587, "step": 24830 }, { "epoch": 45.91, "grad_norm": 6.8137640953063965, "learning_rate": 1.6414048059149722e-06, "loss": 0.8794, "step": 24840 }, { "epoch": 45.93, "grad_norm": 9.849954605102539, "learning_rate": 1.634011090573013e-06, "loss": 0.9888, "step": 24850 }, { "epoch": 45.95, "grad_norm": 5.245386600494385, "learning_rate": 1.6266173752310537e-06, "loss": 1.2363, "step": 24860 }, { "epoch": 45.97, "grad_norm": 8.245514869689941, "learning_rate": 1.6192236598890945e-06, "loss": 1.2641, "step": 24870 }, { "epoch": 45.99, "grad_norm": 8.490116119384766, "learning_rate": 1.611829944547135e-06, "loss": 1.0128, "step": 24880 }, { "epoch": 46.0, "eval_accuracy": 0.8024948024948025, "eval_loss": 0.6505705118179321, "eval_runtime": 1.5147, "eval_samples_per_second": 317.556, "eval_steps_per_second": 40.272, "step": 24886 }, { "epoch": 46.01, "grad_norm": 9.106378555297852, "learning_rate": 1.6044362292051758e-06, "loss": 0.9799, "step": 24890 }, { "epoch": 46.03, "grad_norm": 8.267436981201172, "learning_rate": 1.5970425138632164e-06, "loss": 1.2092, "step": 24900 }, { "epoch": 46.04, "grad_norm": 6.7226881980896, "learning_rate": 1.5896487985212573e-06, "loss": 1.0339, "step": 24910 }, { "epoch": 46.06, "grad_norm": 4.484898567199707, "learning_rate": 1.5822550831792977e-06, "loss": 0.8668, "step": 24920 }, { "epoch": 46.08, "grad_norm": 12.360088348388672, "learning_rate": 1.5748613678373383e-06, "loss": 1.2097, "step": 24930 }, { "epoch": 46.1, "grad_norm": 12.479482650756836, "learning_rate": 1.5674676524953792e-06, "loss": 1.3778, "step": 24940 }, { "epoch": 46.12, "grad_norm": 4.835532188415527, "learning_rate": 1.5600739371534196e-06, "loss": 1.161, "step": 24950 }, { "epoch": 46.14, "grad_norm": 5.981575012207031, "learning_rate": 1.5526802218114604e-06, "loss": 1.3092, "step": 24960 }, { "epoch": 46.16, "grad_norm": 8.3271484375, "learning_rate": 1.545286506469501e-06, "loss": 1.1145, "step": 24970 }, { "epoch": 46.17, "grad_norm": 12.015401840209961, "learning_rate": 1.537892791127542e-06, "loss": 1.2173, "step": 24980 }, { "epoch": 46.19, "grad_norm": 12.327820777893066, "learning_rate": 1.5304990757855823e-06, "loss": 1.316, "step": 24990 }, { "epoch": 46.21, "grad_norm": 6.557477951049805, "learning_rate": 1.523105360443623e-06, "loss": 1.3063, "step": 25000 }, { "epoch": 46.23, "grad_norm": 7.568830490112305, "learning_rate": 1.5157116451016638e-06, "loss": 1.3031, "step": 25010 }, { "epoch": 46.25, "grad_norm": 10.225282669067383, "learning_rate": 1.5083179297597044e-06, "loss": 0.9823, "step": 25020 }, { "epoch": 46.27, "grad_norm": 7.323040962219238, "learning_rate": 1.5009242144177449e-06, "loss": 1.1514, "step": 25030 }, { "epoch": 46.28, "grad_norm": 11.954227447509766, "learning_rate": 1.4935304990757857e-06, "loss": 1.4643, "step": 25040 }, { "epoch": 46.3, "grad_norm": 8.947208404541016, "learning_rate": 1.4861367837338263e-06, "loss": 1.4722, "step": 25050 }, { "epoch": 46.32, "grad_norm": 5.338852405548096, "learning_rate": 1.4787430683918672e-06, "loss": 1.0676, "step": 25060 }, { "epoch": 46.34, "grad_norm": 13.824284553527832, "learning_rate": 1.4713493530499076e-06, "loss": 1.3013, "step": 25070 }, { "epoch": 46.36, "grad_norm": 9.68040943145752, "learning_rate": 1.4639556377079484e-06, "loss": 1.2775, "step": 25080 }, { "epoch": 46.38, "grad_norm": 9.596774101257324, "learning_rate": 1.456561922365989e-06, "loss": 1.2514, "step": 25090 }, { "epoch": 46.4, "grad_norm": 13.064347267150879, "learning_rate": 1.44916820702403e-06, "loss": 1.3144, "step": 25100 }, { "epoch": 46.41, "grad_norm": 9.035834312438965, "learning_rate": 1.4417744916820703e-06, "loss": 0.8881, "step": 25110 }, { "epoch": 46.43, "grad_norm": 6.3635687828063965, "learning_rate": 1.434380776340111e-06, "loss": 1.0417, "step": 25120 }, { "epoch": 46.45, "grad_norm": 7.851380825042725, "learning_rate": 1.4269870609981518e-06, "loss": 1.4945, "step": 25130 }, { "epoch": 46.47, "grad_norm": 9.586305618286133, "learning_rate": 1.4195933456561922e-06, "loss": 1.1737, "step": 25140 }, { "epoch": 46.49, "grad_norm": 9.973650932312012, "learning_rate": 1.412199630314233e-06, "loss": 1.2712, "step": 25150 }, { "epoch": 46.51, "grad_norm": 5.164646625518799, "learning_rate": 1.4048059149722737e-06, "loss": 1.1407, "step": 25160 }, { "epoch": 46.52, "grad_norm": 7.688935279846191, "learning_rate": 1.3974121996303145e-06, "loss": 1.074, "step": 25170 }, { "epoch": 46.54, "grad_norm": 9.623371124267578, "learning_rate": 1.390018484288355e-06, "loss": 1.2002, "step": 25180 }, { "epoch": 46.56, "grad_norm": 3.9396157264709473, "learning_rate": 1.3826247689463956e-06, "loss": 1.1512, "step": 25190 }, { "epoch": 46.58, "grad_norm": 13.917947769165039, "learning_rate": 1.3752310536044364e-06, "loss": 1.3819, "step": 25200 }, { "epoch": 46.6, "grad_norm": 12.70738697052002, "learning_rate": 1.367837338262477e-06, "loss": 0.857, "step": 25210 }, { "epoch": 46.62, "grad_norm": 5.021570205688477, "learning_rate": 1.3604436229205175e-06, "loss": 1.0649, "step": 25220 }, { "epoch": 46.64, "grad_norm": 4.878916263580322, "learning_rate": 1.3530499075785583e-06, "loss": 1.167, "step": 25230 }, { "epoch": 46.65, "grad_norm": 5.995421409606934, "learning_rate": 1.345656192236599e-06, "loss": 1.2064, "step": 25240 }, { "epoch": 46.67, "grad_norm": 9.798457145690918, "learning_rate": 1.3382624768946398e-06, "loss": 0.7412, "step": 25250 }, { "epoch": 46.69, "grad_norm": 14.2097806930542, "learning_rate": 1.3308687615526802e-06, "loss": 0.9547, "step": 25260 }, { "epoch": 46.71, "grad_norm": 9.59878158569336, "learning_rate": 1.323475046210721e-06, "loss": 1.0801, "step": 25270 }, { "epoch": 46.73, "grad_norm": 6.4320878982543945, "learning_rate": 1.3160813308687617e-06, "loss": 1.2118, "step": 25280 }, { "epoch": 46.75, "grad_norm": 10.050814628601074, "learning_rate": 1.3086876155268025e-06, "loss": 1.3272, "step": 25290 }, { "epoch": 46.77, "grad_norm": 3.3388493061065674, "learning_rate": 1.301293900184843e-06, "loss": 1.1763, "step": 25300 }, { "epoch": 46.78, "grad_norm": 10.04194450378418, "learning_rate": 1.2939001848428836e-06, "loss": 1.531, "step": 25310 }, { "epoch": 46.8, "grad_norm": 7.142560958862305, "learning_rate": 1.2865064695009244e-06, "loss": 1.2134, "step": 25320 }, { "epoch": 46.82, "grad_norm": 8.94771671295166, "learning_rate": 1.2791127541589649e-06, "loss": 1.3941, "step": 25330 }, { "epoch": 46.84, "grad_norm": 8.732101440429688, "learning_rate": 1.2717190388170057e-06, "loss": 1.1549, "step": 25340 }, { "epoch": 46.86, "grad_norm": 5.314905166625977, "learning_rate": 1.2643253234750463e-06, "loss": 1.2186, "step": 25350 }, { "epoch": 46.88, "grad_norm": 9.160057067871094, "learning_rate": 1.2569316081330872e-06, "loss": 1.0492, "step": 25360 }, { "epoch": 46.89, "grad_norm": 8.355467796325684, "learning_rate": 1.2495378927911276e-06, "loss": 1.4237, "step": 25370 }, { "epoch": 46.91, "grad_norm": 9.165495872497559, "learning_rate": 1.2421441774491682e-06, "loss": 0.9104, "step": 25380 }, { "epoch": 46.93, "grad_norm": 9.154586791992188, "learning_rate": 1.234750462107209e-06, "loss": 0.9952, "step": 25390 }, { "epoch": 46.95, "grad_norm": 4.512294292449951, "learning_rate": 1.2273567467652497e-06, "loss": 1.1877, "step": 25400 }, { "epoch": 46.97, "grad_norm": 7.809790134429932, "learning_rate": 1.2199630314232903e-06, "loss": 1.4074, "step": 25410 }, { "epoch": 46.99, "grad_norm": 4.81723690032959, "learning_rate": 1.212569316081331e-06, "loss": 1.0369, "step": 25420 }, { "epoch": 47.0, "eval_accuracy": 0.8024948024948025, "eval_loss": 0.6616687774658203, "eval_runtime": 1.5209, "eval_samples_per_second": 316.26, "eval_steps_per_second": 40.108, "step": 25427 }, { "epoch": 47.01, "grad_norm": 11.809417724609375, "learning_rate": 1.2051756007393716e-06, "loss": 1.1519, "step": 25430 }, { "epoch": 47.02, "grad_norm": 7.254668235778809, "learning_rate": 1.1977818853974122e-06, "loss": 1.1637, "step": 25440 }, { "epoch": 47.04, "grad_norm": 8.461350440979004, "learning_rate": 1.190388170055453e-06, "loss": 1.1895, "step": 25450 }, { "epoch": 47.06, "grad_norm": 6.701322555541992, "learning_rate": 1.1829944547134937e-06, "loss": 0.993, "step": 25460 }, { "epoch": 47.08, "grad_norm": 8.063401222229004, "learning_rate": 1.1756007393715343e-06, "loss": 1.5799, "step": 25470 }, { "epoch": 47.1, "grad_norm": 8.461128234863281, "learning_rate": 1.168207024029575e-06, "loss": 1.0671, "step": 25480 }, { "epoch": 47.12, "grad_norm": 2.356477737426758, "learning_rate": 1.1608133086876156e-06, "loss": 0.8526, "step": 25490 }, { "epoch": 47.13, "grad_norm": 7.614037036895752, "learning_rate": 1.1534195933456562e-06, "loss": 1.1273, "step": 25500 }, { "epoch": 47.15, "grad_norm": 9.326582908630371, "learning_rate": 1.1460258780036969e-06, "loss": 1.0874, "step": 25510 }, { "epoch": 47.17, "grad_norm": 5.714335918426514, "learning_rate": 1.1386321626617377e-06, "loss": 1.1641, "step": 25520 }, { "epoch": 47.19, "grad_norm": 8.105392456054688, "learning_rate": 1.1312384473197783e-06, "loss": 1.4208, "step": 25530 }, { "epoch": 47.21, "grad_norm": 4.956079006195068, "learning_rate": 1.123844731977819e-06, "loss": 1.1808, "step": 25540 }, { "epoch": 47.23, "grad_norm": 15.034546852111816, "learning_rate": 1.1164510166358596e-06, "loss": 1.0115, "step": 25550 }, { "epoch": 47.25, "grad_norm": 3.9976143836975098, "learning_rate": 1.1090573012939002e-06, "loss": 1.0577, "step": 25560 }, { "epoch": 47.26, "grad_norm": 12.10183334350586, "learning_rate": 1.1016635859519409e-06, "loss": 1.7225, "step": 25570 }, { "epoch": 47.28, "grad_norm": 12.743776321411133, "learning_rate": 1.0942698706099817e-06, "loss": 0.9839, "step": 25580 }, { "epoch": 47.3, "grad_norm": 3.295074462890625, "learning_rate": 1.0868761552680223e-06, "loss": 1.0558, "step": 25590 }, { "epoch": 47.32, "grad_norm": 10.155179023742676, "learning_rate": 1.079482439926063e-06, "loss": 1.1342, "step": 25600 }, { "epoch": 47.34, "grad_norm": 9.61672592163086, "learning_rate": 1.0720887245841036e-06, "loss": 0.8307, "step": 25610 }, { "epoch": 47.36, "grad_norm": 6.472414970397949, "learning_rate": 1.0646950092421442e-06, "loss": 1.4239, "step": 25620 }, { "epoch": 47.38, "grad_norm": 3.3001365661621094, "learning_rate": 1.0573012939001849e-06, "loss": 0.921, "step": 25630 }, { "epoch": 47.39, "grad_norm": 9.591170310974121, "learning_rate": 1.0499075785582257e-06, "loss": 0.9654, "step": 25640 }, { "epoch": 47.41, "grad_norm": 5.349375247955322, "learning_rate": 1.0425138632162663e-06, "loss": 0.9078, "step": 25650 }, { "epoch": 47.43, "grad_norm": 3.4508564472198486, "learning_rate": 1.035120147874307e-06, "loss": 1.1387, "step": 25660 }, { "epoch": 47.45, "grad_norm": 8.482918739318848, "learning_rate": 1.0277264325323476e-06, "loss": 1.1229, "step": 25670 }, { "epoch": 47.47, "grad_norm": 7.3166303634643555, "learning_rate": 1.0203327171903882e-06, "loss": 0.9385, "step": 25680 }, { "epoch": 47.49, "grad_norm": 4.8514814376831055, "learning_rate": 1.0129390018484288e-06, "loss": 0.8777, "step": 25690 }, { "epoch": 47.5, "grad_norm": 9.43614673614502, "learning_rate": 1.0055452865064695e-06, "loss": 1.1728, "step": 25700 }, { "epoch": 47.52, "grad_norm": 5.5064311027526855, "learning_rate": 9.981515711645103e-07, "loss": 1.1436, "step": 25710 }, { "epoch": 47.54, "grad_norm": 11.893933296203613, "learning_rate": 9.90757855822551e-07, "loss": 1.0284, "step": 25720 }, { "epoch": 47.56, "grad_norm": 6.794284343719482, "learning_rate": 9.833641404805916e-07, "loss": 1.0445, "step": 25730 }, { "epoch": 47.58, "grad_norm": 9.099145889282227, "learning_rate": 9.759704251386322e-07, "loss": 1.091, "step": 25740 }, { "epoch": 47.6, "grad_norm": 12.045644760131836, "learning_rate": 9.685767097966728e-07, "loss": 1.2829, "step": 25750 }, { "epoch": 47.62, "grad_norm": 7.65464973449707, "learning_rate": 9.611829944547135e-07, "loss": 1.2048, "step": 25760 }, { "epoch": 47.63, "grad_norm": 7.1381754875183105, "learning_rate": 9.537892791127543e-07, "loss": 1.2533, "step": 25770 }, { "epoch": 47.65, "grad_norm": 11.174083709716797, "learning_rate": 9.463955637707948e-07, "loss": 1.3027, "step": 25780 }, { "epoch": 47.67, "grad_norm": 12.549125671386719, "learning_rate": 9.390018484288356e-07, "loss": 0.647, "step": 25790 }, { "epoch": 47.69, "grad_norm": 7.212826728820801, "learning_rate": 9.316081330868762e-07, "loss": 1.3757, "step": 25800 }, { "epoch": 47.71, "grad_norm": 7.7667555809021, "learning_rate": 9.24214417744917e-07, "loss": 1.179, "step": 25810 }, { "epoch": 47.73, "grad_norm": 3.0937676429748535, "learning_rate": 9.168207024029576e-07, "loss": 1.17, "step": 25820 }, { "epoch": 47.74, "grad_norm": 6.499346733093262, "learning_rate": 9.094269870609983e-07, "loss": 1.2704, "step": 25830 }, { "epoch": 47.76, "grad_norm": 3.6330244541168213, "learning_rate": 9.020332717190388e-07, "loss": 1.1169, "step": 25840 }, { "epoch": 47.78, "grad_norm": 5.004092216491699, "learning_rate": 8.946395563770796e-07, "loss": 0.9723, "step": 25850 }, { "epoch": 47.8, "grad_norm": 12.980995178222656, "learning_rate": 8.872458410351202e-07, "loss": 1.1771, "step": 25860 }, { "epoch": 47.82, "grad_norm": 15.25153636932373, "learning_rate": 8.79852125693161e-07, "loss": 1.2753, "step": 25870 }, { "epoch": 47.84, "grad_norm": 5.679342269897461, "learning_rate": 8.724584103512016e-07, "loss": 1.1396, "step": 25880 }, { "epoch": 47.86, "grad_norm": 2.6761786937713623, "learning_rate": 8.650646950092423e-07, "loss": 0.9708, "step": 25890 }, { "epoch": 47.87, "grad_norm": 7.30656099319458, "learning_rate": 8.576709796672828e-07, "loss": 1.296, "step": 25900 }, { "epoch": 47.89, "grad_norm": 10.576569557189941, "learning_rate": 8.502772643253235e-07, "loss": 1.0876, "step": 25910 }, { "epoch": 47.91, "grad_norm": 6.223654270172119, "learning_rate": 8.428835489833642e-07, "loss": 1.162, "step": 25920 }, { "epoch": 47.93, "grad_norm": 4.597068786621094, "learning_rate": 8.354898336414048e-07, "loss": 1.1912, "step": 25930 }, { "epoch": 47.95, "grad_norm": 4.93589448928833, "learning_rate": 8.280961182994456e-07, "loss": 1.2802, "step": 25940 }, { "epoch": 47.97, "grad_norm": 8.372135162353516, "learning_rate": 8.207024029574861e-07, "loss": 1.0758, "step": 25950 }, { "epoch": 47.99, "grad_norm": 5.324300289154053, "learning_rate": 8.133086876155268e-07, "loss": 1.1458, "step": 25960 }, { "epoch": 48.0, "eval_accuracy": 0.8004158004158004, "eval_loss": 0.6545633673667908, "eval_runtime": 1.6495, "eval_samples_per_second": 291.596, "eval_steps_per_second": 36.98, "step": 25968 }, { "epoch": 48.0, "grad_norm": 9.546926498413086, "learning_rate": 8.059149722735675e-07, "loss": 1.318, "step": 25970 }, { "epoch": 48.02, "grad_norm": 6.292444229125977, "learning_rate": 7.985212569316082e-07, "loss": 1.0767, "step": 25980 }, { "epoch": 48.04, "grad_norm": 8.22965145111084, "learning_rate": 7.911275415896488e-07, "loss": 1.3299, "step": 25990 }, { "epoch": 48.06, "grad_norm": 20.15998649597168, "learning_rate": 7.837338262476896e-07, "loss": 1.2134, "step": 26000 }, { "epoch": 48.08, "grad_norm": 9.135931015014648, "learning_rate": 7.763401109057302e-07, "loss": 0.931, "step": 26010 }, { "epoch": 48.1, "grad_norm": 7.852985858917236, "learning_rate": 7.68946395563771e-07, "loss": 0.9875, "step": 26020 }, { "epoch": 48.11, "grad_norm": 8.769057273864746, "learning_rate": 7.615526802218115e-07, "loss": 1.1973, "step": 26030 }, { "epoch": 48.13, "grad_norm": 9.066899299621582, "learning_rate": 7.541589648798522e-07, "loss": 1.1586, "step": 26040 }, { "epoch": 48.15, "grad_norm": 2.0728862285614014, "learning_rate": 7.467652495378928e-07, "loss": 1.1851, "step": 26050 }, { "epoch": 48.17, "grad_norm": 8.796727180480957, "learning_rate": 7.393715341959336e-07, "loss": 1.0893, "step": 26060 }, { "epoch": 48.19, "grad_norm": 5.686365604400635, "learning_rate": 7.319778188539742e-07, "loss": 1.5171, "step": 26070 }, { "epoch": 48.21, "grad_norm": 16.936376571655273, "learning_rate": 7.24584103512015e-07, "loss": 1.0493, "step": 26080 }, { "epoch": 48.23, "grad_norm": 8.512947082519531, "learning_rate": 7.171903881700555e-07, "loss": 1.2209, "step": 26090 }, { "epoch": 48.24, "grad_norm": 8.630998611450195, "learning_rate": 7.097966728280961e-07, "loss": 1.248, "step": 26100 }, { "epoch": 48.26, "grad_norm": 9.566816329956055, "learning_rate": 7.024029574861368e-07, "loss": 1.4417, "step": 26110 }, { "epoch": 48.28, "grad_norm": 13.585439682006836, "learning_rate": 6.950092421441775e-07, "loss": 1.3415, "step": 26120 }, { "epoch": 48.3, "grad_norm": 10.696357727050781, "learning_rate": 6.876155268022182e-07, "loss": 0.9299, "step": 26130 }, { "epoch": 48.32, "grad_norm": 7.104404449462891, "learning_rate": 6.802218114602587e-07, "loss": 0.8418, "step": 26140 }, { "epoch": 48.34, "grad_norm": 8.752631187438965, "learning_rate": 6.728280961182995e-07, "loss": 1.082, "step": 26150 }, { "epoch": 48.35, "grad_norm": 6.5805206298828125, "learning_rate": 6.654343807763401e-07, "loss": 1.0181, "step": 26160 }, { "epoch": 48.37, "grad_norm": 4.857325553894043, "learning_rate": 6.580406654343808e-07, "loss": 1.1546, "step": 26170 }, { "epoch": 48.39, "grad_norm": 6.648013114929199, "learning_rate": 6.506469500924215e-07, "loss": 1.1192, "step": 26180 }, { "epoch": 48.41, "grad_norm": 6.483686447143555, "learning_rate": 6.432532347504622e-07, "loss": 1.2068, "step": 26190 }, { "epoch": 48.43, "grad_norm": 7.90614128112793, "learning_rate": 6.358595194085028e-07, "loss": 1.0433, "step": 26200 }, { "epoch": 48.45, "grad_norm": 7.202399253845215, "learning_rate": 6.284658040665436e-07, "loss": 1.3084, "step": 26210 }, { "epoch": 48.47, "grad_norm": 5.477099895477295, "learning_rate": 6.210720887245841e-07, "loss": 1.1511, "step": 26220 }, { "epoch": 48.48, "grad_norm": 3.5082626342773438, "learning_rate": 6.136783733826248e-07, "loss": 1.3665, "step": 26230 }, { "epoch": 48.5, "grad_norm": 6.579675674438477, "learning_rate": 6.062846580406655e-07, "loss": 1.0565, "step": 26240 }, { "epoch": 48.52, "grad_norm": 6.19356107711792, "learning_rate": 5.988909426987061e-07, "loss": 1.0953, "step": 26250 }, { "epoch": 48.54, "grad_norm": 12.52212905883789, "learning_rate": 5.914972273567468e-07, "loss": 0.9355, "step": 26260 }, { "epoch": 48.56, "grad_norm": 5.499223709106445, "learning_rate": 5.841035120147875e-07, "loss": 1.2761, "step": 26270 }, { "epoch": 48.58, "grad_norm": 9.806011199951172, "learning_rate": 5.767097966728281e-07, "loss": 1.0831, "step": 26280 }, { "epoch": 48.6, "grad_norm": 4.650641441345215, "learning_rate": 5.693160813308688e-07, "loss": 0.8785, "step": 26290 }, { "epoch": 48.61, "grad_norm": 8.748270988464355, "learning_rate": 5.619223659889095e-07, "loss": 0.9523, "step": 26300 }, { "epoch": 48.63, "grad_norm": 8.6466064453125, "learning_rate": 5.545286506469501e-07, "loss": 0.7753, "step": 26310 }, { "epoch": 48.65, "grad_norm": 8.633035659790039, "learning_rate": 5.471349353049908e-07, "loss": 1.3581, "step": 26320 }, { "epoch": 48.67, "grad_norm": 4.340029716491699, "learning_rate": 5.397412199630315e-07, "loss": 0.8604, "step": 26330 }, { "epoch": 48.69, "grad_norm": 5.168616771697998, "learning_rate": 5.323475046210721e-07, "loss": 1.3559, "step": 26340 }, { "epoch": 48.71, "grad_norm": 4.818678379058838, "learning_rate": 5.249537892791128e-07, "loss": 1.0953, "step": 26350 }, { "epoch": 48.72, "grad_norm": 7.105437278747559, "learning_rate": 5.175600739371535e-07, "loss": 1.1286, "step": 26360 }, { "epoch": 48.74, "grad_norm": 6.749919891357422, "learning_rate": 5.101663585951941e-07, "loss": 1.3842, "step": 26370 }, { "epoch": 48.76, "grad_norm": 6.941017150878906, "learning_rate": 5.027726432532347e-07, "loss": 1.2206, "step": 26380 }, { "epoch": 48.78, "grad_norm": 5.35729455947876, "learning_rate": 4.953789279112755e-07, "loss": 1.1874, "step": 26390 }, { "epoch": 48.8, "grad_norm": 5.522649765014648, "learning_rate": 4.879852125693161e-07, "loss": 0.963, "step": 26400 }, { "epoch": 48.82, "grad_norm": 7.575677871704102, "learning_rate": 4.805914972273567e-07, "loss": 1.1431, "step": 26410 }, { "epoch": 48.84, "grad_norm": 6.7216315269470215, "learning_rate": 4.731977818853974e-07, "loss": 0.9781, "step": 26420 }, { "epoch": 48.85, "grad_norm": 3.700978994369507, "learning_rate": 4.658040665434381e-07, "loss": 0.8636, "step": 26430 }, { "epoch": 48.87, "grad_norm": 9.584163665771484, "learning_rate": 4.584103512014788e-07, "loss": 1.3432, "step": 26440 }, { "epoch": 48.89, "grad_norm": 5.176293849945068, "learning_rate": 4.510166358595194e-07, "loss": 1.0088, "step": 26450 }, { "epoch": 48.91, "grad_norm": 8.102872848510742, "learning_rate": 4.436229205175601e-07, "loss": 1.286, "step": 26460 }, { "epoch": 48.93, "grad_norm": 5.2418131828308105, "learning_rate": 4.362292051756008e-07, "loss": 1.2945, "step": 26470 }, { "epoch": 48.95, "grad_norm": 5.132104396820068, "learning_rate": 4.288354898336414e-07, "loss": 1.1293, "step": 26480 }, { "epoch": 48.96, "grad_norm": 6.273956298828125, "learning_rate": 4.214417744916821e-07, "loss": 1.3887, "step": 26490 }, { "epoch": 48.98, "grad_norm": 3.940556287765503, "learning_rate": 4.140480591497228e-07, "loss": 1.0696, "step": 26500 }, { "epoch": 49.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.6597396731376648, "eval_runtime": 1.52, "eval_samples_per_second": 316.45, "eval_steps_per_second": 40.132, "step": 26509 }, { "epoch": 49.0, "grad_norm": 5.106778144836426, "learning_rate": 4.066543438077634e-07, "loss": 0.9525, "step": 26510 }, { "epoch": 49.02, "grad_norm": 7.699993133544922, "learning_rate": 3.992606284658041e-07, "loss": 1.3987, "step": 26520 }, { "epoch": 49.04, "grad_norm": 8.818982124328613, "learning_rate": 3.918669131238448e-07, "loss": 1.1766, "step": 26530 }, { "epoch": 49.06, "grad_norm": 6.350173473358154, "learning_rate": 3.844731977818855e-07, "loss": 1.3984, "step": 26540 }, { "epoch": 49.08, "grad_norm": 11.532821655273438, "learning_rate": 3.770794824399261e-07, "loss": 0.9814, "step": 26550 }, { "epoch": 49.09, "grad_norm": 8.71923828125, "learning_rate": 3.696857670979668e-07, "loss": 1.083, "step": 26560 }, { "epoch": 49.11, "grad_norm": 9.045463562011719, "learning_rate": 3.622920517560075e-07, "loss": 1.1762, "step": 26570 }, { "epoch": 49.13, "grad_norm": 12.317120552062988, "learning_rate": 3.5489833641404806e-07, "loss": 1.2938, "step": 26580 }, { "epoch": 49.15, "grad_norm": 6.455867290496826, "learning_rate": 3.4750462107208874e-07, "loss": 0.8731, "step": 26590 }, { "epoch": 49.17, "grad_norm": 9.183475494384766, "learning_rate": 3.4011090573012937e-07, "loss": 1.3945, "step": 26600 }, { "epoch": 49.19, "grad_norm": 5.1193060874938965, "learning_rate": 3.3271719038817006e-07, "loss": 1.0471, "step": 26610 }, { "epoch": 49.21, "grad_norm": 2.903475761413574, "learning_rate": 3.2532347504621074e-07, "loss": 0.8886, "step": 26620 }, { "epoch": 49.22, "grad_norm": 8.063397407531738, "learning_rate": 3.179297597042514e-07, "loss": 1.0096, "step": 26630 }, { "epoch": 49.24, "grad_norm": 9.067961692810059, "learning_rate": 3.1053604436229206e-07, "loss": 1.48, "step": 26640 }, { "epoch": 49.26, "grad_norm": 6.46680212020874, "learning_rate": 3.0314232902033274e-07, "loss": 1.0706, "step": 26650 }, { "epoch": 49.28, "grad_norm": 9.694252967834473, "learning_rate": 2.957486136783734e-07, "loss": 0.9821, "step": 26660 }, { "epoch": 49.3, "grad_norm": 9.900237083435059, "learning_rate": 2.8835489833641405e-07, "loss": 0.8678, "step": 26670 }, { "epoch": 49.32, "grad_norm": 8.050165176391602, "learning_rate": 2.8096118299445474e-07, "loss": 1.0247, "step": 26680 }, { "epoch": 49.33, "grad_norm": 6.939967632293701, "learning_rate": 2.735674676524954e-07, "loss": 1.1739, "step": 26690 }, { "epoch": 49.35, "grad_norm": 7.258787631988525, "learning_rate": 2.6617375231053605e-07, "loss": 1.056, "step": 26700 }, { "epoch": 49.37, "grad_norm": 10.481571197509766, "learning_rate": 2.5878003696857674e-07, "loss": 0.8537, "step": 26710 }, { "epoch": 49.39, "grad_norm": 10.89072036743164, "learning_rate": 2.5138632162661737e-07, "loss": 1.0851, "step": 26720 }, { "epoch": 49.41, "grad_norm": 15.408546447753906, "learning_rate": 2.4399260628465805e-07, "loss": 1.0074, "step": 26730 }, { "epoch": 49.43, "grad_norm": 8.42940616607666, "learning_rate": 2.365988909426987e-07, "loss": 1.3865, "step": 26740 }, { "epoch": 49.45, "grad_norm": 7.883341312408447, "learning_rate": 2.292051756007394e-07, "loss": 0.9751, "step": 26750 }, { "epoch": 49.46, "grad_norm": 7.906477451324463, "learning_rate": 2.2181146025878005e-07, "loss": 0.7271, "step": 26760 }, { "epoch": 49.48, "grad_norm": 8.690512657165527, "learning_rate": 2.144177449168207e-07, "loss": 1.0948, "step": 26770 }, { "epoch": 49.5, "grad_norm": 9.37038803100586, "learning_rate": 2.070240295748614e-07, "loss": 1.1521, "step": 26780 }, { "epoch": 49.52, "grad_norm": 4.444186687469482, "learning_rate": 1.9963031423290205e-07, "loss": 0.9589, "step": 26790 }, { "epoch": 49.54, "grad_norm": 8.415474891662598, "learning_rate": 1.9223659889094274e-07, "loss": 0.9307, "step": 26800 }, { "epoch": 49.56, "grad_norm": 6.987151145935059, "learning_rate": 1.848428835489834e-07, "loss": 1.0604, "step": 26810 }, { "epoch": 49.57, "grad_norm": 4.934401035308838, "learning_rate": 1.7744916820702403e-07, "loss": 0.9969, "step": 26820 }, { "epoch": 49.59, "grad_norm": 11.109991073608398, "learning_rate": 1.7005545286506469e-07, "loss": 1.1276, "step": 26830 }, { "epoch": 49.61, "grad_norm": 12.764841079711914, "learning_rate": 1.6266173752310537e-07, "loss": 1.2847, "step": 26840 }, { "epoch": 49.63, "grad_norm": 6.022518634796143, "learning_rate": 1.5526802218114603e-07, "loss": 1.1392, "step": 26850 }, { "epoch": 49.65, "grad_norm": 13.0466947555542, "learning_rate": 1.478743068391867e-07, "loss": 1.1146, "step": 26860 }, { "epoch": 49.67, "grad_norm": 7.32473087310791, "learning_rate": 1.4048059149722737e-07, "loss": 1.201, "step": 26870 }, { "epoch": 49.69, "grad_norm": 10.20928955078125, "learning_rate": 1.3308687615526803e-07, "loss": 1.206, "step": 26880 }, { "epoch": 49.7, "grad_norm": 5.064608097076416, "learning_rate": 1.2569316081330869e-07, "loss": 1.122, "step": 26890 }, { "epoch": 49.72, "grad_norm": 4.568586349487305, "learning_rate": 1.1829944547134936e-07, "loss": 0.9548, "step": 26900 }, { "epoch": 49.74, "grad_norm": 8.220465660095215, "learning_rate": 1.1090573012939003e-07, "loss": 0.9033, "step": 26910 }, { "epoch": 49.76, "grad_norm": 12.3377685546875, "learning_rate": 1.035120147874307e-07, "loss": 1.2458, "step": 26920 }, { "epoch": 49.78, "grad_norm": 6.277747631072998, "learning_rate": 9.611829944547137e-08, "loss": 1.2087, "step": 26930 }, { "epoch": 49.8, "grad_norm": 7.732276439666748, "learning_rate": 8.872458410351201e-08, "loss": 1.0294, "step": 26940 }, { "epoch": 49.82, "grad_norm": 12.166303634643555, "learning_rate": 8.133086876155268e-08, "loss": 1.0855, "step": 26950 }, { "epoch": 49.83, "grad_norm": 9.935192108154297, "learning_rate": 7.393715341959336e-08, "loss": 1.1723, "step": 26960 }, { "epoch": 49.85, "grad_norm": 10.68835163116455, "learning_rate": 6.654343807763401e-08, "loss": 1.2271, "step": 26970 }, { "epoch": 49.87, "grad_norm": 5.8904242515563965, "learning_rate": 5.914972273567468e-08, "loss": 1.1628, "step": 26980 }, { "epoch": 49.89, "grad_norm": 9.259088516235352, "learning_rate": 5.175600739371535e-08, "loss": 1.0315, "step": 26990 }, { "epoch": 49.91, "grad_norm": 5.863243579864502, "learning_rate": 4.436229205175601e-08, "loss": 0.8127, "step": 27000 }, { "epoch": 49.93, "grad_norm": 5.639997482299805, "learning_rate": 3.696857670979668e-08, "loss": 1.2895, "step": 27010 }, { "epoch": 49.94, "grad_norm": 9.816082000732422, "learning_rate": 2.957486136783734e-08, "loss": 1.3429, "step": 27020 }, { "epoch": 49.96, "grad_norm": 4.471237659454346, "learning_rate": 2.2181146025878003e-08, "loss": 0.8657, "step": 27030 }, { "epoch": 49.98, "grad_norm": 10.234234809875488, "learning_rate": 1.478743068391867e-08, "loss": 1.3488, "step": 27040 }, { "epoch": 50.0, "grad_norm": 4.39223575592041, "learning_rate": 7.393715341959335e-09, "loss": 1.2227, "step": 27050 }, { "epoch": 50.0, "eval_accuracy": 0.7941787941787942, "eval_loss": 0.6566402316093445, "eval_runtime": 1.5913, "eval_samples_per_second": 302.266, "eval_steps_per_second": 38.333, "step": 27050 }, { "epoch": 50.0, "step": 27050, "total_flos": 1.677596635524096e+19, "train_loss": 1.4967251671528419, "train_runtime": 2039.0451, "train_samples_per_second": 106.055, "train_steps_per_second": 13.266 } ], "logging_steps": 10, "max_steps": 27050, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 100, "total_flos": 1.677596635524096e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }