{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998824589585864, "eval_steps": 500, "global_step": 6380, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "ETA": 0.0, "epoch": 0.0015672138855150257, "fp16_scale": 512.0, "global_step": 10, "grad_norm": 44.605, "learning_rate": 2.0833333333333333e-07, "loss": 2.2146, "step": 10 }, { "ETA": 29.58, "epoch": 0.0031344277710300514, "fp16_scale": 512.0, "global_step": 20, "grad_norm": 18.403, "learning_rate": 1.25e-06, "loss": 1.7049, "step": 20 }, { "ETA": 29.6, "epoch": 0.004701641656545077, "fp16_scale": 512.0, "global_step": 30, "grad_norm": 7.563, "learning_rate": 2.2916666666666666e-06, "loss": 1.0428, "step": 30 }, { "ETA": 29.56, "epoch": 0.006268855542060103, "fp16_scale": 512.0, "global_step": 40, "grad_norm": 6.731, "learning_rate": 3.3333333333333333e-06, "loss": 1.0004, "step": 40 }, { "ETA": 29.5, "epoch": 0.00783606942757513, "fp16_scale": 512.0, "global_step": 50, "grad_norm": 8.091, "learning_rate": 4.3750000000000005e-06, "loss": 0.93, "step": 50 }, { "ETA": 29.47, "epoch": 0.009403283313090155, "fp16_scale": 512.0, "global_step": 60, "grad_norm": 6.857, "learning_rate": 5.416666666666667e-06, "loss": 0.9099, "step": 60 }, { "ETA": 29.41, "epoch": 0.01097049719860518, "fp16_scale": 512.0, "global_step": 70, "grad_norm": 4.815, "learning_rate": 6.458333333333334e-06, "loss": 0.8915, "step": 70 }, { "ETA": 29.36, "epoch": 0.012537711084120206, "fp16_scale": 512.0, "global_step": 80, "grad_norm": 4.539, "learning_rate": 7.500000000000001e-06, "loss": 0.8344, "step": 80 }, { "ETA": 29.3, "epoch": 0.014104924969635231, "fp16_scale": 512.0, "global_step": 90, "grad_norm": 5.123, "learning_rate": 8.541666666666666e-06, "loss": 0.8418, "step": 90 }, { "ETA": 29.25, "epoch": 0.01567213885515026, "fp16_scale": 512.0, "global_step": 100, "grad_norm": 5.271, "learning_rate": 9.583333333333335e-06, "loss": 0.7887, "step": 100 }, { "ETA": 29.2, "epoch": 0.017239352740665282, "fp16_scale": 512.0, "global_step": 110, "grad_norm": 5.762, "learning_rate": 1.0625e-05, "loss": 0.8702, "step": 110 }, { "ETA": 29.15, "epoch": 0.01880656662618031, "fp16_scale": 512.0, "global_step": 120, "grad_norm": 5.592, "learning_rate": 1.1666666666666668e-05, "loss": 0.8226, "step": 120 }, { "ETA": 29.1, "epoch": 0.020373780511695333, "fp16_scale": 512.0, "global_step": 130, "grad_norm": 4.187, "learning_rate": 1.2708333333333333e-05, "loss": 0.7926, "step": 130 }, { "ETA": 29.06, "epoch": 0.02194099439721036, "fp16_scale": 512.0, "global_step": 140, "grad_norm": 3.79, "learning_rate": 1.375e-05, "loss": 0.8084, "step": 140 }, { "ETA": 29.02, "epoch": 0.023508208282725384, "fp16_scale": 512.0, "global_step": 150, "grad_norm": 6.431, "learning_rate": 1.479166666666667e-05, "loss": 0.8406, "step": 150 }, { "ETA": 28.96, "epoch": 0.02507542216824041, "fp16_scale": 512.0, "global_step": 160, "grad_norm": 5.383, "learning_rate": 1.5833333333333333e-05, "loss": 0.8033, "step": 160 }, { "ETA": 28.93, "epoch": 0.026642636053755435, "fp16_scale": 512.0, "global_step": 170, "grad_norm": 3.825, "learning_rate": 1.6875e-05, "loss": 0.7992, "step": 170 }, { "ETA": 28.88, "epoch": 0.028209849939270462, "fp16_scale": 512.0, "global_step": 180, "grad_norm": 6.132, "learning_rate": 1.7916666666666667e-05, "loss": 0.831, "step": 180 }, { "ETA": 28.83, "epoch": 0.029777063824785486, "fp16_scale": 512.0, "global_step": 190, "grad_norm": 4.835, "learning_rate": 1.8958333333333334e-05, "loss": 0.8184, "step": 190 }, { "ETA": 28.79, "epoch": 0.03134427771030052, "fp16_scale": 512.0, "global_step": 200, "grad_norm": 6.233, "learning_rate": 2e-05, "loss": 0.8741, "step": 200 }, { "ETA": 28.74, "epoch": 0.03291149159581554, "fp16_scale": 512.0, "global_step": 210, "grad_norm": 3.753, "learning_rate": 1.99998711251399e-05, "loss": 0.8059, "step": 210 }, { "ETA": 28.69, "epoch": 0.034478705481330564, "fp16_scale": 512.0, "global_step": 220, "grad_norm": 3.802, "learning_rate": 1.9999484503881354e-05, "loss": 0.7927, "step": 220 }, { "ETA": 28.68, "epoch": 0.03604591936684559, "fp16_scale": 512.0, "global_step": 230, "grad_norm": 4.375, "learning_rate": 1.99988401461895e-05, "loss": 0.8292, "step": 230 }, { "ETA": 29.04, "epoch": 0.03761313325236062, "fp16_scale": 512.0, "global_step": 240, "grad_norm": 3.616, "learning_rate": 1.9997938068672652e-05, "loss": 0.822, "step": 240 }, { "ETA": 28.97, "epoch": 0.03918034713787564, "fp16_scale": 512.0, "global_step": 250, "grad_norm": 5.363, "learning_rate": 1.9996778294581828e-05, "loss": 0.8432, "step": 250 }, { "ETA": 28.91, "epoch": 0.040747561023390666, "fp16_scale": 512.0, "global_step": 260, "grad_norm": 4.838, "learning_rate": 1.9995360853810172e-05, "loss": 0.8458, "step": 260 }, { "ETA": 28.84, "epoch": 0.04231477490890569, "fp16_scale": 512.0, "global_step": 270, "grad_norm": 4.622, "learning_rate": 1.9993685782892184e-05, "loss": 0.8552, "step": 270 }, { "ETA": 28.78, "epoch": 0.04388198879442072, "fp16_scale": 512.0, "global_step": 280, "grad_norm": 4.467, "learning_rate": 1.9991753125002766e-05, "loss": 0.8134, "step": 280 }, { "ETA": 28.72, "epoch": 0.04544920267993575, "fp16_scale": 512.0, "global_step": 290, "grad_norm": 4.902, "learning_rate": 1.998956292995612e-05, "loss": 0.888, "step": 290 }, { "ETA": 28.66, "epoch": 0.04701641656545077, "fp16_scale": 512.0, "global_step": 300, "grad_norm": 8.023, "learning_rate": 1.998711525420447e-05, "loss": 0.8485, "step": 300 }, { "ETA": 28.6, "epoch": 0.048583630450965795, "fp16_scale": 512.0, "global_step": 310, "grad_norm": 6.401, "learning_rate": 1.998441016083658e-05, "loss": 0.8511, "step": 310 }, { "ETA": 28.54, "epoch": 0.05015084433648082, "fp16_scale": 512.0, "global_step": 320, "grad_norm": 4.692, "learning_rate": 1.9981447719576163e-05, "loss": 0.8344, "step": 320 }, { "ETA": 28.48, "epoch": 0.05171805822199585, "fp16_scale": 512.0, "global_step": 330, "grad_norm": 3.886, "learning_rate": 1.9978228006780056e-05, "loss": 0.7998, "step": 330 }, { "ETA": 28.42, "epoch": 0.05328527210751087, "fp16_scale": 512.0, "global_step": 340, "grad_norm": 3.093, "learning_rate": 1.9974751105436266e-05, "loss": 0.8319, "step": 340 }, { "ETA": 28.37, "epoch": 0.0548524859930259, "fp16_scale": 512.0, "global_step": 350, "grad_norm": 5.02, "learning_rate": 1.9971017105161833e-05, "loss": 0.8256, "step": 350 }, { "ETA": 28.31, "epoch": 0.056419699878540924, "fp16_scale": 512.0, "global_step": 360, "grad_norm": 3.542, "learning_rate": 1.9967026102200503e-05, "loss": 0.7992, "step": 360 }, { "ETA": 28.26, "epoch": 0.05798691376405595, "fp16_scale": 512.0, "global_step": 370, "grad_norm": 3.29, "learning_rate": 1.9962778199420265e-05, "loss": 0.8275, "step": 370 }, { "ETA": 28.2, "epoch": 0.05955412764957097, "fp16_scale": 512.0, "global_step": 380, "grad_norm": 4.119, "learning_rate": 1.9958273506310703e-05, "loss": 0.7866, "step": 380 }, { "ETA": 28.15, "epoch": 0.061121341535086, "fp16_scale": 512.0, "global_step": 390, "grad_norm": 3.126, "learning_rate": 1.995351213898015e-05, "loss": 0.7829, "step": 390 }, { "ETA": 28.09, "epoch": 0.06268855542060103, "fp16_scale": 512.0, "global_step": 400, "grad_norm": 4.012, "learning_rate": 1.9948494220152714e-05, "loss": 0.8086, "step": 400 }, { "ETA": 28.04, "epoch": 0.06425576930611605, "fp16_scale": 512.0, "global_step": 410, "grad_norm": 5.155, "learning_rate": 1.9943219879165113e-05, "loss": 0.791, "step": 410 }, { "ETA": 27.99, "epoch": 0.06582298319163107, "fp16_scale": 512.0, "global_step": 420, "grad_norm": 5.031, "learning_rate": 1.9937689251963347e-05, "loss": 0.7987, "step": 420 }, { "ETA": 27.94, "epoch": 0.06739019707714611, "fp16_scale": 512.0, "global_step": 430, "grad_norm": 5.236, "learning_rate": 1.9931902481099163e-05, "loss": 0.8278, "step": 430 }, { "ETA": 27.88, "epoch": 0.06895741096266113, "fp16_scale": 512.0, "global_step": 440, "grad_norm": 4.538, "learning_rate": 1.992585971572643e-05, "loss": 0.8123, "step": 440 }, { "ETA": 27.83, "epoch": 0.07052462484817615, "fp16_scale": 512.0, "global_step": 450, "grad_norm": 3.488, "learning_rate": 1.9919561111597246e-05, "loss": 0.7873, "step": 450 }, { "ETA": 27.78, "epoch": 0.07209183873369118, "fp16_scale": 512.0, "global_step": 460, "grad_norm": 4.643, "learning_rate": 1.9913006831057967e-05, "loss": 0.8107, "step": 460 }, { "ETA": 27.72, "epoch": 0.0736590526192062, "fp16_scale": 512.0, "global_step": 470, "grad_norm": 3.725, "learning_rate": 1.990619704304499e-05, "loss": 0.7758, "step": 470 }, { "ETA": 27.67, "epoch": 0.07522626650472124, "fp16_scale": 512.0, "global_step": 480, "grad_norm": 5.386, "learning_rate": 1.98991319230804e-05, "loss": 0.8376, "step": 480 }, { "ETA": 27.62, "epoch": 0.07679348039023626, "fp16_scale": 512.0, "global_step": 490, "grad_norm": 3.911, "learning_rate": 1.989181165326747e-05, "loss": 0.8483, "step": 490 }, { "ETA": 27.57, "epoch": 0.07836069427575128, "fp16_scale": 512.0, "global_step": 500, "grad_norm": 4.428, "learning_rate": 1.988423642228596e-05, "loss": 0.8042, "step": 500 }, { "ETA": 27.88, "epoch": 0.07992790816126631, "fp16_scale": 512.0, "global_step": 510, "grad_norm": 3.18, "learning_rate": 1.9876406425387222e-05, "loss": 0.7861, "step": 510 }, { "ETA": 27.82, "epoch": 0.08149512204678133, "fp16_scale": 512.0, "global_step": 520, "grad_norm": 4.472, "learning_rate": 1.9868321864389216e-05, "loss": 0.8244, "step": 520 }, { "ETA": 27.76, "epoch": 0.08306233593229637, "fp16_scale": 512.0, "global_step": 530, "grad_norm": 4.459, "learning_rate": 1.9859982947671273e-05, "loss": 0.8077, "step": 530 }, { "ETA": 27.7, "epoch": 0.08462954981781139, "fp16_scale": 512.0, "global_step": 540, "grad_norm": 3.43, "learning_rate": 1.9851389890168738e-05, "loss": 0.8235, "step": 540 }, { "ETA": 27.64, "epoch": 0.0861967637033264, "fp16_scale": 512.0, "global_step": 550, "grad_norm": 4.116, "learning_rate": 1.984254291336743e-05, "loss": 0.8453, "step": 550 }, { "ETA": 27.59, "epoch": 0.08776397758884144, "fp16_scale": 512.0, "global_step": 560, "grad_norm": 3.874, "learning_rate": 1.9833442245297923e-05, "loss": 0.8315, "step": 560 }, { "ETA": 27.53, "epoch": 0.08933119147435646, "fp16_scale": 512.0, "global_step": 570, "grad_norm": 4.0, "learning_rate": 1.982408812052969e-05, "loss": 0.8089, "step": 570 }, { "ETA": 27.47, "epoch": 0.0908984053598715, "fp16_scale": 512.0, "global_step": 580, "grad_norm": 3.54, "learning_rate": 1.9814480780165026e-05, "loss": 0.8185, "step": 580 }, { "ETA": 27.42, "epoch": 0.09246561924538652, "fp16_scale": 512.0, "global_step": 590, "grad_norm": 4.836, "learning_rate": 1.980462047183287e-05, "loss": 0.8105, "step": 590 }, { "ETA": 27.36, "epoch": 0.09403283313090154, "fp16_scale": 512.0, "global_step": 600, "grad_norm": 4.281, "learning_rate": 1.9794507449682383e-05, "loss": 0.778, "step": 600 }, { "ETA": 27.31, "epoch": 0.09560004701641657, "fp16_scale": 512.0, "global_step": 610, "grad_norm": 3.683, "learning_rate": 1.9784141974376434e-05, "loss": 0.7742, "step": 610 }, { "ETA": 27.25, "epoch": 0.09716726090193159, "fp16_scale": 512.0, "global_step": 620, "grad_norm": 4.075, "learning_rate": 1.9773524313084857e-05, "loss": 0.8509, "step": 620 }, { "ETA": 27.2, "epoch": 0.09873447478744661, "fp16_scale": 512.0, "global_step": 630, "grad_norm": 3.46, "learning_rate": 1.9762654739477578e-05, "loss": 0.7818, "step": 630 }, { "ETA": 27.14, "epoch": 0.10030168867296164, "fp16_scale": 512.0, "global_step": 640, "grad_norm": 3.695, "learning_rate": 1.975153353371755e-05, "loss": 0.8249, "step": 640 }, { "ETA": 27.09, "epoch": 0.10186890255847666, "fp16_scale": 512.0, "global_step": 650, "grad_norm": 4.314, "learning_rate": 1.974016098245354e-05, "loss": 0.7981, "step": 650 }, { "ETA": 27.04, "epoch": 0.1034361164439917, "fp16_scale": 512.0, "global_step": 660, "grad_norm": 3.079, "learning_rate": 1.9728537378812738e-05, "loss": 0.8327, "step": 660 }, { "ETA": 26.98, "epoch": 0.10500333032950672, "fp16_scale": 512.0, "global_step": 670, "grad_norm": 3.434, "learning_rate": 1.9716663022393202e-05, "loss": 0.7954, "step": 670 }, { "ETA": 26.93, "epoch": 0.10657054421502174, "fp16_scale": 512.0, "global_step": 680, "grad_norm": 5.473, "learning_rate": 1.9704538219256143e-05, "loss": 0.7967, "step": 680 }, { "ETA": 26.87, "epoch": 0.10813775810053677, "fp16_scale": 512.0, "global_step": 690, "grad_norm": 3.194, "learning_rate": 1.9692163281918016e-05, "loss": 0.8253, "step": 690 }, { "ETA": 26.82, "epoch": 0.1097049719860518, "fp16_scale": 512.0, "global_step": 700, "grad_norm": 3.881, "learning_rate": 1.9679538529342487e-05, "loss": 0.8237, "step": 700 }, { "ETA": 26.77, "epoch": 0.11127218587156683, "fp16_scale": 512.0, "global_step": 710, "grad_norm": 4.069, "learning_rate": 1.9666664286932198e-05, "loss": 0.8276, "step": 710 }, { "ETA": 26.72, "epoch": 0.11283939975708185, "fp16_scale": 512.0, "global_step": 720, "grad_norm": 3.983, "learning_rate": 1.9653540886520387e-05, "loss": 0.8102, "step": 720 }, { "ETA": 26.66, "epoch": 0.11440661364259687, "fp16_scale": 512.0, "global_step": 730, "grad_norm": 3.559, "learning_rate": 1.964016866636234e-05, "loss": 0.7812, "step": 730 }, { "ETA": 26.61, "epoch": 0.1159738275281119, "fp16_scale": 512.0, "global_step": 740, "grad_norm": 3.652, "learning_rate": 1.9626547971126646e-05, "loss": 0.7735, "step": 740 }, { "ETA": 26.56, "epoch": 0.11754104141362692, "fp16_scale": 512.0, "global_step": 750, "grad_norm": 4.565, "learning_rate": 1.9612679151886352e-05, "loss": 0.7897, "step": 750 }, { "ETA": 26.51, "epoch": 0.11910825529914194, "fp16_scale": 512.0, "global_step": 760, "grad_norm": 3.597, "learning_rate": 1.959856256610988e-05, "loss": 0.813, "step": 760 }, { "ETA": 26.46, "epoch": 0.12067546918465698, "fp16_scale": 512.0, "global_step": 770, "grad_norm": 3.112, "learning_rate": 1.958419857765184e-05, "loss": 0.8063, "step": 770 }, { "ETA": 26.43, "epoch": 0.122242683070172, "fp16_scale": 512.0, "global_step": 780, "grad_norm": 3.769, "learning_rate": 1.9569587556743627e-05, "loss": 0.7885, "step": 780 }, { "ETA": 26.51, "epoch": 0.12380989695568703, "fp16_scale": 512.0, "global_step": 790, "grad_norm": 4.049, "learning_rate": 1.95547298799839e-05, "loss": 0.7815, "step": 790 }, { "ETA": 26.46, "epoch": 0.12537711084120207, "fp16_scale": 512.0, "global_step": 800, "grad_norm": 3.712, "learning_rate": 1.953962593032886e-05, "loss": 0.7915, "step": 800 }, { "ETA": 26.41, "epoch": 0.12694432472671707, "fp16_scale": 512.0, "global_step": 810, "grad_norm": 3.061, "learning_rate": 1.9524276097082383e-05, "loss": 0.7629, "step": 810 }, { "ETA": 26.36, "epoch": 0.1285115386122321, "fp16_scale": 512.0, "global_step": 820, "grad_norm": 3.61, "learning_rate": 1.9508680775886e-05, "loss": 0.824, "step": 820 }, { "ETA": 26.3, "epoch": 0.13007875249774714, "fp16_scale": 512.0, "global_step": 830, "grad_norm": 3.23, "learning_rate": 1.9492840368708668e-05, "loss": 0.8129, "step": 830 }, { "ETA": 26.25, "epoch": 0.13164596638326215, "fp16_scale": 512.0, "global_step": 840, "grad_norm": 3.774, "learning_rate": 1.9476755283836448e-05, "loss": 0.7996, "step": 840 }, { "ETA": 26.2, "epoch": 0.13321318026877718, "fp16_scale": 512.0, "global_step": 850, "grad_norm": 3.48, "learning_rate": 1.946042593586195e-05, "loss": 0.8317, "step": 850 }, { "ETA": 26.14, "epoch": 0.13478039415429222, "fp16_scale": 512.0, "global_step": 860, "grad_norm": 4.229, "learning_rate": 1.944385274567366e-05, "loss": 0.8057, "step": 860 }, { "ETA": 26.09, "epoch": 0.13634760803980722, "fp16_scale": 512.0, "global_step": 870, "grad_norm": 3.373, "learning_rate": 1.9427036140445087e-05, "loss": 0.7926, "step": 870 }, { "ETA": 26.04, "epoch": 0.13791482192532226, "fp16_scale": 512.0, "global_step": 880, "grad_norm": 2.887, "learning_rate": 1.9409976553623767e-05, "loss": 0.7527, "step": 880 }, { "ETA": 25.99, "epoch": 0.1394820358108373, "fp16_scale": 512.0, "global_step": 890, "grad_norm": 3.633, "learning_rate": 1.939267442492007e-05, "loss": 0.8175, "step": 890 }, { "ETA": 25.93, "epoch": 0.1410492496963523, "fp16_scale": 512.0, "global_step": 900, "grad_norm": 3.6, "learning_rate": 1.937513020029588e-05, "loss": 0.7589, "step": 900 }, { "ETA": 25.88, "epoch": 0.14261646358186733, "fp16_scale": 512.0, "global_step": 910, "grad_norm": 9.442, "learning_rate": 1.9357344331953095e-05, "loss": 0.8359, "step": 910 }, { "ETA": 25.83, "epoch": 0.14418367746738237, "fp16_scale": 512.0, "global_step": 920, "grad_norm": 3.371, "learning_rate": 1.9339317278321975e-05, "loss": 0.7368, "step": 920 }, { "ETA": 25.78, "epoch": 0.1457508913528974, "fp16_scale": 512.0, "global_step": 930, "grad_norm": 3.693, "learning_rate": 1.9321049504049325e-05, "loss": 0.7861, "step": 930 }, { "ETA": 25.73, "epoch": 0.1473181052384124, "fp16_scale": 512.0, "global_step": 940, "grad_norm": 5.654, "learning_rate": 1.930254147998651e-05, "loss": 0.7698, "step": 940 }, { "ETA": 25.68, "epoch": 0.14888531912392744, "fp16_scale": 512.0, "global_step": 950, "grad_norm": 3.002, "learning_rate": 1.9283793683177335e-05, "loss": 0.802, "step": 950 }, { "ETA": 25.62, "epoch": 0.15045253300944247, "fp16_scale": 512.0, "global_step": 960, "grad_norm": 3.35, "learning_rate": 1.9264806596845742e-05, "loss": 0.7779, "step": 960 }, { "ETA": 25.57, "epoch": 0.15201974689495748, "fp16_scale": 512.0, "global_step": 970, "grad_norm": 2.751, "learning_rate": 1.9245580710383344e-05, "loss": 0.7804, "step": 970 }, { "ETA": 25.52, "epoch": 0.15358696078047251, "fp16_scale": 512.0, "global_step": 980, "grad_norm": 3.66, "learning_rate": 1.922611651933683e-05, "loss": 0.8006, "step": 980 }, { "ETA": 25.47, "epoch": 0.15515417466598755, "fp16_scale": 512.0, "global_step": 990, "grad_norm": 3.725, "learning_rate": 1.920641452539518e-05, "loss": 0.7695, "step": 990 }, { "ETA": 25.42, "epoch": 0.15672138855150256, "fp16_scale": 512.0, "global_step": 1000, "grad_norm": 3.159, "learning_rate": 1.9186475236376733e-05, "loss": 0.7952, "step": 1000 }, { "ETA": 25.53, "epoch": 0.1582886024370176, "fp16_scale": 1024.0, "global_step": 1010, "grad_norm": 1.366, "learning_rate": 1.916629916621611e-05, "loss": 0.7893, "step": 1010 }, { "ETA": 25.48, "epoch": 0.15985581632253262, "fp16_scale": 1024.0, "global_step": 1020, "grad_norm": 3.977, "learning_rate": 1.914588683495095e-05, "loss": 0.8, "step": 1020 }, { "ETA": 25.42, "epoch": 0.16142303020804763, "fp16_scale": 1024.0, "global_step": 1030, "grad_norm": 2.984, "learning_rate": 1.9125238768708527e-05, "loss": 0.7915, "step": 1030 }, { "ETA": 25.37, "epoch": 0.16299024409356266, "fp16_scale": 1024.0, "global_step": 1040, "grad_norm": 3.751, "learning_rate": 1.9104355499692166e-05, "loss": 0.7921, "step": 1040 }, { "ETA": 25.32, "epoch": 0.1645574579790777, "fp16_scale": 1024.0, "global_step": 1050, "grad_norm": 4.355, "learning_rate": 1.908323756616754e-05, "loss": 0.8083, "step": 1050 }, { "ETA": 25.27, "epoch": 0.16612467186459273, "fp16_scale": 1024.0, "global_step": 1060, "grad_norm": 3.678, "learning_rate": 1.9061885512448797e-05, "loss": 0.8066, "step": 1060 }, { "ETA": 25.21, "epoch": 0.16769188575010774, "fp16_scale": 1024.0, "global_step": 1070, "grad_norm": 3.258, "learning_rate": 1.904029988888453e-05, "loss": 0.7434, "step": 1070 }, { "ETA": 25.16, "epoch": 0.16925909963562277, "fp16_scale": 1024.0, "global_step": 1080, "grad_norm": 3.67, "learning_rate": 1.901848125184357e-05, "loss": 0.7847, "step": 1080 }, { "ETA": 25.11, "epoch": 0.1708263135211378, "fp16_scale": 1024.0, "global_step": 1090, "grad_norm": 3.202, "learning_rate": 1.8996430163700686e-05, "loss": 0.7697, "step": 1090 }, { "ETA": 25.06, "epoch": 0.1723935274066528, "fp16_scale": 1024.0, "global_step": 1100, "grad_norm": 3.825, "learning_rate": 1.8974147192822053e-05, "loss": 0.7835, "step": 1100 }, { "ETA": 25.01, "epoch": 0.17396074129216785, "fp16_scale": 1024.0, "global_step": 1110, "grad_norm": 3.314, "learning_rate": 1.8951632913550625e-05, "loss": 0.7981, "step": 1110 }, { "ETA": 24.96, "epoch": 0.17552795517768288, "fp16_scale": 1024.0, "global_step": 1120, "grad_norm": 5.387, "learning_rate": 1.892888790619132e-05, "loss": 0.7963, "step": 1120 }, { "ETA": 24.9, "epoch": 0.1770951690631979, "fp16_scale": 1024.0, "global_step": 1130, "grad_norm": 3.251, "learning_rate": 1.890591275699606e-05, "loss": 0.8005, "step": 1130 }, { "ETA": 24.85, "epoch": 0.17866238294871292, "fp16_scale": 1024.0, "global_step": 1140, "grad_norm": 4.961, "learning_rate": 1.8882708058148683e-05, "loss": 0.7651, "step": 1140 }, { "ETA": 24.8, "epoch": 0.18022959683422796, "fp16_scale": 1024.0, "global_step": 1150, "grad_norm": 3.24, "learning_rate": 1.8859274407749646e-05, "loss": 0.8029, "step": 1150 }, { "ETA": 24.75, "epoch": 0.181796810719743, "fp16_scale": 1024.0, "global_step": 1160, "grad_norm": 4.009, "learning_rate": 1.8835612409800634e-05, "loss": 0.7887, "step": 1160 }, { "ETA": 24.7, "epoch": 0.183364024605258, "fp16_scale": 1024.0, "global_step": 1170, "grad_norm": 3.594, "learning_rate": 1.881172267418898e-05, "loss": 0.7834, "step": 1170 }, { "ETA": 24.65, "epoch": 0.18493123849077303, "fp16_scale": 1024.0, "global_step": 1180, "grad_norm": 2.744, "learning_rate": 1.8787605816671956e-05, "loss": 0.7694, "step": 1180 }, { "ETA": 24.59, "epoch": 0.18649845237628807, "fp16_scale": 1024.0, "global_step": 1190, "grad_norm": 3.546, "learning_rate": 1.876326245886088e-05, "loss": 0.791, "step": 1190 }, { "ETA": 24.54, "epoch": 0.18806566626180307, "fp16_scale": 1024.0, "global_step": 1200, "grad_norm": 3.565, "learning_rate": 1.873869322820513e-05, "loss": 0.7762, "step": 1200 }, { "ETA": 24.49, "epoch": 0.1896328801473181, "fp16_scale": 1024.0, "global_step": 1210, "grad_norm": 2.926, "learning_rate": 1.8713898757975935e-05, "loss": 0.7669, "step": 1210 }, { "ETA": 24.44, "epoch": 0.19120009403283314, "fp16_scale": 1024.0, "global_step": 1220, "grad_norm": 4.165, "learning_rate": 1.8688879687250067e-05, "loss": 0.7813, "step": 1220 }, { "ETA": 24.39, "epoch": 0.19276730791834815, "fp16_scale": 1024.0, "global_step": 1230, "grad_norm": 2.299, "learning_rate": 1.8663636660893378e-05, "loss": 0.7547, "step": 1230 }, { "ETA": 24.34, "epoch": 0.19433452180386318, "fp16_scale": 1024.0, "global_step": 1240, "grad_norm": 3.092, "learning_rate": 1.8638170329544164e-05, "loss": 0.7791, "step": 1240 }, { "ETA": 24.29, "epoch": 0.19590173568937821, "fp16_scale": 1024.0, "global_step": 1250, "grad_norm": 3.193, "learning_rate": 1.8612481349596406e-05, "loss": 0.7505, "step": 1250 }, { "ETA": 24.24, "epoch": 0.19746894957489322, "fp16_scale": 1024.0, "global_step": 1260, "grad_norm": 3.13, "learning_rate": 1.858657038318284e-05, "loss": 0.7813, "step": 1260 }, { "ETA": 24.19, "epoch": 0.19903616346040826, "fp16_scale": 1024.0, "global_step": 1270, "grad_norm": 3.3, "learning_rate": 1.85604380981579e-05, "loss": 0.793, "step": 1270 }, { "ETA": 24.14, "epoch": 0.2006033773459233, "fp16_scale": 1024.0, "global_step": 1280, "grad_norm": 2.819, "learning_rate": 1.8534085168080503e-05, "loss": 0.7282, "step": 1280 }, { "ETA": 24.09, "epoch": 0.20217059123143832, "fp16_scale": 1024.0, "global_step": 1290, "grad_norm": 3.958, "learning_rate": 1.850751227219669e-05, "loss": 0.7718, "step": 1290 }, { "ETA": 24.04, "epoch": 0.20373780511695333, "fp16_scale": 1024.0, "global_step": 1300, "grad_norm": 3.133, "learning_rate": 1.8480720095422096e-05, "loss": 0.7958, "step": 1300 }, { "ETA": 23.99, "epoch": 0.20530501900246836, "fp16_scale": 1024.0, "global_step": 1310, "grad_norm": 3.184, "learning_rate": 1.8453709328324337e-05, "loss": 0.7885, "step": 1310 }, { "ETA": 23.94, "epoch": 0.2068722328879834, "fp16_scale": 1024.0, "global_step": 1320, "grad_norm": 2.849, "learning_rate": 1.8426480667105178e-05, "loss": 0.7554, "step": 1320 }, { "ETA": 23.9, "epoch": 0.2084394467734984, "fp16_scale": 1024.0, "global_step": 1330, "grad_norm": 3.621, "learning_rate": 1.83990348135826e-05, "loss": 0.7513, "step": 1330 }, { "ETA": 23.91, "epoch": 0.21000666065901344, "fp16_scale": 1024.0, "global_step": 1340, "grad_norm": 2.766, "learning_rate": 1.8371372475172705e-05, "loss": 0.7806, "step": 1340 }, { "ETA": 23.87, "epoch": 0.21157387454452847, "fp16_scale": 1024.0, "global_step": 1350, "grad_norm": 2.369, "learning_rate": 1.8343494364871502e-05, "loss": 0.7946, "step": 1350 }, { "ETA": 23.82, "epoch": 0.21314108843004348, "fp16_scale": 1024.0, "global_step": 1360, "grad_norm": 3.318, "learning_rate": 1.8315401201236492e-05, "loss": 0.7845, "step": 1360 }, { "ETA": 23.77, "epoch": 0.2147083023155585, "fp16_scale": 1024.0, "global_step": 1370, "grad_norm": 3.702, "learning_rate": 1.828709370836819e-05, "loss": 0.7898, "step": 1370 }, { "ETA": 23.72, "epoch": 0.21627551620107355, "fp16_scale": 1024.0, "global_step": 1380, "grad_norm": 2.894, "learning_rate": 1.8258572615891427e-05, "loss": 0.7749, "step": 1380 }, { "ETA": 23.67, "epoch": 0.21784273008658855, "fp16_scale": 1024.0, "global_step": 1390, "grad_norm": 3.881, "learning_rate": 1.8229838658936566e-05, "loss": 0.7623, "step": 1390 }, { "ETA": 23.62, "epoch": 0.2194099439721036, "fp16_scale": 1024.0, "global_step": 1400, "grad_norm": 3.222, "learning_rate": 1.8200892578120544e-05, "loss": 0.7537, "step": 1400 }, { "ETA": 23.57, "epoch": 0.22097715785761862, "fp16_scale": 1024.0, "global_step": 1410, "grad_norm": 2.809, "learning_rate": 1.8171735119527784e-05, "loss": 0.7634, "step": 1410 }, { "ETA": 23.52, "epoch": 0.22254437174313366, "fp16_scale": 1024.0, "global_step": 1420, "grad_norm": 2.92, "learning_rate": 1.8142367034690967e-05, "loss": 0.7852, "step": 1420 }, { "ETA": 23.47, "epoch": 0.22411158562864866, "fp16_scale": 1024.0, "global_step": 1430, "grad_norm": 2.293, "learning_rate": 1.8112789080571655e-05, "loss": 0.7808, "step": 1430 }, { "ETA": 23.42, "epoch": 0.2256787995141637, "fp16_scale": 1024.0, "global_step": 1440, "grad_norm": 3.209, "learning_rate": 1.8083002019540784e-05, "loss": 0.7904, "step": 1440 }, { "ETA": 23.37, "epoch": 0.22724601339967873, "fp16_scale": 1024.0, "global_step": 1450, "grad_norm": 2.592, "learning_rate": 1.805300661935903e-05, "loss": 0.7706, "step": 1450 }, { "ETA": 23.32, "epoch": 0.22881322728519374, "fp16_scale": 1024.0, "global_step": 1460, "grad_norm": 2.165, "learning_rate": 1.8022803653156983e-05, "loss": 0.7777, "step": 1460 }, { "ETA": 23.27, "epoch": 0.23038044117070877, "fp16_scale": 1024.0, "global_step": 1470, "grad_norm": 3.106, "learning_rate": 1.799239389941526e-05, "loss": 0.7475, "step": 1470 }, { "ETA": 23.22, "epoch": 0.2319476550562238, "fp16_scale": 1024.0, "global_step": 1480, "grad_norm": 3.043, "learning_rate": 1.7961778141944407e-05, "loss": 0.7672, "step": 1480 }, { "ETA": 23.17, "epoch": 0.2335148689417388, "fp16_scale": 1024.0, "global_step": 1490, "grad_norm": 2.615, "learning_rate": 1.7930957169864713e-05, "loss": 0.753, "step": 1490 }, { "ETA": 23.12, "epoch": 0.23508208282725385, "fp16_scale": 1024.0, "global_step": 1500, "grad_norm": 3.227, "learning_rate": 1.789993177758588e-05, "loss": 0.7833, "step": 1500 }, { "ETA": 23.17, "epoch": 0.23664929671276888, "fp16_scale": 1024.0, "global_step": 1510, "grad_norm": 3.618, "learning_rate": 1.7868702764786522e-05, "loss": 0.7465, "step": 1510 }, { "ETA": 23.12, "epoch": 0.2382165105982839, "fp16_scale": 1024.0, "global_step": 1520, "grad_norm": 3.637, "learning_rate": 1.783727093639357e-05, "loss": 0.7727, "step": 1520 }, { "ETA": 23.07, "epoch": 0.23978372448379892, "fp16_scale": 1024.0, "global_step": 1530, "grad_norm": 2.791, "learning_rate": 1.7805637102561516e-05, "loss": 0.7736, "step": 1530 }, { "ETA": 23.02, "epoch": 0.24135093836931396, "fp16_scale": 1024.0, "global_step": 1540, "grad_norm": 4.011, "learning_rate": 1.777380207865155e-05, "loss": 0.7794, "step": 1540 }, { "ETA": 22.97, "epoch": 0.242918152254829, "fp16_scale": 1024.0, "global_step": 1550, "grad_norm": 3.216, "learning_rate": 1.7741766685210522e-05, "loss": 0.7829, "step": 1550 }, { "ETA": 22.92, "epoch": 0.244485366140344, "fp16_scale": 1024.0, "global_step": 1560, "grad_norm": 3.06, "learning_rate": 1.7709531747949796e-05, "loss": 0.7472, "step": 1560 }, { "ETA": 22.87, "epoch": 0.24605258002585903, "fp16_scale": 1024.0, "global_step": 1570, "grad_norm": 2.834, "learning_rate": 1.7677098097723985e-05, "loss": 0.7557, "step": 1570 }, { "ETA": 22.82, "epoch": 0.24761979391137406, "fp16_scale": 1024.0, "global_step": 1580, "grad_norm": 2.833, "learning_rate": 1.7644466570509508e-05, "loss": 0.7649, "step": 1580 }, { "ETA": 22.76, "epoch": 0.24918700779688907, "fp16_scale": 1024.0, "global_step": 1590, "grad_norm": 2.454, "learning_rate": 1.761163800738307e-05, "loss": 0.7683, "step": 1590 }, { "ETA": 22.71, "epoch": 0.25075422168240413, "fp16_scale": 1024.0, "global_step": 1600, "grad_norm": 2.78, "learning_rate": 1.757861325449997e-05, "loss": 0.7333, "step": 1600 }, { "ETA": 22.66, "epoch": 0.25232143556791914, "fp16_scale": 1024.0, "global_step": 1610, "grad_norm": 2.601, "learning_rate": 1.7545393163072285e-05, "loss": 0.7917, "step": 1610 }, { "ETA": 22.61, "epoch": 0.25388864945343415, "fp16_scale": 1024.0, "global_step": 1620, "grad_norm": 3.483, "learning_rate": 1.751197858934694e-05, "loss": 0.77, "step": 1620 }, { "ETA": 22.56, "epoch": 0.2554558633389492, "fp16_scale": 1024.0, "global_step": 1630, "grad_norm": 2.619, "learning_rate": 1.7478370394583647e-05, "loss": 0.7957, "step": 1630 }, { "ETA": 22.51, "epoch": 0.2570230772244642, "fp16_scale": 1024.0, "global_step": 1640, "grad_norm": 2.546, "learning_rate": 1.7444569445032677e-05, "loss": 0.7811, "step": 1640 }, { "ETA": 22.46, "epoch": 0.2585902911099792, "fp16_scale": 1024.0, "global_step": 1650, "grad_norm": 4.994, "learning_rate": 1.7410576611912563e-05, "loss": 0.7777, "step": 1650 }, { "ETA": 22.41, "epoch": 0.2601575049954943, "fp16_scale": 1024.0, "global_step": 1660, "grad_norm": 2.53, "learning_rate": 1.7376392771387623e-05, "loss": 0.7764, "step": 1660 }, { "ETA": 22.36, "epoch": 0.2617247188810093, "fp16_scale": 1024.0, "global_step": 1670, "grad_norm": 3.218, "learning_rate": 1.73420188045454e-05, "loss": 0.7549, "step": 1670 }, { "ETA": 22.31, "epoch": 0.2632919327665243, "fp16_scale": 1024.0, "global_step": 1680, "grad_norm": 2.76, "learning_rate": 1.7307455597373916e-05, "loss": 0.7576, "step": 1680 }, { "ETA": 22.26, "epoch": 0.26485914665203936, "fp16_scale": 1024.0, "global_step": 1690, "grad_norm": 3.132, "learning_rate": 1.7272704040738875e-05, "loss": 0.7537, "step": 1690 }, { "ETA": 22.21, "epoch": 0.26642636053755436, "fp16_scale": 1024.0, "global_step": 1700, "grad_norm": 3.045, "learning_rate": 1.723776503036068e-05, "loss": 0.7826, "step": 1700 }, { "ETA": 22.16, "epoch": 0.26799357442306937, "fp16_scale": 1024.0, "global_step": 1710, "grad_norm": 4.358, "learning_rate": 1.7202639466791336e-05, "loss": 0.7511, "step": 1710 }, { "ETA": 22.11, "epoch": 0.26956078830858443, "fp16_scale": 1024.0, "global_step": 1720, "grad_norm": 2.452, "learning_rate": 1.716732825539127e-05, "loss": 0.7512, "step": 1720 }, { "ETA": 22.06, "epoch": 0.27112800219409944, "fp16_scale": 1024.0, "global_step": 1730, "grad_norm": 3.543, "learning_rate": 1.7131832306305964e-05, "loss": 0.76, "step": 1730 }, { "ETA": 22.01, "epoch": 0.27269521607961444, "fp16_scale": 1024.0, "global_step": 1740, "grad_norm": 3.003, "learning_rate": 1.7096152534442515e-05, "loss": 0.7576, "step": 1740 }, { "ETA": 21.96, "epoch": 0.2742624299651295, "fp16_scale": 1024.0, "global_step": 1750, "grad_norm": 3.431, "learning_rate": 1.706028985944604e-05, "loss": 0.7494, "step": 1750 }, { "ETA": 21.91, "epoch": 0.2758296438506445, "fp16_scale": 1024.0, "global_step": 1760, "grad_norm": 5.632, "learning_rate": 1.7024245205675986e-05, "loss": 0.7741, "step": 1760 }, { "ETA": 21.86, "epoch": 0.2773968577361595, "fp16_scale": 1024.0, "global_step": 1770, "grad_norm": 3.417, "learning_rate": 1.6988019502182296e-05, "loss": 0.7829, "step": 1770 }, { "ETA": 21.81, "epoch": 0.2789640716216746, "fp16_scale": 1024.0, "global_step": 1780, "grad_norm": 2.723, "learning_rate": 1.6951613682681465e-05, "loss": 0.7317, "step": 1780 }, { "ETA": 21.77, "epoch": 0.2805312855071896, "fp16_scale": 1024.0, "global_step": 1790, "grad_norm": 2.638, "learning_rate": 1.691502868553247e-05, "loss": 0.7567, "step": 1790 }, { "ETA": 21.72, "epoch": 0.2820984993927046, "fp16_scale": 1024.0, "global_step": 1800, "grad_norm": 4.379, "learning_rate": 1.6878265453712587e-05, "loss": 0.7854, "step": 1800 }, { "ETA": 21.67, "epoch": 0.28366571327821966, "fp16_scale": 1024.0, "global_step": 1810, "grad_norm": 2.736, "learning_rate": 1.6841324934793096e-05, "loss": 0.766, "step": 1810 }, { "ETA": 21.62, "epoch": 0.28523292716373466, "fp16_scale": 1024.0, "global_step": 1820, "grad_norm": 3.732, "learning_rate": 1.6804208080914824e-05, "loss": 0.7589, "step": 1820 }, { "ETA": 21.57, "epoch": 0.2868001410492497, "fp16_scale": 1024.0, "global_step": 1830, "grad_norm": 2.209, "learning_rate": 1.6766915848763657e-05, "loss": 0.7454, "step": 1830 }, { "ETA": 21.52, "epoch": 0.28836735493476473, "fp16_scale": 1024.0, "global_step": 1840, "grad_norm": 2.98, "learning_rate": 1.6729449199545828e-05, "loss": 0.7828, "step": 1840 }, { "ETA": 21.47, "epoch": 0.28993456882027974, "fp16_scale": 1024.0, "global_step": 1850, "grad_norm": 3.104, "learning_rate": 1.669180909896317e-05, "loss": 0.7484, "step": 1850 }, { "ETA": 21.42, "epoch": 0.2915017827057948, "fp16_scale": 1024.0, "global_step": 1860, "grad_norm": 2.728, "learning_rate": 1.6653996517188224e-05, "loss": 0.7652, "step": 1860 }, { "ETA": 21.37, "epoch": 0.2930689965913098, "fp16_scale": 1024.0, "global_step": 1870, "grad_norm": 3.01, "learning_rate": 1.6616012428839226e-05, "loss": 0.7734, "step": 1870 }, { "ETA": 21.34, "epoch": 0.2946362104768248, "fp16_scale": 1024.0, "global_step": 1880, "grad_norm": 3.439, "learning_rate": 1.6577857812954994e-05, "loss": 0.7893, "step": 1880 }, { "ETA": 21.32, "epoch": 0.2962034243623399, "fp16_scale": 1024.0, "global_step": 1890, "grad_norm": 2.866, "learning_rate": 1.6539533652969683e-05, "loss": 0.7249, "step": 1890 }, { "ETA": 21.27, "epoch": 0.2977706382478549, "fp16_scale": 1024.0, "global_step": 1900, "grad_norm": 2.747, "learning_rate": 1.6501040936687444e-05, "loss": 0.7319, "step": 1900 }, { "ETA": 21.22, "epoch": 0.2993378521333699, "fp16_scale": 1024.0, "global_step": 1910, "grad_norm": 2.916, "learning_rate": 1.6462380656256962e-05, "loss": 0.7661, "step": 1910 }, { "ETA": 21.17, "epoch": 0.30090506601888495, "fp16_scale": 1024.0, "global_step": 1920, "grad_norm": 3.348, "learning_rate": 1.6423553808145886e-05, "loss": 0.7494, "step": 1920 }, { "ETA": 21.12, "epoch": 0.30247227990439995, "fp16_scale": 1024.0, "global_step": 1930, "grad_norm": 2.77, "learning_rate": 1.6384561393115135e-05, "loss": 0.7675, "step": 1930 }, { "ETA": 21.07, "epoch": 0.30403949378991496, "fp16_scale": 1024.0, "global_step": 1940, "grad_norm": 3.482, "learning_rate": 1.6345404416193117e-05, "loss": 0.7661, "step": 1940 }, { "ETA": 21.02, "epoch": 0.30560670767543, "fp16_scale": 1024.0, "global_step": 1950, "grad_norm": 3.32, "learning_rate": 1.6306083886649823e-05, "loss": 0.7464, "step": 1950 }, { "ETA": 20.98, "epoch": 0.30717392156094503, "fp16_scale": 1024.0, "global_step": 1960, "grad_norm": 2.803, "learning_rate": 1.6266600817970794e-05, "loss": 0.7393, "step": 1960 }, { "ETA": 20.93, "epoch": 0.30874113544646004, "fp16_scale": 1024.0, "global_step": 1970, "grad_norm": 3.484, "learning_rate": 1.6226956227831018e-05, "loss": 0.7559, "step": 1970 }, { "ETA": 20.88, "epoch": 0.3103083493319751, "fp16_scale": 1024.0, "global_step": 1980, "grad_norm": 3.771, "learning_rate": 1.6187151138068707e-05, "loss": 0.7579, "step": 1980 }, { "ETA": 20.83, "epoch": 0.3118755632174901, "fp16_scale": 1024.0, "global_step": 1990, "grad_norm": 3.84, "learning_rate": 1.6147186574658924e-05, "loss": 0.7631, "step": 1990 }, { "ETA": 20.78, "epoch": 0.3134427771030051, "fp16_scale": 1024.0, "global_step": 2000, "grad_norm": 2.884, "learning_rate": 1.6107063567687183e-05, "loss": 0.7502, "step": 2000 }, { "ETA": 20.79, "epoch": 0.31500999098852017, "fp16_scale": 2048.0, "global_step": 2010, "grad_norm": 1.481, "learning_rate": 1.6066783151322863e-05, "loss": 0.7295, "step": 2010 }, { "ETA": 20.74, "epoch": 0.3165772048740352, "fp16_scale": 2048.0, "global_step": 2020, "grad_norm": 2.545, "learning_rate": 1.6026346363792565e-05, "loss": 0.7341, "step": 2020 }, { "ETA": 20.7, "epoch": 0.3181444187595502, "fp16_scale": 2048.0, "global_step": 2030, "grad_norm": 2.371, "learning_rate": 1.598575424735336e-05, "loss": 0.7362, "step": 2030 }, { "ETA": 20.65, "epoch": 0.31971163264506525, "fp16_scale": 2048.0, "global_step": 2040, "grad_norm": 2.798, "learning_rate": 1.5945007848265912e-05, "loss": 0.7554, "step": 2040 }, { "ETA": 20.6, "epoch": 0.32127884653058025, "fp16_scale": 2048.0, "global_step": 2050, "grad_norm": 2.313, "learning_rate": 1.5904108216767516e-05, "loss": 0.741, "step": 2050 }, { "ETA": 20.55, "epoch": 0.32284606041609526, "fp16_scale": 2048.0, "global_step": 2060, "grad_norm": 3.077, "learning_rate": 1.5863056407045034e-05, "loss": 0.7597, "step": 2060 }, { "ETA": 20.5, "epoch": 0.3244132743016103, "fp16_scale": 2048.0, "global_step": 2070, "grad_norm": 3.112, "learning_rate": 1.582185347720771e-05, "loss": 0.7317, "step": 2070 }, { "ETA": 20.45, "epoch": 0.32598048818712533, "fp16_scale": 2048.0, "global_step": 2080, "grad_norm": 2.469, "learning_rate": 1.5780500489259907e-05, "loss": 0.7093, "step": 2080 }, { "ETA": 20.4, "epoch": 0.3275477020726404, "fp16_scale": 2048.0, "global_step": 2090, "grad_norm": 2.714, "learning_rate": 1.573899850907373e-05, "loss": 0.7303, "step": 2090 }, { "ETA": 20.35, "epoch": 0.3291149159581554, "fp16_scale": 2048.0, "global_step": 2100, "grad_norm": 3.24, "learning_rate": 1.5697348606361564e-05, "loss": 0.7306, "step": 2100 }, { "ETA": 20.3, "epoch": 0.3306821298436704, "fp16_scale": 2048.0, "global_step": 2110, "grad_norm": 3.271, "learning_rate": 1.5655551854648477e-05, "loss": 0.7364, "step": 2110 }, { "ETA": 20.25, "epoch": 0.33224934372918546, "fp16_scale": 2048.0, "global_step": 2120, "grad_norm": 2.683, "learning_rate": 1.5613609331244584e-05, "loss": 0.7405, "step": 2120 }, { "ETA": 20.2, "epoch": 0.33381655761470047, "fp16_scale": 2048.0, "global_step": 2130, "grad_norm": 3.218, "learning_rate": 1.557152211721725e-05, "loss": 0.7515, "step": 2130 }, { "ETA": 20.15, "epoch": 0.3353837715002155, "fp16_scale": 2048.0, "global_step": 2140, "grad_norm": 3.1, "learning_rate": 1.5529291297363235e-05, "loss": 0.7298, "step": 2140 }, { "ETA": 20.1, "epoch": 0.33695098538573054, "fp16_scale": 2048.0, "global_step": 2150, "grad_norm": 3.992, "learning_rate": 1.5486917960180742e-05, "loss": 0.7228, "step": 2150 }, { "ETA": 20.05, "epoch": 0.33851819927124555, "fp16_scale": 2048.0, "global_step": 2160, "grad_norm": 2.571, "learning_rate": 1.5444403197841345e-05, "loss": 0.7533, "step": 2160 }, { "ETA": 20.0, "epoch": 0.34008541315676055, "fp16_scale": 2048.0, "global_step": 2170, "grad_norm": 3.083, "learning_rate": 1.5401748106161868e-05, "loss": 0.7298, "step": 2170 }, { "ETA": 19.95, "epoch": 0.3416526270422756, "fp16_scale": 2048.0, "global_step": 2180, "grad_norm": 2.534, "learning_rate": 1.5358953784576093e-05, "loss": 0.7323, "step": 2180 }, { "ETA": 19.9, "epoch": 0.3432198409277906, "fp16_scale": 2048.0, "global_step": 2190, "grad_norm": 2.848, "learning_rate": 1.5316021336106463e-05, "loss": 0.7388, "step": 2190 }, { "ETA": 19.85, "epoch": 0.3447870548133056, "fp16_scale": 2048.0, "global_step": 2200, "grad_norm": 2.76, "learning_rate": 1.527295186733564e-05, "loss": 0.723, "step": 2200 }, { "ETA": 19.81, "epoch": 0.3463542686988207, "fp16_scale": 2048.0, "global_step": 2210, "grad_norm": 3.625, "learning_rate": 1.5229746488377974e-05, "loss": 0.7092, "step": 2210 }, { "ETA": 19.76, "epoch": 0.3479214825843357, "fp16_scale": 2048.0, "global_step": 2220, "grad_norm": 3.283, "learning_rate": 1.5186406312850901e-05, "loss": 0.7439, "step": 2220 }, { "ETA": 19.71, "epoch": 0.3494886964698507, "fp16_scale": 2048.0, "global_step": 2230, "grad_norm": 2.736, "learning_rate": 1.514293245784623e-05, "loss": 0.7088, "step": 2230 }, { "ETA": 19.66, "epoch": 0.35105591035536576, "fp16_scale": 2048.0, "global_step": 2240, "grad_norm": 2.318, "learning_rate": 1.5099326043901361e-05, "loss": 0.7293, "step": 2240 }, { "ETA": 19.61, "epoch": 0.35262312424088077, "fp16_scale": 2048.0, "global_step": 2250, "grad_norm": 3.371, "learning_rate": 1.505558819497039e-05, "loss": 0.7234, "step": 2250 }, { "ETA": 19.56, "epoch": 0.3541903381263958, "fp16_scale": 2048.0, "global_step": 2260, "grad_norm": 2.881, "learning_rate": 1.5011720038395145e-05, "loss": 0.7456, "step": 2260 }, { "ETA": 19.51, "epoch": 0.35575755201191084, "fp16_scale": 2048.0, "global_step": 2270, "grad_norm": 2.577, "learning_rate": 1.4967722704876147e-05, "loss": 0.7459, "step": 2270 }, { "ETA": 19.46, "epoch": 0.35732476589742584, "fp16_scale": 2048.0, "global_step": 2280, "grad_norm": 2.892, "learning_rate": 1.4923597328443423e-05, "loss": 0.7415, "step": 2280 }, { "ETA": 19.41, "epoch": 0.35889197978294085, "fp16_scale": 2048.0, "global_step": 2290, "grad_norm": 2.461, "learning_rate": 1.4879345046427322e-05, "loss": 0.7487, "step": 2290 }, { "ETA": 19.36, "epoch": 0.3604591936684559, "fp16_scale": 2048.0, "global_step": 2300, "grad_norm": 2.269, "learning_rate": 1.4834966999429179e-05, "loss": 0.7339, "step": 2300 }, { "ETA": 19.31, "epoch": 0.3620264075539709, "fp16_scale": 2048.0, "global_step": 2310, "grad_norm": 2.551, "learning_rate": 1.4790464331291906e-05, "loss": 0.7049, "step": 2310 }, { "ETA": 19.26, "epoch": 0.363593621439486, "fp16_scale": 2048.0, "global_step": 2320, "grad_norm": 3.06, "learning_rate": 1.4745838189070531e-05, "loss": 0.7555, "step": 2320 }, { "ETA": 19.22, "epoch": 0.365160835325001, "fp16_scale": 2048.0, "global_step": 2330, "grad_norm": 3.508, "learning_rate": 1.4701089723002623e-05, "loss": 0.7164, "step": 2330 }, { "ETA": 19.17, "epoch": 0.366728049210516, "fp16_scale": 2048.0, "global_step": 2340, "grad_norm": 2.925, "learning_rate": 1.4656220086478645e-05, "loss": 0.7491, "step": 2340 }, { "ETA": 19.12, "epoch": 0.36829526309603106, "fp16_scale": 2048.0, "global_step": 2350, "grad_norm": 2.25, "learning_rate": 1.4611230436012217e-05, "loss": 0.7173, "step": 2350 }, { "ETA": 19.07, "epoch": 0.36986247698154606, "fp16_scale": 2048.0, "global_step": 2360, "grad_norm": 3.265, "learning_rate": 1.4566121931210326e-05, "loss": 0.7516, "step": 2360 }, { "ETA": 19.02, "epoch": 0.37142969086706107, "fp16_scale": 2048.0, "global_step": 2370, "grad_norm": 2.661, "learning_rate": 1.4520895734743419e-05, "loss": 0.7159, "step": 2370 }, { "ETA": 18.97, "epoch": 0.37299690475257613, "fp16_scale": 2048.0, "global_step": 2380, "grad_norm": 3.14, "learning_rate": 1.4475553012315441e-05, "loss": 0.7037, "step": 2380 }, { "ETA": 18.92, "epoch": 0.37456411863809114, "fp16_scale": 2048.0, "global_step": 2390, "grad_norm": 2.859, "learning_rate": 1.44300949326338e-05, "loss": 0.7347, "step": 2390 }, { "ETA": 18.87, "epoch": 0.37613133252360614, "fp16_scale": 2048.0, "global_step": 2400, "grad_norm": 4.006, "learning_rate": 1.4384522667379229e-05, "loss": 0.729, "step": 2400 }, { "ETA": 18.83, "epoch": 0.3776985464091212, "fp16_scale": 2048.0, "global_step": 2410, "grad_norm": 3.044, "learning_rate": 1.4338837391175582e-05, "loss": 0.741, "step": 2410 }, { "ETA": 18.78, "epoch": 0.3792657602946362, "fp16_scale": 2048.0, "global_step": 2420, "grad_norm": 2.674, "learning_rate": 1.4297624991405243e-05, "loss": 0.7268, "step": 2420 }, { "ETA": 18.74, "epoch": 0.3808329741801512, "fp16_scale": 2048.0, "global_step": 2430, "grad_norm": 2.866, "learning_rate": 1.4251728240895031e-05, "loss": 0.7816, "step": 2430 }, { "ETA": 18.71, "epoch": 0.3824001880656663, "fp16_scale": 2048.0, "global_step": 2440, "grad_norm": 2.794, "learning_rate": 1.4205721902208371e-05, "loss": 0.7469, "step": 2440 }, { "ETA": 18.66, "epoch": 0.3839674019511813, "fp16_scale": 2048.0, "global_step": 2450, "grad_norm": 2.944, "learning_rate": 1.4159607161157363e-05, "loss": 0.7123, "step": 2450 }, { "ETA": 18.62, "epoch": 0.3855346158366963, "fp16_scale": 2048.0, "global_step": 2460, "grad_norm": 3.239, "learning_rate": 1.411338520634816e-05, "loss": 0.7221, "step": 2460 }, { "ETA": 18.57, "epoch": 0.38710182972221135, "fp16_scale": 2048.0, "global_step": 2470, "grad_norm": 2.86, "learning_rate": 1.4067057229150358e-05, "loss": 0.7189, "step": 2470 }, { "ETA": 18.52, "epoch": 0.38866904360772636, "fp16_scale": 2048.0, "global_step": 2480, "grad_norm": 2.896, "learning_rate": 1.402062442366627e-05, "loss": 0.7206, "step": 2480 }, { "ETA": 18.47, "epoch": 0.39023625749324137, "fp16_scale": 2048.0, "global_step": 2490, "grad_norm": 2.715, "learning_rate": 1.3974087986700163e-05, "loss": 0.7439, "step": 2490 }, { "ETA": 18.42, "epoch": 0.39180347137875643, "fp16_scale": 2048.0, "global_step": 2500, "grad_norm": 2.595, "learning_rate": 1.3927449117727392e-05, "loss": 0.7346, "step": 2500 }, { "ETA": 18.42, "epoch": 0.39337068526427144, "fp16_scale": 2048.0, "global_step": 2510, "grad_norm": 2.875, "learning_rate": 1.3880709018863504e-05, "loss": 0.7652, "step": 2510 }, { "ETA": 18.37, "epoch": 0.39493789914978644, "fp16_scale": 2048.0, "global_step": 2520, "grad_norm": 3.692, "learning_rate": 1.3833868894833238e-05, "loss": 0.7012, "step": 2520 }, { "ETA": 18.32, "epoch": 0.3965051130353015, "fp16_scale": 2048.0, "global_step": 2530, "grad_norm": 3.356, "learning_rate": 1.3786929952939478e-05, "loss": 0.7451, "step": 2530 }, { "ETA": 18.27, "epoch": 0.3980723269208165, "fp16_scale": 2048.0, "global_step": 2540, "grad_norm": 2.35, "learning_rate": 1.373989340303214e-05, "loss": 0.7325, "step": 2540 }, { "ETA": 18.22, "epoch": 0.3996395408063315, "fp16_scale": 2048.0, "global_step": 2550, "grad_norm": 3.077, "learning_rate": 1.3692760457476985e-05, "loss": 0.7446, "step": 2550 }, { "ETA": 18.17, "epoch": 0.4012067546918466, "fp16_scale": 2048.0, "global_step": 2560, "grad_norm": 3.121, "learning_rate": 1.3645532331124362e-05, "loss": 0.7164, "step": 2560 }, { "ETA": 18.12, "epoch": 0.4027739685773616, "fp16_scale": 2048.0, "global_step": 2570, "grad_norm": 2.211, "learning_rate": 1.3598210241277905e-05, "loss": 0.7411, "step": 2570 }, { "ETA": 18.08, "epoch": 0.40434118246287665, "fp16_scale": 2048.0, "global_step": 2580, "grad_norm": 2.489, "learning_rate": 1.3550795407663158e-05, "loss": 0.7431, "step": 2580 }, { "ETA": 18.03, "epoch": 0.40590839634839165, "fp16_scale": 2048.0, "global_step": 2590, "grad_norm": 2.241, "learning_rate": 1.3503289052396133e-05, "loss": 0.7127, "step": 2590 }, { "ETA": 17.98, "epoch": 0.40747561023390666, "fp16_scale": 1024.0, "global_step": 2600, "grad_norm": 3.6, "learning_rate": 1.3460456093623443e-05, "loss": 0.7245, "step": 2600 }, { "ETA": 17.93, "epoch": 0.4090428241194217, "fp16_scale": 1024.0, "global_step": 2610, "grad_norm": 2.75, "learning_rate": 1.3412779222569907e-05, "loss": 0.7512, "step": 2610 }, { "ETA": 17.88, "epoch": 0.41061003800493673, "fp16_scale": 1024.0, "global_step": 2620, "grad_norm": 2.475, "learning_rate": 1.3365014387227393e-05, "loss": 0.7082, "step": 2620 }, { "ETA": 17.83, "epoch": 0.41217725189045173, "fp16_scale": 1024.0, "global_step": 2630, "grad_norm": 3.246, "learning_rate": 1.3317162818733205e-05, "loss": 0.7197, "step": 2630 }, { "ETA": 17.78, "epoch": 0.4137444657759668, "fp16_scale": 1024.0, "global_step": 2640, "grad_norm": 3.182, "learning_rate": 1.326922575046018e-05, "loss": 0.7903, "step": 2640 }, { "ETA": 17.73, "epoch": 0.4153116796614818, "fp16_scale": 512.0, "global_step": 2650, "grad_norm": 35.311, "learning_rate": 1.3226010307862582e-05, "loss": 0.7686, "step": 2650 }, { "ETA": 17.68, "epoch": 0.4168788935469968, "fp16_scale": 512.0, "global_step": 2660, "grad_norm": 2.301, "learning_rate": 1.3177914195819018e-05, "loss": 0.7842, "step": 2660 }, { "ETA": 17.63, "epoch": 0.41844610743251187, "fp16_scale": 512.0, "global_step": 2670, "grad_norm": 2.258, "learning_rate": 1.3129736173125975e-05, "loss": 0.7565, "step": 2670 }, { "ETA": 17.59, "epoch": 0.4200133213180269, "fp16_scale": 512.0, "global_step": 2680, "grad_norm": 2.581, "learning_rate": 1.3081477481570642e-05, "loss": 0.7429, "step": 2680 }, { "ETA": 17.54, "epoch": 0.4215805352035419, "fp16_scale": 512.0, "global_step": 2690, "grad_norm": 2.537, "learning_rate": 1.303313936501944e-05, "loss": 0.7517, "step": 2690 }, { "ETA": 17.49, "epoch": 0.42314774908905695, "fp16_scale": 512.0, "global_step": 2700, "grad_norm": 5.254, "learning_rate": 1.2984723069385975e-05, "loss": 0.7414, "step": 2700 }, { "ETA": 17.44, "epoch": 0.42471496297457195, "fp16_scale": 512.0, "global_step": 2710, "grad_norm": 2.486, "learning_rate": 1.293622984259891e-05, "loss": 0.7547, "step": 2710 }, { "ETA": 17.39, "epoch": 0.42628217686008696, "fp16_scale": 512.0, "global_step": 2720, "grad_norm": 2.129, "learning_rate": 1.2887660934569809e-05, "loss": 0.7298, "step": 2720 }, { "ETA": 17.34, "epoch": 0.427849390745602, "fp16_scale": 512.0, "global_step": 2730, "grad_norm": 3.215, "learning_rate": 1.2839017597160916e-05, "loss": 0.7346, "step": 2730 }, { "ETA": 17.29, "epoch": 0.429416604631117, "fp16_scale": 512.0, "global_step": 2740, "grad_norm": 2.886, "learning_rate": 1.2790301084152889e-05, "loss": 0.7197, "step": 2740 }, { "ETA": 17.24, "epoch": 0.43098381851663203, "fp16_scale": 512.0, "global_step": 2750, "grad_norm": 2.857, "learning_rate": 1.2741512651212495e-05, "loss": 0.7417, "step": 2750 }, { "ETA": 17.2, "epoch": 0.4325510324021471, "fp16_scale": 512.0, "global_step": 2760, "grad_norm": 2.604, "learning_rate": 1.2692653555860224e-05, "loss": 0.7411, "step": 2760 }, { "ETA": 17.15, "epoch": 0.4341182462876621, "fp16_scale": 512.0, "global_step": 2770, "grad_norm": 2.792, "learning_rate": 1.264372505743789e-05, "loss": 0.7263, "step": 2770 }, { "ETA": 17.1, "epoch": 0.4356854601731771, "fp16_scale": 512.0, "global_step": 2780, "grad_norm": 3.684, "learning_rate": 1.2594728417076172e-05, "loss": 0.7712, "step": 2780 }, { "ETA": 17.05, "epoch": 0.43725267405869217, "fp16_scale": 512.0, "global_step": 2790, "grad_norm": 2.179, "learning_rate": 1.2545664897662108e-05, "loss": 0.7155, "step": 2790 }, { "ETA": 17.0, "epoch": 0.4388198879442072, "fp16_scale": 512.0, "global_step": 2800, "grad_norm": 2.914, "learning_rate": 1.2496535763806535e-05, "loss": 0.728, "step": 2800 }, { "ETA": 16.95, "epoch": 0.44038710182972224, "fp16_scale": 512.0, "global_step": 2810, "grad_norm": 3.342, "learning_rate": 1.2447342281811499e-05, "loss": 0.7383, "step": 2810 }, { "ETA": 16.9, "epoch": 0.44195431571523724, "fp16_scale": 512.0, "global_step": 2820, "grad_norm": 2.831, "learning_rate": 1.239808571963763e-05, "loss": 0.7351, "step": 2820 }, { "ETA": 16.86, "epoch": 0.44352152960075225, "fp16_scale": 512.0, "global_step": 2830, "grad_norm": 2.444, "learning_rate": 1.2348767346871438e-05, "loss": 0.7054, "step": 2830 }, { "ETA": 16.81, "epoch": 0.4450887434862673, "fp16_scale": 512.0, "global_step": 2840, "grad_norm": 2.822, "learning_rate": 1.2299388434692596e-05, "loss": 0.7212, "step": 2840 }, { "ETA": 16.76, "epoch": 0.4466559573717823, "fp16_scale": 512.0, "global_step": 2850, "grad_norm": 2.994, "learning_rate": 1.2249950255841187e-05, "loss": 0.7232, "step": 2850 }, { "ETA": 16.71, "epoch": 0.4482231712572973, "fp16_scale": 512.0, "global_step": 2860, "grad_norm": 2.159, "learning_rate": 1.220045408458489e-05, "loss": 0.721, "step": 2860 }, { "ETA": 16.66, "epoch": 0.4497903851428124, "fp16_scale": 512.0, "global_step": 2870, "grad_norm": 2.573, "learning_rate": 1.215090119668613e-05, "loss": 0.7249, "step": 2870 }, { "ETA": 16.61, "epoch": 0.4513575990283274, "fp16_scale": 512.0, "global_step": 2880, "grad_norm": 2.2, "learning_rate": 1.2101292869369208e-05, "loss": 0.6849, "step": 2880 }, { "ETA": 16.56, "epoch": 0.4529248129138424, "fp16_scale": 512.0, "global_step": 2890, "grad_norm": 2.541, "learning_rate": 1.2051630381287375e-05, "loss": 0.7097, "step": 2890 }, { "ETA": 16.52, "epoch": 0.45449202679935746, "fp16_scale": 512.0, "global_step": 2900, "grad_norm": 2.185, "learning_rate": 1.2001915012489869e-05, "loss": 0.7336, "step": 2900 }, { "ETA": 16.47, "epoch": 0.45605924068487247, "fp16_scale": 512.0, "global_step": 2910, "grad_norm": 3.047, "learning_rate": 1.195214804438893e-05, "loss": 0.7295, "step": 2910 }, { "ETA": 16.42, "epoch": 0.4576264545703875, "fp16_scale": 512.0, "global_step": 2920, "grad_norm": 2.518, "learning_rate": 1.1902330759726766e-05, "loss": 0.7207, "step": 2920 }, { "ETA": 16.37, "epoch": 0.45919366845590254, "fp16_scale": 512.0, "global_step": 2930, "grad_norm": 2.452, "learning_rate": 1.1852464442542501e-05, "loss": 0.7031, "step": 2930 }, { "ETA": 16.32, "epoch": 0.46076088234141754, "fp16_scale": 512.0, "global_step": 2940, "grad_norm": 2.328, "learning_rate": 1.180255037813906e-05, "loss": 0.7034, "step": 2940 }, { "ETA": 16.27, "epoch": 0.46232809622693255, "fp16_scale": 512.0, "global_step": 2950, "grad_norm": 2.481, "learning_rate": 1.1752589853050057e-05, "loss": 0.712, "step": 2950 }, { "ETA": 16.23, "epoch": 0.4638953101124476, "fp16_scale": 512.0, "global_step": 2960, "grad_norm": 2.881, "learning_rate": 1.1702584155006631e-05, "loss": 0.7308, "step": 2960 }, { "ETA": 16.18, "epoch": 0.4654625239979626, "fp16_scale": 512.0, "global_step": 2970, "grad_norm": 2.929, "learning_rate": 1.1652534572904248e-05, "loss": 0.7251, "step": 2970 }, { "ETA": 16.14, "epoch": 0.4670297378834776, "fp16_scale": 512.0, "global_step": 2980, "grad_norm": 2.239, "learning_rate": 1.1602442396769486e-05, "loss": 0.6962, "step": 2980 }, { "ETA": 16.1, "epoch": 0.4685969517689927, "fp16_scale": 512.0, "global_step": 2990, "grad_norm": 2.487, "learning_rate": 1.1552308917726786e-05, "loss": 0.7391, "step": 2990 }, { "ETA": 16.06, "epoch": 0.4701641656545077, "fp16_scale": 512.0, "global_step": 3000, "grad_norm": 3.831, "learning_rate": 1.1502135427965167e-05, "loss": 0.715, "step": 3000 }, { "ETA": 16.04, "epoch": 0.4717313795400227, "fp16_scale": 512.0, "global_step": 3010, "grad_norm": 2.421, "learning_rate": 1.1451923220704916e-05, "loss": 0.7511, "step": 3010 }, { "ETA": 15.99, "epoch": 0.47329859342553776, "fp16_scale": 512.0, "global_step": 3020, "grad_norm": 2.23, "learning_rate": 1.1401673590164281e-05, "loss": 0.6914, "step": 3020 }, { "ETA": 15.94, "epoch": 0.47486580731105277, "fp16_scale": 512.0, "global_step": 3030, "grad_norm": 2.211, "learning_rate": 1.135138783152608e-05, "loss": 0.7333, "step": 3030 }, { "ETA": 15.9, "epoch": 0.4764330211965678, "fp16_scale": 512.0, "global_step": 3040, "grad_norm": 4.443, "learning_rate": 1.1301067240904331e-05, "loss": 0.7227, "step": 3040 }, { "ETA": 15.85, "epoch": 0.47800023508208284, "fp16_scale": 512.0, "global_step": 3050, "grad_norm": 3.145, "learning_rate": 1.1250713115310852e-05, "loss": 0.7343, "step": 3050 }, { "ETA": 15.8, "epoch": 0.47956744896759784, "fp16_scale": 512.0, "global_step": 3060, "grad_norm": 2.237, "learning_rate": 1.1200326752621821e-05, "loss": 0.7027, "step": 3060 }, { "ETA": 15.75, "epoch": 0.4811346628531129, "fp16_scale": 512.0, "global_step": 3070, "grad_norm": 3.649, "learning_rate": 1.1149909451544327e-05, "loss": 0.711, "step": 3070 }, { "ETA": 15.7, "epoch": 0.4827018767386279, "fp16_scale": 512.0, "global_step": 3080, "grad_norm": 6.593, "learning_rate": 1.1099462511582893e-05, "loss": 0.6981, "step": 3080 }, { "ETA": 15.65, "epoch": 0.4842690906241429, "fp16_scale": 512.0, "global_step": 3090, "grad_norm": 2.193, "learning_rate": 1.104898723300599e-05, "loss": 0.6855, "step": 3090 }, { "ETA": 15.6, "epoch": 0.485836304509658, "fp16_scale": 512.0, "global_step": 3100, "grad_norm": 2.081, "learning_rate": 1.0998484916812503e-05, "loss": 0.7143, "step": 3100 }, { "ETA": 15.55, "epoch": 0.487403518395173, "fp16_scale": 512.0, "global_step": 3110, "grad_norm": 3.085, "learning_rate": 1.0947956864698223e-05, "loss": 0.7145, "step": 3110 }, { "ETA": 15.51, "epoch": 0.488970732280688, "fp16_scale": 512.0, "global_step": 3120, "grad_norm": 2.29, "learning_rate": 1.0897404379022281e-05, "loss": 0.6886, "step": 3120 }, { "ETA": 15.46, "epoch": 0.49053794616620305, "fp16_scale": 512.0, "global_step": 3130, "grad_norm": 2.721, "learning_rate": 1.0846828762773582e-05, "loss": 0.6601, "step": 3130 }, { "ETA": 15.41, "epoch": 0.49210516005171806, "fp16_scale": 512.0, "global_step": 3140, "grad_norm": 3.726, "learning_rate": 1.0796231319537214e-05, "loss": 0.694, "step": 3140 }, { "ETA": 15.36, "epoch": 0.49367237393723307, "fp16_scale": 512.0, "global_step": 3150, "grad_norm": 2.391, "learning_rate": 1.0745613353460862e-05, "loss": 0.7002, "step": 3150 }, { "ETA": 15.31, "epoch": 0.49523958782274813, "fp16_scale": 512.0, "global_step": 3160, "grad_norm": 2.541, "learning_rate": 1.069497616922119e-05, "loss": 0.7195, "step": 3160 }, { "ETA": 15.26, "epoch": 0.49680680170826313, "fp16_scale": 512.0, "global_step": 3170, "grad_norm": 2.212, "learning_rate": 1.0644321071990197e-05, "loss": 0.7227, "step": 3170 }, { "ETA": 15.21, "epoch": 0.49837401559377814, "fp16_scale": 512.0, "global_step": 3180, "grad_norm": 2.843, "learning_rate": 1.0593649367401607e-05, "loss": 0.7215, "step": 3180 }, { "ETA": 15.17, "epoch": 0.4999412294792932, "fp16_scale": 512.0, "global_step": 3190, "grad_norm": 2.622, "learning_rate": 1.0542962361517182e-05, "loss": 0.6857, "step": 3190 }, { "ETA": 15.12, "epoch": 0.5015084433648083, "fp16_scale": 512.0, "global_step": 3200, "grad_norm": 4.081, "learning_rate": 1.0492261360793082e-05, "loss": 0.6919, "step": 3200 }, { "ETA": 15.07, "epoch": 0.5030756572503232, "fp16_scale": 512.0, "global_step": 3210, "grad_norm": 1.986, "learning_rate": 1.0441547672046178e-05, "loss": 0.7229, "step": 3210 }, { "ETA": 15.02, "epoch": 0.5046428711358383, "fp16_scale": 512.0, "global_step": 3220, "grad_norm": 2.561, "learning_rate": 1.039082260242038e-05, "loss": 0.692, "step": 3220 }, { "ETA": 14.97, "epoch": 0.5062100850213533, "fp16_scale": 512.0, "global_step": 3230, "grad_norm": 2.619, "learning_rate": 1.0340087459352946e-05, "loss": 0.72, "step": 3230 }, { "ETA": 14.92, "epoch": 0.5077772989068683, "fp16_scale": 512.0, "global_step": 3240, "grad_norm": 2.845, "learning_rate": 1.028934355054076e-05, "loss": 0.7005, "step": 3240 }, { "ETA": 14.87, "epoch": 0.5093445127923834, "fp16_scale": 512.0, "global_step": 3250, "grad_norm": 2.274, "learning_rate": 1.023859218390665e-05, "loss": 0.6779, "step": 3250 }, { "ETA": 14.83, "epoch": 0.5109117266778984, "fp16_scale": 512.0, "global_step": 3260, "grad_norm": 2.636, "learning_rate": 1.0187834667565684e-05, "loss": 0.7082, "step": 3260 }, { "ETA": 14.78, "epoch": 0.5124789405634134, "fp16_scale": 512.0, "global_step": 3270, "grad_norm": 2.666, "learning_rate": 1.0137072309791411e-05, "loss": 0.699, "step": 3270 }, { "ETA": 14.73, "epoch": 0.5140461544489284, "fp16_scale": 512.0, "global_step": 3280, "grad_norm": 2.36, "learning_rate": 1.008630641898219e-05, "loss": 0.7035, "step": 3280 }, { "ETA": 14.68, "epoch": 0.5156133683344435, "fp16_scale": 512.0, "global_step": 3290, "grad_norm": 2.33, "learning_rate": 1.0035538303627437e-05, "loss": 0.6767, "step": 3290 }, { "ETA": 14.63, "epoch": 0.5171805822199584, "fp16_scale": 512.0, "global_step": 3300, "grad_norm": 2.257, "learning_rate": 9.9847692722739e-06, "loss": 0.6767, "step": 3300 }, { "ETA": 14.58, "epoch": 0.5187477961054735, "fp16_scale": 512.0, "global_step": 3310, "grad_norm": 2.678, "learning_rate": 9.934000633491945e-06, "loss": 0.7028, "step": 3310 }, { "ETA": 14.54, "epoch": 0.5203150099909886, "fp16_scale": 512.0, "global_step": 3320, "grad_norm": 3.457, "learning_rate": 9.883233695841814e-06, "loss": 0.6642, "step": 3320 }, { "ETA": 14.49, "epoch": 0.5218822238765035, "fp16_scale": 512.0, "global_step": 3330, "grad_norm": 3.567, "learning_rate": 9.832469767839908e-06, "loss": 0.7094, "step": 3330 }, { "ETA": 14.44, "epoch": 0.5234494377620186, "fp16_scale": 512.0, "global_step": 3340, "grad_norm": 2.891, "learning_rate": 9.781710157925047e-06, "loss": 0.6836, "step": 3340 }, { "ETA": 14.39, "epoch": 0.5250166516475336, "fp16_scale": 512.0, "global_step": 3350, "grad_norm": 2.194, "learning_rate": 9.730956174424757e-06, "loss": 0.6913, "step": 3350 }, { "ETA": 14.34, "epoch": 0.5265838655330486, "fp16_scale": 512.0, "global_step": 3360, "grad_norm": 2.086, "learning_rate": 9.680209125521545e-06, "loss": 0.6788, "step": 3360 }, { "ETA": 14.29, "epoch": 0.5281510794185637, "fp16_scale": 512.0, "global_step": 3370, "grad_norm": 2.488, "learning_rate": 9.629470319219173e-06, "loss": 0.7221, "step": 3370 }, { "ETA": 14.25, "epoch": 0.5297182933040787, "fp16_scale": 512.0, "global_step": 3380, "grad_norm": 2.049, "learning_rate": 9.578741063308954e-06, "loss": 0.6611, "step": 3380 }, { "ETA": 14.2, "epoch": 0.5312855071895937, "fp16_scale": 512.0, "global_step": 3390, "grad_norm": 2.858, "learning_rate": 9.528022665336045e-06, "loss": 0.6552, "step": 3390 }, { "ETA": 14.15, "epoch": 0.5328527210751087, "fp16_scale": 512.0, "global_step": 3400, "grad_norm": 2.535, "learning_rate": 9.477316432565727e-06, "loss": 0.6672, "step": 3400 }, { "ETA": 14.1, "epoch": 0.5344199349606238, "fp16_scale": 512.0, "global_step": 3410, "grad_norm": 3.078, "learning_rate": 9.426623671949735e-06, "loss": 0.6831, "step": 3410 }, { "ETA": 14.05, "epoch": 0.5359871488461387, "fp16_scale": 512.0, "global_step": 3420, "grad_norm": 3.191, "learning_rate": 9.37594569009256e-06, "loss": 0.7036, "step": 3420 }, { "ETA": 14.0, "epoch": 0.5375543627316538, "fp16_scale": 512.0, "global_step": 3430, "grad_norm": 2.237, "learning_rate": 9.325283793217758e-06, "loss": 0.6793, "step": 3430 }, { "ETA": 13.96, "epoch": 0.5391215766171689, "fp16_scale": 512.0, "global_step": 3440, "grad_norm": 3.739, "learning_rate": 9.274639287134309e-06, "loss": 0.6925, "step": 3440 }, { "ETA": 13.91, "epoch": 0.5406887905026838, "fp16_scale": 512.0, "global_step": 3450, "grad_norm": 3.498, "learning_rate": 9.224013477202939e-06, "loss": 0.7115, "step": 3450 }, { "ETA": 13.86, "epoch": 0.5422560043881989, "fp16_scale": 512.0, "global_step": 3460, "grad_norm": 2.282, "learning_rate": 9.17340766830248e-06, "loss": 0.6965, "step": 3460 }, { "ETA": 13.81, "epoch": 0.5438232182737139, "fp16_scale": 512.0, "global_step": 3470, "grad_norm": 2.453, "learning_rate": 9.122823164796241e-06, "loss": 0.6978, "step": 3470 }, { "ETA": 13.76, "epoch": 0.5453904321592289, "fp16_scale": 512.0, "global_step": 3480, "grad_norm": 2.101, "learning_rate": 9.072261270498389e-06, "loss": 0.6903, "step": 3480 }, { "ETA": 13.71, "epoch": 0.546957646044744, "fp16_scale": 512.0, "global_step": 3490, "grad_norm": 2.317, "learning_rate": 9.02172328864033e-06, "loss": 0.6787, "step": 3490 }, { "ETA": 13.67, "epoch": 0.548524859930259, "fp16_scale": 512.0, "global_step": 3500, "grad_norm": 3.384, "learning_rate": 8.971210521837133e-06, "loss": 0.694, "step": 3500 }, { "ETA": 13.64, "epoch": 0.550092073815774, "fp16_scale": 512.0, "global_step": 3510, "grad_norm": 2.945, "learning_rate": 8.920724272053952e-06, "loss": 0.7047, "step": 3510 }, { "ETA": 13.6, "epoch": 0.551659287701289, "fp16_scale": 512.0, "global_step": 3520, "grad_norm": 2.045, "learning_rate": 8.870265840572459e-06, "loss": 0.6963, "step": 3520 }, { "ETA": 13.56, "epoch": 0.5532265015868041, "fp16_scale": 512.0, "global_step": 3530, "grad_norm": 3.728, "learning_rate": 8.819836527957317e-06, "loss": 0.7034, "step": 3530 }, { "ETA": 13.51, "epoch": 0.554793715472319, "fp16_scale": 512.0, "global_step": 3540, "grad_norm": 2.796, "learning_rate": 8.769437634022642e-06, "loss": 0.6766, "step": 3540 }, { "ETA": 13.47, "epoch": 0.5563609293578341, "fp16_scale": 512.0, "global_step": 3550, "grad_norm": 2.562, "learning_rate": 8.71907045779852e-06, "loss": 0.6392, "step": 3550 }, { "ETA": 13.42, "epoch": 0.5579281432433492, "fp16_scale": 512.0, "global_step": 3560, "grad_norm": 2.453, "learning_rate": 8.668736297497511e-06, "loss": 0.6982, "step": 3560 }, { "ETA": 13.37, "epoch": 0.5594953571288641, "fp16_scale": 512.0, "global_step": 3570, "grad_norm": 2.169, "learning_rate": 8.618436450481182e-06, "loss": 0.6625, "step": 3570 }, { "ETA": 13.32, "epoch": 0.5610625710143792, "fp16_scale": 512.0, "global_step": 3580, "grad_norm": 1.961, "learning_rate": 8.568172213226684e-06, "loss": 0.6839, "step": 3580 }, { "ETA": 13.27, "epoch": 0.5626297848998942, "fp16_scale": 512.0, "global_step": 3590, "grad_norm": 2.06, "learning_rate": 8.517944881293327e-06, "loss": 0.6616, "step": 3590 }, { "ETA": 13.23, "epoch": 0.5641969987854092, "fp16_scale": 512.0, "global_step": 3600, "grad_norm": 2.749, "learning_rate": 8.467755749289186e-06, "loss": 0.7188, "step": 3600 }, { "ETA": 13.18, "epoch": 0.5657642126709242, "fp16_scale": 512.0, "global_step": 3610, "grad_norm": 2.239, "learning_rate": 8.417606110837738e-06, "loss": 0.6778, "step": 3610 }, { "ETA": 13.13, "epoch": 0.5673314265564393, "fp16_scale": 512.0, "global_step": 3620, "grad_norm": 2.227, "learning_rate": 8.367497258544507e-06, "loss": 0.6858, "step": 3620 }, { "ETA": 13.08, "epoch": 0.5688986404419543, "fp16_scale": 512.0, "global_step": 3630, "grad_norm": 2.627, "learning_rate": 8.317430483963758e-06, "loss": 0.6977, "step": 3630 }, { "ETA": 13.03, "epoch": 0.5704658543274693, "fp16_scale": 512.0, "global_step": 3640, "grad_norm": 2.336, "learning_rate": 8.267407077565206e-06, "loss": 0.7021, "step": 3640 }, { "ETA": 12.98, "epoch": 0.5720330682129844, "fp16_scale": 1024.0, "global_step": 3650, "grad_norm": 2.043, "learning_rate": 8.217428328700754e-06, "loss": 0.6843, "step": 3650 }, { "ETA": 12.94, "epoch": 0.5736002820984994, "fp16_scale": 1024.0, "global_step": 3660, "grad_norm": 2.503, "learning_rate": 8.167495525571249e-06, "loss": 0.6849, "step": 3660 }, { "ETA": 12.89, "epoch": 0.5751674959840144, "fp16_scale": 1024.0, "global_step": 3670, "grad_norm": 8.048, "learning_rate": 8.117609955193301e-06, "loss": 0.7003, "step": 3670 }, { "ETA": 12.84, "epoch": 0.5767347098695295, "fp16_scale": 1024.0, "global_step": 3680, "grad_norm": 2.675, "learning_rate": 8.067772903366087e-06, "loss": 0.6441, "step": 3680 }, { "ETA": 12.79, "epoch": 0.5783019237550445, "fp16_scale": 1024.0, "global_step": 3690, "grad_norm": 2.125, "learning_rate": 8.017985654638227e-06, "loss": 0.6881, "step": 3690 }, { "ETA": 12.74, "epoch": 0.5798691376405595, "fp16_scale": 1024.0, "global_step": 3700, "grad_norm": 2.286, "learning_rate": 7.96824949227466e-06, "loss": 0.6574, "step": 3700 }, { "ETA": 12.69, "epoch": 0.5814363515260745, "fp16_scale": 1024.0, "global_step": 3710, "grad_norm": 2.344, "learning_rate": 7.91856569822358e-06, "loss": 0.6805, "step": 3710 }, { "ETA": 12.65, "epoch": 0.5830035654115896, "fp16_scale": 1024.0, "global_step": 3720, "grad_norm": 2.325, "learning_rate": 7.868935553083391e-06, "loss": 0.6865, "step": 3720 }, { "ETA": 12.6, "epoch": 0.5845707792971045, "fp16_scale": 1024.0, "global_step": 3730, "grad_norm": 3.179, "learning_rate": 7.819360336069692e-06, "loss": 0.6936, "step": 3730 }, { "ETA": 12.55, "epoch": 0.5861379931826196, "fp16_scale": 1024.0, "global_step": 3740, "grad_norm": 2.321, "learning_rate": 7.769841324982315e-06, "loss": 0.6764, "step": 3740 }, { "ETA": 12.5, "epoch": 0.5877052070681347, "fp16_scale": 1024.0, "global_step": 3750, "grad_norm": 2.531, "learning_rate": 7.720379796172389e-06, "loss": 0.6855, "step": 3750 }, { "ETA": 12.45, "epoch": 0.5892724209536496, "fp16_scale": 1024.0, "global_step": 3760, "grad_norm": 2.386, "learning_rate": 7.670977024509432e-06, "loss": 0.6872, "step": 3760 }, { "ETA": 12.41, "epoch": 0.5908396348391647, "fp16_scale": 1024.0, "global_step": 3770, "grad_norm": 2.79, "learning_rate": 7.6216342833485e-06, "loss": 0.6927, "step": 3770 }, { "ETA": 12.36, "epoch": 0.5924068487246797, "fp16_scale": 1024.0, "global_step": 3780, "grad_norm": 2.2, "learning_rate": 7.572352844497369e-06, "loss": 0.6648, "step": 3780 }, { "ETA": 12.31, "epoch": 0.5939740626101947, "fp16_scale": 1024.0, "global_step": 3790, "grad_norm": 2.125, "learning_rate": 7.523133978183741e-06, "loss": 0.6809, "step": 3790 }, { "ETA": 12.26, "epoch": 0.5955412764957098, "fp16_scale": 1024.0, "global_step": 3800, "grad_norm": 2.236, "learning_rate": 7.4739789530225244e-06, "loss": 0.6812, "step": 3800 }, { "ETA": 12.21, "epoch": 0.5971084903812248, "fp16_scale": 1024.0, "global_step": 3810, "grad_norm": 1.985, "learning_rate": 7.424889035983116e-06, "loss": 0.6879, "step": 3810 }, { "ETA": 12.16, "epoch": 0.5986757042667398, "fp16_scale": 1024.0, "global_step": 3820, "grad_norm": 3.346, "learning_rate": 7.375865492356756e-06, "loss": 0.6665, "step": 3820 }, { "ETA": 12.12, "epoch": 0.6002429181522548, "fp16_scale": 1024.0, "global_step": 3830, "grad_norm": 2.453, "learning_rate": 7.326909585723901e-06, "loss": 0.6788, "step": 3830 }, { "ETA": 12.07, "epoch": 0.6018101320377699, "fp16_scale": 1024.0, "global_step": 3840, "grad_norm": 2.547, "learning_rate": 7.2780225779216825e-06, "loss": 0.664, "step": 3840 }, { "ETA": 12.02, "epoch": 0.6033773459232848, "fp16_scale": 1024.0, "global_step": 3850, "grad_norm": 3.057, "learning_rate": 7.22920572901136e-06, "loss": 0.6709, "step": 3850 }, { "ETA": 11.97, "epoch": 0.6049445598087999, "fp16_scale": 1024.0, "global_step": 3860, "grad_norm": 2.438, "learning_rate": 7.180460297245841e-06, "loss": 0.668, "step": 3860 }, { "ETA": 11.92, "epoch": 0.606511773694315, "fp16_scale": 1024.0, "global_step": 3870, "grad_norm": 2.914, "learning_rate": 7.131787539037269e-06, "loss": 0.6619, "step": 3870 }, { "ETA": 11.88, "epoch": 0.6080789875798299, "fp16_scale": 1024.0, "global_step": 3880, "grad_norm": 3.034, "learning_rate": 7.083188708924624e-06, "loss": 0.6824, "step": 3880 }, { "ETA": 11.83, "epoch": 0.609646201465345, "fp16_scale": 1024.0, "global_step": 3890, "grad_norm": 2.359, "learning_rate": 7.034665059541398e-06, "loss": 0.665, "step": 3890 }, { "ETA": 11.78, "epoch": 0.61121341535086, "fp16_scale": 1024.0, "global_step": 3900, "grad_norm": 2.126, "learning_rate": 6.98621784158329e-06, "loss": 0.6999, "step": 3900 }, { "ETA": 11.73, "epoch": 0.612780629236375, "fp16_scale": 1024.0, "global_step": 3910, "grad_norm": 2.697, "learning_rate": 6.937848303775986e-06, "loss": 0.664, "step": 3910 }, { "ETA": 11.68, "epoch": 0.6143478431218901, "fp16_scale": 1024.0, "global_step": 3920, "grad_norm": 3.055, "learning_rate": 6.889557692842976e-06, "loss": 0.6968, "step": 3920 }, { "ETA": 11.64, "epoch": 0.6159150570074051, "fp16_scale": 1024.0, "global_step": 3930, "grad_norm": 2.679, "learning_rate": 6.8413472534734e-06, "loss": 0.6734, "step": 3930 }, { "ETA": 11.59, "epoch": 0.6174822708929201, "fp16_scale": 1024.0, "global_step": 3940, "grad_norm": 2.791, "learning_rate": 6.793218228289986e-06, "loss": 0.6812, "step": 3940 }, { "ETA": 11.54, "epoch": 0.6190494847784351, "fp16_scale": 1024.0, "global_step": 3950, "grad_norm": 2.579, "learning_rate": 6.745171857817011e-06, "loss": 0.663, "step": 3950 }, { "ETA": 11.49, "epoch": 0.6206166986639502, "fp16_scale": 1024.0, "global_step": 3960, "grad_norm": 2.878, "learning_rate": 6.697209380448333e-06, "loss": 0.6792, "step": 3960 }, { "ETA": 11.44, "epoch": 0.6221839125494651, "fp16_scale": 1024.0, "global_step": 3970, "grad_norm": 2.977, "learning_rate": 6.649332032415459e-06, "loss": 0.6796, "step": 3970 }, { "ETA": 11.39, "epoch": 0.6237511264349802, "fp16_scale": 1024.0, "global_step": 3980, "grad_norm": 2.891, "learning_rate": 6.601541047755693e-06, "loss": 0.6659, "step": 3980 }, { "ETA": 11.35, "epoch": 0.6253183403204953, "fp16_scale": 1024.0, "global_step": 3990, "grad_norm": 3.476, "learning_rate": 6.5538376582803356e-06, "loss": 0.694, "step": 3990 }, { "ETA": 11.3, "epoch": 0.6268855542060102, "fp16_scale": 1024.0, "global_step": 4000, "grad_norm": 2.474, "learning_rate": 6.50622309354291e-06, "loss": 0.6885, "step": 4000 }, { "ETA": 11.27, "epoch": 0.6284527680915253, "fp16_scale": 1024.0, "global_step": 4010, "grad_norm": 1.907, "learning_rate": 6.458698580807495e-06, "loss": 0.6842, "step": 4010 }, { "ETA": 11.22, "epoch": 0.6300199819770403, "fp16_scale": 1024.0, "global_step": 4020, "grad_norm": 3.952, "learning_rate": 6.41126534501707e-06, "loss": 0.694, "step": 4020 }, { "ETA": 11.17, "epoch": 0.6315871958625553, "fp16_scale": 1024.0, "global_step": 4030, "grad_norm": 1.945, "learning_rate": 6.363924608761965e-06, "loss": 0.6571, "step": 4030 }, { "ETA": 11.12, "epoch": 0.6331544097480704, "fp16_scale": 1024.0, "global_step": 4040, "grad_norm": 3.677, "learning_rate": 6.316677592248331e-06, "loss": 0.6579, "step": 4040 }, { "ETA": 11.08, "epoch": 0.6347216236335854, "fp16_scale": 1024.0, "global_step": 4050, "grad_norm": 2.402, "learning_rate": 6.269525513266699e-06, "loss": 0.6634, "step": 4050 }, { "ETA": 11.03, "epoch": 0.6362888375191004, "fp16_scale": 1024.0, "global_step": 4060, "grad_norm": 2.67, "learning_rate": 6.22246958716058e-06, "loss": 0.6879, "step": 4060 }, { "ETA": 10.98, "epoch": 0.6378560514046154, "fp16_scale": 1024.0, "global_step": 4070, "grad_norm": 2.808, "learning_rate": 6.175511026795156e-06, "loss": 0.6498, "step": 4070 }, { "ETA": 10.94, "epoch": 0.6394232652901305, "fp16_scale": 1024.0, "global_step": 4080, "grad_norm": 2.579, "learning_rate": 6.128651042526006e-06, "loss": 0.6925, "step": 4080 }, { "ETA": 10.89, "epoch": 0.6409904791756454, "fp16_scale": 1024.0, "global_step": 4090, "grad_norm": 2.52, "learning_rate": 6.081890842167916e-06, "loss": 0.6746, "step": 4090 }, { "ETA": 10.85, "epoch": 0.6425576930611605, "fp16_scale": 1024.0, "global_step": 4100, "grad_norm": 1.858, "learning_rate": 6.035231630963737e-06, "loss": 0.6818, "step": 4100 }, { "ETA": 10.8, "epoch": 0.6441249069466756, "fp16_scale": 1024.0, "global_step": 4110, "grad_norm": 2.397, "learning_rate": 5.9886746115533356e-06, "loss": 0.6525, "step": 4110 }, { "ETA": 10.75, "epoch": 0.6456921208321905, "fp16_scale": 1024.0, "global_step": 4120, "grad_norm": 2.411, "learning_rate": 5.942220983942586e-06, "loss": 0.6718, "step": 4120 }, { "ETA": 10.71, "epoch": 0.6472593347177056, "fp16_scale": 1024.0, "global_step": 4130, "grad_norm": 2.88, "learning_rate": 5.895871945472434e-06, "loss": 0.6455, "step": 4130 }, { "ETA": 10.66, "epoch": 0.6488265486032206, "fp16_scale": 1024.0, "global_step": 4140, "grad_norm": 2.438, "learning_rate": 5.849628690788054e-06, "loss": 0.6786, "step": 4140 }, { "ETA": 10.61, "epoch": 0.6503937624887357, "fp16_scale": 1024.0, "global_step": 4150, "grad_norm": 1.985, "learning_rate": 5.803492411808042e-06, "loss": 0.6437, "step": 4150 }, { "ETA": 10.56, "epoch": 0.6519609763742507, "fp16_scale": 1024.0, "global_step": 4160, "grad_norm": 2.398, "learning_rate": 5.7574642976936945e-06, "loss": 0.6379, "step": 4160 }, { "ETA": 10.51, "epoch": 0.6535281902597657, "fp16_scale": 1024.0, "global_step": 4170, "grad_norm": 2.936, "learning_rate": 5.711545534818368e-06, "loss": 0.6781, "step": 4170 }, { "ETA": 10.46, "epoch": 0.6550954041452808, "fp16_scale": 1024.0, "global_step": 4180, "grad_norm": 2.284, "learning_rate": 5.665737306736889e-06, "loss": 0.6458, "step": 4180 }, { "ETA": 10.42, "epoch": 0.6566626180307957, "fp16_scale": 1024.0, "global_step": 4190, "grad_norm": 2.256, "learning_rate": 5.620040794155055e-06, "loss": 0.6894, "step": 4190 }, { "ETA": 10.37, "epoch": 0.6582298319163108, "fp16_scale": 1024.0, "global_step": 4200, "grad_norm": 1.972, "learning_rate": 5.5744571748992005e-06, "loss": 0.6468, "step": 4200 }, { "ETA": 10.32, "epoch": 0.6597970458018259, "fp16_scale": 1024.0, "global_step": 4210, "grad_norm": 2.206, "learning_rate": 5.528987623885834e-06, "loss": 0.6404, "step": 4210 }, { "ETA": 10.27, "epoch": 0.6613642596873408, "fp16_scale": 1024.0, "global_step": 4220, "grad_norm": 1.952, "learning_rate": 5.483633313091363e-06, "loss": 0.6462, "step": 4220 }, { "ETA": 10.22, "epoch": 0.6629314735728559, "fp16_scale": 1024.0, "global_step": 4230, "grad_norm": 2.58, "learning_rate": 5.4383954115218814e-06, "loss": 0.6463, "step": 4230 }, { "ETA": 10.18, "epoch": 0.6644986874583709, "fp16_scale": 1024.0, "global_step": 4240, "grad_norm": 1.728, "learning_rate": 5.393275085183029e-06, "loss": 0.6645, "step": 4240 }, { "ETA": 10.13, "epoch": 0.6660659013438859, "fp16_scale": 1024.0, "global_step": 4250, "grad_norm": 2.223, "learning_rate": 5.3482734970499605e-06, "loss": 0.6531, "step": 4250 }, { "ETA": 10.08, "epoch": 0.6676331152294009, "fp16_scale": 1024.0, "global_step": 4260, "grad_norm": 2.054, "learning_rate": 5.303391807037348e-06, "loss": 0.6768, "step": 4260 }, { "ETA": 10.03, "epoch": 0.669200329114916, "fp16_scale": 1024.0, "global_step": 4270, "grad_norm": 2.35, "learning_rate": 5.2586311719694974e-06, "loss": 0.6359, "step": 4270 }, { "ETA": 9.98, "epoch": 0.670767543000431, "fp16_scale": 1024.0, "global_step": 4280, "grad_norm": 3.027, "learning_rate": 5.213992745550524e-06, "loss": 0.6412, "step": 4280 }, { "ETA": 9.94, "epoch": 0.672334756885946, "fp16_scale": 1024.0, "global_step": 4290, "grad_norm": 2.138, "learning_rate": 5.16947767833462e-06, "loss": 0.6668, "step": 4290 }, { "ETA": 9.89, "epoch": 0.6739019707714611, "fp16_scale": 1024.0, "global_step": 4300, "grad_norm": 2.281, "learning_rate": 5.125087117696403e-06, "loss": 0.6533, "step": 4300 }, { "ETA": 9.84, "epoch": 0.675469184656976, "fp16_scale": 1024.0, "global_step": 4310, "grad_norm": 2.512, "learning_rate": 5.0808222078013214e-06, "loss": 0.6543, "step": 4310 }, { "ETA": 9.79, "epoch": 0.6770363985424911, "fp16_scale": 1024.0, "global_step": 4320, "grad_norm": 2.208, "learning_rate": 5.036684089576195e-06, "loss": 0.6574, "step": 4320 }, { "ETA": 9.74, "epoch": 0.6786036124280062, "fp16_scale": 1024.0, "global_step": 4330, "grad_norm": 2.38, "learning_rate": 4.992673900679784e-06, "loss": 0.6942, "step": 4330 }, { "ETA": 9.7, "epoch": 0.6801708263135211, "fp16_scale": 1024.0, "global_step": 4340, "grad_norm": 1.937, "learning_rate": 4.9487927754734775e-06, "loss": 0.67, "step": 4340 }, { "ETA": 9.65, "epoch": 0.6817380401990362, "fp16_scale": 1024.0, "global_step": 4350, "grad_norm": 2.332, "learning_rate": 4.90504184499205e-06, "loss": 0.6347, "step": 4350 }, { "ETA": 9.6, "epoch": 0.6833052540845512, "fp16_scale": 1024.0, "global_step": 4360, "grad_norm": 3.316, "learning_rate": 4.861422236914509e-06, "loss": 0.6759, "step": 4360 }, { "ETA": 9.55, "epoch": 0.6848724679700662, "fp16_scale": 1024.0, "global_step": 4370, "grad_norm": 1.95, "learning_rate": 4.817935075535033e-06, "loss": 0.6925, "step": 4370 }, { "ETA": 9.5, "epoch": 0.6864396818555812, "fp16_scale": 1024.0, "global_step": 4380, "grad_norm": 2.019, "learning_rate": 4.77458148173399e-06, "loss": 0.6656, "step": 4380 }, { "ETA": 9.46, "epoch": 0.6880068957410963, "fp16_scale": 1024.0, "global_step": 4390, "grad_norm": 2.305, "learning_rate": 4.7313625729490465e-06, "loss": 0.6552, "step": 4390 }, { "ETA": 9.41, "epoch": 0.6895741096266113, "fp16_scale": 1024.0, "global_step": 4400, "grad_norm": 2.275, "learning_rate": 4.688279463146368e-06, "loss": 0.6474, "step": 4400 }, { "ETA": 9.36, "epoch": 0.6911413235121263, "fp16_scale": 1024.0, "global_step": 4410, "grad_norm": 2.844, "learning_rate": 4.645333262791904e-06, "loss": 0.6487, "step": 4410 }, { "ETA": 9.31, "epoch": 0.6927085373976414, "fp16_scale": 1024.0, "global_step": 4420, "grad_norm": 2.353, "learning_rate": 4.602525078822766e-06, "loss": 0.6221, "step": 4420 }, { "ETA": 9.26, "epoch": 0.6942757512831563, "fp16_scale": 1024.0, "global_step": 4430, "grad_norm": 2.665, "learning_rate": 4.559856014618703e-06, "loss": 0.6526, "step": 4430 }, { "ETA": 9.22, "epoch": 0.6958429651686714, "fp16_scale": 1024.0, "global_step": 4440, "grad_norm": 2.108, "learning_rate": 4.517327169973643e-06, "loss": 0.6532, "step": 4440 }, { "ETA": 9.17, "epoch": 0.6974101790541865, "fp16_scale": 1024.0, "global_step": 4450, "grad_norm": 2.701, "learning_rate": 4.474939641067371e-06, "loss": 0.6335, "step": 4450 }, { "ETA": 9.12, "epoch": 0.6989773929397014, "fp16_scale": 1024.0, "global_step": 4460, "grad_norm": 2.679, "learning_rate": 4.432694520437257e-06, "loss": 0.6579, "step": 4460 }, { "ETA": 9.07, "epoch": 0.7005446068252165, "fp16_scale": 1024.0, "global_step": 4470, "grad_norm": 1.97, "learning_rate": 4.3905928969501054e-06, "loss": 0.6379, "step": 4470 }, { "ETA": 9.02, "epoch": 0.7021118207107315, "fp16_scale": 1024.0, "global_step": 4480, "grad_norm": 3.684, "learning_rate": 4.348635855774082e-06, "loss": 0.6785, "step": 4480 }, { "ETA": 8.98, "epoch": 0.7036790345962465, "fp16_scale": 1024.0, "global_step": 4490, "grad_norm": 2.484, "learning_rate": 4.30682447835075e-06, "loss": 0.6561, "step": 4490 }, { "ETA": 8.93, "epoch": 0.7052462484817615, "fp16_scale": 1024.0, "global_step": 4500, "grad_norm": 3.453, "learning_rate": 4.265159842367195e-06, "loss": 0.6366, "step": 4500 }, { "ETA": 8.89, "epoch": 0.7068134623672766, "fp16_scale": 1024.0, "global_step": 4510, "grad_norm": 2.174, "learning_rate": 4.223643021728237e-06, "loss": 0.6589, "step": 4510 }, { "ETA": 8.84, "epoch": 0.7083806762527916, "fp16_scale": 1024.0, "global_step": 4520, "grad_norm": 2.416, "learning_rate": 4.182275086528771e-06, "loss": 0.6656, "step": 4520 }, { "ETA": 8.8, "epoch": 0.7099478901383066, "fp16_scale": 1024.0, "global_step": 4530, "grad_norm": 1.731, "learning_rate": 4.141057103026168e-06, "loss": 0.6437, "step": 4530 }, { "ETA": 8.75, "epoch": 0.7115151040238217, "fp16_scale": 1024.0, "global_step": 4540, "grad_norm": 2.158, "learning_rate": 4.0999901336128015e-06, "loss": 0.6332, "step": 4540 }, { "ETA": 8.7, "epoch": 0.7130823179093366, "fp16_scale": 1024.0, "global_step": 4550, "grad_norm": 2.984, "learning_rate": 4.059075236788656e-06, "loss": 0.6338, "step": 4550 }, { "ETA": 8.65, "epoch": 0.7146495317948517, "fp16_scale": 1024.0, "global_step": 4560, "grad_norm": 2.391, "learning_rate": 4.018313467134054e-06, "loss": 0.643, "step": 4560 }, { "ETA": 8.61, "epoch": 0.7162167456803668, "fp16_scale": 1024.0, "global_step": 4570, "grad_norm": 2.143, "learning_rate": 3.977705875282468e-06, "loss": 0.6483, "step": 4570 }, { "ETA": 8.56, "epoch": 0.7177839595658817, "fp16_scale": 1024.0, "global_step": 4580, "grad_norm": 2.292, "learning_rate": 3.9372535078934415e-06, "loss": 0.6624, "step": 4580 }, { "ETA": 8.51, "epoch": 0.7193511734513968, "fp16_scale": 1024.0, "global_step": 4590, "grad_norm": 2.035, "learning_rate": 3.896957407625612e-06, "loss": 0.6628, "step": 4590 }, { "ETA": 8.46, "epoch": 0.7209183873369118, "fp16_scale": 1024.0, "global_step": 4600, "grad_norm": 2.281, "learning_rate": 3.856818613109836e-06, "loss": 0.6493, "step": 4600 }, { "ETA": 8.41, "epoch": 0.7224856012224268, "fp16_scale": 1024.0, "global_step": 4610, "grad_norm": 2.059, "learning_rate": 3.81683815892242e-06, "loss": 0.6674, "step": 4610 }, { "ETA": 8.37, "epoch": 0.7240528151079418, "fp16_scale": 1024.0, "global_step": 4620, "grad_norm": 2.233, "learning_rate": 3.7770170755584524e-06, "loss": 0.6551, "step": 4620 }, { "ETA": 8.32, "epoch": 0.7256200289934569, "fp16_scale": 1024.0, "global_step": 4630, "grad_norm": 2.489, "learning_rate": 3.7373563894052444e-06, "loss": 0.6574, "step": 4630 }, { "ETA": 8.28, "epoch": 0.727187242878972, "fp16_scale": 1024.0, "global_step": 4640, "grad_norm": 2.365, "learning_rate": 3.6978571227158655e-06, "loss": 0.6672, "step": 4640 }, { "ETA": 8.23, "epoch": 0.7287544567644869, "fp16_scale": 2048.0, "global_step": 4650, "grad_norm": 2.451, "learning_rate": 3.6585202935828133e-06, "loss": 0.6598, "step": 4650 }, { "ETA": 8.18, "epoch": 0.730321670650002, "fp16_scale": 2048.0, "global_step": 4660, "grad_norm": 2.212, "learning_rate": 3.619346915911759e-06, "loss": 0.6387, "step": 4660 }, { "ETA": 8.13, "epoch": 0.731888884535517, "fp16_scale": 2048.0, "global_step": 4670, "grad_norm": 2.04, "learning_rate": 3.5803379993954146e-06, "loss": 0.6797, "step": 4670 }, { "ETA": 8.09, "epoch": 0.733456098421032, "fp16_scale": 2048.0, "global_step": 4680, "grad_norm": 2.082, "learning_rate": 3.541494549487512e-06, "loss": 0.6459, "step": 4680 }, { "ETA": 8.04, "epoch": 0.735023312306547, "fp16_scale": 2048.0, "global_step": 4690, "grad_norm": 2.181, "learning_rate": 3.502817567376886e-06, "loss": 0.6517, "step": 4690 }, { "ETA": 7.99, "epoch": 0.7365905261920621, "fp16_scale": 2048.0, "global_step": 4700, "grad_norm": 1.914, "learning_rate": 3.464308049961671e-06, "loss": 0.6311, "step": 4700 }, { "ETA": 7.94, "epoch": 0.7381577400775771, "fp16_scale": 2048.0, "global_step": 4710, "grad_norm": 2.169, "learning_rate": 3.425966989823596e-06, "loss": 0.6503, "step": 4710 }, { "ETA": 7.89, "epoch": 0.7397249539630921, "fp16_scale": 2048.0, "global_step": 4720, "grad_norm": 1.835, "learning_rate": 3.387795375202414e-06, "loss": 0.6289, "step": 4720 }, { "ETA": 7.85, "epoch": 0.7412921678486072, "fp16_scale": 2048.0, "global_step": 4730, "grad_norm": 2.463, "learning_rate": 3.3497941899704267e-06, "loss": 0.6407, "step": 4730 }, { "ETA": 7.8, "epoch": 0.7428593817341221, "fp16_scale": 2048.0, "global_step": 4740, "grad_norm": 3.038, "learning_rate": 3.311964413607117e-06, "loss": 0.6588, "step": 4740 }, { "ETA": 7.75, "epoch": 0.7444265956196372, "fp16_scale": 2048.0, "global_step": 4750, "grad_norm": 2.629, "learning_rate": 3.274307021173911e-06, "loss": 0.624, "step": 4750 }, { "ETA": 7.7, "epoch": 0.7459938095051523, "fp16_scale": 2048.0, "global_step": 4760, "grad_norm": 2.266, "learning_rate": 3.2368229832890565e-06, "loss": 0.6704, "step": 4760 }, { "ETA": 7.66, "epoch": 0.7475610233906672, "fp16_scale": 2048.0, "global_step": 4770, "grad_norm": 2.528, "learning_rate": 3.1995132661025673e-06, "loss": 0.6352, "step": 4770 }, { "ETA": 7.61, "epoch": 0.7491282372761823, "fp16_scale": 2048.0, "global_step": 4780, "grad_norm": 2.439, "learning_rate": 3.1623788312713656e-06, "loss": 0.6619, "step": 4780 }, { "ETA": 7.56, "epoch": 0.7506954511616973, "fp16_scale": 2048.0, "global_step": 4790, "grad_norm": 2.292, "learning_rate": 3.12542063593447e-06, "loss": 0.6274, "step": 4790 }, { "ETA": 7.51, "epoch": 0.7522626650472123, "fp16_scale": 2048.0, "global_step": 4800, "grad_norm": 3.361, "learning_rate": 3.0886396326883305e-06, "loss": 0.6196, "step": 4800 }, { "ETA": 7.46, "epoch": 0.7538298789327273, "fp16_scale": 2048.0, "global_step": 4810, "grad_norm": 10.937, "learning_rate": 3.0520367695622766e-06, "loss": 0.6547, "step": 4810 }, { "ETA": 7.42, "epoch": 0.7553970928182424, "fp16_scale": 2048.0, "global_step": 4820, "grad_norm": 3.288, "learning_rate": 3.015612989994082e-06, "loss": 0.6345, "step": 4820 }, { "ETA": 7.37, "epoch": 0.7569643067037574, "fp16_scale": 2048.0, "global_step": 4830, "grad_norm": 2.648, "learning_rate": 2.97936923280565e-06, "loss": 0.6281, "step": 4830 }, { "ETA": 7.32, "epoch": 0.7585315205892724, "fp16_scale": 2048.0, "global_step": 4840, "grad_norm": 2.144, "learning_rate": 2.9433064321787996e-06, "loss": 0.6253, "step": 4840 }, { "ETA": 7.27, "epoch": 0.7600987344747875, "fp16_scale": 2048.0, "global_step": 4850, "grad_norm": 3.614, "learning_rate": 2.9074255176312115e-06, "loss": 0.6505, "step": 4850 }, { "ETA": 7.22, "epoch": 0.7616659483603024, "fp16_scale": 2048.0, "global_step": 4860, "grad_norm": 2.036, "learning_rate": 2.8717274139924556e-06, "loss": 0.6299, "step": 4860 }, { "ETA": 7.18, "epoch": 0.7632331622458175, "fp16_scale": 2048.0, "global_step": 4870, "grad_norm": 3.092, "learning_rate": 2.8362130413801524e-06, "loss": 0.6549, "step": 4870 }, { "ETA": 7.13, "epoch": 0.7648003761313326, "fp16_scale": 2048.0, "global_step": 4880, "grad_norm": 2.695, "learning_rate": 2.8008833151762636e-06, "loss": 0.6202, "step": 4880 }, { "ETA": 7.08, "epoch": 0.7663675900168475, "fp16_scale": 2048.0, "global_step": 4890, "grad_norm": 2.407, "learning_rate": 2.765739146003493e-06, "loss": 0.6084, "step": 4890 }, { "ETA": 7.03, "epoch": 0.7679348039023626, "fp16_scale": 2048.0, "global_step": 4900, "grad_norm": 2.132, "learning_rate": 2.7307814397018217e-06, "loss": 0.6688, "step": 4900 }, { "ETA": 6.99, "epoch": 0.7695020177878776, "fp16_scale": 2048.0, "global_step": 4910, "grad_norm": 2.43, "learning_rate": 2.6960110973051445e-06, "loss": 0.6316, "step": 4910 }, { "ETA": 6.94, "epoch": 0.7710692316733926, "fp16_scale": 2048.0, "global_step": 4920, "grad_norm": 2.563, "learning_rate": 2.6614290150180667e-06, "loss": 0.6349, "step": 4920 }, { "ETA": 6.89, "epoch": 0.7726364455589076, "fp16_scale": 2048.0, "global_step": 4930, "grad_norm": 2.146, "learning_rate": 2.6270360841927924e-06, "loss": 0.6226, "step": 4930 }, { "ETA": 6.84, "epoch": 0.7742036594444227, "fp16_scale": 2048.0, "global_step": 4940, "grad_norm": 2.0, "learning_rate": 2.592833191306151e-06, "loss": 0.6323, "step": 4940 }, { "ETA": 6.79, "epoch": 0.7757708733299377, "fp16_scale": 2048.0, "global_step": 4950, "grad_norm": 4.297, "learning_rate": 2.5588212179367487e-06, "loss": 0.6356, "step": 4950 }, { "ETA": 6.75, "epoch": 0.7773380872154527, "fp16_scale": 2048.0, "global_step": 4960, "grad_norm": 2.479, "learning_rate": 2.5250010407422485e-06, "loss": 0.6219, "step": 4960 }, { "ETA": 6.7, "epoch": 0.7789053011009678, "fp16_scale": 2048.0, "global_step": 4970, "grad_norm": 2.719, "learning_rate": 2.4913735314367715e-06, "loss": 0.6485, "step": 4970 }, { "ETA": 6.65, "epoch": 0.7804725149864827, "fp16_scale": 2048.0, "global_step": 4980, "grad_norm": 2.164, "learning_rate": 2.4579395567684284e-06, "loss": 0.646, "step": 4980 }, { "ETA": 6.6, "epoch": 0.7820397288719978, "fp16_scale": 2048.0, "global_step": 4990, "grad_norm": 2.672, "learning_rate": 2.4246999784969817e-06, "loss": 0.6229, "step": 4990 }, { "ETA": 6.56, "epoch": 0.7836069427575129, "fp16_scale": 2048.0, "global_step": 5000, "grad_norm": 2.134, "learning_rate": 2.3916556533716296e-06, "loss": 0.62, "step": 5000 }, { "ETA": 6.52, "epoch": 0.7851741566430278, "fp16_scale": 2048.0, "global_step": 5010, "grad_norm": 2.442, "learning_rate": 2.3588074331089296e-06, "loss": 0.6711, "step": 5010 }, { "ETA": 6.47, "epoch": 0.7867413705285429, "fp16_scale": 2048.0, "global_step": 5020, "grad_norm": 2.502, "learning_rate": 2.3261561643708387e-06, "loss": 0.635, "step": 5020 }, { "ETA": 6.42, "epoch": 0.7883085844140579, "fp16_scale": 2048.0, "global_step": 5030, "grad_norm": 2.263, "learning_rate": 2.293702688742898e-06, "loss": 0.619, "step": 5030 }, { "ETA": 6.37, "epoch": 0.7898757982995729, "fp16_scale": 2048.0, "global_step": 5040, "grad_norm": 2.295, "learning_rate": 2.2614478427125285e-06, "loss": 0.6496, "step": 5040 }, { "ETA": 6.32, "epoch": 0.791443012185088, "fp16_scale": 2048.0, "global_step": 5050, "grad_norm": 2.363, "learning_rate": 2.229392457647486e-06, "loss": 0.6396, "step": 5050 }, { "ETA": 6.28, "epoch": 0.793010226070603, "fp16_scale": 2048.0, "global_step": 5060, "grad_norm": 2.289, "learning_rate": 2.1975373597744242e-06, "loss": 0.6366, "step": 5060 }, { "ETA": 6.23, "epoch": 0.794577439956118, "fp16_scale": 2048.0, "global_step": 5070, "grad_norm": 2.387, "learning_rate": 2.165883370157601e-06, "loss": 0.6459, "step": 5070 }, { "ETA": 6.18, "epoch": 0.796144653841633, "fp16_scale": 2048.0, "global_step": 5080, "grad_norm": 2.12, "learning_rate": 2.1344313046777097e-06, "loss": 0.6552, "step": 5080 }, { "ETA": 6.13, "epoch": 0.7977118677271481, "fp16_scale": 2048.0, "global_step": 5090, "grad_norm": 1.951, "learning_rate": 2.1031819740108616e-06, "loss": 0.6027, "step": 5090 }, { "ETA": 6.09, "epoch": 0.799279081612663, "fp16_scale": 2048.0, "global_step": 5100, "grad_norm": 1.878, "learning_rate": 2.0721361836076814e-06, "loss": 0.644, "step": 5100 }, { "ETA": 6.04, "epoch": 0.8008462954981781, "fp16_scale": 2048.0, "global_step": 5110, "grad_norm": 2.002, "learning_rate": 2.041294733672542e-06, "loss": 0.6535, "step": 5110 }, { "ETA": 5.99, "epoch": 0.8024135093836932, "fp16_scale": 2048.0, "global_step": 5120, "grad_norm": 2.605, "learning_rate": 2.0106584191429556e-06, "loss": 0.658, "step": 5120 }, { "ETA": 5.94, "epoch": 0.8039807232692082, "fp16_scale": 2048.0, "global_step": 5130, "grad_norm": 3.932, "learning_rate": 1.9802280296690722e-06, "loss": 0.6403, "step": 5130 }, { "ETA": 5.89, "epoch": 0.8055479371547232, "fp16_scale": 2048.0, "global_step": 5140, "grad_norm": 1.911, "learning_rate": 1.9500043495933275e-06, "loss": 0.6264, "step": 5140 }, { "ETA": 5.85, "epoch": 0.8071151510402382, "fp16_scale": 2048.0, "global_step": 5150, "grad_norm": 2.622, "learning_rate": 1.919988157930236e-06, "loss": 0.6415, "step": 5150 }, { "ETA": 5.8, "epoch": 0.8086823649257533, "fp16_scale": 2048.0, "global_step": 5160, "grad_norm": 3.352, "learning_rate": 1.89018022834629e-06, "loss": 0.6679, "step": 5160 }, { "ETA": 5.75, "epoch": 0.8102495788112682, "fp16_scale": 2048.0, "global_step": 5170, "grad_norm": 2.792, "learning_rate": 1.8605813291400444e-06, "loss": 0.6317, "step": 5170 }, { "ETA": 5.71, "epoch": 0.8118167926967833, "fp16_scale": 2048.0, "global_step": 5180, "grad_norm": 2.251, "learning_rate": 1.8311922232222979e-06, "loss": 0.6364, "step": 5180 }, { "ETA": 5.66, "epoch": 0.8133840065822984, "fp16_scale": 2048.0, "global_step": 5190, "grad_norm": 2.3, "learning_rate": 1.8020136680964329e-06, "loss": 0.6312, "step": 5190 }, { "ETA": 5.61, "epoch": 0.8149512204678133, "fp16_scale": 2048.0, "global_step": 5200, "grad_norm": 2.282, "learning_rate": 1.7730464158388906e-06, "loss": 0.6105, "step": 5200 }, { "ETA": 5.56, "epoch": 0.8165184343533284, "fp16_scale": 2048.0, "global_step": 5210, "grad_norm": 2.271, "learning_rate": 1.7442912130797883e-06, "loss": 0.6232, "step": 5210 }, { "ETA": 5.52, "epoch": 0.8180856482388434, "fp16_scale": 2048.0, "global_step": 5220, "grad_norm": 2.368, "learning_rate": 1.715748800983672e-06, "loss": 0.6293, "step": 5220 }, { "ETA": 5.47, "epoch": 0.8196528621243584, "fp16_scale": 2048.0, "global_step": 5230, "grad_norm": 2.348, "learning_rate": 1.6874199152304182e-06, "loss": 0.6553, "step": 5230 }, { "ETA": 5.42, "epoch": 0.8212200760098735, "fp16_scale": 2048.0, "global_step": 5240, "grad_norm": 1.922, "learning_rate": 1.659305285996259e-06, "loss": 0.6178, "step": 5240 }, { "ETA": 5.37, "epoch": 0.8227872898953885, "fp16_scale": 2048.0, "global_step": 5250, "grad_norm": 2.16, "learning_rate": 1.6314056379349784e-06, "loss": 0.641, "step": 5250 }, { "ETA": 5.33, "epoch": 0.8243545037809035, "fp16_scale": 2048.0, "global_step": 5260, "grad_norm": 1.997, "learning_rate": 1.6037216901592245e-06, "loss": 0.6361, "step": 5260 }, { "ETA": 5.28, "epoch": 0.8259217176664185, "fp16_scale": 2048.0, "global_step": 5270, "grad_norm": 2.167, "learning_rate": 1.5762541562219779e-06, "loss": 0.6505, "step": 5270 }, { "ETA": 5.23, "epoch": 0.8274889315519336, "fp16_scale": 2048.0, "global_step": 5280, "grad_norm": 2.206, "learning_rate": 1.5490037440981564e-06, "loss": 0.624, "step": 5280 }, { "ETA": 5.18, "epoch": 0.8290561454374485, "fp16_scale": 2048.0, "global_step": 5290, "grad_norm": 2.148, "learning_rate": 1.5219711561663697e-06, "loss": 0.6093, "step": 5290 }, { "ETA": 5.13, "epoch": 0.8306233593229636, "fp16_scale": 2048.0, "global_step": 5300, "grad_norm": 2.247, "learning_rate": 1.4951570891908174e-06, "loss": 0.6228, "step": 5300 }, { "ETA": 5.09, "epoch": 0.8321905732084787, "fp16_scale": 2048.0, "global_step": 5310, "grad_norm": 3.03, "learning_rate": 1.4685622343033223e-06, "loss": 0.6093, "step": 5310 }, { "ETA": 5.04, "epoch": 0.8337577870939936, "fp16_scale": 2048.0, "global_step": 5320, "grad_norm": 2.311, "learning_rate": 1.4421872769855262e-06, "loss": 0.6365, "step": 5320 }, { "ETA": 4.99, "epoch": 0.8353250009795087, "fp16_scale": 2048.0, "global_step": 5330, "grad_norm": 1.918, "learning_rate": 1.4160328970512149e-06, "loss": 0.6482, "step": 5330 }, { "ETA": 4.94, "epoch": 0.8368922148650237, "fp16_scale": 2048.0, "global_step": 5340, "grad_norm": 2.109, "learning_rate": 1.390099768628802e-06, "loss": 0.6297, "step": 5340 }, { "ETA": 4.9, "epoch": 0.8384594287505387, "fp16_scale": 2048.0, "global_step": 5350, "grad_norm": 2.198, "learning_rate": 1.3643885601439488e-06, "loss": 0.6396, "step": 5350 }, { "ETA": 4.85, "epoch": 0.8400266426360538, "fp16_scale": 2048.0, "global_step": 5360, "grad_norm": 2.682, "learning_rate": 1.3388999343023278e-06, "loss": 0.6397, "step": 5360 }, { "ETA": 4.8, "epoch": 0.8415938565215688, "fp16_scale": 2048.0, "global_step": 5370, "grad_norm": 2.607, "learning_rate": 1.3136345480725621e-06, "loss": 0.6441, "step": 5370 }, { "ETA": 4.75, "epoch": 0.8431610704070838, "fp16_scale": 2048.0, "global_step": 5380, "grad_norm": 2.553, "learning_rate": 1.2885930526692736e-06, "loss": 0.659, "step": 5380 }, { "ETA": 4.71, "epoch": 0.8447282842925988, "fp16_scale": 2048.0, "global_step": 5390, "grad_norm": 2.285, "learning_rate": 1.2637760935363053e-06, "loss": 0.6154, "step": 5390 }, { "ETA": 4.66, "epoch": 0.8462954981781139, "fp16_scale": 2048.0, "global_step": 5400, "grad_norm": 2.19, "learning_rate": 1.2391843103300838e-06, "loss": 0.6418, "step": 5400 }, { "ETA": 4.61, "epoch": 0.8478627120636288, "fp16_scale": 2048.0, "global_step": 5410, "grad_norm": 2.141, "learning_rate": 1.214818336903134e-06, "loss": 0.6329, "step": 5410 }, { "ETA": 4.56, "epoch": 0.8494299259491439, "fp16_scale": 2048.0, "global_step": 5420, "grad_norm": 2.279, "learning_rate": 1.1906788012877423e-06, "loss": 0.6217, "step": 5420 }, { "ETA": 4.51, "epoch": 0.850997139834659, "fp16_scale": 2048.0, "global_step": 5430, "grad_norm": 2.33, "learning_rate": 1.1667663256797578e-06, "loss": 0.6205, "step": 5430 }, { "ETA": 4.47, "epoch": 0.8525643537201739, "fp16_scale": 2048.0, "global_step": 5440, "grad_norm": 2.216, "learning_rate": 1.1430815264225737e-06, "loss": 0.6635, "step": 5440 }, { "ETA": 4.42, "epoch": 0.854131567605689, "fp16_scale": 2048.0, "global_step": 5450, "grad_norm": 2.259, "learning_rate": 1.1196250139912268e-06, "loss": 0.6828, "step": 5450 }, { "ETA": 4.37, "epoch": 0.855698781491204, "fp16_scale": 2048.0, "global_step": 5460, "grad_norm": 2.969, "learning_rate": 1.096397392976669e-06, "loss": 0.6305, "step": 5460 }, { "ETA": 4.32, "epoch": 0.857265995376719, "fp16_scale": 2048.0, "global_step": 5470, "grad_norm": 2.021, "learning_rate": 1.073399262070184e-06, "loss": 0.6284, "step": 5470 }, { "ETA": 4.28, "epoch": 0.858833209262234, "fp16_scale": 2048.0, "global_step": 5480, "grad_norm": 2.051, "learning_rate": 1.0506312140479502e-06, "loss": 0.6396, "step": 5480 }, { "ETA": 4.23, "epoch": 0.8604004231477491, "fp16_scale": 2048.0, "global_step": 5490, "grad_norm": 2.332, "learning_rate": 1.028093835755769e-06, "loss": 0.6275, "step": 5490 }, { "ETA": 4.18, "epoch": 0.8619676370332641, "fp16_scale": 2048.0, "global_step": 5500, "grad_norm": 3.388, "learning_rate": 1.005787708093937e-06, "loss": 0.6016, "step": 5500 }, { "ETA": 4.14, "epoch": 0.8635348509187791, "fp16_scale": 2048.0, "global_step": 5510, "grad_norm": 2.186, "learning_rate": 9.837134060022668e-07, "loss": 0.6251, "step": 5510 }, { "ETA": 4.09, "epoch": 0.8651020648042942, "fp16_scale": 2048.0, "global_step": 5520, "grad_norm": 2.527, "learning_rate": 9.618714984452793e-07, "loss": 0.6447, "step": 5520 }, { "ETA": 4.04, "epoch": 0.8666692786898091, "fp16_scale": 2048.0, "global_step": 5530, "grad_norm": 3.872, "learning_rate": 9.40262548397528e-07, "loss": 0.6198, "step": 5530 }, { "ETA": 3.99, "epoch": 0.8682364925753242, "fp16_scale": 2048.0, "global_step": 5540, "grad_norm": 1.967, "learning_rate": 9.18887112829101e-07, "loss": 0.6496, "step": 5540 }, { "ETA": 3.95, "epoch": 0.8698037064608393, "fp16_scale": 2048.0, "global_step": 5550, "grad_norm": 3.834, "learning_rate": 8.977457426912517e-07, "loss": 0.6364, "step": 5550 }, { "ETA": 3.9, "epoch": 0.8713709203463542, "fp16_scale": 2048.0, "global_step": 5560, "grad_norm": 2.293, "learning_rate": 8.768389829021984e-07, "loss": 0.6268, "step": 5560 }, { "ETA": 3.85, "epoch": 0.8729381342318693, "fp16_scale": 2048.0, "global_step": 5570, "grad_norm": 2.842, "learning_rate": 8.561673723330932e-07, "loss": 0.6419, "step": 5570 }, { "ETA": 3.8, "epoch": 0.8745053481173843, "fp16_scale": 2048.0, "global_step": 5580, "grad_norm": 2.526, "learning_rate": 8.3573144379412e-07, "loss": 0.6225, "step": 5580 }, { "ETA": 3.76, "epoch": 0.8760725620028993, "fp16_scale": 2048.0, "global_step": 5590, "grad_norm": 2.434, "learning_rate": 8.15531724020765e-07, "loss": 0.6463, "step": 5590 }, { "ETA": 3.71, "epoch": 0.8776397758884144, "fp16_scale": 2048.0, "global_step": 5600, "grad_norm": 2.044, "learning_rate": 7.955687336602391e-07, "loss": 0.64, "step": 5600 }, { "ETA": 3.66, "epoch": 0.8792069897739294, "fp16_scale": 2048.0, "global_step": 5610, "grad_norm": 4.355, "learning_rate": 7.758429872580608e-07, "loss": 0.6305, "step": 5610 }, { "ETA": 3.61, "epoch": 0.8807742036594445, "fp16_scale": 2048.0, "global_step": 5620, "grad_norm": 2.274, "learning_rate": 7.563549932447944e-07, "loss": 0.6042, "step": 5620 }, { "ETA": 3.57, "epoch": 0.8823414175449594, "fp16_scale": 2048.0, "global_step": 5630, "grad_norm": 2.577, "learning_rate": 7.371052539229362e-07, "loss": 0.6229, "step": 5630 }, { "ETA": 3.52, "epoch": 0.8839086314304745, "fp16_scale": 2048.0, "global_step": 5640, "grad_norm": 2.202, "learning_rate": 7.180942654539802e-07, "loss": 0.6616, "step": 5640 }, { "ETA": 3.47, "epoch": 0.8854758453159896, "fp16_scale": 4096.0, "global_step": 5650, "grad_norm": 3.963, "learning_rate": 6.99322517845622e-07, "loss": 0.6466, "step": 5650 }, { "ETA": 3.42, "epoch": 0.8870430592015045, "fp16_scale": 4096.0, "global_step": 5660, "grad_norm": 2.242, "learning_rate": 6.807904949391319e-07, "loss": 0.6464, "step": 5660 }, { "ETA": 3.37, "epoch": 0.8886102730870196, "fp16_scale": 4096.0, "global_step": 5670, "grad_norm": 2.595, "learning_rate": 6.624986743968809e-07, "loss": 0.6401, "step": 5670 }, { "ETA": 3.33, "epoch": 0.8901774869725346, "fp16_scale": 4096.0, "global_step": 5680, "grad_norm": 2.086, "learning_rate": 6.44447527690032e-07, "loss": 0.6126, "step": 5680 }, { "ETA": 3.28, "epoch": 0.8917447008580496, "fp16_scale": 4096.0, "global_step": 5690, "grad_norm": 2.494, "learning_rate": 6.266375200863884e-07, "loss": 0.613, "step": 5690 }, { "ETA": 3.23, "epoch": 0.8933119147435646, "fp16_scale": 4096.0, "global_step": 5700, "grad_norm": 2.06, "learning_rate": 6.090691106383939e-07, "loss": 0.6201, "step": 5700 }, { "ETA": 3.18, "epoch": 0.8948791286290797, "fp16_scale": 4096.0, "global_step": 5710, "grad_norm": 2.088, "learning_rate": 5.917427521713115e-07, "loss": 0.6254, "step": 5710 }, { "ETA": 3.14, "epoch": 0.8964463425145947, "fp16_scale": 4096.0, "global_step": 5720, "grad_norm": 2.035, "learning_rate": 5.746588912715467e-07, "loss": 0.6323, "step": 5720 }, { "ETA": 3.09, "epoch": 0.8980135564001097, "fp16_scale": 4096.0, "global_step": 5730, "grad_norm": 2.48, "learning_rate": 5.578179682751372e-07, "loss": 0.6391, "step": 5730 }, { "ETA": 3.04, "epoch": 0.8995807702856248, "fp16_scale": 4096.0, "global_step": 5740, "grad_norm": 2.13, "learning_rate": 5.412204172564006e-07, "loss": 0.6491, "step": 5740 }, { "ETA": 3.0, "epoch": 0.9011479841711397, "fp16_scale": 4096.0, "global_step": 5750, "grad_norm": 2.237, "learning_rate": 5.248666660167523e-07, "loss": 0.623, "step": 5750 }, { "ETA": 2.95, "epoch": 0.9027151980566548, "fp16_scale": 4096.0, "global_step": 5760, "grad_norm": 2.283, "learning_rate": 5.087571360736698e-07, "loss": 0.6169, "step": 5760 }, { "ETA": 2.9, "epoch": 0.9042824119421699, "fp16_scale": 4096.0, "global_step": 5770, "grad_norm": 1.802, "learning_rate": 4.928922426498384e-07, "loss": 0.6286, "step": 5770 }, { "ETA": 2.85, "epoch": 0.9058496258276848, "fp16_scale": 4096.0, "global_step": 5780, "grad_norm": 2.055, "learning_rate": 4.772723946624414e-07, "loss": 0.6375, "step": 5780 }, { "ETA": 2.81, "epoch": 0.9074168397131999, "fp16_scale": 4096.0, "global_step": 5790, "grad_norm": 2.604, "learning_rate": 4.6189799471262343e-07, "loss": 0.6312, "step": 5790 }, { "ETA": 2.76, "epoch": 0.9089840535987149, "fp16_scale": 4096.0, "global_step": 5800, "grad_norm": 2.046, "learning_rate": 4.467694390751132e-07, "loss": 0.619, "step": 5800 }, { "ETA": 2.71, "epoch": 0.9105512674842299, "fp16_scale": 4096.0, "global_step": 5810, "grad_norm": 2.427, "learning_rate": 4.318871176880102e-07, "loss": 0.6094, "step": 5810 }, { "ETA": 2.66, "epoch": 0.9121184813697449, "fp16_scale": 4096.0, "global_step": 5820, "grad_norm": 2.188, "learning_rate": 4.1725141414273194e-07, "loss": 0.6301, "step": 5820 }, { "ETA": 2.62, "epoch": 0.91368569525526, "fp16_scale": 4096.0, "global_step": 5830, "grad_norm": 2.529, "learning_rate": 4.028627056741252e-07, "loss": 0.6323, "step": 5830 }, { "ETA": 2.57, "epoch": 0.915252909140775, "fp16_scale": 4096.0, "global_step": 5840, "grad_norm": 2.326, "learning_rate": 3.887213631507503e-07, "loss": 0.6415, "step": 5840 }, { "ETA": 2.52, "epoch": 0.91682012302629, "fp16_scale": 4096.0, "global_step": 5850, "grad_norm": 2.328, "learning_rate": 3.748277510653142e-07, "loss": 0.6457, "step": 5850 }, { "ETA": 2.47, "epoch": 0.9183873369118051, "fp16_scale": 4096.0, "global_step": 5860, "grad_norm": 4.232, "learning_rate": 3.6118222752527963e-07, "loss": 0.625, "step": 5860 }, { "ETA": 2.42, "epoch": 0.91995455079732, "fp16_scale": 4096.0, "global_step": 5870, "grad_norm": 2.001, "learning_rate": 3.477851442436342e-07, "loss": 0.6162, "step": 5870 }, { "ETA": 2.38, "epoch": 0.9215217646828351, "fp16_scale": 4096.0, "global_step": 5880, "grad_norm": 1.914, "learning_rate": 3.3463684652982555e-07, "loss": 0.6074, "step": 5880 }, { "ETA": 2.33, "epoch": 0.9230889785683501, "fp16_scale": 4096.0, "global_step": 5890, "grad_norm": 1.723, "learning_rate": 3.217376732808586e-07, "loss": 0.6085, "step": 5890 }, { "ETA": 2.28, "epoch": 0.9246561924538651, "fp16_scale": 4096.0, "global_step": 5900, "grad_norm": 2.135, "learning_rate": 3.0908795697256246e-07, "loss": 0.6303, "step": 5900 }, { "ETA": 2.23, "epoch": 0.9262234063393802, "fp16_scale": 4096.0, "global_step": 5910, "grad_norm": 2.142, "learning_rate": 2.9668802365102054e-07, "loss": 0.6107, "step": 5910 }, { "ETA": 2.19, "epoch": 0.9277906202248952, "fp16_scale": 4096.0, "global_step": 5920, "grad_norm": 2.271, "learning_rate": 2.8453819292416973e-07, "loss": 0.6003, "step": 5920 }, { "ETA": 2.14, "epoch": 0.9293578341104102, "fp16_scale": 4096.0, "global_step": 5930, "grad_norm": 2.507, "learning_rate": 2.7263877795355443e-07, "loss": 0.6426, "step": 5930 }, { "ETA": 2.09, "epoch": 0.9309250479959252, "fp16_scale": 4096.0, "global_step": 5940, "grad_norm": 2.27, "learning_rate": 2.6099008544626434e-07, "loss": 0.6432, "step": 5940 }, { "ETA": 2.04, "epoch": 0.9324922618814403, "fp16_scale": 4096.0, "global_step": 5950, "grad_norm": 1.876, "learning_rate": 2.495924156470231e-07, "loss": 0.5796, "step": 5950 }, { "ETA": 2.0, "epoch": 0.9340594757669552, "fp16_scale": 4096.0, "global_step": 5960, "grad_norm": 2.29, "learning_rate": 2.3844606233044855e-07, "loss": 0.6381, "step": 5960 }, { "ETA": 1.95, "epoch": 0.9356266896524703, "fp16_scale": 4096.0, "global_step": 5970, "grad_norm": 1.93, "learning_rate": 2.2755131279348807e-07, "loss": 0.6265, "step": 5970 }, { "ETA": 1.9, "epoch": 0.9371939035379854, "fp16_scale": 4096.0, "global_step": 5980, "grad_norm": 2.107, "learning_rate": 2.1690844784800437e-07, "loss": 0.6197, "step": 5980 }, { "ETA": 1.85, "epoch": 0.9387611174235003, "fp16_scale": 4096.0, "global_step": 5990, "grad_norm": 2.435, "learning_rate": 2.0651774181354445e-07, "loss": 0.6203, "step": 5990 }, { "ETA": 1.81, "epoch": 0.9403283313090154, "fp16_scale": 4096.0, "global_step": 6000, "grad_norm": 2.107, "learning_rate": 1.963794625102655e-07, "loss": 0.6072, "step": 6000 }, { "ETA": 1.76, "epoch": 0.9418955451945304, "fp16_scale": 4096.0, "global_step": 6010, "grad_norm": 2.7, "learning_rate": 1.8649387125203255e-07, "loss": 0.6501, "step": 6010 }, { "ETA": 1.71, "epoch": 0.9434627590800454, "fp16_scale": 4096.0, "global_step": 6020, "grad_norm": 2.167, "learning_rate": 1.7686122283968488e-07, "loss": 0.6264, "step": 6020 }, { "ETA": 1.66, "epoch": 0.9450299729655605, "fp16_scale": 4096.0, "global_step": 6030, "grad_norm": 2.122, "learning_rate": 1.674817655544647e-07, "loss": 0.6247, "step": 6030 }, { "ETA": 1.62, "epoch": 0.9465971868510755, "fp16_scale": 4096.0, "global_step": 6040, "grad_norm": 2.076, "learning_rate": 1.5835574115162123e-07, "loss": 0.6271, "step": 6040 }, { "ETA": 1.57, "epoch": 0.9481644007365905, "fp16_scale": 4096.0, "global_step": 6050, "grad_norm": 2.815, "learning_rate": 1.4948338485417768e-07, "loss": 0.6186, "step": 6050 }, { "ETA": 1.52, "epoch": 0.9497316146221055, "fp16_scale": 4096.0, "global_step": 6060, "grad_norm": 2.341, "learning_rate": 1.4086492534686968e-07, "loss": 0.6258, "step": 6060 }, { "ETA": 1.47, "epoch": 0.9512988285076206, "fp16_scale": 4096.0, "global_step": 6070, "grad_norm": 2.646, "learning_rate": 1.3250058477025095e-07, "loss": 0.6234, "step": 6070 }, { "ETA": 1.43, "epoch": 0.9528660423931355, "fp16_scale": 4096.0, "global_step": 6080, "grad_norm": 2.8, "learning_rate": 1.2439057871496463e-07, "loss": 0.6148, "step": 6080 }, { "ETA": 1.38, "epoch": 0.9544332562786506, "fp16_scale": 4096.0, "global_step": 6090, "grad_norm": 1.999, "learning_rate": 1.1653511621618985e-07, "loss": 0.5933, "step": 6090 }, { "ETA": 1.33, "epoch": 0.9560004701641657, "fp16_scale": 4096.0, "global_step": 6100, "grad_norm": 2.628, "learning_rate": 1.0893439974825393e-07, "loss": 0.6104, "step": 6100 }, { "ETA": 1.28, "epoch": 0.9575676840496807, "fp16_scale": 4096.0, "global_step": 6110, "grad_norm": 2.091, "learning_rate": 1.015886252194087e-07, "loss": 0.6291, "step": 6110 }, { "ETA": 1.24, "epoch": 0.9591348979351957, "fp16_scale": 4096.0, "global_step": 6120, "grad_norm": 2.404, "learning_rate": 9.449798196679016e-08, "loss": 0.6221, "step": 6120 }, { "ETA": 1.19, "epoch": 0.9607021118207107, "fp16_scale": 4096.0, "global_step": 6130, "grad_norm": 2.17, "learning_rate": 8.766265275152786e-08, "loss": 0.6213, "step": 6130 }, { "ETA": 1.14, "epoch": 0.9622693257062258, "fp16_scale": 4096.0, "global_step": 6140, "grad_norm": 1.804, "learning_rate": 8.108281375404093e-08, "loss": 0.6415, "step": 6140 }, { "ETA": 1.09, "epoch": 0.9638365395917408, "fp16_scale": 4096.0, "global_step": 6150, "grad_norm": 1.722, "learning_rate": 7.475863456949728e-08, "loss": 0.6276, "step": 6150 }, { "ETA": 1.05, "epoch": 0.9654037534772558, "fp16_scale": 4096.0, "global_step": 6160, "grad_norm": 1.697, "learning_rate": 6.869027820343598e-08, "loss": 0.6077, "step": 6160 }, { "ETA": 1.0, "epoch": 0.9669709673627709, "fp16_scale": 4096.0, "global_step": 6170, "grad_norm": 2.37, "learning_rate": 6.287790106757396e-08, "loss": 0.6046, "step": 6170 }, { "ETA": 0.95, "epoch": 0.9685381812482858, "fp16_scale": 4096.0, "global_step": 6180, "grad_norm": 2.879, "learning_rate": 5.7321652975769194e-08, "loss": 0.6358, "step": 6180 }, { "ETA": 0.9, "epoch": 0.9701053951338009, "fp16_scale": 4096.0, "global_step": 6190, "grad_norm": 2.486, "learning_rate": 5.20216771401616e-08, "loss": 0.6194, "step": 6190 }, { "ETA": 0.86, "epoch": 0.971672609019316, "fp16_scale": 4096.0, "global_step": 6200, "grad_norm": 2.44, "learning_rate": 4.697811016747711e-08, "loss": 0.6041, "step": 6200 }, { "ETA": 0.81, "epoch": 0.9732398229048309, "fp16_scale": 4096.0, "global_step": 6210, "grad_norm": 2.378, "learning_rate": 4.2191082055517143e-08, "loss": 0.6194, "step": 6210 }, { "ETA": 0.76, "epoch": 0.974807036790346, "fp16_scale": 4096.0, "global_step": 6220, "grad_norm": 2.339, "learning_rate": 3.766071618979461e-08, "loss": 0.6172, "step": 6220 }, { "ETA": 0.71, "epoch": 0.976374250675861, "fp16_scale": 4096.0, "global_step": 6230, "grad_norm": 2.287, "learning_rate": 3.338712934036425e-08, "loss": 0.6208, "step": 6230 }, { "ETA": 0.67, "epoch": 0.977941464561376, "fp16_scale": 4096.0, "global_step": 6240, "grad_norm": 2.324, "learning_rate": 2.9370431658806152e-08, "loss": 0.6339, "step": 6240 }, { "ETA": 0.62, "epoch": 0.979508678446891, "fp16_scale": 4096.0, "global_step": 6250, "grad_norm": 2.071, "learning_rate": 2.561072667539244e-08, "loss": 0.6279, "step": 6250 }, { "ETA": 0.57, "epoch": 0.9810758923324061, "fp16_scale": 4096.0, "global_step": 6260, "grad_norm": 2.58, "learning_rate": 2.2108111296412772e-08, "loss": 0.6292, "step": 6260 }, { "ETA": 0.52, "epoch": 0.9826431062179211, "fp16_scale": 4096.0, "global_step": 6270, "grad_norm": 2.091, "learning_rate": 1.8862675801681884e-08, "loss": 0.6033, "step": 6270 }, { "ETA": 0.48, "epoch": 0.9842103201034361, "fp16_scale": 4096.0, "global_step": 6280, "grad_norm": 2.559, "learning_rate": 1.58745038422059e-08, "loss": 0.635, "step": 6280 }, { "ETA": 0.43, "epoch": 0.9857775339889512, "fp16_scale": 4096.0, "global_step": 6290, "grad_norm": 2.044, "learning_rate": 1.3143672438037381e-08, "loss": 0.6041, "step": 6290 }, { "ETA": 0.38, "epoch": 0.9873447478744661, "fp16_scale": 4096.0, "global_step": 6300, "grad_norm": 2.064, "learning_rate": 1.0670251976275803e-08, "loss": 0.6467, "step": 6300 }, { "ETA": 0.33, "epoch": 0.9889119617599812, "fp16_scale": 4096.0, "global_step": 6310, "grad_norm": 2.35, "learning_rate": 8.454306209265684e-09, "loss": 0.6252, "step": 6310 }, { "ETA": 0.29, "epoch": 0.9904791756454963, "fp16_scale": 4096.0, "global_step": 6320, "grad_norm": 2.157, "learning_rate": 6.495892252947888e-09, "loss": 0.639, "step": 6320 }, { "ETA": 0.24, "epoch": 0.9920463895310112, "fp16_scale": 4096.0, "global_step": 6330, "grad_norm": 2.505, "learning_rate": 4.7950605853874785e-09, "loss": 0.6403, "step": 6330 }, { "ETA": 0.19, "epoch": 0.9936136034165263, "fp16_scale": 4096.0, "global_step": 6340, "grad_norm": 2.051, "learning_rate": 3.3518550454714195e-09, "loss": 0.6109, "step": 6340 }, { "ETA": 0.14, "epoch": 0.9951808173020413, "fp16_scale": 4096.0, "global_step": 6350, "grad_norm": 1.954, "learning_rate": 2.166312831783923e-09, "loss": 0.602, "step": 6350 }, { "ETA": 0.1, "epoch": 0.9967480311875563, "fp16_scale": 4096.0, "global_step": 6360, "grad_norm": 1.921, "learning_rate": 1.2384645016416674e-09, "loss": 0.6085, "step": 6360 }, { "ETA": 0.05, "epoch": 0.9983152450730713, "fp16_scale": 4096.0, "global_step": 6370, "grad_norm": 3.47, "learning_rate": 5.683339703088653e-10, "loss": 0.6294, "step": 6370 }, { "ETA": 0.0, "epoch": 0.9998824589585864, "fp16_scale": 4096.0, "global_step": 6380, "grad_norm": 2.934, "learning_rate": 1.5593851038109287e-10, "loss": 0.6272, "step": 6380 }, { "epoch": 0.9998824589585864, "step": 6380, "total_flos": 6434699981881344.0, "train_loss": 0.7134642638382866, "train_runtime": 109318.7053, "train_samples_per_second": 7.471, "train_steps_per_second": 0.058 } ], "logging_steps": 10, "max_steps": 6380, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6434699981881344.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }