{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.4460093896713615, "eval_steps": 1000, "global_step": 1900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.4084507042253522e-07, "loss": 0.6105, "step": 1 }, { "epoch": 0.0, "learning_rate": 1.4084507042253521e-06, "loss": 0.7205, "step": 10 }, { "epoch": 0.01, "learning_rate": 2.8169014084507042e-06, "loss": 0.694, "step": 20 }, { "epoch": 0.01, "learning_rate": 4.225352112676057e-06, "loss": 0.6913, "step": 30 }, { "epoch": 0.02, "learning_rate": 5.6338028169014084e-06, "loss": 0.5948, "step": 40 }, { "epoch": 0.02, "learning_rate": 7.042253521126761e-06, "loss": 0.5666, "step": 50 }, { "epoch": 0.03, "learning_rate": 8.450704225352114e-06, "loss": 0.4994, "step": 60 }, { "epoch": 0.03, "learning_rate": 9.859154929577466e-06, "loss": 0.4599, "step": 70 }, { "epoch": 0.04, "learning_rate": 1.1267605633802817e-05, "loss": 0.4061, "step": 80 }, { "epoch": 0.04, "learning_rate": 1.267605633802817e-05, "loss": 0.4111, "step": 90 }, { "epoch": 0.05, "learning_rate": 1.4084507042253522e-05, "loss": 0.3984, "step": 100 }, { "epoch": 0.05, "learning_rate": 1.5492957746478876e-05, "loss": 0.3893, "step": 110 }, { "epoch": 0.06, "learning_rate": 1.6901408450704228e-05, "loss": 0.4009, "step": 120 }, { "epoch": 0.06, "learning_rate": 1.830985915492958e-05, "loss": 0.4021, "step": 130 }, { "epoch": 0.07, "learning_rate": 1.9718309859154933e-05, "loss": 0.4013, "step": 140 }, { "epoch": 0.07, "learning_rate": 2.112676056338028e-05, "loss": 0.3468, "step": 150 }, { "epoch": 0.08, "learning_rate": 2.2535211267605634e-05, "loss": 0.3506, "step": 160 }, { "epoch": 0.08, "learning_rate": 2.3943661971830986e-05, "loss": 0.3765, "step": 170 }, { "epoch": 0.08, "learning_rate": 2.535211267605634e-05, "loss": 0.3766, "step": 180 }, { "epoch": 0.09, "learning_rate": 2.676056338028169e-05, "loss": 0.3583, "step": 190 }, { "epoch": 0.09, "learning_rate": 2.8169014084507043e-05, "loss": 0.3613, "step": 200 }, { "epoch": 0.1, "learning_rate": 2.9577464788732395e-05, "loss": 0.3604, "step": 210 }, { "epoch": 0.1, "learning_rate": 2.9999778542898527e-05, "loss": 0.3396, "step": 220 }, { "epoch": 0.11, "learning_rate": 2.9998693870796316e-05, "loss": 0.3655, "step": 230 }, { "epoch": 0.11, "learning_rate": 2.9996705373180166e-05, "loss": 0.3373, "step": 240 }, { "epoch": 0.12, "learning_rate": 2.9993813169877495e-05, "loss": 0.3397, "step": 250 }, { "epoch": 0.12, "learning_rate": 2.9990017435173293e-05, "loss": 0.3497, "step": 260 }, { "epoch": 0.13, "learning_rate": 2.9985318397799606e-05, "loss": 0.3548, "step": 270 }, { "epoch": 0.13, "learning_rate": 2.9979716340921736e-05, "loss": 0.3584, "step": 280 }, { "epoch": 0.14, "learning_rate": 2.997321160212122e-05, "loss": 0.3477, "step": 290 }, { "epoch": 0.14, "learning_rate": 2.996580457337544e-05, "loss": 0.3731, "step": 300 }, { "epoch": 0.15, "learning_rate": 2.9957495701034037e-05, "loss": 0.3688, "step": 310 }, { "epoch": 0.15, "learning_rate": 2.9948285485792e-05, "loss": 0.3399, "step": 320 }, { "epoch": 0.15, "learning_rate": 2.993817448265948e-05, "loss": 0.3419, "step": 330 }, { "epoch": 0.16, "learning_rate": 2.992716330092839e-05, "loss": 0.3663, "step": 340 }, { "epoch": 0.16, "learning_rate": 2.9915252604135618e-05, "loss": 0.358, "step": 350 }, { "epoch": 0.17, "learning_rate": 2.9902443110023127e-05, "loss": 0.3332, "step": 360 }, { "epoch": 0.17, "learning_rate": 2.9888735590494616e-05, "loss": 0.3315, "step": 370 }, { "epoch": 0.18, "learning_rate": 2.9874130871569087e-05, "loss": 0.3519, "step": 380 }, { "epoch": 0.18, "learning_rate": 2.9858629833331002e-05, "loss": 0.3876, "step": 390 }, { "epoch": 0.19, "learning_rate": 2.9842233409877296e-05, "loss": 0.3219, "step": 400 }, { "epoch": 0.19, "learning_rate": 2.9824942589261053e-05, "loss": 0.3465, "step": 410 }, { "epoch": 0.2, "learning_rate": 2.9806758413431997e-05, "loss": 0.3564, "step": 420 }, { "epoch": 0.2, "learning_rate": 2.978768197817368e-05, "loss": 0.3719, "step": 430 }, { "epoch": 0.21, "learning_rate": 2.976771443303745e-05, "loss": 0.345, "step": 440 }, { "epoch": 0.21, "learning_rate": 2.974685698127321e-05, "loss": 0.325, "step": 450 }, { "epoch": 0.22, "learning_rate": 2.9725110879756868e-05, "loss": 0.3461, "step": 460 }, { "epoch": 0.22, "learning_rate": 2.9702477438914617e-05, "loss": 0.3338, "step": 470 }, { "epoch": 0.23, "learning_rate": 2.9678958022643983e-05, "loss": 0.3533, "step": 480 }, { "epoch": 0.23, "learning_rate": 2.9654554048231597e-05, "loss": 0.3274, "step": 490 }, { "epoch": 0.23, "learning_rate": 2.9629266986267835e-05, "loss": 0.3423, "step": 500 }, { "epoch": 0.24, "learning_rate": 2.9603098360558167e-05, "loss": 0.3256, "step": 510 }, { "epoch": 0.24, "learning_rate": 2.957604974803134e-05, "loss": 0.33, "step": 520 }, { "epoch": 0.25, "learning_rate": 2.9548122778644357e-05, "loss": 0.3108, "step": 530 }, { "epoch": 0.25, "learning_rate": 2.9519319135284252e-05, "loss": 0.3473, "step": 540 }, { "epoch": 0.26, "learning_rate": 2.9489640553666687e-05, "loss": 0.3242, "step": 550 }, { "epoch": 0.26, "learning_rate": 2.945908882223134e-05, "loss": 0.3455, "step": 560 }, { "epoch": 0.27, "learning_rate": 2.9427665782034143e-05, "loss": 0.3417, "step": 570 }, { "epoch": 0.27, "learning_rate": 2.9395373326636344e-05, "loss": 0.3262, "step": 580 }, { "epoch": 0.28, "learning_rate": 2.9362213401990395e-05, "loss": 0.3507, "step": 590 }, { "epoch": 0.28, "learning_rate": 2.9328188006322693e-05, "loss": 0.3396, "step": 600 }, { "epoch": 0.29, "learning_rate": 2.9293299190013143e-05, "loss": 0.3283, "step": 610 }, { "epoch": 0.29, "learning_rate": 2.9257549055471645e-05, "loss": 0.3119, "step": 620 }, { "epoch": 0.3, "learning_rate": 2.9220939757011366e-05, "loss": 0.338, "step": 630 }, { "epoch": 0.3, "learning_rate": 2.9183473500718938e-05, "loss": 0.3285, "step": 640 }, { "epoch": 0.31, "learning_rate": 2.9145152544321504e-05, "loss": 0.3293, "step": 650 }, { "epoch": 0.31, "learning_rate": 2.9105979197050683e-05, "loss": 0.3528, "step": 660 }, { "epoch": 0.31, "learning_rate": 2.906595581950341e-05, "loss": 0.3183, "step": 670 }, { "epoch": 0.32, "learning_rate": 2.902508482349968e-05, "loss": 0.3593, "step": 680 }, { "epoch": 0.32, "learning_rate": 2.898336867193721e-05, "loss": 0.3371, "step": 690 }, { "epoch": 0.33, "learning_rate": 2.8940809878643038e-05, "loss": 0.3313, "step": 700 }, { "epoch": 0.33, "learning_rate": 2.8897411008222026e-05, "loss": 0.3298, "step": 710 }, { "epoch": 0.34, "learning_rate": 2.8853174675902323e-05, "loss": 0.3366, "step": 720 }, { "epoch": 0.34, "learning_rate": 2.8808103547377754e-05, "loss": 0.3408, "step": 730 }, { "epoch": 0.35, "learning_rate": 2.8762200338647222e-05, "loss": 0.3328, "step": 740 }, { "epoch": 0.35, "learning_rate": 2.8715467815850994e-05, "loss": 0.3235, "step": 750 }, { "epoch": 0.36, "learning_rate": 2.8667908795104053e-05, "loss": 0.3455, "step": 760 }, { "epoch": 0.36, "learning_rate": 2.8619526142326367e-05, "loss": 0.322, "step": 770 }, { "epoch": 0.37, "learning_rate": 2.8570322773070217e-05, "loss": 0.3367, "step": 780 }, { "epoch": 0.37, "learning_rate": 2.8520301652344476e-05, "loss": 0.3576, "step": 790 }, { "epoch": 0.38, "learning_rate": 2.8469465794435965e-05, "loss": 0.332, "step": 800 }, { "epoch": 0.38, "learning_rate": 2.8417818262727784e-05, "loss": 0.3156, "step": 810 }, { "epoch": 0.38, "learning_rate": 2.8365362169514726e-05, "loss": 0.3305, "step": 820 }, { "epoch": 0.39, "learning_rate": 2.8312100675815736e-05, "loss": 0.3238, "step": 830 }, { "epoch": 0.39, "learning_rate": 2.8258036991183414e-05, "loss": 0.3092, "step": 840 }, { "epoch": 0.4, "learning_rate": 2.8203174373510617e-05, "loss": 0.3503, "step": 850 }, { "epoch": 0.4, "learning_rate": 2.8147516128834116e-05, "loss": 0.3112, "step": 860 }, { "epoch": 0.41, "learning_rate": 2.809106561113541e-05, "loss": 0.3307, "step": 870 }, { "epoch": 0.41, "learning_rate": 2.803382622213857e-05, "loss": 0.3317, "step": 880 }, { "epoch": 0.42, "learning_rate": 2.7975801411105307e-05, "loss": 0.3328, "step": 890 }, { "epoch": 0.42, "learning_rate": 2.7916994674627045e-05, "loss": 0.3301, "step": 900 }, { "epoch": 0.43, "learning_rate": 2.7857409556414283e-05, "loss": 0.3271, "step": 910 }, { "epoch": 0.43, "learning_rate": 2.7797049647083016e-05, "loss": 0.3154, "step": 920 }, { "epoch": 0.44, "learning_rate": 2.7735918583938363e-05, "loss": 0.328, "step": 930 }, { "epoch": 0.44, "learning_rate": 2.76740200507554e-05, "loss": 0.3193, "step": 940 }, { "epoch": 0.45, "learning_rate": 2.761135777755715e-05, "loss": 0.3222, "step": 950 }, { "epoch": 1.0, "learning_rate": 2.7547935540389843e-05, "loss": 0.3022, "step": 960 }, { "epoch": 1.01, "learning_rate": 2.748375716109533e-05, "loss": 0.3119, "step": 970 }, { "epoch": 1.01, "learning_rate": 2.7418826507080818e-05, "loss": 0.3019, "step": 980 }, { "epoch": 1.02, "learning_rate": 2.7353147491085785e-05, "loss": 0.3181, "step": 990 }, { "epoch": 1.02, "learning_rate": 2.728672407094622e-05, "loss": 0.312, "step": 1000 }, { "epoch": 1.02, "eval_loss": 0.5085553526878357, "eval_runtime": 6.8207, "eval_samples_per_second": 20.526, "eval_steps_per_second": 5.131, "step": 1000 }, { "epoch": 1.03, "learning_rate": 2.7219560249356125e-05, "loss": 0.2941, "step": 1010 }, { "epoch": 1.03, "learning_rate": 2.7151660073626283e-05, "loss": 0.2852, "step": 1020 }, { "epoch": 1.04, "learning_rate": 2.7083027635440392e-05, "loss": 0.3113, "step": 1030 }, { "epoch": 1.04, "learning_rate": 2.7013667070608502e-05, "loss": 0.2969, "step": 1040 }, { "epoch": 1.05, "learning_rate": 2.6943582558817764e-05, "loss": 0.2912, "step": 1050 }, { "epoch": 1.05, "learning_rate": 2.6872778323380585e-05, "loss": 0.2851, "step": 1060 }, { "epoch": 1.06, "learning_rate": 2.6801258630980117e-05, "loss": 0.3045, "step": 1070 }, { "epoch": 1.06, "learning_rate": 2.6729027791413154e-05, "loss": 0.3157, "step": 1080 }, { "epoch": 1.07, "learning_rate": 2.6656090157330424e-05, "loss": 0.2968, "step": 1090 }, { "epoch": 1.07, "learning_rate": 2.6582450123974278e-05, "loss": 0.2832, "step": 1100 }, { "epoch": 1.08, "learning_rate": 2.650811212891385e-05, "loss": 0.32, "step": 1110 }, { "epoch": 1.08, "learning_rate": 2.6433080651777655e-05, "loss": 0.2936, "step": 1120 }, { "epoch": 1.08, "learning_rate": 2.635736021398361e-05, "loss": 0.3094, "step": 1130 }, { "epoch": 1.09, "learning_rate": 2.628095537846661e-05, "loss": 0.3073, "step": 1140 }, { "epoch": 1.09, "learning_rate": 2.6203870749403553e-05, "loss": 0.3067, "step": 1150 }, { "epoch": 1.1, "learning_rate": 2.6126110971935878e-05, "loss": 0.309, "step": 1160 }, { "epoch": 1.1, "learning_rate": 2.604768073188966e-05, "loss": 0.2851, "step": 1170 }, { "epoch": 1.11, "learning_rate": 2.5968584755493233e-05, "loss": 0.3074, "step": 1180 }, { "epoch": 1.11, "learning_rate": 2.5888827809092406e-05, "loss": 0.3012, "step": 1190 }, { "epoch": 1.12, "learning_rate": 2.5808414698863205e-05, "loss": 0.3042, "step": 1200 }, { "epoch": 1.12, "learning_rate": 2.5727350270522293e-05, "loss": 0.3072, "step": 1210 }, { "epoch": 1.13, "learning_rate": 2.5645639409034935e-05, "loss": 0.2948, "step": 1220 }, { "epoch": 1.13, "learning_rate": 2.5563287038320635e-05, "loss": 0.3042, "step": 1230 }, { "epoch": 1.14, "learning_rate": 2.548029812095644e-05, "loss": 0.3112, "step": 1240 }, { "epoch": 1.14, "learning_rate": 2.539667765787786e-05, "loss": 0.3213, "step": 1250 }, { "epoch": 1.15, "learning_rate": 2.531243068807754e-05, "loss": 0.2931, "step": 1260 }, { "epoch": 1.15, "learning_rate": 2.522756228830158e-05, "loss": 0.2802, "step": 1270 }, { "epoch": 1.15, "learning_rate": 2.5142077572743643e-05, "loss": 0.3049, "step": 1280 }, { "epoch": 1.16, "learning_rate": 2.5055981692736758e-05, "loss": 0.3234, "step": 1290 }, { "epoch": 1.16, "learning_rate": 2.4969279836442868e-05, "loss": 0.286, "step": 1300 }, { "epoch": 1.17, "learning_rate": 2.4881977228540243e-05, "loss": 0.3099, "step": 1310 }, { "epoch": 1.17, "learning_rate": 2.4794079129908606e-05, "loss": 0.2811, "step": 1320 }, { "epoch": 1.18, "learning_rate": 2.470559083731212e-05, "loss": 0.3202, "step": 1330 }, { "epoch": 1.18, "learning_rate": 2.4616517683080197e-05, "loss": 0.3031, "step": 1340 }, { "epoch": 1.19, "learning_rate": 2.4526865034786184e-05, "loss": 0.2663, "step": 1350 }, { "epoch": 1.19, "learning_rate": 2.4436638294923902e-05, "loss": 0.2946, "step": 1360 }, { "epoch": 1.2, "learning_rate": 2.4345842900582084e-05, "loss": 0.2625, "step": 1370 }, { "epoch": 1.2, "learning_rate": 2.4254484323116746e-05, "loss": 0.2953, "step": 1380 }, { "epoch": 1.21, "learning_rate": 2.4162568067821478e-05, "loss": 0.3124, "step": 1390 }, { "epoch": 1.21, "learning_rate": 2.4070099673595696e-05, "loss": 0.3166, "step": 1400 }, { "epoch": 1.22, "learning_rate": 2.3977084712610862e-05, "loss": 0.3096, "step": 1410 }, { "epoch": 1.22, "learning_rate": 2.3883528789974703e-05, "loss": 0.3054, "step": 1420 }, { "epoch": 1.23, "learning_rate": 2.3789437543393446e-05, "loss": 0.3024, "step": 1430 }, { "epoch": 1.23, "learning_rate": 2.3694816642832087e-05, "loss": 0.2855, "step": 1440 }, { "epoch": 1.23, "learning_rate": 2.3599671790172738e-05, "loss": 0.2768, "step": 1450 }, { "epoch": 1.24, "learning_rate": 2.3504008718870983e-05, "loss": 0.289, "step": 1460 }, { "epoch": 1.24, "learning_rate": 2.3407833193610427e-05, "loss": 0.2805, "step": 1470 }, { "epoch": 1.25, "learning_rate": 2.3311151009955297e-05, "loss": 0.2729, "step": 1480 }, { "epoch": 1.25, "learning_rate": 2.3213967994001185e-05, "loss": 0.2649, "step": 1490 }, { "epoch": 1.26, "learning_rate": 2.3116290002023982e-05, "loss": 0.2858, "step": 1500 }, { "epoch": 1.26, "learning_rate": 2.301812292012698e-05, "loss": 0.2785, "step": 1510 }, { "epoch": 1.27, "learning_rate": 2.291947266388616e-05, "loss": 0.2951, "step": 1520 }, { "epoch": 1.27, "learning_rate": 2.2820345177993727e-05, "loss": 0.2612, "step": 1530 }, { "epoch": 1.28, "learning_rate": 2.272074643589988e-05, "loss": 0.2873, "step": 1540 }, { "epoch": 1.28, "learning_rate": 2.262068243945285e-05, "loss": 0.276, "step": 1550 }, { "epoch": 1.29, "learning_rate": 2.252015921853723e-05, "loss": 0.2888, "step": 1560 }, { "epoch": 1.29, "learning_rate": 2.2419182830710593e-05, "loss": 0.2721, "step": 1570 }, { "epoch": 1.3, "learning_rate": 2.23177593608385e-05, "loss": 0.2805, "step": 1580 }, { "epoch": 1.3, "learning_rate": 2.221589492072778e-05, "loss": 0.2719, "step": 1590 }, { "epoch": 1.31, "learning_rate": 2.2113595648758273e-05, "loss": 0.2703, "step": 1600 }, { "epoch": 1.31, "learning_rate": 2.2010867709512895e-05, "loss": 0.2661, "step": 1610 }, { "epoch": 1.31, "learning_rate": 2.1907717293406175e-05, "loss": 0.2665, "step": 1620 }, { "epoch": 1.32, "learning_rate": 2.1804150616311222e-05, "loss": 0.2791, "step": 1630 }, { "epoch": 1.32, "learning_rate": 2.1700173919185144e-05, "loss": 0.2549, "step": 1640 }, { "epoch": 1.33, "learning_rate": 2.1595793467692967e-05, "loss": 0.2934, "step": 1650 }, { "epoch": 1.33, "learning_rate": 2.149101555183009e-05, "loss": 0.2666, "step": 1660 }, { "epoch": 1.34, "learning_rate": 2.1385846485543202e-05, "loss": 0.3041, "step": 1670 }, { "epoch": 1.34, "learning_rate": 2.1280292606349838e-05, "loss": 0.2651, "step": 1680 }, { "epoch": 1.35, "learning_rate": 2.117436027495647e-05, "loss": 0.2718, "step": 1690 }, { "epoch": 1.35, "learning_rate": 2.106805587487519e-05, "loss": 0.2625, "step": 1700 }, { "epoch": 1.36, "learning_rate": 2.096138581203908e-05, "loss": 0.284, "step": 1710 }, { "epoch": 1.36, "learning_rate": 2.0854356514416144e-05, "loss": 0.2865, "step": 1720 }, { "epoch": 1.37, "learning_rate": 2.0746974431621968e-05, "loss": 0.288, "step": 1730 }, { "epoch": 1.37, "learning_rate": 2.06392460345311e-05, "loss": 0.2704, "step": 1740 }, { "epoch": 1.38, "learning_rate": 2.053117781488706e-05, "loss": 0.2896, "step": 1750 }, { "epoch": 1.38, "learning_rate": 2.0422776284911175e-05, "loss": 0.2813, "step": 1760 }, { "epoch": 1.38, "learning_rate": 2.031404797691016e-05, "loss": 0.2857, "step": 1770 }, { "epoch": 1.39, "learning_rate": 2.0204999442882447e-05, "loss": 0.3063, "step": 1780 }, { "epoch": 1.39, "learning_rate": 2.0095637254123392e-05, "loss": 0.2837, "step": 1790 }, { "epoch": 1.4, "learning_rate": 1.998596800082927e-05, "loss": 0.2851, "step": 1800 }, { "epoch": 1.4, "learning_rate": 1.9875998291700148e-05, "loss": 0.2852, "step": 1810 }, { "epoch": 1.41, "learning_rate": 1.976573475354165e-05, "loss": 0.2651, "step": 1820 }, { "epoch": 1.41, "learning_rate": 1.9655184030865617e-05, "loss": 0.2655, "step": 1830 }, { "epoch": 1.42, "learning_rate": 1.9544352785489706e-05, "loss": 0.2758, "step": 1840 }, { "epoch": 1.42, "learning_rate": 1.9433247696135967e-05, "loss": 0.2698, "step": 1850 }, { "epoch": 1.43, "learning_rate": 1.9321875458028347e-05, "loss": 0.2988, "step": 1860 }, { "epoch": 1.43, "learning_rate": 1.9210242782489266e-05, "loss": 0.2723, "step": 1870 }, { "epoch": 1.44, "learning_rate": 1.9098356396535167e-05, "loss": 0.2726, "step": 1880 }, { "epoch": 1.44, "learning_rate": 1.8986223042471144e-05, "loss": 0.2541, "step": 1890 }, { "epoch": 1.45, "learning_rate": 1.8873849477484696e-05, "loss": 0.2822, "step": 1900 }, { "epoch": 1.45, "step": 1900, "total_flos": 3.4203213408659046e+17, "train_loss": 0.32689529290324765, "train_runtime": 3915.3355, "train_samples_per_second": 4.351, "train_steps_per_second": 1.088 } ], "logging_steps": 10, "max_steps": 4260, "num_train_epochs": 2, "save_steps": 1000, "total_flos": 3.4203213408659046e+17, "trial_name": null, "trial_params": null }