{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 1000, "global_step": 1900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 3.1578947368421055e-07, "loss": 0.7568, "step": 1 }, { "epoch": 0.01, "learning_rate": 3.157894736842105e-06, "loss": 0.7081, "step": 10 }, { "epoch": 0.02, "learning_rate": 6.31578947368421e-06, "loss": 0.6806, "step": 20 }, { "epoch": 0.03, "learning_rate": 9.473684210526315e-06, "loss": 0.6123, "step": 30 }, { "epoch": 0.04, "learning_rate": 1.263157894736842e-05, "loss": 0.4982, "step": 40 }, { "epoch": 0.05, "learning_rate": 1.5789473684210526e-05, "loss": 0.4448, "step": 50 }, { "epoch": 0.06, "learning_rate": 1.894736842105263e-05, "loss": 0.4243, "step": 60 }, { "epoch": 0.07, "learning_rate": 2.2105263157894736e-05, "loss": 0.3961, "step": 70 }, { "epoch": 0.08, "learning_rate": 2.526315789473684e-05, "loss": 0.3963, "step": 80 }, { "epoch": 0.09, "learning_rate": 2.8421052631578946e-05, "loss": 0.3795, "step": 90 }, { "epoch": 0.11, "learning_rate": 2.9999432005848255e-05, "loss": 0.3786, "step": 100 }, { "epoch": 0.12, "learning_rate": 2.99948883107249e-05, "loss": 0.3962, "step": 110 }, { "epoch": 0.13, "learning_rate": 2.9985802296874666e-05, "loss": 0.3566, "step": 120 }, { "epoch": 0.14, "learning_rate": 2.9972176716673562e-05, "loss": 0.3817, "step": 130 }, { "epoch": 0.15, "learning_rate": 2.9954015697643372e-05, "loss": 0.3644, "step": 140 }, { "epoch": 0.16, "learning_rate": 2.9931324741201325e-05, "loss": 0.3497, "step": 150 }, { "epoch": 0.17, "learning_rate": 2.9904110720993565e-05, "loss": 0.3601, "step": 160 }, { "epoch": 0.18, "learning_rate": 2.987238188081299e-05, "loss": 0.3877, "step": 170 }, { "epoch": 0.19, "learning_rate": 2.983614783210197e-05, "loss": 0.3585, "step": 180 }, { "epoch": 0.2, "learning_rate": 2.9795419551040836e-05, "loss": 0.3498, "step": 190 }, { "epoch": 0.21, "learning_rate": 2.9750209375222893e-05, "loss": 0.3618, "step": 200 }, { "epoch": 0.22, "learning_rate": 2.97005309999171e-05, "loss": 0.3496, "step": 210 }, { "epoch": 0.23, "learning_rate": 2.964639947391939e-05, "loss": 0.3483, "step": 220 }, { "epoch": 0.24, "learning_rate": 2.958783119499408e-05, "loss": 0.3488, "step": 230 }, { "epoch": 0.25, "learning_rate": 2.9524843904906528e-05, "loss": 0.3405, "step": 240 }, { "epoch": 0.26, "learning_rate": 2.9457456684048772e-05, "loss": 0.3441, "step": 250 }, { "epoch": 0.27, "learning_rate": 2.938568994565956e-05, "loss": 0.3638, "step": 260 }, { "epoch": 0.28, "learning_rate": 2.9309565429640724e-05, "loss": 0.3695, "step": 270 }, { "epoch": 0.29, "learning_rate": 2.9229106195971603e-05, "loss": 0.3372, "step": 280 }, { "epoch": 0.31, "learning_rate": 2.9144336617723625e-05, "loss": 0.354, "step": 290 }, { "epoch": 0.32, "learning_rate": 2.90552823736771e-05, "loss": 0.358, "step": 300 }, { "epoch": 0.33, "learning_rate": 2.8961970440542496e-05, "loss": 0.3473, "step": 310 }, { "epoch": 0.34, "learning_rate": 2.8864429084788534e-05, "loss": 0.3379, "step": 320 }, { "epoch": 0.35, "learning_rate": 2.8762687854079563e-05, "loss": 0.3794, "step": 330 }, { "epoch": 0.36, "learning_rate": 2.8656777568324878e-05, "loss": 0.3475, "step": 340 }, { "epoch": 0.37, "learning_rate": 2.8546730310342593e-05, "loss": 0.3655, "step": 350 }, { "epoch": 0.38, "learning_rate": 2.8432579416140984e-05, "loss": 0.3354, "step": 360 }, { "epoch": 0.39, "learning_rate": 2.8314359464820184e-05, "loss": 0.3448, "step": 370 }, { "epoch": 0.4, "learning_rate": 2.8192106268097336e-05, "loss": 0.3565, "step": 380 }, { "epoch": 0.41, "learning_rate": 2.8065856859458346e-05, "loss": 0.3436, "step": 390 }, { "epoch": 0.42, "learning_rate": 2.7935649482939533e-05, "loss": 0.3509, "step": 400 }, { "epoch": 0.43, "learning_rate": 2.7801523581542563e-05, "loss": 0.3312, "step": 410 }, { "epoch": 0.44, "learning_rate": 2.766351978528622e-05, "loss": 0.3445, "step": 420 }, { "epoch": 0.45, "learning_rate": 2.7521679898898567e-05, "loss": 0.3374, "step": 430 }, { "epoch": 0.46, "learning_rate": 2.737604688915327e-05, "loss": 0.3438, "step": 440 }, { "epoch": 0.47, "learning_rate": 2.72266648718539e-05, "loss": 0.3307, "step": 450 }, { "epoch": 0.48, "learning_rate": 2.7073579098470196e-05, "loss": 0.3344, "step": 460 }, { "epoch": 0.49, "learning_rate": 2.6916835942430292e-05, "loss": 0.325, "step": 470 }, { "epoch": 0.51, "learning_rate": 2.6756482885073032e-05, "loss": 0.3296, "step": 480 }, { "epoch": 0.52, "learning_rate": 2.6592568501264746e-05, "loss": 0.3536, "step": 490 }, { "epoch": 0.53, "learning_rate": 2.6425142444684735e-05, "loss": 0.3272, "step": 500 }, { "epoch": 0.54, "learning_rate": 2.6254255432783933e-05, "loss": 0.3547, "step": 510 }, { "epoch": 0.55, "learning_rate": 2.6079959231421347e-05, "loss": 0.3313, "step": 520 }, { "epoch": 0.56, "learning_rate": 2.5902306639182952e-05, "loss": 0.343, "step": 530 }, { "epoch": 0.57, "learning_rate": 2.5721351471387666e-05, "loss": 0.3129, "step": 540 }, { "epoch": 0.58, "learning_rate": 2.5537148543785385e-05, "loss": 0.3505, "step": 550 }, { "epoch": 0.59, "learning_rate": 2.534975365595196e-05, "loss": 0.3523, "step": 560 }, { "epoch": 0.6, "learning_rate": 2.5159223574386117e-05, "loss": 0.3699, "step": 570 }, { "epoch": 0.61, "learning_rate": 2.496561601531353e-05, "loss": 0.315, "step": 580 }, { "epoch": 0.62, "learning_rate": 2.4768989627203123e-05, "loss": 0.3382, "step": 590 }, { "epoch": 0.63, "learning_rate": 2.4569403973001045e-05, "loss": 0.3387, "step": 600 }, { "epoch": 0.64, "learning_rate": 2.436691951208758e-05, "loss": 0.3286, "step": 610 }, { "epoch": 0.65, "learning_rate": 2.4161597581962526e-05, "loss": 0.3096, "step": 620 }, { "epoch": 0.66, "learning_rate": 2.395350037966456e-05, "loss": 0.3384, "step": 630 }, { "epoch": 0.67, "learning_rate": 2.3742690942930235e-05, "loss": 0.3284, "step": 640 }, { "epoch": 0.68, "learning_rate": 2.3529233131098313e-05, "loss": 0.3295, "step": 650 }, { "epoch": 0.69, "learning_rate": 2.33131916057652e-05, "loss": 0.3164, "step": 660 }, { "epoch": 0.71, "learning_rate": 2.309463181119736e-05, "loss": 0.3597, "step": 670 }, { "epoch": 0.72, "learning_rate": 2.287361995450667e-05, "loss": 0.3239, "step": 680 }, { "epoch": 0.73, "learning_rate": 2.2650222985594634e-05, "loss": 0.3298, "step": 690 }, { "epoch": 0.74, "learning_rate": 2.2424508576871623e-05, "loss": 0.3176, "step": 700 }, { "epoch": 0.75, "learning_rate": 2.219654510275728e-05, "loss": 0.3344, "step": 710 }, { "epoch": 0.76, "learning_rate": 2.1966401618968194e-05, "loss": 0.3518, "step": 720 }, { "epoch": 0.77, "learning_rate": 2.173414784159925e-05, "loss": 0.3323, "step": 730 }, { "epoch": 0.78, "learning_rate": 2.149985412600492e-05, "loss": 0.3343, "step": 740 }, { "epoch": 0.79, "learning_rate": 2.1263591445486895e-05, "loss": 0.3383, "step": 750 }, { "epoch": 0.8, "learning_rate": 2.1025431369794546e-05, "loss": 0.3135, "step": 760 }, { "epoch": 0.81, "learning_rate": 2.0785446043444677e-05, "loss": 0.3278, "step": 770 }, { "epoch": 0.82, "learning_rate": 2.0543708163867204e-05, "loss": 0.3148, "step": 780 }, { "epoch": 0.83, "learning_rate": 2.0300290959383318e-05, "loss": 0.3511, "step": 790 }, { "epoch": 0.84, "learning_rate": 2.0055268167022835e-05, "loss": 0.3146, "step": 800 }, { "epoch": 0.85, "learning_rate": 1.9808714010187425e-05, "loss": 0.3321, "step": 810 }, { "epoch": 0.86, "learning_rate": 1.9560703176166565e-05, "loss": 0.3373, "step": 820 }, { "epoch": 0.87, "learning_rate": 1.931131079351289e-05, "loss": 0.32, "step": 830 }, { "epoch": 0.88, "learning_rate": 1.9060612409283946e-05, "loss": 0.3106, "step": 840 }, { "epoch": 0.89, "learning_rate": 1.8808683966157132e-05, "loss": 0.3167, "step": 850 }, { "epoch": 0.91, "learning_rate": 1.8555601779424778e-05, "loss": 0.2993, "step": 860 }, { "epoch": 0.92, "learning_rate": 1.8301442513876406e-05, "loss": 0.3405, "step": 870 }, { "epoch": 0.93, "learning_rate": 1.804628316057508e-05, "loss": 0.3346, "step": 880 }, { "epoch": 0.94, "learning_rate": 1.779020101353492e-05, "loss": 0.3319, "step": 890 }, { "epoch": 0.95, "learning_rate": 1.7533273646306857e-05, "loss": 0.3087, "step": 900 }, { "epoch": 0.96, "learning_rate": 1.7275578888479714e-05, "loss": 0.316, "step": 910 }, { "epoch": 0.97, "learning_rate": 1.7017194802103705e-05, "loss": 0.3054, "step": 920 }, { "epoch": 0.98, "learning_rate": 1.6758199658043538e-05, "loss": 0.3255, "step": 930 }, { "epoch": 0.99, "learning_rate": 1.6498671912268256e-05, "loss": 0.3175, "step": 940 }, { "epoch": 1.0, "learning_rate": 1.623869018208499e-05, "loss": 0.3424, "step": 950 }, { "epoch": 1.01, "learning_rate": 1.5978333222323858e-05, "loss": 0.2825, "step": 960 }, { "epoch": 1.02, "learning_rate": 1.571767990148122e-05, "loss": 0.2886, "step": 970 }, { "epoch": 1.03, "learning_rate": 1.5456809177828444e-05, "loss": 0.3196, "step": 980 }, { "epoch": 1.04, "learning_rate": 1.5195800075493542e-05, "loss": 0.3178, "step": 990 }, { "epoch": 1.05, "learning_rate": 1.4934731660522817e-05, "loss": 0.2676, "step": 1000 }, { "epoch": 1.05, "eval_loss": 0.508576512336731, "eval_runtime": 12.2399, "eval_samples_per_second": 3.595, "eval_steps_per_second": 0.899, "step": 1000 }, { "epoch": 1.06, "learning_rate": 1.4673683016929805e-05, "loss": 0.2803, "step": 1010 }, { "epoch": 1.07, "learning_rate": 1.441273322273884e-05, "loss": 0.319, "step": 1020 }, { "epoch": 1.08, "learning_rate": 1.4151961326030314e-05, "loss": 0.2965, "step": 1030 }, { "epoch": 1.09, "learning_rate": 1.3891446320995143e-05, "loss": 0.3067, "step": 1040 }, { "epoch": 1.11, "learning_rate": 1.3631267124005453e-05, "loss": 0.3072, "step": 1050 }, { "epoch": 1.12, "learning_rate": 1.337150254970891e-05, "loss": 0.2963, "step": 1060 }, { "epoch": 1.13, "learning_rate": 1.3112231287153798e-05, "loss": 0.3063, "step": 1070 }, { "epoch": 1.14, "learning_rate": 1.28535318759522e-05, "loss": 0.2732, "step": 1080 }, { "epoch": 1.15, "learning_rate": 1.2595482682488443e-05, "loss": 0.2874, "step": 1090 }, { "epoch": 1.16, "learning_rate": 1.2338161876179964e-05, "loss": 0.2998, "step": 1100 }, { "epoch": 1.17, "learning_rate": 1.2081647405797923e-05, "loss": 0.2849, "step": 1110 }, { "epoch": 1.18, "learning_rate": 1.1826016975854563e-05, "loss": 0.2735, "step": 1120 }, { "epoch": 1.19, "learning_rate": 1.1571348023064662e-05, "loss": 0.2937, "step": 1130 }, { "epoch": 1.2, "learning_rate": 1.1317717692888014e-05, "loss": 0.2993, "step": 1140 }, { "epoch": 1.21, "learning_rate": 1.1065202816160213e-05, "loss": 0.2933, "step": 1150 }, { "epoch": 1.22, "learning_rate": 1.081387988581869e-05, "loss": 0.285, "step": 1160 }, { "epoch": 1.23, "learning_rate": 1.0563825033731146e-05, "loss": 0.2896, "step": 1170 }, { "epoch": 1.24, "learning_rate": 1.031511400763332e-05, "loss": 0.2874, "step": 1180 }, { "epoch": 1.25, "learning_rate": 1.0067822148183194e-05, "loss": 0.2851, "step": 1190 }, { "epoch": 1.26, "learning_rate": 9.822024366138397e-06, "loss": 0.2925, "step": 1200 }, { "epoch": 1.27, "learning_rate": 9.577795119663966e-06, "loss": 0.2843, "step": 1210 }, { "epoch": 1.28, "learning_rate": 9.335208391777106e-06, "loss": 0.2884, "step": 1220 }, { "epoch": 1.29, "learning_rate": 9.094337667935942e-06, "loss": 0.269, "step": 1230 }, { "epoch": 1.31, "learning_rate": 8.855255913778949e-06, "loss": 0.2849, "step": 1240 }, { "epoch": 1.32, "learning_rate": 8.618035553021925e-06, "loss": 0.304, "step": 1250 }, { "epoch": 1.33, "learning_rate": 8.382748445519008e-06, "loss": 0.3049, "step": 1260 }, { "epoch": 1.34, "learning_rate": 8.149465865494633e-06, "loss": 0.2999, "step": 1270 }, { "epoch": 1.35, "learning_rate": 7.918258479952763e-06, "loss": 0.2835, "step": 1280 }, { "epoch": 1.36, "learning_rate": 7.689196327270171e-06, "loss": 0.3167, "step": 1290 }, { "epoch": 1.37, "learning_rate": 7.462348795980088e-06, "loss": 0.2842, "step": 1300 }, { "epoch": 1.38, "learning_rate": 7.237784603752705e-06, "loss": 0.2909, "step": 1310 }, { "epoch": 1.39, "learning_rate": 7.015571776578922e-06, "loss": 0.2881, "step": 1320 }, { "epoch": 1.4, "learning_rate": 6.795777628163599e-06, "loss": 0.2796, "step": 1330 }, { "epoch": 1.41, "learning_rate": 6.578468739534602e-06, "loss": 0.3056, "step": 1340 }, { "epoch": 1.42, "learning_rate": 6.363710938873759e-06, "loss": 0.2987, "step": 1350 }, { "epoch": 1.43, "learning_rate": 6.151569281575925e-06, "loss": 0.288, "step": 1360 }, { "epoch": 1.44, "learning_rate": 5.942108030542074e-06, "loss": 0.2954, "step": 1370 }, { "epoch": 1.45, "learning_rate": 5.735390636712514e-06, "loss": 0.2837, "step": 1380 }, { "epoch": 1.46, "learning_rate": 5.531479719846038e-06, "loss": 0.3055, "step": 1390 }, { "epoch": 1.47, "learning_rate": 5.330437049550868e-06, "loss": 0.313, "step": 1400 }, { "epoch": 1.48, "learning_rate": 5.132323526573126e-06, "loss": 0.2966, "step": 1410 }, { "epoch": 1.49, "learning_rate": 4.937199164348521e-06, "loss": 0.2741, "step": 1420 }, { "epoch": 1.51, "learning_rate": 4.745123070822786e-06, "loss": 0.2973, "step": 1430 }, { "epoch": 1.52, "learning_rate": 4.556153430546451e-06, "loss": 0.281, "step": 1440 }, { "epoch": 1.53, "learning_rate": 4.370347487049313e-06, "loss": 0.2905, "step": 1450 }, { "epoch": 1.54, "learning_rate": 4.187761525499973e-06, "loss": 0.2806, "step": 1460 }, { "epoch": 1.55, "learning_rate": 4.008450855655675e-06, "loss": 0.2716, "step": 1470 }, { "epoch": 1.56, "learning_rate": 3.83246979510764e-06, "loss": 0.2834, "step": 1480 }, { "epoch": 1.57, "learning_rate": 3.676977737529078e-06, "loss": 0.2809, "step": 1490 }, { "epoch": 1.58, "learning_rate": 3.5074689542164895e-06, "loss": 0.2845, "step": 1500 }, { "epoch": 1.59, "learning_rate": 3.341441539881574e-06, "loss": 0.3001, "step": 1510 }, { "epoch": 1.6, "learning_rate": 3.1789457882922753e-06, "loss": 0.2941, "step": 1520 }, { "epoch": 1.61, "learning_rate": 3.020030923389471e-06, "loss": 0.2917, "step": 1530 }, { "epoch": 1.62, "learning_rate": 2.86474508437579e-06, "loss": 0.293, "step": 1540 }, { "epoch": 1.63, "learning_rate": 2.7131353111330843e-06, "loss": 0.2941, "step": 1550 }, { "epoch": 1.64, "learning_rate": 2.565247529972901e-06, "loss": 0.2787, "step": 1560 }, { "epoch": 1.65, "learning_rate": 2.4211265397242854e-06, "loss": 0.2899, "step": 1570 }, { "epoch": 1.66, "learning_rate": 2.280815998163083e-06, "loss": 0.2814, "step": 1580 }, { "epoch": 1.67, "learning_rate": 2.144358408786986e-06, "loss": 0.2786, "step": 1590 }, { "epoch": 1.68, "learning_rate": 2.011795107940138e-06, "loss": 0.2916, "step": 1600 }, { "epoch": 1.69, "learning_rate": 1.8831662522913594e-06, "loss": 0.2834, "step": 1610 }, { "epoch": 1.71, "learning_rate": 1.7585108066697136e-06, "loss": 0.2735, "step": 1620 }, { "epoch": 1.72, "learning_rate": 1.6378665322611002e-06, "loss": 0.3039, "step": 1630 }, { "epoch": 1.73, "learning_rate": 1.521269975169471e-06, "loss": 0.2872, "step": 1640 }, { "epoch": 1.74, "learning_rate": 1.408756455346114e-06, "loss": 0.2863, "step": 1650 }, { "epoch": 1.75, "learning_rate": 1.3003600558903927e-06, "loss": 0.2854, "step": 1660 }, { "epoch": 1.76, "learning_rate": 1.196113612725116e-06, "loss": 0.2727, "step": 1670 }, { "epoch": 1.77, "learning_rate": 1.0960487046497524e-06, "loss": 0.2763, "step": 1680 }, { "epoch": 1.78, "learning_rate": 1.000195643774431e-06, "loss": 0.2962, "step": 1690 }, { "epoch": 1.79, "learning_rate": 9.085834663376629e-07, "loss": 0.2888, "step": 1700 }, { "epoch": 1.8, "learning_rate": 8.212399239105534e-07, "loss": 0.2839, "step": 1710 }, { "epoch": 1.81, "learning_rate": 7.381914749901752e-07, "loss": 0.2789, "step": 1720 }, { "epoch": 1.82, "learning_rate": 6.594632769846353e-07, "loss": 0.2772, "step": 1730 }, { "epoch": 1.83, "learning_rate": 5.850791785922849e-07, "loss": 0.278, "step": 1740 }, { "epoch": 1.84, "learning_rate": 5.150617125773633e-07, "loss": 0.2878, "step": 1750 }, { "epoch": 1.85, "learning_rate": 4.494320889442749e-07, "loss": 0.2734, "step": 1760 }, { "epoch": 1.86, "learning_rate": 3.882101885125539e-07, "loss": 0.2826, "step": 1770 }, { "epoch": 1.87, "learning_rate": 3.3141455689448266e-07, "loss": 0.2875, "step": 1780 }, { "epoch": 1.88, "learning_rate": 2.790623988771712e-07, "loss": 0.2898, "step": 1790 }, { "epoch": 1.89, "learning_rate": 2.3116957321080102e-07, "loss": 0.2919, "step": 1800 }, { "epoch": 1.91, "learning_rate": 1.8775058780463094e-07, "loss": 0.2778, "step": 1810 }, { "epoch": 1.92, "learning_rate": 1.4881859533218466e-07, "loss": 0.3179, "step": 1820 }, { "epoch": 1.93, "learning_rate": 1.1438538924699094e-07, "loss": 0.2771, "step": 1830 }, { "epoch": 1.94, "learning_rate": 8.446140021006132e-08, "loss": 0.2817, "step": 1840 }, { "epoch": 1.95, "learning_rate": 5.9055692930179426e-08, "loss": 0.2903, "step": 1850 }, { "epoch": 1.96, "learning_rate": 3.8175963417980685e-08, "loss": 0.2934, "step": 1860 }, { "epoch": 1.97, "learning_rate": 2.1828536654647235e-08, "loss": 0.2952, "step": 1870 }, { "epoch": 1.98, "learning_rate": 1.0018364675912217e-08, "loss": 0.2881, "step": 1880 }, { "epoch": 1.99, "learning_rate": 2.7490250719663933e-09, "loss": 0.2701, "step": 1890 }, { "epoch": 2.0, "learning_rate": 2.2719903721712954e-11, "loss": 0.2955, "step": 1900 }, { "epoch": 2.0, "step": 1900, "total_flos": 3.4203213408659046e+17, "train_loss": 0.3227183536165639, "train_runtime": 6710.5312, "train_samples_per_second": 1.132, "train_steps_per_second": 0.283 } ], "logging_steps": 10, "max_steps": 1900, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "total_flos": 3.4203213408659046e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }