{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1365, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.18978102189781e-06, "loss": 13.9701, "step": 1 }, { "epoch": 0.04, "learning_rate": 2.1897810218978098e-05, "loss": 9.8829, "step": 10 }, { "epoch": 0.07, "learning_rate": 4.3795620437956196e-05, "loss": 7.6246, "step": 20 }, { "epoch": 0.11, "learning_rate": 6.56934306569343e-05, "loss": 7.2381, "step": 30 }, { "epoch": 0.15, "learning_rate": 8.759124087591239e-05, "loss": 7.2, "step": 40 }, { "epoch": 0.18, "learning_rate": 0.00010948905109489051, "loss": 7.1787, "step": 50 }, { "epoch": 0.22, "learning_rate": 0.0001313868613138686, "loss": 7.1182, "step": 60 }, { "epoch": 0.26, "learning_rate": 0.00015328467153284672, "loss": 7.1756, "step": 70 }, { "epoch": 0.29, "learning_rate": 0.00017518248175182478, "loss": 6.9204, "step": 80 }, { "epoch": 0.33, "learning_rate": 0.0001970802919708029, "loss": 6.739, "step": 90 }, { "epoch": 0.37, "learning_rate": 0.00021897810218978101, "loss": 6.6124, "step": 100 }, { "epoch": 0.4, "learning_rate": 0.0002408759124087591, "loss": 6.5668, "step": 110 }, { "epoch": 0.44, "learning_rate": 0.0002627737226277372, "loss": 6.4359, "step": 120 }, { "epoch": 0.48, "learning_rate": 0.0002846715328467153, "loss": 6.5331, "step": 130 }, { "epoch": 0.51, "learning_rate": 0.00029999558221422155, "loss": 6.4523, "step": 140 }, { "epoch": 0.55, "learning_rate": 0.00029991705103933765, "loss": 6.2601, "step": 150 }, { "epoch": 0.59, "learning_rate": 0.00029974040600590614, "loss": 6.1484, "step": 160 }, { "epoch": 0.62, "learning_rate": 0.0002994657627200285, "loss": 6.0018, "step": 170 }, { "epoch": 0.66, "learning_rate": 0.0002990933009231839, "loss": 5.8596, "step": 180 }, { "epoch": 0.7, "learning_rate": 0.0002986232643745964, "loss": 5.7249, "step": 190 }, { "epoch": 0.73, "learning_rate": 0.000298055960691706, "loss": 5.5685, "step": 200 }, { "epoch": 0.77, "learning_rate": 0.0002973917611488469, "loss": 5.4467, "step": 210 }, { "epoch": 0.81, "learning_rate": 0.0002966311004342651, "loss": 5.3091, "step": 220 }, { "epoch": 0.84, "learning_rate": 0.0002957744763656356, "loss": 5.1977, "step": 230 }, { "epoch": 0.88, "learning_rate": 0.00029482244956426253, "loss": 5.0377, "step": 240 }, { "epoch": 0.92, "learning_rate": 0.0002937756430881789, "loss": 4.9283, "step": 250 }, { "epoch": 0.95, "learning_rate": 0.0002926347420243833, "loss": 5.091, "step": 260 }, { "epoch": 0.99, "learning_rate": 0.0002914004930404816, "loss": 4.8342, "step": 270 }, { "epoch": 1.0, "eval_loss": 4.737916946411133, "eval_runtime": 30.7851, "eval_samples_per_second": 501.248, "eval_steps_per_second": 1.981, "step": 273 }, { "epoch": 1.03, "learning_rate": 0.00029007370389602736, "loss": 4.7528, "step": 280 }, { "epoch": 1.06, "learning_rate": 0.00028865524291388006, "loss": 4.6222, "step": 290 }, { "epoch": 1.1, "learning_rate": 0.0002871460384119274, "loss": 4.4689, "step": 300 }, { "epoch": 1.14, "learning_rate": 0.00028554707809554385, "loss": 4.3523, "step": 310 }, { "epoch": 1.17, "learning_rate": 0.0002838594084111824, "loss": 4.3172, "step": 320 }, { "epoch": 1.21, "learning_rate": 0.00028208413386152326, "loss": 4.1787, "step": 330 }, { "epoch": 1.25, "learning_rate": 0.00028022241628262735, "loss": 4.1082, "step": 340 }, { "epoch": 1.28, "learning_rate": 0.00027827547408356773, "loss": 3.9914, "step": 350 }, { "epoch": 1.32, "learning_rate": 0.00027624458144903663, "loss": 4.0363, "step": 360 }, { "epoch": 1.36, "learning_rate": 0.0002741310675054493, "loss": 5.2644, "step": 370 }, { "epoch": 1.39, "learning_rate": 0.0002719363154510924, "loss": 4.8735, "step": 380 }, { "epoch": 1.43, "learning_rate": 0.000269661761650883, "loss": 4.4014, "step": 390 }, { "epoch": 1.47, "learning_rate": 0.00026730889469633406, "loss": 4.1602, "step": 400 }, { "epoch": 1.5, "learning_rate": 0.0002648792544313389, "loss": 4.0027, "step": 410 }, { "epoch": 1.54, "learning_rate": 0.0002623744309444141, "loss": 3.9095, "step": 420 }, { "epoch": 1.58, "learning_rate": 0.0002597960635280588, "loss": 3.8225, "step": 430 }, { "epoch": 1.61, "learning_rate": 0.00025714583960591324, "loss": 3.7638, "step": 440 }, { "epoch": 1.65, "learning_rate": 0.0002544254936284164, "loss": 3.7038, "step": 450 }, { "epoch": 1.68, "learning_rate": 0.0002516368059376883, "loss": 3.6738, "step": 460 }, { "epoch": 1.72, "learning_rate": 0.00024878160160237653, "loss": 3.6142, "step": 470 }, { "epoch": 1.76, "learning_rate": 0.00024586174922323293, "loss": 3.5565, "step": 480 }, { "epoch": 1.79, "learning_rate": 0.0002428791597101996, "loss": 3.5071, "step": 490 }, { "epoch": 1.83, "learning_rate": 0.00023983578503180541, "loss": 3.4761, "step": 500 }, { "epoch": 1.87, "learning_rate": 0.00023673361693769216, "loss": 3.4575, "step": 510 }, { "epoch": 1.9, "learning_rate": 0.00023357468565510535, "loss": 3.4062, "step": 520 }, { "epoch": 1.94, "learning_rate": 0.00023036105856020315, "loss": 3.3653, "step": 530 }, { "epoch": 1.98, "learning_rate": 0.00022709483882505315, "loss": 3.3301, "step": 540 }, { "epoch": 2.0, "eval_loss": 3.2846388816833496, "eval_runtime": 30.5512, "eval_samples_per_second": 505.086, "eval_steps_per_second": 1.997, "step": 546 }, { "epoch": 2.01, "learning_rate": 0.00022377816404120263, "loss": 3.2758, "step": 550 }, { "epoch": 2.05, "learning_rate": 0.00022041320482072218, "loss": 3.2522, "step": 560 }, { "epoch": 2.09, "learning_rate": 0.00021700216337563975, "loss": 3.1993, "step": 570 }, { "epoch": 2.12, "learning_rate": 0.00021354727207669315, "loss": 3.147, "step": 580 }, { "epoch": 2.16, "learning_rate": 0.00021005079199234558, "loss": 3.1192, "step": 590 }, { "epoch": 2.2, "learning_rate": 0.00020651501140901961, "loss": 3.0901, "step": 600 }, { "epoch": 2.23, "learning_rate": 0.0002029422443335184, "loss": 3.0812, "step": 610 }, { "epoch": 2.27, "learning_rate": 0.00019933482897861385, "loss": 3.0369, "step": 620 }, { "epoch": 2.31, "learning_rate": 0.00019569512623279333, "loss": 2.9916, "step": 630 }, { "epoch": 2.34, "learning_rate": 0.00019202551811516592, "loss": 2.9367, "step": 640 }, { "epoch": 2.38, "learning_rate": 0.00018832840621653993, "loss": 2.9235, "step": 650 }, { "epoch": 2.42, "learning_rate": 0.00018460621012769126, "loss": 3.0402, "step": 660 }, { "epoch": 2.45, "learning_rate": 0.0001808613658558521, "loss": 2.9328, "step": 670 }, { "epoch": 2.49, "learning_rate": 0.00017709632423045527, "loss": 2.8384, "step": 680 }, { "epoch": 2.53, "learning_rate": 0.0001733135492991784, "loss": 2.7372, "step": 690 }, { "epoch": 2.56, "learning_rate": 0.00016951551671533753, "loss": 2.7189, "step": 700 }, { "epoch": 2.6, "learning_rate": 0.00016570471211768486, "loss": 2.6697, "step": 710 }, { "epoch": 2.64, "learning_rate": 0.00016188362950367204, "loss": 2.6319, "step": 720 }, { "epoch": 2.67, "learning_rate": 0.00015805476959724273, "loss": 2.5963, "step": 730 }, { "epoch": 2.71, "learning_rate": 0.00015422063821222292, "loss": 2.5732, "step": 740 }, { "epoch": 2.75, "learning_rate": 0.00015038374461238062, "loss": 2.5426, "step": 750 }, { "epoch": 2.78, "learning_rate": 0.00014654659986922697, "loss": 2.5217, "step": 760 }, { "epoch": 2.82, "learning_rate": 0.00014271171521863514, "loss": 2.4971, "step": 770 }, { "epoch": 2.86, "learning_rate": 0.00013888160041735086, "loss": 2.4917, "step": 780 }, { "epoch": 2.89, "learning_rate": 0.0001350587621004716, "loss": 2.4795, "step": 790 }, { "epoch": 2.93, "learning_rate": 0.00013124570214096816, "loss": 2.4464, "step": 800 }, { "epoch": 2.97, "learning_rate": 0.00012744491601232355, "loss": 2.4158, "step": 810 }, { "epoch": 3.0, "eval_loss": 2.413381576538086, "eval_runtime": 30.6479, "eval_samples_per_second": 503.492, "eval_steps_per_second": 1.99, "step": 819 }, { "epoch": 3.0, "learning_rate": 0.00012365889115535916, "loss": 2.402, "step": 820 }, { "epoch": 3.04, "learning_rate": 0.00011989010535031889, "loss": 2.3491, "step": 830 }, { "epoch": 3.08, "learning_rate": 0.00011614102509527481, "loss": 2.3247, "step": 840 }, { "epoch": 3.11, "learning_rate": 0.00011241410399191728, "loss": 2.3179, "step": 850 }, { "epoch": 3.15, "learning_rate": 0.00010871178113978432, "loss": 2.3006, "step": 860 }, { "epoch": 3.19, "learning_rate": 0.00010503647953998295, "loss": 2.305, "step": 870 }, { "epoch": 3.22, "learning_rate": 0.00010139060450944528, "loss": 2.2922, "step": 880 }, { "epoch": 3.26, "learning_rate": 9.777654210675867e-05, "loss": 2.2766, "step": 890 }, { "epoch": 3.3, "learning_rate": 9.419665757059952e-05, "loss": 2.2732, "step": 900 }, { "epoch": 3.33, "learning_rate": 9.065329377179248e-05, "loss": 2.2591, "step": 910 }, { "epoch": 3.37, "learning_rate": 8.714876968000853e-05, "loss": 2.2477, "step": 920 }, { "epoch": 3.41, "learning_rate": 8.368537884610555e-05, "loss": 2.243, "step": 930 }, { "epoch": 3.44, "learning_rate": 8.026538790110405e-05, "loss": 2.2341, "step": 940 }, { "epoch": 3.48, "learning_rate": 7.689103507278047e-05, "loss": 2.2249, "step": 950 }, { "epoch": 3.52, "learning_rate": 7.356452872084971e-05, "loss": 2.236, "step": 960 }, { "epoch": 3.55, "learning_rate": 7.028804589169443e-05, "loss": 2.2097, "step": 970 }, { "epoch": 3.59, "learning_rate": 6.706373089358791e-05, "loss": 2.1968, "step": 980 }, { "epoch": 3.63, "learning_rate": 6.389369389334193e-05, "loss": 2.187, "step": 990 }, { "epoch": 3.66, "learning_rate": 6.0780009535299393e-05, "loss": 2.1865, "step": 1000 }, { "epoch": 3.7, "learning_rate": 5.772471558357407e-05, "loss": 2.1732, "step": 1010 }, { "epoch": 3.74, "learning_rate": 5.4729811588427536e-05, "loss": 2.1648, "step": 1020 }, { "epoch": 3.77, "learning_rate": 5.179725757765449e-05, "loss": 2.1696, "step": 1030 }, { "epoch": 3.81, "learning_rate": 4.892897277383434e-05, "loss": 2.1591, "step": 1040 }, { "epoch": 3.85, "learning_rate": 4.6126834338287713e-05, "loss": 2.1536, "step": 1050 }, { "epoch": 3.88, "learning_rate": 4.339267614256027e-05, "loss": 2.1536, "step": 1060 }, { "epoch": 3.92, "learning_rate": 4.07282875682373e-05, "loss": 2.1404, "step": 1070 }, { "epoch": 3.96, "learning_rate": 3.813541233587552e-05, "loss": 2.1403, "step": 1080 }, { "epoch": 3.99, "learning_rate": 3.561574736381752e-05, "loss": 2.1322, "step": 1090 }, { "epoch": 4.0, "eval_loss": 2.1637322902679443, "eval_runtime": 30.5067, "eval_samples_per_second": 505.824, "eval_steps_per_second": 2.0, "step": 1092 }, { "epoch": 4.03, "learning_rate": 3.317094165763639e-05, "loss": 2.0822, "step": 1100 }, { "epoch": 4.07, "learning_rate": 3.080259523093675e-05, "loss": 2.0771, "step": 1110 }, { "epoch": 4.1, "learning_rate": 2.8512258058219112e-05, "loss": 2.0782, "step": 1120 }, { "epoch": 4.14, "learning_rate": 2.6301429060492306e-05, "loss": 2.0688, "step": 1130 }, { "epoch": 4.18, "learning_rate": 2.417155512429832e-05, "loss": 2.0603, "step": 1140 }, { "epoch": 4.21, "learning_rate": 2.2124030154791035e-05, "loss": 2.0602, "step": 1150 }, { "epoch": 4.25, "learning_rate": 2.0160194163489062e-05, "loss": 2.0603, "step": 1160 }, { "epoch": 4.29, "learning_rate": 1.828133239129944e-05, "loss": 2.0617, "step": 1170 }, { "epoch": 4.32, "learning_rate": 1.6488674467386278e-05, "loss": 2.065, "step": 1180 }, { "epoch": 4.36, "learning_rate": 1.47833936044345e-05, "loss": 2.0479, "step": 1190 }, { "epoch": 4.4, "learning_rate": 1.3166605830835903e-05, "loss": 2.0553, "step": 1200 }, { "epoch": 4.43, "learning_rate": 1.1639369260299463e-05, "loss": 2.044, "step": 1210 }, { "epoch": 4.47, "learning_rate": 1.0202683399364469e-05, "loss": 2.0539, "step": 1220 }, { "epoch": 4.51, "learning_rate": 8.857488493268839e-06, "loss": 2.0471, "step": 1230 }, { "epoch": 4.54, "learning_rate": 7.604664910601915e-06, "loss": 2.0548, "step": 1240 }, { "epoch": 4.58, "learning_rate": 6.445032567143238e-06, "loss": 2.0447, "step": 1250 }, { "epoch": 4.62, "learning_rate": 5.379350389265319e-06, "loss": 2.0379, "step": 1260 }, { "epoch": 4.65, "learning_rate": 4.408315817250818e-06, "loss": 2.0351, "step": 1270 }, { "epoch": 4.69, "learning_rate": 3.5325643488498757e-06, "loss": 2.0463, "step": 1280 }, { "epoch": 4.73, "learning_rate": 2.7526691233758334e-06, "loss": 2.0436, "step": 1290 }, { "epoch": 4.76, "learning_rate": 2.0691405466118307e-06, "loss": 2.0491, "step": 1300 }, { "epoch": 4.8, "learning_rate": 1.4824259567733698e-06, "loss": 2.0461, "step": 1310 }, { "epoch": 4.84, "learning_rate": 9.929093317461057e-07, "loss": 2.041, "step": 1320 }, { "epoch": 4.87, "learning_rate": 6.009110377897086e-07, "loss": 2.04, "step": 1330 }, { "epoch": 4.91, "learning_rate": 3.066876198728474e-07, "loss": 2.0415, "step": 1340 }, { "epoch": 4.95, "learning_rate": 1.1043163377627562e-07, "loss": 2.0418, "step": 1350 }, { "epoch": 4.98, "learning_rate": 1.2271520073786623e-08, "loss": 2.0369, "step": 1360 }, { "epoch": 5.0, "eval_loss": 2.118281126022339, "eval_runtime": 30.6172, "eval_samples_per_second": 503.999, "eval_steps_per_second": 1.992, "step": 1365 }, { "epoch": 5.0, "step": 1365, "total_flos": 457285168005120.0, "train_loss": 3.477488596011431, "train_runtime": 5141.5129, "train_samples_per_second": 135.588, "train_steps_per_second": 0.265 } ], "logging_steps": 10, "max_steps": 1365, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 457285168005120.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }