{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 972, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030864197530864196, "grad_norm": 5.566057508694927, "learning_rate": 1.0204081632653063e-06, "loss": 0.9078, "step": 10 }, { "epoch": 0.06172839506172839, "grad_norm": 2.402969993178742, "learning_rate": 2.0408163265306125e-06, "loss": 0.4908, "step": 20 }, { "epoch": 0.09259259259259259, "grad_norm": 1.5973191159110185, "learning_rate": 3.0612244897959185e-06, "loss": 0.3604, "step": 30 }, { "epoch": 0.12345679012345678, "grad_norm": 1.5212680542283326, "learning_rate": 4.081632653061225e-06, "loss": 0.2917, "step": 40 }, { "epoch": 0.15432098765432098, "grad_norm": 1.327762286536679, "learning_rate": 5.1020408163265315e-06, "loss": 0.2483, "step": 50 }, { "epoch": 0.18518518518518517, "grad_norm": 1.497773959020813, "learning_rate": 6.122448979591837e-06, "loss": 0.3335, "step": 60 }, { "epoch": 0.21604938271604937, "grad_norm": 1.452406056897556, "learning_rate": 7.1428571428571436e-06, "loss": 0.2373, "step": 70 }, { "epoch": 0.24691358024691357, "grad_norm": 1.2333736335084606, "learning_rate": 8.16326530612245e-06, "loss": 0.2827, "step": 80 }, { "epoch": 0.2777777777777778, "grad_norm": 1.977571583115981, "learning_rate": 9.183673469387756e-06, "loss": 0.3493, "step": 90 }, { "epoch": 0.30864197530864196, "grad_norm": 0.43529393896151314, "learning_rate": 9.999870796282452e-06, "loss": 0.2379, "step": 100 }, { "epoch": 0.3395061728395062, "grad_norm": 1.1728868827035934, "learning_rate": 9.995349367260807e-06, "loss": 0.2865, "step": 110 }, { "epoch": 0.37037037037037035, "grad_norm": 0.8560454934036049, "learning_rate": 9.984374428250894e-06, "loss": 0.2797, "step": 120 }, { "epoch": 0.4012345679012346, "grad_norm": 0.8669441816278666, "learning_rate": 9.966960157816279e-06, "loss": 0.2909, "step": 130 }, { "epoch": 0.43209876543209874, "grad_norm": 1.1122798966875698, "learning_rate": 9.943129053516176e-06, "loss": 0.2224, "step": 140 }, { "epoch": 0.46296296296296297, "grad_norm": 1.3783001550147502, "learning_rate": 9.912911902840771e-06, "loss": 0.3475, "step": 150 }, { "epoch": 0.49382716049382713, "grad_norm": 1.1129606902235367, "learning_rate": 9.876347743436758e-06, "loss": 0.2754, "step": 160 }, { "epoch": 0.5246913580246914, "grad_norm": 0.5982987847884026, "learning_rate": 9.833483812674453e-06, "loss": 0.2668, "step": 170 }, { "epoch": 0.5555555555555556, "grad_norm": 1.151190335163041, "learning_rate": 9.78437548662167e-06, "loss": 0.304, "step": 180 }, { "epoch": 0.5864197530864198, "grad_norm": 0.9873399404098623, "learning_rate": 9.729086208503174e-06, "loss": 0.2839, "step": 190 }, { "epoch": 0.6172839506172839, "grad_norm": 1.0916371259601432, "learning_rate": 9.66768740673815e-06, "loss": 0.2536, "step": 200 }, { "epoch": 0.6481481481481481, "grad_norm": 0.8725822152638094, "learning_rate": 9.60025840266157e-06, "loss": 0.2941, "step": 210 }, { "epoch": 0.6790123456790124, "grad_norm": 0.9851282803801447, "learning_rate": 9.52688630804867e-06, "loss": 0.2636, "step": 220 }, { "epoch": 0.7098765432098766, "grad_norm": 1.2682899477638283, "learning_rate": 9.44766591257493e-06, "loss": 0.3098, "step": 230 }, { "epoch": 0.7407407407407407, "grad_norm": 0.9753967370936533, "learning_rate": 9.362699561356957e-06, "loss": 0.2596, "step": 240 }, { "epoch": 0.7716049382716049, "grad_norm": 1.2536107665402, "learning_rate": 9.272097022732444e-06, "loss": 0.3364, "step": 250 }, { "epoch": 0.8024691358024691, "grad_norm": 1.0948963090455852, "learning_rate": 9.175975346450063e-06, "loss": 0.2546, "step": 260 }, { "epoch": 0.8333333333333334, "grad_norm": 0.761843240861204, "learning_rate": 9.074458712452476e-06, "loss": 0.2366, "step": 270 }, { "epoch": 0.8641975308641975, "grad_norm": 1.183127668103192, "learning_rate": 8.9676782704478e-06, "loss": 0.2917, "step": 280 }, { "epoch": 0.8950617283950617, "grad_norm": 1.26099529846567, "learning_rate": 8.855771970476834e-06, "loss": 0.2808, "step": 290 }, { "epoch": 0.9259259259259259, "grad_norm": 1.2030992581122257, "learning_rate": 8.738884384694905e-06, "loss": 0.2532, "step": 300 }, { "epoch": 0.9567901234567902, "grad_norm": 0.9236541991188757, "learning_rate": 8.617166520598563e-06, "loss": 0.2135, "step": 310 }, { "epoch": 0.9876543209876543, "grad_norm": 1.142242691308657, "learning_rate": 8.490775625938452e-06, "loss": 0.248, "step": 320 }, { "epoch": 1.0185185185185186, "grad_norm": 0.9358677809806952, "learning_rate": 8.359874985570378e-06, "loss": 0.156, "step": 330 }, { "epoch": 1.0493827160493827, "grad_norm": 0.9685063215738038, "learning_rate": 8.224633710506997e-06, "loss": 0.1483, "step": 340 }, { "epoch": 1.0802469135802468, "grad_norm": 0.9972139407992048, "learning_rate": 8.085226519442697e-06, "loss": 0.1773, "step": 350 }, { "epoch": 1.1111111111111112, "grad_norm": 0.7866994795730093, "learning_rate": 7.941833513033873e-06, "loss": 0.1674, "step": 360 }, { "epoch": 1.1419753086419753, "grad_norm": 1.3594591232427207, "learning_rate": 7.794639941226238e-06, "loss": 0.1996, "step": 370 }, { "epoch": 1.1728395061728394, "grad_norm": 1.336485058939724, "learning_rate": 7.643835963929747e-06, "loss": 0.1992, "step": 380 }, { "epoch": 1.2037037037037037, "grad_norm": 1.0894387345805672, "learning_rate": 7.489616405350319e-06, "loss": 0.2009, "step": 390 }, { "epoch": 1.2345679012345678, "grad_norm": 0.8521396408327715, "learning_rate": 7.332180502295729e-06, "loss": 0.1551, "step": 400 }, { "epoch": 1.2654320987654322, "grad_norm": 1.280235377415998, "learning_rate": 7.171731646780867e-06, "loss": 0.1903, "step": 410 }, { "epoch": 1.2962962962962963, "grad_norm": 1.0177820006481466, "learning_rate": 7.008477123264849e-06, "loss": 0.1919, "step": 420 }, { "epoch": 1.3271604938271606, "grad_norm": 1.3792449227303458, "learning_rate": 6.842627840859461e-06, "loss": 0.1653, "step": 430 }, { "epoch": 1.3580246913580247, "grad_norm": 0.9886652944346411, "learning_rate": 6.674398060854931e-06, "loss": 0.1528, "step": 440 }, { "epoch": 1.3888888888888888, "grad_norm": 0.9738749120684131, "learning_rate": 6.5040051199149755e-06, "loss": 0.1859, "step": 450 }, { "epoch": 1.4197530864197532, "grad_norm": 1.0499735220491768, "learning_rate": 6.331669149298781e-06, "loss": 0.2089, "step": 460 }, { "epoch": 1.4506172839506173, "grad_norm": 1.3564424657138714, "learning_rate": 6.157612790472626e-06, "loss": 0.1956, "step": 470 }, { "epoch": 1.4814814814814814, "grad_norm": 0.7943373995297914, "learning_rate": 5.982060907478568e-06, "loss": 0.1462, "step": 480 }, { "epoch": 1.5123456790123457, "grad_norm": 1.5557938987689983, "learning_rate": 5.805240296431765e-06, "loss": 0.2092, "step": 490 }, { "epoch": 1.5432098765432098, "grad_norm": 0.9522144954857849, "learning_rate": 5.627379392521758e-06, "loss": 0.1653, "step": 500 }, { "epoch": 1.5432098765432098, "eval_loss": 0.23522016406059265, "eval_runtime": 72.3023, "eval_samples_per_second": 7.967, "eval_steps_per_second": 7.967, "step": 500 }, { "epoch": 1.574074074074074, "grad_norm": 1.0277772820113307, "learning_rate": 5.448707974896214e-06, "loss": 0.1934, "step": 510 }, { "epoch": 1.6049382716049383, "grad_norm": 1.498458469156437, "learning_rate": 5.2694568698084085e-06, "loss": 0.2338, "step": 520 }, { "epoch": 1.6358024691358026, "grad_norm": 1.2380870016139613, "learning_rate": 5.089857652411961e-06, "loss": 0.1551, "step": 530 }, { "epoch": 1.6666666666666665, "grad_norm": 1.2431560615526187, "learning_rate": 4.910142347588041e-06, "loss": 0.211, "step": 540 }, { "epoch": 1.6975308641975309, "grad_norm": 0.8069111004848413, "learning_rate": 4.730543130191594e-06, "loss": 0.1598, "step": 550 }, { "epoch": 1.7283950617283952, "grad_norm": 0.748202151488271, "learning_rate": 4.551292025103789e-06, "loss": 0.1465, "step": 560 }, { "epoch": 1.7592592592592593, "grad_norm": 0.775409645399768, "learning_rate": 4.372620607478242e-06, "loss": 0.152, "step": 570 }, { "epoch": 1.7901234567901234, "grad_norm": 0.919987356247785, "learning_rate": 4.1947597035682355e-06, "loss": 0.2149, "step": 580 }, { "epoch": 1.8209876543209877, "grad_norm": 1.1312605755339111, "learning_rate": 4.017939092521434e-06, "loss": 0.1777, "step": 590 }, { "epoch": 1.8518518518518519, "grad_norm": 1.3626643893639394, "learning_rate": 3.842387209527374e-06, "loss": 0.1829, "step": 600 }, { "epoch": 1.882716049382716, "grad_norm": 1.227575534766062, "learning_rate": 3.6683308507012196e-06, "loss": 0.1799, "step": 610 }, { "epoch": 1.9135802469135803, "grad_norm": 1.2174251384264219, "learning_rate": 3.4959948800850253e-06, "loss": 0.1362, "step": 620 }, { "epoch": 1.9444444444444444, "grad_norm": 1.1078151820356543, "learning_rate": 3.3256019391450696e-06, "loss": 0.1562, "step": 630 }, { "epoch": 1.9753086419753085, "grad_norm": 1.0667445790104497, "learning_rate": 3.1573721591405405e-06, "loss": 0.1792, "step": 640 }, { "epoch": 2.006172839506173, "grad_norm": 0.9627653971639197, "learning_rate": 2.991522876735154e-06, "loss": 0.1473, "step": 650 }, { "epoch": 2.037037037037037, "grad_norm": 0.8420822083166241, "learning_rate": 2.8282683532191333e-06, "loss": 0.0994, "step": 660 }, { "epoch": 2.067901234567901, "grad_norm": 1.001599199308316, "learning_rate": 2.6678194977042727e-06, "loss": 0.0865, "step": 670 }, { "epoch": 2.0987654320987654, "grad_norm": 0.9784608307244814, "learning_rate": 2.5103835946496846e-06, "loss": 0.1042, "step": 680 }, { "epoch": 2.1296296296296298, "grad_norm": 1.258037980079001, "learning_rate": 2.3561640360702525e-06, "loss": 0.0687, "step": 690 }, { "epoch": 2.1604938271604937, "grad_norm": 1.0158830256095577, "learning_rate": 2.205360058773764e-06, "loss": 0.0784, "step": 700 }, { "epoch": 2.191358024691358, "grad_norm": 0.9515473229873379, "learning_rate": 2.058166486966128e-06, "loss": 0.1027, "step": 710 }, { "epoch": 2.2222222222222223, "grad_norm": 0.9896704595537134, "learning_rate": 1.914773480557304e-06, "loss": 0.0921, "step": 720 }, { "epoch": 2.253086419753086, "grad_norm": 1.200715350686628, "learning_rate": 1.775366289493003e-06, "loss": 0.0772, "step": 730 }, { "epoch": 2.2839506172839505, "grad_norm": 1.2889212786691133, "learning_rate": 1.6401250144296239e-06, "loss": 0.0951, "step": 740 }, { "epoch": 2.314814814814815, "grad_norm": 1.364590190580036, "learning_rate": 1.5092243740615486e-06, "loss": 0.0871, "step": 750 }, { "epoch": 2.3456790123456788, "grad_norm": 1.0124869207352833, "learning_rate": 1.382833479401438e-06, "loss": 0.099, "step": 760 }, { "epoch": 2.376543209876543, "grad_norm": 1.0379306151771268, "learning_rate": 1.2611156153050963e-06, "loss": 0.1116, "step": 770 }, { "epoch": 2.4074074074074074, "grad_norm": 0.9907248139219855, "learning_rate": 1.1442280295231656e-06, "loss": 0.0753, "step": 780 }, { "epoch": 2.4382716049382718, "grad_norm": 1.1280634693251803, "learning_rate": 1.0323217295522026e-06, "loss": 0.0875, "step": 790 }, { "epoch": 2.4691358024691357, "grad_norm": 0.9944383353033676, "learning_rate": 9.255412875475256e-07, "loss": 0.0926, "step": 800 }, { "epoch": 2.5, "grad_norm": 1.2340366986358158, "learning_rate": 8.240246535499369e-07, "loss": 0.0856, "step": 810 }, { "epoch": 2.5308641975308643, "grad_norm": 0.7578751117138942, "learning_rate": 7.279029772675572e-07, "loss": 0.0995, "step": 820 }, { "epoch": 2.5617283950617287, "grad_norm": 0.7485029876002154, "learning_rate": 6.373004386430442e-07, "loss": 0.0832, "step": 830 }, { "epoch": 2.5925925925925926, "grad_norm": 1.236625972178977, "learning_rate": 5.523340874250704e-07, "loss": 0.0927, "step": 840 }, { "epoch": 2.623456790123457, "grad_norm": 1.2710363913669391, "learning_rate": 4.7311369195133127e-07, "loss": 0.0865, "step": 850 }, { "epoch": 2.6543209876543212, "grad_norm": 0.7442084812404209, "learning_rate": 3.997415973384311e-07, "loss": 0.079, "step": 860 }, { "epoch": 2.685185185185185, "grad_norm": 1.0024538750298015, "learning_rate": 3.3231259326184983e-07, "loss": 0.0988, "step": 870 }, { "epoch": 2.7160493827160495, "grad_norm": 0.8295790721782684, "learning_rate": 2.7091379149682683e-07, "loss": 0.0809, "step": 880 }, { "epoch": 2.746913580246914, "grad_norm": 1.0737158797857456, "learning_rate": 2.156245133783308e-07, "loss": 0.0787, "step": 890 }, { "epoch": 2.7777777777777777, "grad_norm": 0.9634211271913754, "learning_rate": 1.6651618732554774e-07, "loss": 0.0692, "step": 900 }, { "epoch": 2.808641975308642, "grad_norm": 1.0387449050019724, "learning_rate": 1.2365225656324308e-07, "loss": 0.0831, "step": 910 }, { "epoch": 2.8395061728395063, "grad_norm": 1.284046717570844, "learning_rate": 8.708809715922973e-08, "loss": 0.0868, "step": 920 }, { "epoch": 2.8703703703703702, "grad_norm": 1.3674065078147544, "learning_rate": 5.687094648382518e-08, "loss": 0.1005, "step": 930 }, { "epoch": 2.9012345679012346, "grad_norm": 1.1040914867606508, "learning_rate": 3.303984218372136e-08, "loss": 0.1167, "step": 940 }, { "epoch": 2.932098765432099, "grad_norm": 1.093365577556806, "learning_rate": 1.562557174910606e-08, "loss": 0.0875, "step": 950 }, { "epoch": 2.962962962962963, "grad_norm": 1.034221852693663, "learning_rate": 4.650632739194305e-09, "loss": 0.0802, "step": 960 }, { "epoch": 2.993827160493827, "grad_norm": 1.1039194966989203, "learning_rate": 1.2920371754931994e-10, "loss": 0.0794, "step": 970 }, { "epoch": 3.0, "step": 972, "total_flos": 20295904002048.0, "train_loss": 0.19039062806117682, "train_runtime": 4309.0281, "train_samples_per_second": 3.607, "train_steps_per_second": 0.226 } ], "logging_steps": 10, "max_steps": 972, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 20295904002048.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }