{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998166819431714, "eval_steps": 500, "global_step": 2727, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.0, "learning_rate": 1.4652014652014653e-05, "loss": 0.0, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.0, "learning_rate": 2.9304029304029305e-05, "loss": 0.0, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.0, "learning_rate": 4.3956043956043955e-05, "loss": 0.0, "step": 60 }, { "epoch": 0.03, "grad_norm": 0.0, "learning_rate": 5.860805860805861e-05, "loss": 0.0, "step": 80 }, { "epoch": 0.04, "grad_norm": 0.0, "learning_rate": 7.326007326007326e-05, "loss": 0.0, "step": 100 }, { "epoch": 0.04, "grad_norm": 0.0, "learning_rate": 8.791208791208791e-05, "loss": 0.0, "step": 120 }, { "epoch": 0.05, "grad_norm": 0.0, "learning_rate": 0.00010256410256410256, "loss": 0.0, "step": 140 }, { "epoch": 0.06, "grad_norm": 0.0, "learning_rate": 0.00011721611721611722, "loss": 0.0, "step": 160 }, { "epoch": 0.07, "grad_norm": 0.0, "learning_rate": 0.00013186813186813188, "loss": 0.0, "step": 180 }, { "epoch": 0.07, "grad_norm": 0.0, "learning_rate": 0.00014652014652014652, "loss": 0.0, "step": 200 }, { "epoch": 0.08, "grad_norm": 0.0, "learning_rate": 0.00016117216117216118, "loss": 0.0, "step": 220 }, { "epoch": 0.09, "grad_norm": 0.0, "learning_rate": 0.00017582417582417582, "loss": 0.0, "step": 240 }, { "epoch": 0.1, "grad_norm": 0.0, "learning_rate": 0.00019047619047619048, "loss": 0.0, "step": 260 }, { "epoch": 0.1, "grad_norm": 0.0, "learning_rate": 0.00019942950285248574, "loss": 0.0, "step": 280 }, { "epoch": 0.11, "grad_norm": 0.0, "learning_rate": 0.000197799511002445, "loss": 0.0, "step": 300 }, { "epoch": 0.12, "grad_norm": 0.0, "learning_rate": 0.00019616951915240425, "loss": 0.0, "step": 320 }, { "epoch": 0.12, "grad_norm": 0.0, "learning_rate": 0.00019453952730236348, "loss": 0.0, "step": 340 }, { "epoch": 0.13, "grad_norm": 0.0, "learning_rate": 0.00019290953545232276, "loss": 0.0, "step": 360 }, { "epoch": 0.14, "grad_norm": 0.0, "learning_rate": 0.00019127954360228199, "loss": 0.0, "step": 380 }, { "epoch": 0.15, "grad_norm": 0.0, "learning_rate": 0.00018964955175224124, "loss": 0.0, "step": 400 }, { "epoch": 0.15, "grad_norm": 0.0, "learning_rate": 0.0001880195599022005, "loss": 0.0, "step": 420 }, { "epoch": 0.16, "grad_norm": 0.0, "learning_rate": 0.00018638956805215975, "loss": 0.0, "step": 440 }, { "epoch": 0.17, "grad_norm": 0.0, "learning_rate": 0.000184759576202119, "loss": 0.0, "step": 460 }, { "epoch": 0.18, "grad_norm": 0.0, "learning_rate": 0.00018312958435207826, "loss": 0.0, "step": 480 }, { "epoch": 0.18, "grad_norm": 0.0, "learning_rate": 0.0001814995925020375, "loss": 0.0, "step": 500 }, { "epoch": 0.19, "grad_norm": 0.0, "learning_rate": 0.00017986960065199674, "loss": 0.0, "step": 520 }, { "epoch": 0.2, "grad_norm": 0.0, "learning_rate": 0.000178239608801956, "loss": 0.0, "step": 540 }, { "epoch": 0.21, "grad_norm": 0.0, "learning_rate": 0.00017660961695191524, "loss": 0.0, "step": 560 }, { "epoch": 0.21, "grad_norm": 0.0, "learning_rate": 0.0001749796251018745, "loss": 0.0, "step": 580 }, { "epoch": 0.22, "grad_norm": 0.0, "learning_rate": 0.00017334963325183375, "loss": 0.0, "step": 600 }, { "epoch": 0.23, "grad_norm": 0.0, "learning_rate": 0.000171719641401793, "loss": 0.0, "step": 620 }, { "epoch": 0.23, "grad_norm": 0.0, "learning_rate": 0.00017008964955175223, "loss": 0.0, "step": 640 }, { "epoch": 0.24, "grad_norm": 0.0, "learning_rate": 0.00016845965770171151, "loss": 0.0, "step": 660 }, { "epoch": 0.25, "grad_norm": 0.0, "learning_rate": 0.00016682966585167074, "loss": 0.0, "step": 680 }, { "epoch": 0.26, "grad_norm": 0.0, "learning_rate": 0.00016519967400163, "loss": 0.0, "step": 700 }, { "epoch": 0.26, "grad_norm": 0.0, "learning_rate": 0.00016356968215158925, "loss": 0.0, "step": 720 }, { "epoch": 0.27, "grad_norm": 0.0, "learning_rate": 0.0001619396903015485, "loss": 0.0, "step": 740 }, { "epoch": 0.28, "grad_norm": 0.0, "learning_rate": 0.00016030969845150773, "loss": 0.0, "step": 760 }, { "epoch": 0.29, "grad_norm": 0.0, "learning_rate": 0.000158679706601467, "loss": 0.0, "step": 780 }, { "epoch": 0.29, "grad_norm": 0.0, "learning_rate": 0.00015704971475142624, "loss": 0.0, "step": 800 }, { "epoch": 0.3, "grad_norm": 0.0, "learning_rate": 0.00015541972290138552, "loss": 0.0, "step": 820 }, { "epoch": 0.31, "grad_norm": 0.0, "learning_rate": 0.00015378973105134475, "loss": 0.0, "step": 840 }, { "epoch": 0.32, "grad_norm": 0.0, "learning_rate": 0.000152159739201304, "loss": 0.0, "step": 860 }, { "epoch": 0.32, "grad_norm": 0.0, "learning_rate": 0.00015052974735126325, "loss": 0.0, "step": 880 }, { "epoch": 0.33, "grad_norm": 0.0, "learning_rate": 0.0001488997555012225, "loss": 0.0, "step": 900 }, { "epoch": 0.34, "grad_norm": 0.0, "learning_rate": 0.00014726976365118173, "loss": 0.0, "step": 920 }, { "epoch": 0.34, "grad_norm": 0.0, "learning_rate": 0.00014563977180114102, "loss": 0.0, "step": 940 }, { "epoch": 0.35, "grad_norm": 0.0, "learning_rate": 0.00014400977995110024, "loss": 0.0, "step": 960 }, { "epoch": 0.36, "grad_norm": 0.0, "learning_rate": 0.0001423797881010595, "loss": 0.0, "step": 980 }, { "epoch": 0.37, "grad_norm": 0.0, "learning_rate": 0.00014074979625101875, "loss": 0.0, "step": 1000 }, { "epoch": 0.37, "grad_norm": 0.0, "learning_rate": 0.000139119804400978, "loss": 0.0, "step": 1020 }, { "epoch": 0.38, "grad_norm": 0.0, "learning_rate": 0.00013748981255093726, "loss": 0.0, "step": 1040 }, { "epoch": 0.39, "grad_norm": 0.0, "learning_rate": 0.0001358598207008965, "loss": 0.0, "step": 1060 }, { "epoch": 0.4, "grad_norm": 0.0, "learning_rate": 0.00013422982885085577, "loss": 0.0, "step": 1080 }, { "epoch": 0.4, "grad_norm": 0.0, "learning_rate": 0.000132599837000815, "loss": 0.0, "step": 1100 }, { "epoch": 0.41, "grad_norm": 0.0, "learning_rate": 0.00013096984515077427, "loss": 0.0, "step": 1120 }, { "epoch": 0.42, "grad_norm": 0.0, "learning_rate": 0.0001293398533007335, "loss": 0.0, "step": 1140 }, { "epoch": 0.43, "grad_norm": 0.0, "learning_rate": 0.00012770986145069276, "loss": 0.0, "step": 1160 }, { "epoch": 0.43, "grad_norm": 0.0, "learning_rate": 0.000126079869600652, "loss": 0.0, "step": 1180 }, { "epoch": 0.44, "grad_norm": 0.0, "learning_rate": 0.00012444987775061126, "loss": 0.0, "step": 1200 }, { "epoch": 0.45, "grad_norm": 0.0, "learning_rate": 0.0001228198859005705, "loss": 0.0, "step": 1220 }, { "epoch": 0.45, "grad_norm": 0.0, "learning_rate": 0.00012118989405052976, "loss": 0.0, "step": 1240 }, { "epoch": 0.46, "grad_norm": 0.0, "learning_rate": 0.000119559902200489, "loss": 0.0, "step": 1260 }, { "epoch": 0.47, "grad_norm": 0.0, "learning_rate": 0.00011792991035044825, "loss": 0.0, "step": 1280 }, { "epoch": 0.48, "grad_norm": 0.0, "learning_rate": 0.0001162999185004075, "loss": 0.0, "step": 1300 }, { "epoch": 0.48, "grad_norm": 0.0, "learning_rate": 0.00011466992665036676, "loss": 0.0, "step": 1320 }, { "epoch": 0.49, "grad_norm": 0.0, "learning_rate": 0.000113039934800326, "loss": 0.0, "step": 1340 }, { "epoch": 0.5, "grad_norm": 0.0, "learning_rate": 0.00011140994295028527, "loss": 0.0, "step": 1360 }, { "epoch": 0.51, "grad_norm": 0.0, "learning_rate": 0.00010977995110024451, "loss": 0.0, "step": 1380 }, { "epoch": 0.51, "grad_norm": 0.0, "learning_rate": 0.00010814995925020375, "loss": 0.0, "step": 1400 }, { "epoch": 0.52, "grad_norm": 0.0, "learning_rate": 0.00010651996740016302, "loss": 0.0, "step": 1420 }, { "epoch": 0.53, "grad_norm": 0.0, "learning_rate": 0.00010488997555012226, "loss": 0.0, "step": 1440 }, { "epoch": 0.54, "grad_norm": 0.0, "learning_rate": 0.0001032599837000815, "loss": 0.0, "step": 1460 }, { "epoch": 0.54, "grad_norm": 0.0, "learning_rate": 0.00010162999185004076, "loss": 0.0, "step": 1480 }, { "epoch": 0.55, "grad_norm": 0.0, "learning_rate": 0.0001, "loss": 0.0, "step": 1500 }, { "epoch": 0.56, "grad_norm": 0.0, "learning_rate": 9.837000814995926e-05, "loss": 0.0, "step": 1520 }, { "epoch": 0.56, "grad_norm": 0.0, "learning_rate": 9.67400162999185e-05, "loss": 0.0, "step": 1540 }, { "epoch": 0.57, "grad_norm": 0.0, "learning_rate": 9.511002444987775e-05, "loss": 0.0, "step": 1560 }, { "epoch": 0.58, "grad_norm": 0.0, "learning_rate": 9.348003259983701e-05, "loss": 0.0, "step": 1580 }, { "epoch": 0.59, "grad_norm": 0.0, "learning_rate": 9.185004074979625e-05, "loss": 0.0, "step": 1600 }, { "epoch": 0.59, "grad_norm": 0.0, "learning_rate": 9.02200488997555e-05, "loss": 0.0, "step": 1620 }, { "epoch": 0.6, "grad_norm": 0.0, "learning_rate": 8.859005704971476e-05, "loss": 0.0, "step": 1640 }, { "epoch": 0.61, "grad_norm": 0.0, "learning_rate": 8.6960065199674e-05, "loss": 0.0, "step": 1660 }, { "epoch": 0.62, "grad_norm": 0.0, "learning_rate": 8.533007334963325e-05, "loss": 0.0, "step": 1680 }, { "epoch": 0.62, "grad_norm": 0.0, "learning_rate": 8.37000814995925e-05, "loss": 0.0, "step": 1700 }, { "epoch": 0.63, "grad_norm": 0.0, "learning_rate": 8.207008964955176e-05, "loss": 0.0, "step": 1720 }, { "epoch": 0.64, "grad_norm": 0.0, "learning_rate": 8.044009779951101e-05, "loss": 0.0, "step": 1740 }, { "epoch": 0.65, "grad_norm": 0.0, "learning_rate": 7.881010594947025e-05, "loss": 0.0, "step": 1760 }, { "epoch": 0.65, "grad_norm": 0.0, "learning_rate": 7.71801140994295e-05, "loss": 0.0, "step": 1780 }, { "epoch": 0.66, "grad_norm": 0.0, "learning_rate": 7.555012224938876e-05, "loss": 0.0, "step": 1800 }, { "epoch": 0.67, "grad_norm": 0.0, "learning_rate": 7.392013039934801e-05, "loss": 0.0, "step": 1820 }, { "epoch": 0.67, "grad_norm": 0.0, "learning_rate": 7.229013854930725e-05, "loss": 0.0, "step": 1840 }, { "epoch": 0.68, "grad_norm": 0.0, "learning_rate": 7.066014669926651e-05, "loss": 0.0, "step": 1860 }, { "epoch": 0.69, "grad_norm": 0.0, "learning_rate": 6.903015484922576e-05, "loss": 0.0, "step": 1880 }, { "epoch": 0.7, "grad_norm": 0.0, "learning_rate": 6.740016299918502e-05, "loss": 0.0, "step": 1900 }, { "epoch": 0.7, "grad_norm": 0.0, "learning_rate": 6.577017114914426e-05, "loss": 0.0, "step": 1920 }, { "epoch": 0.71, "grad_norm": 0.0, "learning_rate": 6.414017929910351e-05, "loss": 0.0, "step": 1940 }, { "epoch": 0.72, "grad_norm": 0.0, "learning_rate": 6.251018744906276e-05, "loss": 0.0, "step": 1960 }, { "epoch": 0.73, "grad_norm": 0.0, "learning_rate": 6.0880195599022005e-05, "loss": 0.0, "step": 1980 }, { "epoch": 0.73, "grad_norm": 0.0, "learning_rate": 5.925020374898126e-05, "loss": 0.0, "step": 2000 }, { "epoch": 0.74, "grad_norm": 0.0, "learning_rate": 5.762021189894051e-05, "loss": 0.0, "step": 2020 }, { "epoch": 0.75, "grad_norm": 0.0, "learning_rate": 5.5990220048899754e-05, "loss": 0.0, "step": 2040 }, { "epoch": 0.76, "grad_norm": 0.0, "learning_rate": 5.436022819885901e-05, "loss": 0.0, "step": 2060 }, { "epoch": 0.76, "grad_norm": 0.0, "learning_rate": 5.273023634881826e-05, "loss": 0.0, "step": 2080 }, { "epoch": 0.77, "grad_norm": 0.0, "learning_rate": 5.110024449877751e-05, "loss": 0.0, "step": 2100 }, { "epoch": 0.78, "grad_norm": 0.0, "learning_rate": 4.9470252648736756e-05, "loss": 0.0, "step": 2120 }, { "epoch": 0.78, "grad_norm": 0.0, "learning_rate": 4.784026079869601e-05, "loss": 0.0, "step": 2140 }, { "epoch": 0.79, "grad_norm": 0.0, "learning_rate": 4.6210268948655264e-05, "loss": 0.0, "step": 2160 }, { "epoch": 0.8, "grad_norm": 0.0, "learning_rate": 4.458027709861451e-05, "loss": 0.0, "step": 2180 }, { "epoch": 0.81, "grad_norm": 0.0, "learning_rate": 4.295028524857376e-05, "loss": 0.0, "step": 2200 }, { "epoch": 0.81, "grad_norm": 0.0, "learning_rate": 4.132029339853301e-05, "loss": 0.0, "step": 2220 }, { "epoch": 0.82, "grad_norm": 0.0, "learning_rate": 3.969030154849226e-05, "loss": 0.0, "step": 2240 }, { "epoch": 0.83, "grad_norm": 0.0, "learning_rate": 3.8060309698451507e-05, "loss": 0.0, "step": 2260 }, { "epoch": 0.84, "grad_norm": 0.0, "learning_rate": 3.643031784841076e-05, "loss": 0.0, "step": 2280 }, { "epoch": 0.84, "grad_norm": 0.0, "learning_rate": 3.480032599837001e-05, "loss": 0.0, "step": 2300 }, { "epoch": 0.85, "grad_norm": 0.0, "learning_rate": 3.3170334148329255e-05, "loss": 0.0, "step": 2320 }, { "epoch": 0.86, "grad_norm": 0.0, "learning_rate": 3.154034229828851e-05, "loss": 0.0, "step": 2340 }, { "epoch": 0.87, "grad_norm": 0.0, "learning_rate": 2.991035044824776e-05, "loss": 0.0, "step": 2360 }, { "epoch": 0.87, "grad_norm": 0.0, "learning_rate": 2.8280358598207013e-05, "loss": 0.0, "step": 2380 }, { "epoch": 0.88, "grad_norm": 0.0, "learning_rate": 2.665036674816626e-05, "loss": 0.0, "step": 2400 }, { "epoch": 0.89, "grad_norm": 0.0, "learning_rate": 2.5020374898125508e-05, "loss": 0.0, "step": 2420 }, { "epoch": 0.89, "grad_norm": 0.0, "learning_rate": 2.3390383048084762e-05, "loss": 0.0, "step": 2440 }, { "epoch": 0.9, "grad_norm": 0.0, "learning_rate": 2.1760391198044012e-05, "loss": 0.0, "step": 2460 }, { "epoch": 0.91, "grad_norm": 0.0, "learning_rate": 2.0130399348003263e-05, "loss": 0.0, "step": 2480 }, { "epoch": 0.92, "grad_norm": 0.0, "learning_rate": 1.850040749796251e-05, "loss": 0.0, "step": 2500 }, { "epoch": 0.92, "grad_norm": 0.0, "learning_rate": 1.687041564792176e-05, "loss": 0.0, "step": 2520 }, { "epoch": 0.93, "grad_norm": 0.0, "learning_rate": 1.5240423797881013e-05, "loss": 0.0, "step": 2540 }, { "epoch": 0.94, "grad_norm": 0.0, "learning_rate": 1.361043194784026e-05, "loss": 0.0, "step": 2560 }, { "epoch": 0.95, "grad_norm": 0.0, "learning_rate": 1.198044009779951e-05, "loss": 0.0, "step": 2580 }, { "epoch": 0.95, "grad_norm": 0.0, "learning_rate": 1.0350448247758763e-05, "loss": 0.0, "step": 2600 }, { "epoch": 0.96, "grad_norm": 0.0, "learning_rate": 8.720456397718012e-06, "loss": 0.0, "step": 2620 }, { "epoch": 0.97, "grad_norm": 0.0, "learning_rate": 7.090464547677262e-06, "loss": 0.0, "step": 2640 }, { "epoch": 0.98, "grad_norm": 0.0, "learning_rate": 5.460472697636512e-06, "loss": 0.0, "step": 2660 }, { "epoch": 0.98, "grad_norm": 0.0, "learning_rate": 3.830480847595763e-06, "loss": 0.0, "step": 2680 }, { "epoch": 0.99, "grad_norm": 0.0, "learning_rate": 2.2004889975550126e-06, "loss": 0.0, "step": 2700 }, { "epoch": 1.0, "grad_norm": 0.0, "learning_rate": 5.704971475142625e-07, "loss": 0.0, "step": 2720 } ], "logging_steps": 20, "max_steps": 2727, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 67854207762432.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }