{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.46490004649000466, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004649000464900047, "grad_norm": 0.32953426241874695, "learning_rate": 6.666666666666667e-05, "loss": 1.2004, "step": 1 }, { "epoch": 0.009298000929800094, "grad_norm": 0.3239539563655853, "learning_rate": 0.00013333333333333334, "loss": 1.2289, "step": 2 }, { "epoch": 0.01394700139470014, "grad_norm": 0.3198412358760834, "learning_rate": 0.0002, "loss": 1.2456, "step": 3 }, { "epoch": 0.018596001859600187, "grad_norm": 0.3325466513633728, "learning_rate": 0.00019995280121409636, "loss": 1.2665, "step": 4 }, { "epoch": 0.023245002324500233, "grad_norm": 0.24871456623077393, "learning_rate": 0.00019981125436139405, "loss": 1.2468, "step": 5 }, { "epoch": 0.02789400278940028, "grad_norm": 0.2815144658088684, "learning_rate": 0.00019957550790499526, "loss": 1.3639, "step": 6 }, { "epoch": 0.032543003254300325, "grad_norm": 0.32101693749427795, "learning_rate": 0.00019924580911037827, "loss": 1.3038, "step": 7 }, { "epoch": 0.037192003719200374, "grad_norm": 0.3729206919670105, "learning_rate": 0.00019882250378605015, "loss": 1.2968, "step": 8 }, { "epoch": 0.04184100418410042, "grad_norm": 0.6571431159973145, "learning_rate": 0.0001983060359208415, "loss": 1.3009, "step": 9 }, { "epoch": 0.046490004649000466, "grad_norm": 0.7671701908111572, "learning_rate": 0.00019769694721822337, "loss": 1.3118, "step": 10 }, { "epoch": 0.05113900511390051, "grad_norm": 1.0644092559814453, "learning_rate": 0.00019699587652813503, "loss": 1.1329, "step": 11 }, { "epoch": 0.05578800557880056, "grad_norm": 1.1345399618148804, "learning_rate": 0.00019620355917691884, "loss": 0.9978, "step": 12 }, { "epoch": 0.06043700604370061, "grad_norm": 0.7588826417922974, "learning_rate": 0.00019532082619606436, "loss": 1.0705, "step": 13 }, { "epoch": 0.06508600650860065, "grad_norm": 0.48238641023635864, "learning_rate": 0.00019434860345057096, "loss": 1.1014, "step": 14 }, { "epoch": 0.0697350069735007, "grad_norm": 0.4875181317329407, "learning_rate": 0.0001932879106678434, "loss": 1.1474, "step": 15 }, { "epoch": 0.07438400743840075, "grad_norm": 0.4862312972545624, "learning_rate": 0.00019213986036813863, "loss": 1.1379, "step": 16 }, { "epoch": 0.07903300790330078, "grad_norm": 0.3827863931655884, "learning_rate": 0.0001909056566976856, "loss": 1.184, "step": 17 }, { "epoch": 0.08368200836820083, "grad_norm": 0.27560049295425415, "learning_rate": 0.00018958659416570212, "loss": 1.2394, "step": 18 }, { "epoch": 0.08833100883310088, "grad_norm": 0.2506018877029419, "learning_rate": 0.0001881840562866336, "loss": 1.2103, "step": 19 }, { "epoch": 0.09298000929800093, "grad_norm": 0.27463746070861816, "learning_rate": 0.00018669951412903725, "loss": 1.1667, "step": 20 }, { "epoch": 0.09762900976290098, "grad_norm": 0.39848339557647705, "learning_rate": 0.0001851345247726344, "loss": 1.199, "step": 21 }, { "epoch": 0.10227801022780102, "grad_norm": 0.4772193729877472, "learning_rate": 0.00018349072967514896, "loss": 1.0961, "step": 22 }, { "epoch": 0.10692701069270107, "grad_norm": 0.5459783673286438, "learning_rate": 0.00018176985295064487, "loss": 1.0476, "step": 23 }, { "epoch": 0.11157601115760112, "grad_norm": 0.4945487082004547, "learning_rate": 0.00017997369956116845, "loss": 1.0457, "step": 24 }, { "epoch": 0.11622501162250116, "grad_norm": 0.6709153652191162, "learning_rate": 0.00017810415342359257, "loss": 0.9617, "step": 25 }, { "epoch": 0.12087401208740121, "grad_norm": 0.11191853880882263, "learning_rate": 0.00017616317543364804, "loss": 1.0668, "step": 26 }, { "epoch": 0.12552301255230125, "grad_norm": 0.10844499617815018, "learning_rate": 0.00017415280140921463, "loss": 1.0769, "step": 27 }, { "epoch": 0.1301720130172013, "grad_norm": 0.10678815096616745, "learning_rate": 0.00017207513995502939, "loss": 1.1198, "step": 28 }, { "epoch": 0.13482101348210135, "grad_norm": 0.12248999625444412, "learning_rate": 0.0001699323702510513, "loss": 1.1554, "step": 29 }, { "epoch": 0.1394700139470014, "grad_norm": 0.17909854650497437, "learning_rate": 0.0001677267397668026, "loss": 1.1657, "step": 30 }, { "epoch": 0.14411901441190145, "grad_norm": 0.268684446811676, "learning_rate": 0.0001654605619040835, "loss": 1.1791, "step": 31 }, { "epoch": 0.1487680148768015, "grad_norm": 0.3540749251842499, "learning_rate": 0.00016313621357053306, "loss": 1.1892, "step": 32 }, { "epoch": 0.15341701534170155, "grad_norm": 0.3607785999774933, "learning_rate": 0.00016075613268658157, "loss": 1.1991, "step": 33 }, { "epoch": 0.15806601580660157, "grad_norm": 0.35358941555023193, "learning_rate": 0.00015832281562840856, "loss": 1.0899, "step": 34 }, { "epoch": 0.16271501627150162, "grad_norm": 0.40458086133003235, "learning_rate": 0.00015583881460958868, "loss": 1.0584, "step": 35 }, { "epoch": 0.16736401673640167, "grad_norm": 0.4372604191303253, "learning_rate": 0.0001533067350041725, "loss": 0.9016, "step": 36 }, { "epoch": 0.17201301720130172, "grad_norm": 0.6177683472633362, "learning_rate": 0.0001507292326140085, "loss": 0.9031, "step": 37 }, { "epoch": 0.17666201766620176, "grad_norm": 0.6434624791145325, "learning_rate": 0.00014810901088317414, "loss": 1.0235, "step": 38 }, { "epoch": 0.18131101813110181, "grad_norm": 0.09565520286560059, "learning_rate": 0.00014544881806243583, "loss": 1.095, "step": 39 }, { "epoch": 0.18596001859600186, "grad_norm": 0.10863650590181351, "learning_rate": 0.0001427514443267139, "loss": 1.0956, "step": 40 }, { "epoch": 0.1906090190609019, "grad_norm": 0.09909647703170776, "learning_rate": 0.0001400197188485739, "loss": 1.1065, "step": 41 }, { "epoch": 0.19525801952580196, "grad_norm": 0.09885207563638687, "learning_rate": 0.00013725650683081556, "loss": 1.1516, "step": 42 }, { "epoch": 0.199907019990702, "grad_norm": 0.10642173141241074, "learning_rate": 0.0001344647065012709, "loss": 1.1673, "step": 43 }, { "epoch": 0.20455602045560203, "grad_norm": 0.14314843714237213, "learning_rate": 0.00013164724607296285, "loss": 1.1498, "step": 44 }, { "epoch": 0.20920502092050208, "grad_norm": 0.17045779526233673, "learning_rate": 0.00012880708067281477, "loss": 1.1511, "step": 45 }, { "epoch": 0.21385402138540213, "grad_norm": 0.2381921112537384, "learning_rate": 0.00012594718924213008, "loss": 1.0835, "step": 46 }, { "epoch": 0.21850302185030218, "grad_norm": 0.3426450192928314, "learning_rate": 0.00012307057141209415, "loss": 1.0434, "step": 47 }, { "epoch": 0.22315202231520223, "grad_norm": 0.4425382912158966, "learning_rate": 0.0001201802443575756, "loss": 0.9782, "step": 48 }, { "epoch": 0.22780102278010228, "grad_norm": 0.47294408082962036, "learning_rate": 0.0001172792396325264, "loss": 0.8836, "step": 49 }, { "epoch": 0.23245002324500233, "grad_norm": 0.5693187713623047, "learning_rate": 0.00011437059999030035, "loss": 0.9676, "step": 50 }, { "epoch": 0.23709902370990238, "grad_norm": 0.14201653003692627, "learning_rate": 0.00011145737619222516, "loss": 1.0578, "step": 51 }, { "epoch": 0.24174802417480243, "grad_norm": 0.15487229824066162, "learning_rate": 0.00010854262380777486, "loss": 1.0633, "step": 52 }, { "epoch": 0.24639702463970248, "grad_norm": 0.14283418655395508, "learning_rate": 0.0001056294000096997, "loss": 1.0856, "step": 53 }, { "epoch": 0.2510460251046025, "grad_norm": 0.14647558331489563, "learning_rate": 0.00010272076036747365, "loss": 1.123, "step": 54 }, { "epoch": 0.2556950255695026, "grad_norm": 0.16010554134845734, "learning_rate": 9.981975564242443e-05, "loss": 1.1116, "step": 55 }, { "epoch": 0.2603440260344026, "grad_norm": 0.14617392420768738, "learning_rate": 9.692942858790591e-05, "loss": 1.1356, "step": 56 }, { "epoch": 0.2649930264993027, "grad_norm": 0.17232324182987213, "learning_rate": 9.405281075786995e-05, "loss": 1.112, "step": 57 }, { "epoch": 0.2696420269642027, "grad_norm": 0.18721264600753784, "learning_rate": 9.119291932718525e-05, "loss": 1.1253, "step": 58 }, { "epoch": 0.2742910274291027, "grad_norm": 0.23640522360801697, "learning_rate": 8.835275392703721e-05, "loss": 1.0711, "step": 59 }, { "epoch": 0.2789400278940028, "grad_norm": 0.29225069284439087, "learning_rate": 8.553529349872916e-05, "loss": 0.9971, "step": 60 }, { "epoch": 0.2835890283589028, "grad_norm": 0.36253005266189575, "learning_rate": 8.274349316918446e-05, "loss": 0.9234, "step": 61 }, { "epoch": 0.2882380288238029, "grad_norm": 0.38477060198783875, "learning_rate": 7.998028115142617e-05, "loss": 0.8265, "step": 62 }, { "epoch": 0.2928870292887029, "grad_norm": 0.49141454696655273, "learning_rate": 7.724855567328613e-05, "loss": 0.9709, "step": 63 }, { "epoch": 0.297536029753603, "grad_norm": 0.09308861941099167, "learning_rate": 7.455118193756419e-05, "loss": 1.0642, "step": 64 }, { "epoch": 0.302185030218503, "grad_norm": 0.11423233896493912, "learning_rate": 7.189098911682592e-05, "loss": 1.0881, "step": 65 }, { "epoch": 0.3068340306834031, "grad_norm": 0.11385729163885117, "learning_rate": 6.927076738599152e-05, "loss": 1.0958, "step": 66 }, { "epoch": 0.3114830311483031, "grad_norm": 0.16382458806037903, "learning_rate": 6.669326499582755e-05, "loss": 1.1432, "step": 67 }, { "epoch": 0.31613203161320313, "grad_norm": 0.13493482768535614, "learning_rate": 6.416118539041135e-05, "loss": 1.1444, "step": 68 }, { "epoch": 0.3207810320781032, "grad_norm": 0.18623846769332886, "learning_rate": 6.167718437159147e-05, "loss": 1.1724, "step": 69 }, { "epoch": 0.32543003254300323, "grad_norm": 0.16176700592041016, "learning_rate": 5.924386731341842e-05, "loss": 1.0617, "step": 70 }, { "epoch": 0.3300790330079033, "grad_norm": 0.20845215022563934, "learning_rate": 5.686378642946699e-05, "loss": 1.0322, "step": 71 }, { "epoch": 0.33472803347280333, "grad_norm": 0.22166885435581207, "learning_rate": 5.453943809591654e-05, "loss": 0.9453, "step": 72 }, { "epoch": 0.3393770339377034, "grad_norm": 0.349749356508255, "learning_rate": 5.227326023319743e-05, "loss": 0.9271, "step": 73 }, { "epoch": 0.34402603440260343, "grad_norm": 0.3437642753124237, "learning_rate": 5.006762974894872e-05, "loss": 0.874, "step": 74 }, { "epoch": 0.3486750348675035, "grad_norm": 0.4277116656303406, "learning_rate": 4.7924860044970615e-05, "loss": 0.8426, "step": 75 }, { "epoch": 0.35332403533240353, "grad_norm": 0.0775565356016159, "learning_rate": 4.5847198590785394e-05, "loss": 1.0357, "step": 76 }, { "epoch": 0.3579730357973036, "grad_norm": 0.07587376981973648, "learning_rate": 4.383682456635199e-05, "loss": 1.0816, "step": 77 }, { "epoch": 0.36262203626220363, "grad_norm": 0.0855608880519867, "learning_rate": 4.1895846576407424e-05, "loss": 1.095, "step": 78 }, { "epoch": 0.36727103672710365, "grad_norm": 0.08497363328933716, "learning_rate": 4.002630043883159e-05, "loss": 1.1268, "step": 79 }, { "epoch": 0.3719200371920037, "grad_norm": 0.0938296765089035, "learning_rate": 3.8230147049355147e-05, "loss": 1.1187, "step": 80 }, { "epoch": 0.37656903765690375, "grad_norm": 0.09680171310901642, "learning_rate": 3.650927032485101e-05, "loss": 1.1838, "step": 81 }, { "epoch": 0.3812180381218038, "grad_norm": 0.11150510609149933, "learning_rate": 3.486547522736562e-05, "loss": 1.1177, "step": 82 }, { "epoch": 0.38586703858670385, "grad_norm": 0.13839669525623322, "learning_rate": 3.3300485870962776e-05, "loss": 1.087, "step": 83 }, { "epoch": 0.3905160390516039, "grad_norm": 0.1885904222726822, "learning_rate": 3.1815943713366404e-05, "loss": 1.0186, "step": 84 }, { "epoch": 0.39516503951650395, "grad_norm": 0.243554025888443, "learning_rate": 3.041340583429789e-05, "loss": 0.8965, "step": 85 }, { "epoch": 0.399814039981404, "grad_norm": 0.27477556467056274, "learning_rate": 2.9094343302314432e-05, "loss": 0.918, "step": 86 }, { "epoch": 0.40446304044630405, "grad_norm": 0.31194981932640076, "learning_rate": 2.78601396318614e-05, "loss": 0.7524, "step": 87 }, { "epoch": 0.40911204091120407, "grad_norm": 0.3639957308769226, "learning_rate": 2.6712089332156633e-05, "loss": 0.9365, "step": 88 }, { "epoch": 0.41376104137610414, "grad_norm": 0.08689970523118973, "learning_rate": 2.5651396549429086e-05, "loss": 1.0602, "step": 89 }, { "epoch": 0.41841004184100417, "grad_norm": 0.08937124907970428, "learning_rate": 2.4679173803935662e-05, "loss": 1.0554, "step": 90 }, { "epoch": 0.42305904230590424, "grad_norm": 0.09696152806282043, "learning_rate": 2.3796440823081167e-05, "loss": 1.1068, "step": 91 }, { "epoch": 0.42770804277080426, "grad_norm": 0.09235560148954391, "learning_rate": 2.3004123471865e-05, "loss": 1.1125, "step": 92 }, { "epoch": 0.43235704323570434, "grad_norm": 0.10225149989128113, "learning_rate": 2.2303052781776664e-05, "loss": 1.1077, "step": 93 }, { "epoch": 0.43700604370060436, "grad_norm": 0.11181271821260452, "learning_rate": 2.169396407915849e-05, "loss": 1.1302, "step": 94 }, { "epoch": 0.44165504416550444, "grad_norm": 0.14333005249500275, "learning_rate": 2.1177496213949837e-05, "loss": 1.1268, "step": 95 }, { "epoch": 0.44630404463040446, "grad_norm": 0.19770824909210205, "learning_rate": 2.0754190889621745e-05, "loss": 1.0529, "step": 96 }, { "epoch": 0.4509530450953045, "grad_norm": 0.2224006950855255, "learning_rate": 2.0424492095004746e-05, "loss": 0.9715, "step": 97 }, { "epoch": 0.45560204556020456, "grad_norm": 0.26830530166625977, "learning_rate": 2.0188745638605954e-05, "loss": 0.8829, "step": 98 }, { "epoch": 0.4602510460251046, "grad_norm": 0.2952977120876312, "learning_rate": 2.0047198785903658e-05, "loss": 0.8337, "step": 99 }, { "epoch": 0.46490004649000466, "grad_norm": 0.5423007607460022, "learning_rate": 2e-05, "loss": 0.8743, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.384528854810624e+17, "train_batch_size": 6, "trial_name": null, "trial_params": null }