| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.004016908381793129, |
| "eval_steps": 102, |
| "global_step": 86, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 4.6708236997594524e-05, |
| "grad_norm": 0.24815663695335388, |
| "learning_rate": 0.0, |
| "loss": 1.5212, |
| "mean_token_accuracy": 0.6624278724193573, |
| "num_tokens": 6610.0, |
| "step": 1 |
| }, |
| { |
| "epoch": 9.341647399518905e-05, |
| "grad_norm": 0.22391362488269806, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 1.3169, |
| "mean_token_accuracy": 0.6899301111698151, |
| "num_tokens": 13855.0, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.00014012471099278357, |
| "grad_norm": 0.24716292321681976, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 1.5019, |
| "mean_token_accuracy": 0.6771045923233032, |
| "num_tokens": 19204.0, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0001868329479903781, |
| "grad_norm": 0.20678554475307465, |
| "learning_rate": 3e-06, |
| "loss": 1.2039, |
| "mean_token_accuracy": 0.7312012910842896, |
| "num_tokens": 27787.0, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.00023354118498797262, |
| "grad_norm": 0.16259518265724182, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 1.234, |
| "mean_token_accuracy": 0.7289687991142273, |
| "num_tokens": 38962.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.00028024942198556714, |
| "grad_norm": 0.18290941417217255, |
| "learning_rate": 5e-06, |
| "loss": 1.3001, |
| "mean_token_accuracy": 0.685863584280014, |
| "num_tokens": 46738.0, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.00032695765898316167, |
| "grad_norm": 0.22102956473827362, |
| "learning_rate": 4.998754476483098e-06, |
| "loss": 1.1942, |
| "mean_token_accuracy": 0.7294453680515289, |
| "num_tokens": 54364.0, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.0003736658959807562, |
| "grad_norm": 0.23882372677326202, |
| "learning_rate": 4.9950192123145654e-06, |
| "loss": 1.3233, |
| "mean_token_accuracy": 0.7093064486980438, |
| "num_tokens": 60646.0, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.0004203741329783507, |
| "grad_norm": 0.20760418474674225, |
| "learning_rate": 4.988798125270709e-06, |
| "loss": 1.8058, |
| "mean_token_accuracy": 0.6353311538696289, |
| "num_tokens": 70548.0, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.00046708236997594524, |
| "grad_norm": 0.18925875425338745, |
| "learning_rate": 4.980097740412761e-06, |
| "loss": 1.6046, |
| "mean_token_accuracy": 0.6708937883377075, |
| "num_tokens": 78213.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0005137906069735398, |
| "grad_norm": 0.2128538340330124, |
| "learning_rate": 4.968927183242991e-06, |
| "loss": 1.4572, |
| "mean_token_accuracy": 0.6536244750022888, |
| "num_tokens": 85224.0, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.0005604988439711343, |
| "grad_norm": 0.19441735744476318, |
| "learning_rate": 4.955298170133318e-06, |
| "loss": 1.2837, |
| "mean_token_accuracy": 0.702611893415451, |
| "num_tokens": 93493.0, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.0006072070809687288, |
| "grad_norm": 0.2139551192522049, |
| "learning_rate": 4.93922499603645e-06, |
| "loss": 1.2754, |
| "mean_token_accuracy": 0.6683319509029388, |
| "num_tokens": 102255.0, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.0006539153179663233, |
| "grad_norm": 0.2441701591014862, |
| "learning_rate": 4.920724519492452e-06, |
| "loss": 1.3914, |
| "mean_token_accuracy": 0.6826636791229248, |
| "num_tokens": 109224.0, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.0007006235549639179, |
| "grad_norm": 0.21002362668514252, |
| "learning_rate": 4.89981614494647e-06, |
| "loss": 1.7201, |
| "mean_token_accuracy": 0.6224022209644318, |
| "num_tokens": 115222.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0007473317919615124, |
| "grad_norm": 0.22687377035617828, |
| "learning_rate": 4.876521802396143e-06, |
| "loss": 1.5563, |
| "mean_token_accuracy": 0.6480969190597534, |
| "num_tokens": 121037.0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.0007940400289591069, |
| "grad_norm": 0.23102138936519623, |
| "learning_rate": 4.850865924390067e-06, |
| "loss": 1.9085, |
| "mean_token_accuracy": 0.5840953290462494, |
| "num_tokens": 126199.0, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.0008407482659567014, |
| "grad_norm": 0.20352718234062195, |
| "learning_rate": 4.822875420401423e-06, |
| "loss": 1.6604, |
| "mean_token_accuracy": 0.6558521389961243, |
| "num_tokens": 133602.0, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.000887456502954296, |
| "grad_norm": 0.2476186901330948, |
| "learning_rate": 4.792579648603658e-06, |
| "loss": 1.3569, |
| "mean_token_accuracy": 0.6907783448696136, |
| "num_tokens": 139069.0, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.0009341647399518905, |
| "grad_norm": 0.2237555831670761, |
| "learning_rate": 4.760010385077814e-06, |
| "loss": 1.3358, |
| "mean_token_accuracy": 0.6829327940940857, |
| "num_tokens": 144727.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.000980872976949485, |
| "grad_norm": 0.1982390135526657, |
| "learning_rate": 4.725201790483807e-06, |
| "loss": 1.3553, |
| "mean_token_accuracy": 0.6921784579753876, |
| "num_tokens": 151639.0, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.0010275812139470795, |
| "grad_norm": 0.21326391398906708, |
| "learning_rate": 4.688190374230609e-06, |
| "loss": 1.2572, |
| "mean_token_accuracy": 0.7189462780952454, |
| "num_tokens": 160851.0, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.0010742894509446742, |
| "grad_norm": 0.2238379716873169, |
| "learning_rate": 4.649014956182927e-06, |
| "loss": 1.5684, |
| "mean_token_accuracy": 0.6826211810112, |
| "num_tokens": 167413.0, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.0011209976879422686, |
| "grad_norm": 0.20658670365810394, |
| "learning_rate": 4.607716625944519e-06, |
| "loss": 1.3036, |
| "mean_token_accuracy": 0.6903411149978638, |
| "num_tokens": 174102.0, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.0011677059249398632, |
| "grad_norm": 0.21213635802268982, |
| "learning_rate": 4.5643386997608765e-06, |
| "loss": 1.4276, |
| "mean_token_accuracy": 0.6844733953475952, |
| "num_tokens": 182236.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.0012144141619374576, |
| "grad_norm": 0.24115009605884552, |
| "learning_rate": 4.518926675086462e-06, |
| "loss": 1.9658, |
| "mean_token_accuracy": 0.5898375511169434, |
| "num_tokens": 189613.0, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.0012611223989350522, |
| "grad_norm": 0.22647137939929962, |
| "learning_rate": 4.471528182864168e-06, |
| "loss": 1.5963, |
| "mean_token_accuracy": 0.6704545319080353, |
| "num_tokens": 196900.0, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.0013078306359326467, |
| "grad_norm": 0.17949607968330383, |
| "learning_rate": 4.422192937567027e-06, |
| "loss": 1.2397, |
| "mean_token_accuracy": 0.7182678580284119, |
| "num_tokens": 204513.0, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.0013545388729302413, |
| "grad_norm": 0.2529963552951813, |
| "learning_rate": 4.3709726850546015e-06, |
| "loss": 1.7072, |
| "mean_token_accuracy": 0.6386008858680725, |
| "num_tokens": 210803.0, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.0014012471099278357, |
| "grad_norm": 0.1803470253944397, |
| "learning_rate": 4.3179211482987196e-06, |
| "loss": 1.2488, |
| "mean_token_accuracy": 0.7105345129966736, |
| "num_tokens": 218783.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.0014479553469254303, |
| "grad_norm": 0.1690954566001892, |
| "learning_rate": 4.2630939710354985e-06, |
| "loss": 1.1768, |
| "mean_token_accuracy": 0.7205143570899963, |
| "num_tokens": 228472.0, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.0014946635839230248, |
| "grad_norm": 0.2781330943107605, |
| "learning_rate": 4.206548659402743e-06, |
| "loss": 1.2211, |
| "mean_token_accuracy": 0.7092916369438171, |
| "num_tokens": 233617.0, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.0015413718209206194, |
| "grad_norm": 0.194204643368721, |
| "learning_rate": 4.148344521623957e-06, |
| "loss": 1.4836, |
| "mean_token_accuracy": 0.6891669929027557, |
| "num_tokens": 241676.0, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.0015880800579182138, |
| "grad_norm": 0.1841108798980713, |
| "learning_rate": 4.088542605802202e-06, |
| "loss": 1.3113, |
| "mean_token_accuracy": 0.7001610100269318, |
| "num_tokens": 251517.0, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.0016347882949158084, |
| "grad_norm": 0.19068853557109833, |
| "learning_rate": 4.0272056358890665e-06, |
| "loss": 1.3375, |
| "mean_token_accuracy": 0.7067949175834656, |
| "num_tokens": 260718.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.0016814965319134029, |
| "grad_norm": 0.22810104489326477, |
| "learning_rate": 3.964397945895903e-06, |
| "loss": 1.5774, |
| "mean_token_accuracy": 0.6743293404579163, |
| "num_tokens": 267063.0, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.0017282047689109975, |
| "grad_norm": 0.2199760228395462, |
| "learning_rate": 3.900185412416337e-06, |
| "loss": 1.7639, |
| "mean_token_accuracy": 0.6289554536342621, |
| "num_tokens": 274278.0, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.001774913005908592, |
| "grad_norm": 0.23150336742401123, |
| "learning_rate": 3.834635385530813e-06, |
| "loss": 2.4011, |
| "mean_token_accuracy": 0.5110038071870804, |
| "num_tokens": 279752.0, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.0018216212429061865, |
| "grad_norm": 0.2343943864107132, |
| "learning_rate": 3.7678166181656624e-06, |
| "loss": 1.3846, |
| "mean_token_accuracy": 0.6946818828582764, |
| "num_tokens": 287145.0, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.001868329479903781, |
| "grad_norm": 0.1842251569032669, |
| "learning_rate": 3.6997991939807804e-06, |
| "loss": 1.5058, |
| "mean_token_accuracy": 0.6877120733261108, |
| "num_tokens": 295574.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0019150377169013756, |
| "grad_norm": 0.21155694127082825, |
| "learning_rate": 3.63065445386154e-06, |
| "loss": 1.7073, |
| "mean_token_accuracy": 0.6352330446243286, |
| "num_tokens": 302707.0, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.00196174595389897, |
| "grad_norm": 0.25946730375289917, |
| "learning_rate": 3.5604549210920576e-06, |
| "loss": 1.5031, |
| "mean_token_accuracy": 0.6669047772884369, |
| "num_tokens": 307612.0, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.0020084541908965644, |
| "grad_norm": 0.18682092428207397, |
| "learning_rate": 3.489274225288284e-06, |
| "loss": 1.3015, |
| "mean_token_accuracy": 0.7169822454452515, |
| "num_tokens": 319133.0, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.002055162427894159, |
| "grad_norm": 0.2042531669139862, |
| "learning_rate": 3.4171870251706995e-06, |
| "loss": 2.0483, |
| "mean_token_accuracy": 0.5778546035289764, |
| "num_tokens": 326268.0, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.0021018706648917537, |
| "grad_norm": 0.2018388956785202, |
| "learning_rate": 3.344268930257633e-06, |
| "loss": 1.4825, |
| "mean_token_accuracy": 0.6688913702964783, |
| "num_tokens": 332923.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.0021485789018893483, |
| "grad_norm": 0.21121808886528015, |
| "learning_rate": 3.2705964215613145e-06, |
| "loss": 1.4353, |
| "mean_token_accuracy": 0.68389692902565, |
| "num_tokens": 338889.0, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.0021952871388869425, |
| "grad_norm": 0.2037765383720398, |
| "learning_rate": 3.196246771369853e-06, |
| "loss": 1.1499, |
| "mean_token_accuracy": 0.7377910912036896, |
| "num_tokens": 346176.0, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.002241995375884537, |
| "grad_norm": 0.1926920861005783, |
| "learning_rate": 3.121297962199279e-06, |
| "loss": 1.4606, |
| "mean_token_accuracy": 0.6756649613380432, |
| "num_tokens": 354291.0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.0022887036128821318, |
| "grad_norm": 0.23878051340579987, |
| "learning_rate": 3.0458286050006548e-06, |
| "loss": 1.6098, |
| "mean_token_accuracy": 0.6611288487911224, |
| "num_tokens": 361893.0, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.0023354118498797264, |
| "grad_norm": 0.2237703949213028, |
| "learning_rate": 2.96991785670804e-06, |
| "loss": 1.7951, |
| "mean_token_accuracy": 0.6226321458816528, |
| "num_tokens": 369849.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.0023821200868773206, |
| "grad_norm": 0.21574032306671143, |
| "learning_rate": 2.8936453372138006e-06, |
| "loss": 1.2809, |
| "mean_token_accuracy": 0.71575528383255, |
| "num_tokens": 376459.0, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.0024288283238749152, |
| "grad_norm": 0.1896604299545288, |
| "learning_rate": 2.8170910458583355e-06, |
| "loss": 1.7823, |
| "mean_token_accuracy": 0.6189461052417755, |
| "num_tokens": 383519.0, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.00247553656087251, |
| "grad_norm": 0.17899923026561737, |
| "learning_rate": 2.740335277521815e-06, |
| "loss": 1.6108, |
| "mean_token_accuracy": 0.6556365489959717, |
| "num_tokens": 391616.0, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.0025222447978701045, |
| "grad_norm": 0.22684413194656372, |
| "learning_rate": 2.6634585384059415e-06, |
| "loss": 1.7163, |
| "mean_token_accuracy": 0.629599392414093, |
| "num_tokens": 399812.0, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.002568953034867699, |
| "grad_norm": 0.18395479023456573, |
| "learning_rate": 2.5865414615940594e-06, |
| "loss": 1.2312, |
| "mean_token_accuracy": 0.7152631878852844, |
| "num_tokens": 407387.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.0026156612718652933, |
| "grad_norm": 0.17716501653194427, |
| "learning_rate": 2.509664722478186e-06, |
| "loss": 1.6189, |
| "mean_token_accuracy": 0.6399194896221161, |
| "num_tokens": 415514.0, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.002662369508862888, |
| "grad_norm": 0.2080097496509552, |
| "learning_rate": 2.4329089541416655e-06, |
| "loss": 1.5846, |
| "mean_token_accuracy": 0.663497120141983, |
| "num_tokens": 422039.0, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.0027090777458604826, |
| "grad_norm": 0.19784171879291534, |
| "learning_rate": 2.3563546627862e-06, |
| "loss": 1.3426, |
| "mean_token_accuracy": 0.6939470767974854, |
| "num_tokens": 428566.0, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.0027557859828580772, |
| "grad_norm": 0.1710215061903, |
| "learning_rate": 2.2800821432919614e-06, |
| "loss": 1.5268, |
| "mean_token_accuracy": 0.6385573744773865, |
| "num_tokens": 436877.0, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.0028024942198556714, |
| "grad_norm": 0.19530710577964783, |
| "learning_rate": 2.204171394999346e-06, |
| "loss": 1.7253, |
| "mean_token_accuracy": 0.6284772753715515, |
| "num_tokens": 444515.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.002849202456853266, |
| "grad_norm": 0.20333561301231384, |
| "learning_rate": 2.1287020378007216e-06, |
| "loss": 1.6405, |
| "mean_token_accuracy": 0.6564561724662781, |
| "num_tokens": 450241.0, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.0028959106938508607, |
| "grad_norm": 0.24220742285251617, |
| "learning_rate": 2.0537532286301483e-06, |
| "loss": 1.7357, |
| "mean_token_accuracy": 0.613721638917923, |
| "num_tokens": 455245.0, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.0029426189308484553, |
| "grad_norm": 0.25850263237953186, |
| "learning_rate": 1.9794035784386857e-06, |
| "loss": 1.2092, |
| "mean_token_accuracy": 0.7280539572238922, |
| "num_tokens": 461165.0, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.0029893271678460495, |
| "grad_norm": 0.1687914878129959, |
| "learning_rate": 1.9057310697423676e-06, |
| "loss": 1.4988, |
| "mean_token_accuracy": 0.6855064630508423, |
| "num_tokens": 470696.0, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.003036035404843644, |
| "grad_norm": 0.22230161726474762, |
| "learning_rate": 1.8328129748293017e-06, |
| "loss": 1.9035, |
| "mean_token_accuracy": 0.5855222791433334, |
| "num_tokens": 475858.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.0030827436418412388, |
| "grad_norm": 0.21983900666236877, |
| "learning_rate": 1.7607257747117174e-06, |
| "loss": 1.7954, |
| "mean_token_accuracy": 0.6393535435199738, |
| "num_tokens": 481280.0, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.0031294518788388334, |
| "grad_norm": 0.28015828132629395, |
| "learning_rate": 1.6895450789079434e-06, |
| "loss": 1.254, |
| "mean_token_accuracy": 0.7204155027866364, |
| "num_tokens": 488210.0, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.0031761601158364276, |
| "grad_norm": 0.21209849417209625, |
| "learning_rate": 1.6193455461384617e-06, |
| "loss": 1.9336, |
| "mean_token_accuracy": 0.6008668541908264, |
| "num_tokens": 494581.0, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.0032228683528340222, |
| "grad_norm": 0.25260934233665466, |
| "learning_rate": 1.5502008060192202e-06, |
| "loss": 1.7713, |
| "mean_token_accuracy": 0.592128798365593, |
| "num_tokens": 499120.0, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.003269576589831617, |
| "grad_norm": 0.19961196184158325, |
| "learning_rate": 1.4821833818343378e-06, |
| "loss": 1.3307, |
| "mean_token_accuracy": 0.7077135443687439, |
| "num_tokens": 506700.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.0033162848268292115, |
| "grad_norm": 0.21448221802711487, |
| "learning_rate": 1.4153646144691887e-06, |
| "loss": 1.4005, |
| "mean_token_accuracy": 0.6793873608112335, |
| "num_tokens": 513622.0, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.0033629930638268057, |
| "grad_norm": 0.19134606420993805, |
| "learning_rate": 1.3498145875836636e-06, |
| "loss": 1.8944, |
| "mean_token_accuracy": 0.6288950145244598, |
| "num_tokens": 520502.0, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.0034097013008244003, |
| "grad_norm": 0.20466452836990356, |
| "learning_rate": 1.285602054104097e-06, |
| "loss": 1.9276, |
| "mean_token_accuracy": 0.57016322016716, |
| "num_tokens": 528001.0, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.003456409537821995, |
| "grad_norm": 0.20611968636512756, |
| "learning_rate": 1.2227943641109345e-06, |
| "loss": 1.8754, |
| "mean_token_accuracy": 0.5942300260066986, |
| "num_tokens": 533840.0, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.0035031177748195896, |
| "grad_norm": 0.22362089157104492, |
| "learning_rate": 1.1614573941977975e-06, |
| "loss": 2.0853, |
| "mean_token_accuracy": 0.5632732808589935, |
| "num_tokens": 540459.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.003549826011817184, |
| "grad_norm": 0.16187940537929535, |
| "learning_rate": 1.1016554783760433e-06, |
| "loss": 1.3277, |
| "mean_token_accuracy": 0.7023499011993408, |
| "num_tokens": 548772.0, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.0035965342488147784, |
| "grad_norm": 0.22978803515434265, |
| "learning_rate": 2.2033109567520866e-06, |
| "loss": 1.8168, |
| "mean_token_accuracy": 0.6008757054805756, |
| "num_tokens": 554584.0, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.003643242485812373, |
| "grad_norm": 0.1978112757205963, |
| "learning_rate": 2.0419772472095698e-06, |
| "loss": 1.5754, |
| "mean_token_accuracy": 0.6501273214817047, |
| "num_tokens": 561654.0, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.0036899507228099677, |
| "grad_norm": 0.2072610855102539, |
| "learning_rate": 1.652483217564065e-06, |
| "loss": 1.7213, |
| "mean_token_accuracy": 0.6490782797336578, |
| "num_tokens": 567900.0, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.003736658959807562, |
| "grad_norm": 0.22125592827796936, |
| "learning_rate": 1.2629891879185599e-06, |
| "loss": 1.4386, |
| "mean_token_accuracy": 0.6827996075153351, |
| "num_tokens": 576574.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.0037833671968051565, |
| "grad_norm": 0.18630705773830414, |
| "learning_rate": 1.1016554783760433e-06, |
| "loss": 1.0299, |
| "mean_token_accuracy": 0.7590331435203552, |
| "num_tokens": 583830.0, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.003830075433802751, |
| "grad_norm": 0.20747390389442444, |
| "learning_rate": 2.2033109567520866e-06, |
| "loss": 1.45, |
| "mean_token_accuracy": 0.6852848827838898, |
| "num_tokens": 592239.0, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.003876783670800346, |
| "grad_norm": 0.1848316192626953, |
| "learning_rate": 2.0419772472095698e-06, |
| "loss": 1.8091, |
| "mean_token_accuracy": 0.6223555505275726, |
| "num_tokens": 599697.0, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.00392349190779794, |
| "grad_norm": 0.17624878883361816, |
| "learning_rate": 1.652483217564065e-06, |
| "loss": 1.1183, |
| "mean_token_accuracy": 0.7234238386154175, |
| "num_tokens": 610087.0, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.003970200144795535, |
| "grad_norm": 0.169888436794281, |
| "learning_rate": 1.2629891879185599e-06, |
| "loss": 1.7984, |
| "mean_token_accuracy": 0.6403481364250183, |
| "num_tokens": 617837.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.004016908381793129, |
| "grad_norm": 0.2139935940504074, |
| "learning_rate": 1.1016554783760433e-06, |
| "loss": 1.7612, |
| "mean_token_accuracy": 0.614133208990097, |
| "num_tokens": 623438.0, |
| "step": 86 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 102, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2417724847292416.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|