| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 171, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.017543859649122806, |
| "grad_norm": 3.9338779838849725, |
| "learning_rate": 0.0, |
| "loss": 1.2337, |
| "num_tokens": 415561.0, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.03508771929824561, |
| "grad_norm": 4.061961880981431, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 1.2551, |
| "num_tokens": 811930.0, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.05263157894736842, |
| "grad_norm": 4.126747331618448, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 1.2772, |
| "num_tokens": 1198366.0, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.07017543859649122, |
| "grad_norm": 3.410910565563502, |
| "learning_rate": 5e-06, |
| "loss": 1.1776, |
| "num_tokens": 1608933.0, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.08771929824561403, |
| "grad_norm": 2.233718748318594, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 0.9942, |
| "num_tokens": 2068099.0, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.10526315789473684, |
| "grad_norm": 1.470021030676499, |
| "learning_rate": 8.333333333333334e-06, |
| "loss": 0.8448, |
| "num_tokens": 2506575.0, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.12280701754385964, |
| "grad_norm": 1.4565364805839704, |
| "learning_rate": 1e-05, |
| "loss": 0.8213, |
| "num_tokens": 2932014.0, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.14035087719298245, |
| "grad_norm": 2.3516845691230217, |
| "learning_rate": 9.999184354855868e-06, |
| "loss": 0.6275, |
| "num_tokens": 3370254.0, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.15789473684210525, |
| "grad_norm": 1.8135724347288096, |
| "learning_rate": 9.996737715102133e-06, |
| "loss": 0.5931, |
| "num_tokens": 3782161.0, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.17543859649122806, |
| "grad_norm": 1.4106997469551075, |
| "learning_rate": 9.99266096766761e-06, |
| "loss": 0.4933, |
| "num_tokens": 4211970.0, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.19298245614035087, |
| "grad_norm": 0.8131031118317348, |
| "learning_rate": 9.98695559040975e-06, |
| "loss": 0.4229, |
| "num_tokens": 4613367.0, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.21052631578947367, |
| "grad_norm": 0.7764134903704204, |
| "learning_rate": 9.979623651578881e-06, |
| "loss": 0.3888, |
| "num_tokens": 5012484.0, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.22807017543859648, |
| "grad_norm": 0.37508885186302826, |
| "learning_rate": 9.970667809068476e-06, |
| "loss": 0.3783, |
| "num_tokens": 5420781.0, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.24561403508771928, |
| "grad_norm": 0.3209831897748014, |
| "learning_rate": 9.960091309451626e-06, |
| "loss": 0.3308, |
| "num_tokens": 5823019.0, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.2631578947368421, |
| "grad_norm": 0.2796386507143645, |
| "learning_rate": 9.947897986804131e-06, |
| "loss": 0.3456, |
| "num_tokens": 6241736.0, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.2807017543859649, |
| "grad_norm": 0.25191330470494944, |
| "learning_rate": 9.93409226131462e-06, |
| "loss": 0.3172, |
| "num_tokens": 6680804.0, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.2982456140350877, |
| "grad_norm": 0.2415593584049499, |
| "learning_rate": 9.91867913768218e-06, |
| "loss": 0.3253, |
| "num_tokens": 7107939.0, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.3157894736842105, |
| "grad_norm": 0.3003770654260172, |
| "learning_rate": 9.901664203302126e-06, |
| "loss": 0.3213, |
| "num_tokens": 7524281.0, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.3333333333333333, |
| "grad_norm": 0.3539478460817712, |
| "learning_rate": 9.883053626240503e-06, |
| "loss": 0.2966, |
| "num_tokens": 7939796.0, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.3508771929824561, |
| "grad_norm": 0.2280690675113219, |
| "learning_rate": 9.862854152998112e-06, |
| "loss": 0.3043, |
| "num_tokens": 8366406.0, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.3684210526315789, |
| "grad_norm": 0.21683554455375648, |
| "learning_rate": 9.841073106064852e-06, |
| "loss": 0.3057, |
| "num_tokens": 8772526.0, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.38596491228070173, |
| "grad_norm": 0.19203110899204243, |
| "learning_rate": 9.81771838126524e-06, |
| "loss": 0.2899, |
| "num_tokens": 9193705.0, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.40350877192982454, |
| "grad_norm": 0.21290762042544037, |
| "learning_rate": 9.792798444896107e-06, |
| "loss": 0.3073, |
| "num_tokens": 9641419.0, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.42105263157894735, |
| "grad_norm": 0.1917960836643891, |
| "learning_rate": 9.766322330657499e-06, |
| "loss": 0.2921, |
| "num_tokens": 10041552.0, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.43859649122807015, |
| "grad_norm": 0.18577210434865676, |
| "learning_rate": 9.738299636377863e-06, |
| "loss": 0.28, |
| "num_tokens": 10476244.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.45614035087719296, |
| "grad_norm": 0.18309063662126532, |
| "learning_rate": 9.70874052053476e-06, |
| "loss": 0.2785, |
| "num_tokens": 10863935.0, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.47368421052631576, |
| "grad_norm": 0.18434080604136077, |
| "learning_rate": 9.677655698572326e-06, |
| "loss": 0.2661, |
| "num_tokens": 11259192.0, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.49122807017543857, |
| "grad_norm": 0.16956692098933204, |
| "learning_rate": 9.645056439016827e-06, |
| "loss": 0.273, |
| "num_tokens": 11708724.0, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.5087719298245614, |
| "grad_norm": 0.16434051233500432, |
| "learning_rate": 9.610954559391704e-06, |
| "loss": 0.2667, |
| "num_tokens": 12137403.0, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.5263157894736842, |
| "grad_norm": 0.17669427582556774, |
| "learning_rate": 9.57536242193364e-06, |
| "loss": 0.2584, |
| "num_tokens": 12543239.0, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.543859649122807, |
| "grad_norm": 0.1652310917436867, |
| "learning_rate": 9.538292929111114e-06, |
| "loss": 0.2569, |
| "num_tokens": 12940013.0, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.5614035087719298, |
| "grad_norm": 0.15646296852545927, |
| "learning_rate": 9.499759518947156e-06, |
| "loss": 0.2607, |
| "num_tokens": 13409215.0, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.5789473684210527, |
| "grad_norm": 0.16249861759922887, |
| "learning_rate": 9.459776160147941e-06, |
| "loss": 0.2461, |
| "num_tokens": 13806115.0, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.5964912280701754, |
| "grad_norm": 0.15613005334949157, |
| "learning_rate": 9.418357347038999e-06, |
| "loss": 0.2493, |
| "num_tokens": 14248072.0, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.6140350877192983, |
| "grad_norm": 0.16063917924130028, |
| "learning_rate": 9.375518094310904e-06, |
| "loss": 0.255, |
| "num_tokens": 14680852.0, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.631578947368421, |
| "grad_norm": 0.15840497254274827, |
| "learning_rate": 9.331273931576306e-06, |
| "loss": 0.2459, |
| "num_tokens": 15109781.0, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.6491228070175439, |
| "grad_norm": 0.1549313386657812, |
| "learning_rate": 9.285640897740316e-06, |
| "loss": 0.2461, |
| "num_tokens": 15556489.0, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.15967707273841447, |
| "learning_rate": 9.238635535186247e-06, |
| "loss": 0.2358, |
| "num_tokens": 15975315.0, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.6842105263157895, |
| "grad_norm": 0.1664153879515682, |
| "learning_rate": 9.19027488377886e-06, |
| "loss": 0.2453, |
| "num_tokens": 16366263.0, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.7017543859649122, |
| "grad_norm": 0.16085937077964602, |
| "learning_rate": 9.140576474687263e-06, |
| "loss": 0.2339, |
| "num_tokens": 16780696.0, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.7192982456140351, |
| "grad_norm": 0.1525506357194124, |
| "learning_rate": 9.0895583240297e-06, |
| "loss": 0.2454, |
| "num_tokens": 17226365.0, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.7368421052631579, |
| "grad_norm": 0.15414442592489289, |
| "learning_rate": 9.037238926342544e-06, |
| "loss": 0.2315, |
| "num_tokens": 17651462.0, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.7543859649122807, |
| "grad_norm": 0.160113618784668, |
| "learning_rate": 8.983637247875872e-06, |
| "loss": 0.24, |
| "num_tokens": 18064676.0, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.7719298245614035, |
| "grad_norm": 0.1592020616116995, |
| "learning_rate": 8.92877271971802e-06, |
| "loss": 0.236, |
| "num_tokens": 18474727.0, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.7894736842105263, |
| "grad_norm": 0.15245337800135356, |
| "learning_rate": 8.872665230751644e-06, |
| "loss": 0.2405, |
| "num_tokens": 18902046.0, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.8070175438596491, |
| "grad_norm": 0.20723449258057428, |
| "learning_rate": 8.815335120443822e-06, |
| "loss": 0.224, |
| "num_tokens": 19319711.0, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.8245614035087719, |
| "grad_norm": 0.1474862726874557, |
| "learning_rate": 8.756803171472817e-06, |
| "loss": 0.2412, |
| "num_tokens": 19757131.0, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.8421052631578947, |
| "grad_norm": 0.15052042969864995, |
| "learning_rate": 8.69709060219416e-06, |
| "loss": 0.2327, |
| "num_tokens": 20194869.0, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.8596491228070176, |
| "grad_norm": 0.15385476085770403, |
| "learning_rate": 8.636219058948823e-06, |
| "loss": 0.2266, |
| "num_tokens": 20597224.0, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.8771929824561403, |
| "grad_norm": 0.15228109362462297, |
| "learning_rate": 8.574210608216206e-06, |
| "loss": 0.239, |
| "num_tokens": 21031895.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.8947368421052632, |
| "grad_norm": 0.14813164351615135, |
| "learning_rate": 8.511087728614863e-06, |
| "loss": 0.2271, |
| "num_tokens": 21445108.0, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.9122807017543859, |
| "grad_norm": 0.14876791381975332, |
| "learning_rate": 8.446873302753783e-06, |
| "loss": 0.2277, |
| "num_tokens": 21886020.0, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.9298245614035088, |
| "grad_norm": 0.14949287901477407, |
| "learning_rate": 8.381590608937251e-06, |
| "loss": 0.2395, |
| "num_tokens": 22300331.0, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.9473684210526315, |
| "grad_norm": 0.1413875737037552, |
| "learning_rate": 8.315263312726248e-06, |
| "loss": 0.2321, |
| "num_tokens": 22731352.0, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.9649122807017544, |
| "grad_norm": 0.14923397563157811, |
| "learning_rate": 8.247915458359473e-06, |
| "loss": 0.2169, |
| "num_tokens": 23159887.0, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.9824561403508771, |
| "grad_norm": 0.15363902325932008, |
| "learning_rate": 8.179571460037096e-06, |
| "loss": 0.2357, |
| "num_tokens": 23618660.0, |
| "step": 56 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.14687085036662337, |
| "learning_rate": 8.110256093070393e-06, |
| "loss": 0.2334, |
| "num_tokens": 24039673.0, |
| "step": 57 |
| }, |
| { |
| "epoch": 1.0175438596491229, |
| "grad_norm": 0.1476041876990848, |
| "learning_rate": 8.039994484900463e-06, |
| "loss": 0.2011, |
| "num_tokens": 24434236.0, |
| "step": 58 |
| }, |
| { |
| "epoch": 1.0350877192982457, |
| "grad_norm": 0.1419449695352204, |
| "learning_rate": 7.968812105989316e-06, |
| "loss": 0.2114, |
| "num_tokens": 24853504.0, |
| "step": 59 |
| }, |
| { |
| "epoch": 1.0526315789473684, |
| "grad_norm": 0.15337353318491537, |
| "learning_rate": 7.896734760586599e-06, |
| "loss": 0.2052, |
| "num_tokens": 25255898.0, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.0701754385964912, |
| "grad_norm": 0.14113455068915454, |
| "learning_rate": 7.82378857737533e-06, |
| "loss": 0.2098, |
| "num_tokens": 25667644.0, |
| "step": 61 |
| }, |
| { |
| "epoch": 1.087719298245614, |
| "grad_norm": 0.14898616979585055, |
| "learning_rate": 7.75e-06, |
| "loss": 0.2041, |
| "num_tokens": 26083880.0, |
| "step": 62 |
| }, |
| { |
| "epoch": 1.1052631578947367, |
| "grad_norm": 0.15200676955309492, |
| "learning_rate": 7.675395777480538e-06, |
| "loss": 0.1911, |
| "num_tokens": 26465737.0, |
| "step": 63 |
| }, |
| { |
| "epoch": 1.1228070175438596, |
| "grad_norm": 0.15217717203165435, |
| "learning_rate": 7.600002954515532e-06, |
| "loss": 0.2113, |
| "num_tokens": 26905038.0, |
| "step": 64 |
| }, |
| { |
| "epoch": 1.1403508771929824, |
| "grad_norm": 0.1392806691058708, |
| "learning_rate": 7.523848861678297e-06, |
| "loss": 0.1981, |
| "num_tokens": 27327803.0, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.1578947368421053, |
| "grad_norm": 0.14823210297550762, |
| "learning_rate": 7.446961105509289e-06, |
| "loss": 0.199, |
| "num_tokens": 27743615.0, |
| "step": 66 |
| }, |
| { |
| "epoch": 1.1754385964912282, |
| "grad_norm": 0.15025342094871685, |
| "learning_rate": 7.36936755850849e-06, |
| "loss": 0.2035, |
| "num_tokens": 28159990.0, |
| "step": 67 |
| }, |
| { |
| "epoch": 1.1929824561403508, |
| "grad_norm": 0.1424694767142918, |
| "learning_rate": 7.2910963490313815e-06, |
| "loss": 0.1997, |
| "num_tokens": 28556831.0, |
| "step": 68 |
| }, |
| { |
| "epoch": 1.2105263157894737, |
| "grad_norm": 0.14322092181407153, |
| "learning_rate": 7.212175851092154e-06, |
| "loss": 0.1961, |
| "num_tokens": 28995194.0, |
| "step": 69 |
| }, |
| { |
| "epoch": 1.2280701754385965, |
| "grad_norm": 0.1526511754370368, |
| "learning_rate": 7.132634674077884e-06, |
| "loss": 0.1921, |
| "num_tokens": 29410916.0, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.2456140350877192, |
| "grad_norm": 0.139735447301268, |
| "learning_rate": 7.052501652377368e-06, |
| "loss": 0.2063, |
| "num_tokens": 29913120.0, |
| "step": 71 |
| }, |
| { |
| "epoch": 1.263157894736842, |
| "grad_norm": 0.14344116097027, |
| "learning_rate": 6.971805834928399e-06, |
| "loss": 0.2027, |
| "num_tokens": 30351179.0, |
| "step": 72 |
| }, |
| { |
| "epoch": 1.280701754385965, |
| "grad_norm": 0.1426174151835526, |
| "learning_rate": 6.890576474687264e-06, |
| "loss": 0.2064, |
| "num_tokens": 30791604.0, |
| "step": 73 |
| }, |
| { |
| "epoch": 1.2982456140350878, |
| "grad_norm": 0.14994921794042249, |
| "learning_rate": 6.808843018024296e-06, |
| "loss": 0.2065, |
| "num_tokens": 31261315.0, |
| "step": 74 |
| }, |
| { |
| "epoch": 1.3157894736842106, |
| "grad_norm": 0.15092392748993957, |
| "learning_rate": 6.726635094049291e-06, |
| "loss": 0.1908, |
| "num_tokens": 31659426.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.3333333333333333, |
| "grad_norm": 0.14611148416178402, |
| "learning_rate": 6.643982503870693e-06, |
| "loss": 0.2051, |
| "num_tokens": 32057379.0, |
| "step": 76 |
| }, |
| { |
| "epoch": 1.3508771929824561, |
| "grad_norm": 0.1427497694651295, |
| "learning_rate": 6.560915209792424e-06, |
| "loss": 0.1992, |
| "num_tokens": 32476948.0, |
| "step": 77 |
| }, |
| { |
| "epoch": 1.368421052631579, |
| "grad_norm": 0.14625754427445514, |
| "learning_rate": 6.477463324452286e-06, |
| "loss": 0.1966, |
| "num_tokens": 32880627.0, |
| "step": 78 |
| }, |
| { |
| "epoch": 1.3859649122807016, |
| "grad_norm": 0.14428604329114683, |
| "learning_rate": 6.393657099905854e-06, |
| "loss": 0.2, |
| "num_tokens": 33298268.0, |
| "step": 79 |
| }, |
| { |
| "epoch": 1.4035087719298245, |
| "grad_norm": 0.14115165460163148, |
| "learning_rate": 6.309526916659843e-06, |
| "loss": 0.1961, |
| "num_tokens": 33735278.0, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.4210526315789473, |
| "grad_norm": 0.14007382911380342, |
| "learning_rate": 6.225103272658889e-06, |
| "loss": 0.199, |
| "num_tokens": 34180174.0, |
| "step": 81 |
| }, |
| { |
| "epoch": 1.4385964912280702, |
| "grad_norm": 0.1464893959125568, |
| "learning_rate": 6.140416772229785e-06, |
| "loss": 0.1996, |
| "num_tokens": 34600325.0, |
| "step": 82 |
| }, |
| { |
| "epoch": 1.456140350877193, |
| "grad_norm": 0.14708924135818197, |
| "learning_rate": 6.0554981149871276e-06, |
| "loss": 0.1978, |
| "num_tokens": 35028867.0, |
| "step": 83 |
| }, |
| { |
| "epoch": 1.4736842105263157, |
| "grad_norm": 0.15137327410210355, |
| "learning_rate": 5.970378084704441e-06, |
| "loss": 0.1971, |
| "num_tokens": 35465932.0, |
| "step": 84 |
| }, |
| { |
| "epoch": 1.4912280701754386, |
| "grad_norm": 0.14187565136169733, |
| "learning_rate": 5.88508753815478e-06, |
| "loss": 0.1922, |
| "num_tokens": 35857083.0, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.5087719298245614, |
| "grad_norm": 0.14750567085422958, |
| "learning_rate": 5.799657393924869e-06, |
| "loss": 0.1886, |
| "num_tokens": 36243816.0, |
| "step": 86 |
| }, |
| { |
| "epoch": 1.526315789473684, |
| "grad_norm": 0.13584876516210329, |
| "learning_rate": 5.714118621206843e-06, |
| "loss": 0.1949, |
| "num_tokens": 36682515.0, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.543859649122807, |
| "grad_norm": 0.13138592337104565, |
| "learning_rate": 5.6285022285716325e-06, |
| "loss": 0.1848, |
| "num_tokens": 37108482.0, |
| "step": 88 |
| }, |
| { |
| "epoch": 1.5614035087719298, |
| "grad_norm": 0.14022318239136972, |
| "learning_rate": 5.542839252728096e-06, |
| "loss": 0.1986, |
| "num_tokens": 37546779.0, |
| "step": 89 |
| }, |
| { |
| "epoch": 1.5789473684210527, |
| "grad_norm": 0.14886854943483496, |
| "learning_rate": 5.457160747271906e-06, |
| "loss": 0.2025, |
| "num_tokens": 37965357.0, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.5964912280701755, |
| "grad_norm": 0.14038303393485826, |
| "learning_rate": 5.371497771428368e-06, |
| "loss": 0.1975, |
| "num_tokens": 38406113.0, |
| "step": 91 |
| }, |
| { |
| "epoch": 1.6140350877192984, |
| "grad_norm": 0.14458491708947513, |
| "learning_rate": 5.2858813787931605e-06, |
| "loss": 0.1872, |
| "num_tokens": 38800413.0, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.631578947368421, |
| "grad_norm": 0.1480350806263145, |
| "learning_rate": 5.2003426060751324e-06, |
| "loss": 0.1968, |
| "num_tokens": 39200782.0, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.6491228070175439, |
| "grad_norm": 0.13730868120439885, |
| "learning_rate": 5.114912461845223e-06, |
| "loss": 0.2098, |
| "num_tokens": 39666940.0, |
| "step": 94 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.13357583490843425, |
| "learning_rate": 5.02962191529556e-06, |
| "loss": 0.2005, |
| "num_tokens": 40103554.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.6842105263157894, |
| "grad_norm": 0.13277577033293314, |
| "learning_rate": 4.944501885012875e-06, |
| "loss": 0.1894, |
| "num_tokens": 40523764.0, |
| "step": 96 |
| }, |
| { |
| "epoch": 1.7017543859649122, |
| "grad_norm": 0.1445531104714593, |
| "learning_rate": 4.859583227770218e-06, |
| "loss": 0.1966, |
| "num_tokens": 40923093.0, |
| "step": 97 |
| }, |
| { |
| "epoch": 1.719298245614035, |
| "grad_norm": 0.1346892995124459, |
| "learning_rate": 4.774896727341113e-06, |
| "loss": 0.2085, |
| "num_tokens": 41388977.0, |
| "step": 98 |
| }, |
| { |
| "epoch": 1.736842105263158, |
| "grad_norm": 0.13950660376280916, |
| "learning_rate": 4.6904730833401575e-06, |
| "loss": 0.1993, |
| "num_tokens": 41804049.0, |
| "step": 99 |
| }, |
| { |
| "epoch": 1.7543859649122808, |
| "grad_norm": 0.13419027046897863, |
| "learning_rate": 4.606342900094147e-06, |
| "loss": 0.197, |
| "num_tokens": 42212384.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.7719298245614035, |
| "grad_norm": 0.13471704895191902, |
| "learning_rate": 4.5225366755477165e-06, |
| "loss": 0.1991, |
| "num_tokens": 42626890.0, |
| "step": 101 |
| }, |
| { |
| "epoch": 1.7894736842105263, |
| "grad_norm": 0.1370431857295371, |
| "learning_rate": 4.439084790207577e-06, |
| "loss": 0.181, |
| "num_tokens": 43045185.0, |
| "step": 102 |
| }, |
| { |
| "epoch": 1.807017543859649, |
| "grad_norm": 0.13585245130540036, |
| "learning_rate": 4.35601749612931e-06, |
| "loss": 0.1926, |
| "num_tokens": 43469318.0, |
| "step": 103 |
| }, |
| { |
| "epoch": 1.8245614035087718, |
| "grad_norm": 0.14177193142263728, |
| "learning_rate": 4.273364905950711e-06, |
| "loss": 0.1883, |
| "num_tokens": 43860735.0, |
| "step": 104 |
| }, |
| { |
| "epoch": 1.8421052631578947, |
| "grad_norm": 0.13934124481189936, |
| "learning_rate": 4.191156981975704e-06, |
| "loss": 0.186, |
| "num_tokens": 44259177.0, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.8596491228070176, |
| "grad_norm": 0.13757581593440663, |
| "learning_rate": 4.109423525312738e-06, |
| "loss": 0.1813, |
| "num_tokens": 44652826.0, |
| "step": 106 |
| }, |
| { |
| "epoch": 1.8771929824561404, |
| "grad_norm": 0.12994085331633362, |
| "learning_rate": 4.028194165071603e-06, |
| "loss": 0.2007, |
| "num_tokens": 45110456.0, |
| "step": 107 |
| }, |
| { |
| "epoch": 1.8947368421052633, |
| "grad_norm": 0.1315821088883057, |
| "learning_rate": 3.9474983476226335e-06, |
| "loss": 0.1984, |
| "num_tokens": 45561128.0, |
| "step": 108 |
| }, |
| { |
| "epoch": 1.912280701754386, |
| "grad_norm": 0.13466926193354203, |
| "learning_rate": 3.867365325922116e-06, |
| "loss": 0.195, |
| "num_tokens": 45999442.0, |
| "step": 109 |
| }, |
| { |
| "epoch": 1.9298245614035088, |
| "grad_norm": 0.1407044279595099, |
| "learning_rate": 3.7878241489078473e-06, |
| "loss": 0.1957, |
| "num_tokens": 46405471.0, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.9473684210526314, |
| "grad_norm": 0.13869286010098864, |
| "learning_rate": 3.7089036509686216e-06, |
| "loss": 0.2096, |
| "num_tokens": 46837446.0, |
| "step": 111 |
| }, |
| { |
| "epoch": 1.9649122807017543, |
| "grad_norm": 0.13744222256799704, |
| "learning_rate": 3.630632441491512e-06, |
| "loss": 0.1874, |
| "num_tokens": 47238683.0, |
| "step": 112 |
| }, |
| { |
| "epoch": 1.9824561403508771, |
| "grad_norm": 0.13584540109090137, |
| "learning_rate": 3.5530388944907124e-06, |
| "loss": 0.1944, |
| "num_tokens": 47661310.0, |
| "step": 113 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.14078927259649465, |
| "learning_rate": 3.476151138321705e-06, |
| "loss": 0.1893, |
| "num_tokens": 48079346.0, |
| "step": 114 |
| }, |
| { |
| "epoch": 2.017543859649123, |
| "grad_norm": 0.14801647762626233, |
| "learning_rate": 3.3999970454844688e-06, |
| "loss": 0.1833, |
| "num_tokens": 48481444.0, |
| "step": 115 |
| }, |
| { |
| "epoch": 2.0350877192982457, |
| "grad_norm": 0.14207211024797958, |
| "learning_rate": 3.3246042225194626e-06, |
| "loss": 0.1972, |
| "num_tokens": 48904379.0, |
| "step": 116 |
| }, |
| { |
| "epoch": 2.0526315789473686, |
| "grad_norm": 0.13459996651406178, |
| "learning_rate": 3.2500000000000015e-06, |
| "loss": 0.1734, |
| "num_tokens": 49314280.0, |
| "step": 117 |
| }, |
| { |
| "epoch": 2.0701754385964914, |
| "grad_norm": 0.13420885166445595, |
| "learning_rate": 3.176211422624672e-06, |
| "loss": 0.1748, |
| "num_tokens": 49720807.0, |
| "step": 118 |
| }, |
| { |
| "epoch": 2.087719298245614, |
| "grad_norm": 0.1396854400821284, |
| "learning_rate": 3.103265239413401e-06, |
| "loss": 0.1781, |
| "num_tokens": 50151950.0, |
| "step": 119 |
| }, |
| { |
| "epoch": 2.1052631578947367, |
| "grad_norm": 0.14321980578287785, |
| "learning_rate": 3.0311878940106864e-06, |
| "loss": 0.182, |
| "num_tokens": 50574817.0, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.1228070175438596, |
| "grad_norm": 0.1402669344348115, |
| "learning_rate": 2.9600055150995397e-06, |
| "loss": 0.1804, |
| "num_tokens": 50991178.0, |
| "step": 121 |
| }, |
| { |
| "epoch": 2.1403508771929824, |
| "grad_norm": 0.14268249286817555, |
| "learning_rate": 2.889743906929609e-06, |
| "loss": 0.1701, |
| "num_tokens": 51370437.0, |
| "step": 122 |
| }, |
| { |
| "epoch": 2.1578947368421053, |
| "grad_norm": 0.13769468594260603, |
| "learning_rate": 2.820428539962905e-06, |
| "loss": 0.1803, |
| "num_tokens": 51807382.0, |
| "step": 123 |
| }, |
| { |
| "epoch": 2.175438596491228, |
| "grad_norm": 0.1418977810735795, |
| "learning_rate": 2.7520845416405285e-06, |
| "loss": 0.1867, |
| "num_tokens": 52214420.0, |
| "step": 124 |
| }, |
| { |
| "epoch": 2.192982456140351, |
| "grad_norm": 0.1357482680971467, |
| "learning_rate": 2.6847366872737535e-06, |
| "loss": 0.1855, |
| "num_tokens": 52648228.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 2.2105263157894735, |
| "grad_norm": 0.13758984714319883, |
| "learning_rate": 2.618409391062751e-06, |
| "loss": 0.1928, |
| "num_tokens": 53085345.0, |
| "step": 126 |
| }, |
| { |
| "epoch": 2.2280701754385963, |
| "grad_norm": 0.13835249455110674, |
| "learning_rate": 2.5531266972462176e-06, |
| "loss": 0.1753, |
| "num_tokens": 53493140.0, |
| "step": 127 |
| }, |
| { |
| "epoch": 2.245614035087719, |
| "grad_norm": 0.13785872746102823, |
| "learning_rate": 2.4889122713851397e-06, |
| "loss": 0.1734, |
| "num_tokens": 53922832.0, |
| "step": 128 |
| }, |
| { |
| "epoch": 2.263157894736842, |
| "grad_norm": 0.1369141853241696, |
| "learning_rate": 2.425789391783796e-06, |
| "loss": 0.1771, |
| "num_tokens": 54319035.0, |
| "step": 129 |
| }, |
| { |
| "epoch": 2.280701754385965, |
| "grad_norm": 0.13274261857238875, |
| "learning_rate": 2.36378094105118e-06, |
| "loss": 0.1759, |
| "num_tokens": 54754161.0, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.2982456140350878, |
| "grad_norm": 0.13374187150373806, |
| "learning_rate": 2.302909397805841e-06, |
| "loss": 0.1757, |
| "num_tokens": 55177972.0, |
| "step": 131 |
| }, |
| { |
| "epoch": 2.3157894736842106, |
| "grad_norm": 0.14056010545232225, |
| "learning_rate": 2.2431968285271843e-06, |
| "loss": 0.1762, |
| "num_tokens": 55567532.0, |
| "step": 132 |
| }, |
| { |
| "epoch": 2.3333333333333335, |
| "grad_norm": 0.14440093167538778, |
| "learning_rate": 2.1846648795561777e-06, |
| "loss": 0.1789, |
| "num_tokens": 55997257.0, |
| "step": 133 |
| }, |
| { |
| "epoch": 2.3508771929824563, |
| "grad_norm": 0.13209921359488944, |
| "learning_rate": 2.1273347692483574e-06, |
| "loss": 0.1683, |
| "num_tokens": 56435780.0, |
| "step": 134 |
| }, |
| { |
| "epoch": 2.3684210526315788, |
| "grad_norm": 0.13075669290172312, |
| "learning_rate": 2.071227280281982e-06, |
| "loss": 0.1766, |
| "num_tokens": 56880449.0, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.3859649122807016, |
| "grad_norm": 0.15520159951824955, |
| "learning_rate": 2.016362752124129e-06, |
| "loss": 0.1766, |
| "num_tokens": 57298925.0, |
| "step": 136 |
| }, |
| { |
| "epoch": 2.4035087719298245, |
| "grad_norm": 0.13674994090087955, |
| "learning_rate": 1.9627610736574575e-06, |
| "loss": 0.1689, |
| "num_tokens": 57702294.0, |
| "step": 137 |
| }, |
| { |
| "epoch": 2.4210526315789473, |
| "grad_norm": 0.13321588166669115, |
| "learning_rate": 1.9104416759703017e-06, |
| "loss": 0.1758, |
| "num_tokens": 58154418.0, |
| "step": 138 |
| }, |
| { |
| "epoch": 2.43859649122807, |
| "grad_norm": 0.13632315461262803, |
| "learning_rate": 1.8594235253127373e-06, |
| "loss": 0.1767, |
| "num_tokens": 58579683.0, |
| "step": 139 |
| }, |
| { |
| "epoch": 2.456140350877193, |
| "grad_norm": 0.1288153252214618, |
| "learning_rate": 1.8097251162211405e-06, |
| "loss": 0.1811, |
| "num_tokens": 59051562.0, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.473684210526316, |
| "grad_norm": 0.12987465966750822, |
| "learning_rate": 1.7613644648137543e-06, |
| "loss": 0.1774, |
| "num_tokens": 59501698.0, |
| "step": 141 |
| }, |
| { |
| "epoch": 2.4912280701754383, |
| "grad_norm": 0.1374425426257984, |
| "learning_rate": 1.7143591022596846e-06, |
| "loss": 0.1944, |
| "num_tokens": 59955592.0, |
| "step": 142 |
| }, |
| { |
| "epoch": 2.5087719298245617, |
| "grad_norm": 0.13531966438989285, |
| "learning_rate": 1.6687260684236943e-06, |
| "loss": 0.1805, |
| "num_tokens": 60381274.0, |
| "step": 143 |
| }, |
| { |
| "epoch": 2.526315789473684, |
| "grad_norm": 0.134925096600037, |
| "learning_rate": 1.6244819056890975e-06, |
| "loss": 0.171, |
| "num_tokens": 60788438.0, |
| "step": 144 |
| }, |
| { |
| "epoch": 2.543859649122807, |
| "grad_norm": 0.1407719819848176, |
| "learning_rate": 1.5816426529610035e-06, |
| "loss": 0.1714, |
| "num_tokens": 61184551.0, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.56140350877193, |
| "grad_norm": 0.1359063010745857, |
| "learning_rate": 1.5402238398520614e-06, |
| "loss": 0.1758, |
| "num_tokens": 61619839.0, |
| "step": 146 |
| }, |
| { |
| "epoch": 2.5789473684210527, |
| "grad_norm": 0.13246835060504009, |
| "learning_rate": 1.5002404810528452e-06, |
| "loss": 0.1739, |
| "num_tokens": 62045889.0, |
| "step": 147 |
| }, |
| { |
| "epoch": 2.5964912280701755, |
| "grad_norm": 0.13964950522945552, |
| "learning_rate": 1.4617070708888882e-06, |
| "loss": 0.1718, |
| "num_tokens": 62437900.0, |
| "step": 148 |
| }, |
| { |
| "epoch": 2.6140350877192984, |
| "grad_norm": 0.13372294299780066, |
| "learning_rate": 1.4246375780663613e-06, |
| "loss": 0.1661, |
| "num_tokens": 62843119.0, |
| "step": 149 |
| }, |
| { |
| "epoch": 2.6315789473684212, |
| "grad_norm": 0.13805400712254826, |
| "learning_rate": 1.389045440608296e-06, |
| "loss": 0.1828, |
| "num_tokens": 63262284.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.6491228070175437, |
| "grad_norm": 0.13808339180910903, |
| "learning_rate": 1.354943560983175e-06, |
| "loss": 0.1803, |
| "num_tokens": 63701914.0, |
| "step": 151 |
| }, |
| { |
| "epoch": 2.6666666666666665, |
| "grad_norm": 0.13165183261262675, |
| "learning_rate": 1.3223443014276738e-06, |
| "loss": 0.1774, |
| "num_tokens": 64149759.0, |
| "step": 152 |
| }, |
| { |
| "epoch": 2.6842105263157894, |
| "grad_norm": 0.1319243140393186, |
| "learning_rate": 1.2912594794652406e-06, |
| "loss": 0.1799, |
| "num_tokens": 64602514.0, |
| "step": 153 |
| }, |
| { |
| "epoch": 2.7017543859649122, |
| "grad_norm": 0.13538494698388223, |
| "learning_rate": 1.2617003636221394e-06, |
| "loss": 0.1694, |
| "num_tokens": 64992832.0, |
| "step": 154 |
| }, |
| { |
| "epoch": 2.719298245614035, |
| "grad_norm": 0.13498616412912304, |
| "learning_rate": 1.2336776693425028e-06, |
| "loss": 0.1707, |
| "num_tokens": 65389001.0, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.736842105263158, |
| "grad_norm": 0.12924445948277097, |
| "learning_rate": 1.2072015551038933e-06, |
| "loss": 0.1692, |
| "num_tokens": 65842297.0, |
| "step": 156 |
| }, |
| { |
| "epoch": 2.754385964912281, |
| "grad_norm": 0.13206490698518994, |
| "learning_rate": 1.1822816187347625e-06, |
| "loss": 0.1719, |
| "num_tokens": 66254756.0, |
| "step": 157 |
| }, |
| { |
| "epoch": 2.7719298245614032, |
| "grad_norm": 0.13946311479316087, |
| "learning_rate": 1.1589268939351499e-06, |
| "loss": 0.1824, |
| "num_tokens": 66653349.0, |
| "step": 158 |
| }, |
| { |
| "epoch": 2.7894736842105265, |
| "grad_norm": 0.12958606648427337, |
| "learning_rate": 1.1371458470018896e-06, |
| "loss": 0.1758, |
| "num_tokens": 67089072.0, |
| "step": 159 |
| }, |
| { |
| "epoch": 2.807017543859649, |
| "grad_norm": 0.12836201287391627, |
| "learning_rate": 1.1169463737594995e-06, |
| "loss": 0.1725, |
| "num_tokens": 67530318.0, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.824561403508772, |
| "grad_norm": 0.1312877766550337, |
| "learning_rate": 1.0983357966978747e-06, |
| "loss": 0.17, |
| "num_tokens": 67943670.0, |
| "step": 161 |
| }, |
| { |
| "epoch": 2.8421052631578947, |
| "grad_norm": 0.1292736067143446, |
| "learning_rate": 1.0813208623178199e-06, |
| "loss": 0.1759, |
| "num_tokens": 68380170.0, |
| "step": 162 |
| }, |
| { |
| "epoch": 2.8596491228070176, |
| "grad_norm": 0.13227684289793154, |
| "learning_rate": 1.0659077386853817e-06, |
| "loss": 0.1719, |
| "num_tokens": 68808114.0, |
| "step": 163 |
| }, |
| { |
| "epoch": 2.8771929824561404, |
| "grad_norm": 0.13694616326909123, |
| "learning_rate": 1.0521020131958692e-06, |
| "loss": 0.168, |
| "num_tokens": 69191512.0, |
| "step": 164 |
| }, |
| { |
| "epoch": 2.8947368421052633, |
| "grad_norm": 0.135480809490847, |
| "learning_rate": 1.0399086905483752e-06, |
| "loss": 0.1659, |
| "num_tokens": 69582710.0, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.912280701754386, |
| "grad_norm": 0.13656650923592858, |
| "learning_rate": 1.0293321909315242e-06, |
| "loss": 0.1764, |
| "num_tokens": 69995390.0, |
| "step": 166 |
| }, |
| { |
| "epoch": 2.9298245614035086, |
| "grad_norm": 0.13037307085567856, |
| "learning_rate": 1.0203763484211196e-06, |
| "loss": 0.1737, |
| "num_tokens": 70418613.0, |
| "step": 167 |
| }, |
| { |
| "epoch": 2.9473684210526314, |
| "grad_norm": 0.13386161162103719, |
| "learning_rate": 1.0130444095902514e-06, |
| "loss": 0.1731, |
| "num_tokens": 70848952.0, |
| "step": 168 |
| }, |
| { |
| "epoch": 2.9649122807017543, |
| "grad_norm": 0.13477603098195323, |
| "learning_rate": 1.0073390323323897e-06, |
| "loss": 0.1859, |
| "num_tokens": 71284648.0, |
| "step": 169 |
| }, |
| { |
| "epoch": 2.982456140350877, |
| "grad_norm": 0.13335006132129937, |
| "learning_rate": 1.0032622848978689e-06, |
| "loss": 0.1727, |
| "num_tokens": 71712021.0, |
| "step": 170 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.13952787638079295, |
| "learning_rate": 1.000815645144134e-06, |
| "loss": 0.1803, |
| "num_tokens": 72119019.0, |
| "step": 171 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 171, |
| "total_flos": 2.3168881532705178e+17, |
| "train_loss": 0.25350178002614027, |
| "train_runtime": 2712.8956, |
| "train_samples_per_second": 8.068, |
| "train_steps_per_second": 0.063 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 171, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.3168881532705178e+17, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|