Gencode-MxDNA / checkpoint-750 /trainer_state.json
andyjzhao's picture
Upload folder using huggingface_hub
4dcb67b verified
{
"best_global_step": 750,
"best_metric": 1.2186306715011597,
"best_model_checkpoint": "/gpfs/scratch/guoh/DNAFM/output/gencode_human_12.8k_12800/Gencode-MxDNA/checkpoint-750",
"epoch": 0.21275086873271398,
"eval_steps": 125,
"global_step": 750,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0005673356499539039,
"grad_norm": 8450.4345703125,
"loss": 876.9911,
"lr": 2e-06,
"step": 2,
"tokens_trained": 0.000985992
},
{
"epoch": 0.0011346712999078079,
"grad_norm": 8980.888671875,
"loss": 779.4711,
"lr": 6e-06,
"step": 4,
"tokens_trained": 0.001968088
},
{
"epoch": 0.001702006949861712,
"grad_norm": 7489.92529296875,
"loss": 488.6157,
"lr": 1e-05,
"step": 6,
"tokens_trained": 0.002953808
},
{
"epoch": 0.0022693425998156157,
"grad_norm": 1952.1917724609375,
"loss": 237.0602,
"lr": 1.4e-05,
"step": 8,
"tokens_trained": 0.003935728
},
{
"epoch": 0.0028366782497695198,
"grad_norm": 1418.443603515625,
"loss": 159.0854,
"lr": 1.8e-05,
"step": 10,
"tokens_trained": 0.004916488
},
{
"epoch": 0.003404013899723424,
"grad_norm": 874.7195434570312,
"loss": 91.9563,
"lr": 2.2e-05,
"step": 12,
"tokens_trained": 0.005902792
},
{
"epoch": 0.003971349549677328,
"grad_norm": 1339.8248291015625,
"loss": 40.3366,
"lr": 2.6e-05,
"step": 14,
"tokens_trained": 0.0068856
},
{
"epoch": 0.0045386851996312315,
"grad_norm": 2936.7607421875,
"loss": 22.7436,
"lr": 3e-05,
"step": 16,
"tokens_trained": 0.007868248
},
{
"epoch": 0.005106020849585136,
"grad_norm": 1531.3807373046875,
"loss": 23.4797,
"lr": 3.4000000000000007e-05,
"step": 18,
"tokens_trained": 0.008849296
},
{
"epoch": 0.0056733564995390395,
"grad_norm": 3027.4189453125,
"loss": 38.7379,
"lr": 3.8e-05,
"step": 20,
"tokens_trained": 0.009830984
},
{
"epoch": 0.006240692149492944,
"grad_norm": 2435.890625,
"loss": 26.2427,
"lr": 4.2000000000000004e-05,
"step": 22,
"tokens_trained": 0.01081364
},
{
"epoch": 0.006808027799446848,
"grad_norm": 3217.990478515625,
"loss": 31.0263,
"lr": 4.6e-05,
"step": 24,
"tokens_trained": 0.01179036
},
{
"epoch": 0.007375363449400752,
"grad_norm": 3854.00634765625,
"loss": 33.8781,
"lr": 5e-05,
"step": 26,
"tokens_trained": 0.012774504
},
{
"epoch": 0.007942699099354656,
"grad_norm": 3197.489990234375,
"loss": 27.7927,
"lr": 5.4e-05,
"step": 28,
"tokens_trained": 0.013759992
},
{
"epoch": 0.00851003474930856,
"grad_norm": 3034.156494140625,
"loss": 37.9083,
"lr": 5.800000000000001e-05,
"step": 30,
"tokens_trained": 0.014740536
},
{
"epoch": 0.009077370399262463,
"grad_norm": 3040.314453125,
"loss": 34.0659,
"lr": 6.2e-05,
"step": 32,
"tokens_trained": 0.015725984
},
{
"epoch": 0.009644706049216368,
"grad_norm": 3065.5791015625,
"loss": 27.7768,
"lr": 6.6e-05,
"step": 34,
"tokens_trained": 0.016706864
},
{
"epoch": 0.010212041699170272,
"grad_norm": 2454.293701171875,
"loss": 35.1143,
"lr": 7.000000000000001e-05,
"step": 36,
"tokens_trained": 0.017688816
},
{
"epoch": 0.010779377349124175,
"grad_norm": 3100.7802734375,
"loss": 42.2603,
"lr": 7.4e-05,
"step": 38,
"tokens_trained": 0.018669072
},
{
"epoch": 0.011346712999078079,
"grad_norm": 2749.84423828125,
"loss": 39.3879,
"lr": 7.8e-05,
"step": 40,
"tokens_trained": 0.019652072
},
{
"epoch": 0.011914048649031984,
"grad_norm": 1519.9908447265625,
"loss": 35.0735,
"lr": 8.2e-05,
"step": 42,
"tokens_trained": 0.020633112
},
{
"epoch": 0.012481384298985888,
"grad_norm": 1474.4244384765625,
"loss": 25.8965,
"lr": 8.599999999999999e-05,
"step": 44,
"tokens_trained": 0.021616192
},
{
"epoch": 0.013048719948939792,
"grad_norm": 2962.500244140625,
"loss": 51.0784,
"lr": 8.999999999999999e-05,
"step": 46,
"tokens_trained": 0.022597288
},
{
"epoch": 0.013616055598893695,
"grad_norm": 2419.41455078125,
"loss": 43.0334,
"lr": 9.400000000000001e-05,
"step": 48,
"tokens_trained": 0.02357572
},
{
"epoch": 0.014183391248847599,
"grad_norm": 1267.87451171875,
"loss": 21.8063,
"lr": 9.800000000000001e-05,
"step": 50,
"tokens_trained": 0.024553376
},
{
"epoch": 0.014750726898801504,
"grad_norm": 1573.944091796875,
"loss": 52.9693,
"lr": 0.000102,
"step": 52,
"tokens_trained": 0.025536728
},
{
"epoch": 0.015318062548755408,
"grad_norm": 1509.650146484375,
"loss": 50.0825,
"lr": 0.000106,
"step": 54,
"tokens_trained": 0.026517
},
{
"epoch": 0.01588539819870931,
"grad_norm": 2334.765380859375,
"loss": 42.1982,
"lr": 0.00011,
"step": 56,
"tokens_trained": 0.027504728
},
{
"epoch": 0.016452733848663217,
"grad_norm": 1594.16259765625,
"loss": 39.0562,
"lr": 0.000114,
"step": 58,
"tokens_trained": 0.028485416
},
{
"epoch": 0.01702006949861712,
"grad_norm": 1628.082275390625,
"loss": 35.0488,
"lr": 0.000118,
"step": 60,
"tokens_trained": 0.029468696
},
{
"epoch": 0.017587405148571024,
"grad_norm": 2496.6455078125,
"loss": 49.4241,
"lr": 0.000122,
"step": 62,
"tokens_trained": 0.030453584
},
{
"epoch": 0.018154740798524926,
"grad_norm": 2521.721435546875,
"loss": 69.0275,
"lr": 0.000126,
"step": 64,
"tokens_trained": 0.031432864
},
{
"epoch": 0.01872207644847883,
"grad_norm": 2179.571533203125,
"loss": 63.1409,
"lr": 0.00013000000000000002,
"step": 66,
"tokens_trained": 0.032418416
},
{
"epoch": 0.019289412098432736,
"grad_norm": 899.7137451171875,
"loss": 38.4131,
"lr": 0.000134,
"step": 68,
"tokens_trained": 0.033402136
},
{
"epoch": 0.01985674774838664,
"grad_norm": 2109.377685546875,
"loss": 51.0044,
"lr": 0.00013800000000000002,
"step": 70,
"tokens_trained": 0.03438832
},
{
"epoch": 0.020424083398340544,
"grad_norm": 1649.1873779296875,
"loss": 32.1408,
"lr": 0.00014199999999999998,
"step": 72,
"tokens_trained": 0.035374464
},
{
"epoch": 0.020991419048294446,
"grad_norm": 1807.994140625,
"loss": 28.8357,
"lr": 0.000146,
"step": 74,
"tokens_trained": 0.03635784
},
{
"epoch": 0.02155875469824835,
"grad_norm": 998.9485473632812,
"loss": 23.0343,
"lr": 0.00015,
"step": 76,
"tokens_trained": 0.037340248
},
{
"epoch": 0.022126090348202256,
"grad_norm": 2240.17578125,
"loss": 32.0397,
"lr": 0.000154,
"step": 78,
"tokens_trained": 0.038321968
},
{
"epoch": 0.022693425998156158,
"grad_norm": 1606.0067138671875,
"loss": 32.1776,
"lr": 0.000158,
"step": 80,
"tokens_trained": 0.039304992
},
{
"epoch": 0.023260761648110063,
"grad_norm": 1685.1015625,
"loss": 24.3428,
"lr": 0.000162,
"step": 82,
"tokens_trained": 0.040286808
},
{
"epoch": 0.02382809729806397,
"grad_norm": 1761.7890625,
"loss": 23.9261,
"lr": 0.00016600000000000002,
"step": 84,
"tokens_trained": 0.041271776
},
{
"epoch": 0.02439543294801787,
"grad_norm": 2036.0982666015625,
"loss": 27.7196,
"lr": 0.00017,
"step": 86,
"tokens_trained": 0.042252784
},
{
"epoch": 0.024962768597971776,
"grad_norm": 1564.3870849609375,
"loss": 25.3722,
"lr": 0.000174,
"step": 88,
"tokens_trained": 0.04323596
},
{
"epoch": 0.025530104247925678,
"grad_norm": 1508.349853515625,
"loss": 18.4107,
"lr": 0.000178,
"step": 90,
"tokens_trained": 0.044218984
},
{
"epoch": 0.026097439897879583,
"grad_norm": 1955.011474609375,
"loss": 28.8456,
"lr": 0.000182,
"step": 92,
"tokens_trained": 0.045202144
},
{
"epoch": 0.02666477554783349,
"grad_norm": 1679.9423828125,
"loss": 23.6139,
"lr": 0.000186,
"step": 94,
"tokens_trained": 0.046192336
},
{
"epoch": 0.02723211119778739,
"grad_norm": 1517.5731201171875,
"loss": 42.145,
"lr": 0.00019,
"step": 96,
"tokens_trained": 0.047174312
},
{
"epoch": 0.027799446847741296,
"grad_norm": 1535.3076171875,
"loss": 31.9711,
"lr": 0.000194,
"step": 98,
"tokens_trained": 0.048158944
},
{
"epoch": 0.028366782497695198,
"grad_norm": 1475.2569580078125,
"loss": 37.645,
"lr": 0.00019800000000000002,
"step": 100,
"tokens_trained": 0.04914364
},
{
"epoch": 0.028934118147649103,
"grad_norm": 1918.4088134765625,
"loss": 69.4053,
"lr": 0.000202,
"step": 102,
"tokens_trained": 0.050123488
},
{
"epoch": 0.02950145379760301,
"grad_norm": 1631.6231689453125,
"loss": 50.9725,
"lr": 0.000206,
"step": 104,
"tokens_trained": 0.051105512
},
{
"epoch": 0.03006878944755691,
"grad_norm": 1291.6376953125,
"loss": 22.6527,
"lr": 0.00021,
"step": 106,
"tokens_trained": 0.052091704
},
{
"epoch": 0.030636125097510816,
"grad_norm": 1224.9625244140625,
"loss": 60.2725,
"lr": 0.000214,
"step": 108,
"tokens_trained": 0.053074824
},
{
"epoch": 0.031203460747464717,
"grad_norm": 1218.2022705078125,
"loss": 75.8728,
"lr": 0.000218,
"step": 110,
"tokens_trained": 0.054057104
},
{
"epoch": 0.03177079639741862,
"grad_norm": 1761.8861083984375,
"loss": 61.6427,
"lr": 0.000222,
"step": 112,
"tokens_trained": 0.055039128
},
{
"epoch": 0.03233813204737253,
"grad_norm": 1482.4256591796875,
"loss": 35.3351,
"lr": 0.00022600000000000002,
"step": 114,
"tokens_trained": 0.05602388
},
{
"epoch": 0.03290546769732643,
"grad_norm": 563.6399536132812,
"loss": 40.1461,
"lr": 0.00023,
"step": 116,
"tokens_trained": 0.057005376
},
{
"epoch": 0.03347280334728033,
"grad_norm": 1266.058837890625,
"loss": 24.0657,
"lr": 0.00023400000000000002,
"step": 118,
"tokens_trained": 0.057985136
},
{
"epoch": 0.03404013899723424,
"grad_norm": 918.206298828125,
"loss": 23.9626,
"lr": 0.00023799999999999998,
"step": 120,
"tokens_trained": 0.058968288
},
{
"epoch": 0.03460747464718814,
"grad_norm": 1495.7191162109375,
"loss": 19.798,
"lr": 0.000242,
"step": 122,
"tokens_trained": 0.05995348
},
{
"epoch": 0.03517481029714205,
"grad_norm": 1264.302734375,
"loss": 31.5342,
"lr": 0.000246,
"step": 124,
"tokens_trained": 0.060935832
},
{
"epoch": 0.035458478122119,
"eval_loss": 5.312118053436279,
"eval_runtime": 21.3065,
"step": 125,
"tokens_trained": 0.061426608
},
{
"epoch": 0.03574214594709595,
"grad_norm": 907.4861450195312,
"loss": 25.1262,
"lr": 0.00025,
"step": 126,
"tokens_trained": 0.061918184
},
{
"epoch": 0.03630948159704985,
"grad_norm": 1287.6158447265625,
"loss": 26.963,
"lr": 0.000254,
"step": 128,
"tokens_trained": 0.062902328
},
{
"epoch": 0.03687681724700376,
"grad_norm": 1260.570556640625,
"loss": 24.9633,
"lr": 0.00025800000000000004,
"step": 130,
"tokens_trained": 0.063883456
},
{
"epoch": 0.03744415289695766,
"grad_norm": 1436.82373046875,
"loss": 23.1028,
"lr": 0.000262,
"step": 132,
"tokens_trained": 0.06486748
},
{
"epoch": 0.03801148854691157,
"grad_norm": 812.9523315429688,
"loss": 20.5496,
"lr": 0.000266,
"step": 134,
"tokens_trained": 0.065847104
},
{
"epoch": 0.03857882419686547,
"grad_norm": 1336.5322265625,
"loss": 23.673,
"lr": 0.00027,
"step": 136,
"tokens_trained": 0.066829928
},
{
"epoch": 0.03914615984681937,
"grad_norm": 1381.282470703125,
"loss": 32.0373,
"lr": 0.00027400000000000005,
"step": 138,
"tokens_trained": 0.067814024
},
{
"epoch": 0.03971349549677328,
"grad_norm": 972.7861938476562,
"loss": 26.9454,
"lr": 0.00027800000000000004,
"step": 140,
"tokens_trained": 0.068797744
},
{
"epoch": 0.04028083114672718,
"grad_norm": 1347.2249755859375,
"loss": 22.3578,
"lr": 0.00028199999999999997,
"step": 142,
"tokens_trained": 0.069780072
},
{
"epoch": 0.04084816679668109,
"grad_norm": 829.525390625,
"loss": 37.9879,
"lr": 0.00028599999999999996,
"step": 144,
"tokens_trained": 0.070759896
},
{
"epoch": 0.04141550244663499,
"grad_norm": 1094.1033935546875,
"loss": 21.1972,
"lr": 0.00029,
"step": 146,
"tokens_trained": 0.0717452
},
{
"epoch": 0.04198283809658889,
"grad_norm": 717.107421875,
"loss": 21.7774,
"lr": 0.000294,
"step": 148,
"tokens_trained": 0.072727432
},
{
"epoch": 0.042550173746542796,
"grad_norm": 744.4456787109375,
"loss": 20.3235,
"lr": 0.000298,
"step": 150,
"tokens_trained": 0.073712128
},
{
"epoch": 0.0431175093964967,
"grad_norm": 904.1460571289062,
"loss": 22.7878,
"lr": 0.000302,
"step": 152,
"tokens_trained": 0.074695296
},
{
"epoch": 0.04368484504645061,
"grad_norm": 1352.303955078125,
"loss": 20.9757,
"lr": 0.000306,
"step": 154,
"tokens_trained": 0.0756798
},
{
"epoch": 0.04425218069640451,
"grad_norm": 997.0473022460938,
"loss": 17.4647,
"lr": 0.00031,
"step": 156,
"tokens_trained": 0.076666504
},
{
"epoch": 0.04481951634635841,
"grad_norm": 1206.387939453125,
"loss": 21.1846,
"lr": 0.000314,
"step": 158,
"tokens_trained": 0.07764868
},
{
"epoch": 0.045386851996312316,
"grad_norm": 1029.6807861328125,
"loss": 17.8853,
"lr": 0.00031800000000000003,
"step": 160,
"tokens_trained": 0.07863548
},
{
"epoch": 0.04595418764626622,
"grad_norm": 1136.4635009765625,
"loss": 30.057,
"lr": 0.000322,
"step": 162,
"tokens_trained": 0.079618928
},
{
"epoch": 0.04652152329622013,
"grad_norm": 834.3464965820312,
"loss": 28.1782,
"lr": 0.000326,
"step": 164,
"tokens_trained": 0.0806032
},
{
"epoch": 0.04708885894617403,
"grad_norm": 1177.8365478515625,
"loss": 16.4267,
"lr": 0.00033,
"step": 166,
"tokens_trained": 0.081583752
},
{
"epoch": 0.04765619459612794,
"grad_norm": 572.501708984375,
"loss": 16.5752,
"lr": 0.00033400000000000004,
"step": 168,
"tokens_trained": 0.082568184
},
{
"epoch": 0.048223530246081836,
"grad_norm": 437.6822814941406,
"loss": 11.5509,
"lr": 0.00033800000000000003,
"step": 170,
"tokens_trained": 0.083553352
},
{
"epoch": 0.04879086589603574,
"grad_norm": 1119.0416259765625,
"loss": 16.2689,
"lr": 0.000342,
"step": 172,
"tokens_trained": 0.084536352
},
{
"epoch": 0.04935820154598965,
"grad_norm": 895.4021606445312,
"loss": 12.6663,
"lr": 0.000346,
"step": 174,
"tokens_trained": 0.085517312
},
{
"epoch": 0.04992553719594355,
"grad_norm": 995.6289672851562,
"loss": 26.0663,
"lr": 0.00035,
"step": 176,
"tokens_trained": 0.086496088
},
{
"epoch": 0.05049287284589746,
"grad_norm": 839.6610717773438,
"loss": 21.5115,
"lr": 0.000354,
"step": 178,
"tokens_trained": 0.087480632
},
{
"epoch": 0.051060208495851356,
"grad_norm": 734.1155395507812,
"loss": 29.3287,
"lr": 0.000358,
"step": 180,
"tokens_trained": 0.088460408
},
{
"epoch": 0.05162754414580526,
"grad_norm": 721.4505615234375,
"loss": 26.0801,
"lr": 0.000362,
"step": 182,
"tokens_trained": 0.08944248
},
{
"epoch": 0.052194879795759166,
"grad_norm": 845.9672241210938,
"loss": 19.0639,
"lr": 0.000366,
"step": 184,
"tokens_trained": 0.090427832
},
{
"epoch": 0.05276221544571307,
"grad_norm": 1210.9969482421875,
"loss": 23.9036,
"lr": 0.00037,
"step": 186,
"tokens_trained": 0.091411504
},
{
"epoch": 0.05332955109566698,
"grad_norm": 1079.1690673828125,
"loss": 23.5588,
"lr": 0.000374,
"step": 188,
"tokens_trained": 0.092392672
},
{
"epoch": 0.053896886745620876,
"grad_norm": 596.111328125,
"loss": 20.8275,
"lr": 0.000378,
"step": 190,
"tokens_trained": 0.093374696
},
{
"epoch": 0.05446422239557478,
"grad_norm": 761.8096923828125,
"loss": 22.512,
"lr": 0.000382,
"step": 192,
"tokens_trained": 0.094361912
},
{
"epoch": 0.055031558045528686,
"grad_norm": 1081.9832763671875,
"loss": 32.335,
"lr": 0.000386,
"step": 194,
"tokens_trained": 0.095342992
},
{
"epoch": 0.05559889369548259,
"grad_norm": 304.3534240722656,
"loss": 11.5275,
"lr": 0.00039000000000000005,
"step": 196,
"tokens_trained": 0.096323512
},
{
"epoch": 0.0561662293454365,
"grad_norm": 586.6314086914062,
"loss": 16.2663,
"lr": 0.00039400000000000004,
"step": 198,
"tokens_trained": 0.097308864
},
{
"epoch": 0.056733564995390395,
"grad_norm": 624.9953002929688,
"loss": 16.627,
"lr": 0.000398,
"step": 200,
"tokens_trained": 0.098289064
},
{
"epoch": 0.0573009006453443,
"grad_norm": 585.9645385742188,
"loss": 15.8359,
"lr": 0.000402,
"step": 202,
"tokens_trained": 0.099269696
},
{
"epoch": 0.057868236295298206,
"grad_norm": 537.9913330078125,
"loss": 20.0779,
"lr": 0.00040600000000000006,
"step": 204,
"tokens_trained": 0.100248448
},
{
"epoch": 0.05843557194525211,
"grad_norm": 805.04931640625,
"loss": 21.4524,
"lr": 0.00041,
"step": 206,
"tokens_trained": 0.101231248
},
{
"epoch": 0.05900290759520602,
"grad_norm": 439.1418151855469,
"loss": 23.9852,
"lr": 0.000414,
"step": 208,
"tokens_trained": 0.102210688
},
{
"epoch": 0.059570243245159915,
"grad_norm": 502.684814453125,
"loss": 17.6273,
"lr": 0.00041799999999999997,
"step": 210,
"tokens_trained": 0.103192176
},
{
"epoch": 0.06013757889511382,
"grad_norm": 849.9979858398438,
"loss": 33.7517,
"lr": 0.000422,
"step": 212,
"tokens_trained": 0.104172824
},
{
"epoch": 0.060704914545067726,
"grad_norm": 939.583740234375,
"loss": 26.2559,
"lr": 0.000426,
"step": 214,
"tokens_trained": 0.105156672
},
{
"epoch": 0.06127225019502163,
"grad_norm": 525.0505981445312,
"loss": 20.0923,
"lr": 0.00043,
"step": 216,
"tokens_trained": 0.106141368
},
{
"epoch": 0.061839585844975536,
"grad_norm": 420.296630859375,
"loss": 17.9608,
"lr": 0.00043400000000000003,
"step": 218,
"tokens_trained": 0.107124088
},
{
"epoch": 0.062406921494929435,
"grad_norm": 711.3380737304688,
"loss": 19.387,
"lr": 0.000438,
"step": 220,
"tokens_trained": 0.108112632
},
{
"epoch": 0.06297425714488335,
"grad_norm": 759.183349609375,
"loss": 17.8061,
"lr": 0.000442,
"step": 222,
"tokens_trained": 0.1090934
},
{
"epoch": 0.06354159279483725,
"grad_norm": 790.025146484375,
"loss": 13.8539,
"lr": 0.000446,
"step": 224,
"tokens_trained": 0.110079512
},
{
"epoch": 0.06410892844479114,
"grad_norm": 769.8306274414062,
"loss": 22.1258,
"lr": 0.00045000000000000004,
"step": 226,
"tokens_trained": 0.111060152
},
{
"epoch": 0.06467626409474506,
"grad_norm": 656.8352661132812,
"loss": 14.8646,
"lr": 0.00045400000000000003,
"step": 228,
"tokens_trained": 0.112044144
},
{
"epoch": 0.06524359974469895,
"grad_norm": 498.92010498046875,
"loss": 23.1558,
"lr": 0.000458,
"step": 230,
"tokens_trained": 0.113022928
},
{
"epoch": 0.06581093539465287,
"grad_norm": 764.0186157226562,
"loss": 16.7089,
"lr": 0.000462,
"step": 232,
"tokens_trained": 0.114003832
},
{
"epoch": 0.06637827104460677,
"grad_norm": 491.5793762207031,
"loss": 12.3979,
"lr": 0.00046600000000000005,
"step": 234,
"tokens_trained": 0.114991008
},
{
"epoch": 0.06694560669456066,
"grad_norm": 679.9217529296875,
"loss": 14.9037,
"lr": 0.00047,
"step": 236,
"tokens_trained": 0.115971888
},
{
"epoch": 0.06751294234451458,
"grad_norm": 491.0369567871094,
"loss": 7.7603,
"lr": 0.000474,
"step": 238,
"tokens_trained": 0.116952616
},
{
"epoch": 0.06808027799446847,
"grad_norm": 369.2186279296875,
"loss": 8.2256,
"lr": 0.00047799999999999996,
"step": 240,
"tokens_trained": 0.117935816
},
{
"epoch": 0.06864761364442239,
"grad_norm": 312.72137451171875,
"loss": 7.5486,
"lr": 0.000482,
"step": 242,
"tokens_trained": 0.118919392
},
{
"epoch": 0.06921494929437629,
"grad_norm": 596.1439208984375,
"loss": 11.7351,
"lr": 0.000486,
"step": 244,
"tokens_trained": 0.119901856
},
{
"epoch": 0.06978228494433018,
"grad_norm": 467.5667419433594,
"loss": 11.8403,
"lr": 0.00049,
"step": 246,
"tokens_trained": 0.120884624
},
{
"epoch": 0.0703496205942841,
"grad_norm": 430.50048828125,
"loss": 13.8081,
"lr": 0.000494,
"step": 248,
"tokens_trained": 0.121869224
},
{
"epoch": 0.070916956244238,
"grad_norm": 522.242919921875,
"loss": 14.1892,
"lr": 0.000498,
"step": 250,
"tokens_trained": 0.122853584
},
{
"epoch": 0.070916956244238,
"eval_loss": 1.9294606447219849,
"eval_runtime": 20.4162,
"step": 250,
"tokens_trained": 0.122853584
},
{
"epoch": 0.0714842918941919,
"grad_norm": 835.2765502929688,
"loss": 13.2462,
"lr": 0.0005020000000000001,
"step": 252,
"tokens_trained": 0.123835544
},
{
"epoch": 0.0720516275441458,
"grad_norm": 714.8098754882812,
"loss": 20.0498,
"lr": 0.000506,
"step": 254,
"tokens_trained": 0.124821616
},
{
"epoch": 0.0726189631940997,
"grad_norm": 701.512939453125,
"loss": 18.3664,
"lr": 0.00051,
"step": 256,
"tokens_trained": 0.125807608
},
{
"epoch": 0.07318629884405362,
"grad_norm": 773.987060546875,
"loss": 21.3807,
"lr": 0.000514,
"step": 258,
"tokens_trained": 0.126791464
},
{
"epoch": 0.07375363449400751,
"grad_norm": 826.422119140625,
"loss": 22.6403,
"lr": 0.000518,
"step": 260,
"tokens_trained": 0.127771752
},
{
"epoch": 0.07432097014396143,
"grad_norm": 742.8673095703125,
"loss": 20.1504,
"lr": 0.000522,
"step": 262,
"tokens_trained": 0.128755448
},
{
"epoch": 0.07488830579391532,
"grad_norm": 797.79296875,
"loss": 26.7343,
"lr": 0.000526,
"step": 264,
"tokens_trained": 0.129741088
},
{
"epoch": 0.07545564144386922,
"grad_norm": 673.9141235351562,
"loss": 12.505,
"lr": 0.0005300000000000001,
"step": 266,
"tokens_trained": 0.130727504
},
{
"epoch": 0.07602297709382314,
"grad_norm": 310.6510925292969,
"loss": 12.6344,
"lr": 0.0005340000000000001,
"step": 268,
"tokens_trained": 0.131710296
},
{
"epoch": 0.07659031274377703,
"grad_norm": 312.40966796875,
"loss": 14.254,
"lr": 0.0005380000000000001,
"step": 270,
"tokens_trained": 0.132695352
},
{
"epoch": 0.07715764839373095,
"grad_norm": 492.2834777832031,
"loss": 19.0979,
"lr": 0.0005420000000000001,
"step": 272,
"tokens_trained": 0.133677928
},
{
"epoch": 0.07772498404368484,
"grad_norm": 628.457763671875,
"loss": 21.7735,
"lr": 0.000546,
"step": 274,
"tokens_trained": 0.134655504
},
{
"epoch": 0.07829231969363874,
"grad_norm": 382.8389892578125,
"loss": 12.5128,
"lr": 0.00055,
"step": 276,
"tokens_trained": 0.135640208
},
{
"epoch": 0.07885965534359266,
"grad_norm": 483.12335205078125,
"loss": 15.2589,
"lr": 0.000554,
"step": 278,
"tokens_trained": 0.136624232
},
{
"epoch": 0.07942699099354655,
"grad_norm": 640.658447265625,
"loss": 12.1341,
"lr": 0.000558,
"step": 280,
"tokens_trained": 0.13760628
},
{
"epoch": 0.07999432664350047,
"grad_norm": 410.0824279785156,
"loss": 12.5723,
"lr": 0.0005620000000000001,
"step": 282,
"tokens_trained": 0.13858832
},
{
"epoch": 0.08056166229345436,
"grad_norm": 513.2861328125,
"loss": 14.8461,
"lr": 0.000566,
"step": 284,
"tokens_trained": 0.139568424
},
{
"epoch": 0.08112899794340826,
"grad_norm": 564.547607421875,
"loss": 12.5792,
"lr": 0.00057,
"step": 286,
"tokens_trained": 0.140557016
},
{
"epoch": 0.08169633359336217,
"grad_norm": 451.3592834472656,
"loss": 16.5433,
"lr": 0.000574,
"step": 288,
"tokens_trained": 0.141540248
},
{
"epoch": 0.08226366924331607,
"grad_norm": 404.2495422363281,
"loss": 16.4138,
"lr": 0.000578,
"step": 290,
"tokens_trained": 0.142528272
},
{
"epoch": 0.08283100489326999,
"grad_norm": 566.5219116210938,
"loss": 16.4743,
"lr": 0.0005819999999999999,
"step": 292,
"tokens_trained": 0.143513096
},
{
"epoch": 0.08339834054322388,
"grad_norm": 559.6517333984375,
"loss": 16.421,
"lr": 0.0005859999999999999,
"step": 294,
"tokens_trained": 0.144494472
},
{
"epoch": 0.08396567619317778,
"grad_norm": 260.874755859375,
"loss": 11.2214,
"lr": 0.00059,
"step": 296,
"tokens_trained": 0.14547876
},
{
"epoch": 0.0845330118431317,
"grad_norm": 272.02899169921875,
"loss": 10.3491,
"lr": 0.000594,
"step": 298,
"tokens_trained": 0.146465864
},
{
"epoch": 0.08510034749308559,
"grad_norm": 556.9845581054688,
"loss": 10.4348,
"lr": 0.000598,
"step": 300,
"tokens_trained": 0.147446344
},
{
"epoch": 0.0856676831430395,
"grad_norm": 273.35772705078125,
"loss": 8.3292,
"lr": 0.000602,
"step": 302,
"tokens_trained": 0.14843244
},
{
"epoch": 0.0862350187929934,
"grad_norm": 246.6316680908203,
"loss": 9.9362,
"lr": 0.000606,
"step": 304,
"tokens_trained": 0.149415976
},
{
"epoch": 0.0868023544429473,
"grad_norm": 564.4365844726562,
"loss": 9.2621,
"lr": 0.00061,
"step": 306,
"tokens_trained": 0.150398728
},
{
"epoch": 0.08736969009290121,
"grad_norm": 396.0948791503906,
"loss": 11.8526,
"lr": 0.000614,
"step": 308,
"tokens_trained": 0.151385104
},
{
"epoch": 0.08793702574285511,
"grad_norm": 488.6072692871094,
"loss": 11.8473,
"lr": 0.0006180000000000001,
"step": 310,
"tokens_trained": 0.152373672
},
{
"epoch": 0.08850436139280903,
"grad_norm": 346.70660400390625,
"loss": 12.0897,
"lr": 0.000622,
"step": 312,
"tokens_trained": 0.153356256
},
{
"epoch": 0.08907169704276292,
"grad_norm": 382.40679931640625,
"loss": 9.271,
"lr": 0.000626,
"step": 314,
"tokens_trained": 0.154342632
},
{
"epoch": 0.08963903269271682,
"grad_norm": 288.7908935546875,
"loss": 9.185,
"lr": 0.00063,
"step": 316,
"tokens_trained": 0.1553238
},
{
"epoch": 0.09020636834267073,
"grad_norm": 337.5335388183594,
"loss": 12.0555,
"lr": 0.000634,
"step": 318,
"tokens_trained": 0.156313168
},
{
"epoch": 0.09077370399262463,
"grad_norm": 349.25531005859375,
"loss": 8.51,
"lr": 0.000638,
"step": 320,
"tokens_trained": 0.157299448
},
{
"epoch": 0.09134103964257854,
"grad_norm": 471.7824401855469,
"loss": 14.1888,
"lr": 0.000642,
"step": 322,
"tokens_trained": 0.158285264
},
{
"epoch": 0.09190837529253244,
"grad_norm": 284.94036865234375,
"loss": 10.1593,
"lr": 0.000646,
"step": 324,
"tokens_trained": 0.159267512
},
{
"epoch": 0.09247571094248634,
"grad_norm": 510.90478515625,
"loss": 13.5744,
"lr": 0.0006500000000000001,
"step": 326,
"tokens_trained": 0.160250856
},
{
"epoch": 0.09304304659244025,
"grad_norm": 373.82965087890625,
"loss": 8.4999,
"lr": 0.0006540000000000001,
"step": 328,
"tokens_trained": 0.161231832
},
{
"epoch": 0.09361038224239415,
"grad_norm": 219.3827362060547,
"loss": 8.4436,
"lr": 0.0006580000000000001,
"step": 330,
"tokens_trained": 0.162217656
},
{
"epoch": 0.09417771789234806,
"grad_norm": 433.0914001464844,
"loss": 11.2019,
"lr": 0.000662,
"step": 332,
"tokens_trained": 0.163199096
},
{
"epoch": 0.09474505354230196,
"grad_norm": 242.65907287597656,
"loss": 9.0666,
"lr": 0.000666,
"step": 334,
"tokens_trained": 0.164178512
},
{
"epoch": 0.09531238919225588,
"grad_norm": 446.07916259765625,
"loss": 8.6546,
"lr": 0.00067,
"step": 336,
"tokens_trained": 0.165162464
},
{
"epoch": 0.09587972484220977,
"grad_norm": 231.8892364501953,
"loss": 7.5819,
"lr": 0.000674,
"step": 338,
"tokens_trained": 0.166141536
},
{
"epoch": 0.09644706049216367,
"grad_norm": 100.7306137084961,
"loss": 6.7047,
"lr": 0.0006780000000000001,
"step": 340,
"tokens_trained": 0.167123944
},
{
"epoch": 0.09701439614211758,
"grad_norm": 78.11279296875,
"loss": 5.9308,
"lr": 0.0006820000000000001,
"step": 342,
"tokens_trained": 0.168105264
},
{
"epoch": 0.09758173179207148,
"grad_norm": 271.466064453125,
"loss": 6.9141,
"lr": 0.0006860000000000001,
"step": 344,
"tokens_trained": 0.169088912
},
{
"epoch": 0.0981490674420254,
"grad_norm": 252.54478454589844,
"loss": 6.3281,
"lr": 0.00069,
"step": 346,
"tokens_trained": 0.170077368
},
{
"epoch": 0.0987164030919793,
"grad_norm": 305.8559875488281,
"loss": 6.443,
"lr": 0.000694,
"step": 348,
"tokens_trained": 0.171057232
},
{
"epoch": 0.09928373874193319,
"grad_norm": 227.74374389648438,
"loss": 6.552,
"lr": 0.0006979999999999999,
"step": 350,
"tokens_trained": 0.172041376
},
{
"epoch": 0.0998510743918871,
"grad_norm": 446.7601623535156,
"loss": 10.8184,
"lr": 0.0007019999999999999,
"step": 352,
"tokens_trained": 0.173023624
},
{
"epoch": 0.100418410041841,
"grad_norm": 353.0849609375,
"loss": 8.6327,
"lr": 0.0007059999999999999,
"step": 354,
"tokens_trained": 0.174005992
},
{
"epoch": 0.10098574569179491,
"grad_norm": 367.9427185058594,
"loss": 9.3898,
"lr": 0.00071,
"step": 356,
"tokens_trained": 0.174988304
},
{
"epoch": 0.10155308134174881,
"grad_norm": 224.4961700439453,
"loss": 8.284,
"lr": 0.000714,
"step": 358,
"tokens_trained": 0.175969816
},
{
"epoch": 0.10212041699170271,
"grad_norm": 221.86537170410156,
"loss": 7.0578,
"lr": 0.000718,
"step": 360,
"tokens_trained": 0.176952688
},
{
"epoch": 0.10268775264165662,
"grad_norm": 331.0989685058594,
"loss": 6.9561,
"lr": 0.000722,
"step": 362,
"tokens_trained": 0.177935144
},
{
"epoch": 0.10325508829161052,
"grad_norm": 171.6498260498047,
"loss": 7.203,
"lr": 0.000726,
"step": 364,
"tokens_trained": 0.178916776
},
{
"epoch": 0.10382242394156443,
"grad_norm": 284.2208557128906,
"loss": 10.3517,
"lr": 0.00073,
"step": 366,
"tokens_trained": 0.179903432
},
{
"epoch": 0.10438975959151833,
"grad_norm": 354.8574523925781,
"loss": 9.3888,
"lr": 0.000734,
"step": 368,
"tokens_trained": 0.180883224
},
{
"epoch": 0.10495709524147223,
"grad_norm": 344.82574462890625,
"loss": 10.5933,
"lr": 0.000738,
"step": 370,
"tokens_trained": 0.181863808
},
{
"epoch": 0.10552443089142614,
"grad_norm": 302.6838073730469,
"loss": 10.2832,
"lr": 0.000742,
"step": 372,
"tokens_trained": 0.182843712
},
{
"epoch": 0.10609176654138004,
"grad_norm": 323.0387878417969,
"loss": 6.4864,
"lr": 0.000746,
"step": 374,
"tokens_trained": 0.183825832
},
{
"epoch": 0.10637543436635699,
"eval_loss": 1.4430732727050781,
"eval_runtime": 20.5468,
"step": 375,
"tokens_trained": 0.184317744
},
{
"epoch": 0.10665910219133395,
"grad_norm": 133.74822998046875,
"loss": 5.4176,
"lr": 0.00075,
"step": 376,
"tokens_trained": 0.184811352
},
{
"epoch": 0.10722643784128785,
"grad_norm": 180.3372344970703,
"loss": 5.5641,
"lr": 0.000754,
"step": 378,
"tokens_trained": 0.185792528
},
{
"epoch": 0.10779377349124175,
"grad_norm": 250.83999633789062,
"loss": 5.8612,
"lr": 0.000758,
"step": 380,
"tokens_trained": 0.186777112
},
{
"epoch": 0.10836110914119566,
"grad_norm": 293.51959228515625,
"loss": 6.0418,
"lr": 0.000762,
"step": 382,
"tokens_trained": 0.18775724
},
{
"epoch": 0.10892844479114956,
"grad_norm": 292.56207275390625,
"loss": 6.1812,
"lr": 0.0007660000000000001,
"step": 384,
"tokens_trained": 0.188733568
},
{
"epoch": 0.10949578044110347,
"grad_norm": 121.82467651367188,
"loss": 6.0855,
"lr": 0.0007700000000000001,
"step": 386,
"tokens_trained": 0.189718512
},
{
"epoch": 0.11006311609105737,
"grad_norm": 124.30497741699219,
"loss": 5.7734,
"lr": 0.0007740000000000001,
"step": 388,
"tokens_trained": 0.190703776
},
{
"epoch": 0.11063045174101127,
"grad_norm": 143.64004516601562,
"loss": 5.7641,
"lr": 0.000778,
"step": 390,
"tokens_trained": 0.191689888
},
{
"epoch": 0.11119778739096518,
"grad_norm": 160.06784057617188,
"loss": 5.6025,
"lr": 0.000782,
"step": 392,
"tokens_trained": 0.192673992
},
{
"epoch": 0.11176512304091908,
"grad_norm": 226.97988891601562,
"loss": 6.0049,
"lr": 0.000786,
"step": 394,
"tokens_trained": 0.193656272
},
{
"epoch": 0.112332458690873,
"grad_norm": 223.26898193359375,
"loss": 5.6972,
"lr": 0.00079,
"step": 396,
"tokens_trained": 0.194639144
},
{
"epoch": 0.11289979434082689,
"grad_norm": 249.34912109375,
"loss": 5.7348,
"lr": 0.0007940000000000001,
"step": 398,
"tokens_trained": 0.195621256
},
{
"epoch": 0.11346712999078079,
"grad_norm": 161.34271240234375,
"loss": 5.6689,
"lr": 0.0007980000000000001,
"step": 400,
"tokens_trained": 0.196604136
},
{
"epoch": 0.1140344656407347,
"grad_norm": 148.53176879882812,
"loss": 5.702,
"lr": 0.0008020000000000001,
"step": 402,
"tokens_trained": 0.197586784
},
{
"epoch": 0.1146018012906886,
"grad_norm": 144.40835571289062,
"loss": 6.2402,
"lr": 0.0008060000000000001,
"step": 404,
"tokens_trained": 0.198570824
},
{
"epoch": 0.11516913694064251,
"grad_norm": 306.57562255859375,
"loss": 7.1739,
"lr": 0.0008100000000000001,
"step": 406,
"tokens_trained": 0.199548328
},
{
"epoch": 0.11573647259059641,
"grad_norm": 308.79180908203125,
"loss": 6.0972,
"lr": 0.0008139999999999999,
"step": 408,
"tokens_trained": 0.200532496
},
{
"epoch": 0.11630380824055031,
"grad_norm": 197.76791381835938,
"loss": 6.3533,
"lr": 0.0008179999999999999,
"step": 410,
"tokens_trained": 0.201514648
},
{
"epoch": 0.11687114389050422,
"grad_norm": 129.5694580078125,
"loss": 6.9628,
"lr": 0.0008219999999999999,
"step": 412,
"tokens_trained": 0.2024994
},
{
"epoch": 0.11743847954045812,
"grad_norm": 446.0195617675781,
"loss": 11.7562,
"lr": 0.000826,
"step": 414,
"tokens_trained": 0.20348012
},
{
"epoch": 0.11800581519041203,
"grad_norm": 355.5342712402344,
"loss": 8.8055,
"lr": 0.00083,
"step": 416,
"tokens_trained": 0.20446356
},
{
"epoch": 0.11857315084036593,
"grad_norm": 456.2491149902344,
"loss": 9.606,
"lr": 0.000834,
"step": 418,
"tokens_trained": 0.205445288
},
{
"epoch": 0.11914048649031983,
"grad_norm": 369.8676452636719,
"loss": 8.385,
"lr": 0.000838,
"step": 420,
"tokens_trained": 0.206427832
},
{
"epoch": 0.11970782214027374,
"grad_norm": 262.19073486328125,
"loss": 9.0956,
"lr": 0.000842,
"step": 422,
"tokens_trained": 0.207409848
},
{
"epoch": 0.12027515779022764,
"grad_norm": 120.3193130493164,
"loss": 5.4937,
"lr": 0.000846,
"step": 424,
"tokens_trained": 0.208391752
},
{
"epoch": 0.12084249344018155,
"grad_norm": 222.1111297607422,
"loss": 8.9367,
"lr": 0.00085,
"step": 426,
"tokens_trained": 0.20937384
},
{
"epoch": 0.12140982909013545,
"grad_norm": 137.16819763183594,
"loss": 7.5876,
"lr": 0.000854,
"step": 428,
"tokens_trained": 0.210358576
},
{
"epoch": 0.12197716474008935,
"grad_norm": 267.61846923828125,
"loss": 8.817,
"lr": 0.000858,
"step": 430,
"tokens_trained": 0.211340064
},
{
"epoch": 0.12254450039004326,
"grad_norm": 472.72906494140625,
"loss": 8.203,
"lr": 0.000862,
"step": 432,
"tokens_trained": 0.212321144
},
{
"epoch": 0.12311183603999716,
"grad_norm": 297.1420593261719,
"loss": 10.987,
"lr": 0.000866,
"step": 434,
"tokens_trained": 0.213300312
},
{
"epoch": 0.12367917168995107,
"grad_norm": 281.7297668457031,
"loss": 7.6117,
"lr": 0.00087,
"step": 436,
"tokens_trained": 0.214287624
},
{
"epoch": 0.12424650733990497,
"grad_norm": 203.09678649902344,
"loss": 6.5638,
"lr": 0.000874,
"step": 438,
"tokens_trained": 0.215272136
},
{
"epoch": 0.12481384298985887,
"grad_norm": 155.7823944091797,
"loss": 6.1131,
"lr": 0.000878,
"step": 440,
"tokens_trained": 0.216256392
},
{
"epoch": 0.12538117863981277,
"grad_norm": 189.86196899414062,
"loss": 8.2565,
"lr": 0.000882,
"step": 442,
"tokens_trained": 0.217242504
},
{
"epoch": 0.1259485142897667,
"grad_norm": 247.4568634033203,
"loss": 7.1005,
"lr": 0.0008860000000000001,
"step": 444,
"tokens_trained": 0.218226008
},
{
"epoch": 0.1265158499397206,
"grad_norm": 179.72825622558594,
"loss": 6.3379,
"lr": 0.0008900000000000001,
"step": 446,
"tokens_trained": 0.219210584
},
{
"epoch": 0.1270831855896745,
"grad_norm": 212.96356201171875,
"loss": 7.2514,
"lr": 0.000894,
"step": 448,
"tokens_trained": 0.220193952
},
{
"epoch": 0.1276505212396284,
"grad_norm": 105.67095947265625,
"loss": 5.456,
"lr": 0.000898,
"step": 450,
"tokens_trained": 0.221176936
},
{
"epoch": 0.1282178568895823,
"grad_norm": 302.9122619628906,
"loss": 6.4018,
"lr": 0.000902,
"step": 452,
"tokens_trained": 0.222161952
},
{
"epoch": 0.12878519253953621,
"grad_norm": 215.66561889648438,
"loss": 6.2853,
"lr": 0.000906,
"step": 454,
"tokens_trained": 0.223144912
},
{
"epoch": 0.1293525281894901,
"grad_norm": 272.9984130859375,
"loss": 7.3902,
"lr": 0.00091,
"step": 456,
"tokens_trained": 0.224127392
},
{
"epoch": 0.129919863839444,
"grad_norm": 200.7503662109375,
"loss": 6.1637,
"lr": 0.0009140000000000001,
"step": 458,
"tokens_trained": 0.22511648
},
{
"epoch": 0.1304871994893979,
"grad_norm": 93.23990631103516,
"loss": 6.4867,
"lr": 0.0009180000000000001,
"step": 460,
"tokens_trained": 0.226098144
},
{
"epoch": 0.1310545351393518,
"grad_norm": 274.37164306640625,
"loss": 8.99,
"lr": 0.0009220000000000001,
"step": 462,
"tokens_trained": 0.227081848
},
{
"epoch": 0.13162187078930573,
"grad_norm": 186.66322326660156,
"loss": 8.7122,
"lr": 0.0009260000000000001,
"step": 464,
"tokens_trained": 0.22806636
},
{
"epoch": 0.13218920643925963,
"grad_norm": 586.1035766601562,
"loss": 9.1045,
"lr": 0.00093,
"step": 466,
"tokens_trained": 0.229047872
},
{
"epoch": 0.13275654208921353,
"grad_norm": 227.55996704101562,
"loss": 9.7276,
"lr": 0.000934,
"step": 468,
"tokens_trained": 0.230031144
},
{
"epoch": 0.13332387773916743,
"grad_norm": 229.26609802246094,
"loss": 6.6244,
"lr": 0.0009379999999999999,
"step": 470,
"tokens_trained": 0.2310158
},
{
"epoch": 0.13389121338912133,
"grad_norm": 145.16331481933594,
"loss": 5.759,
"lr": 0.000942,
"step": 472,
"tokens_trained": 0.2319996
},
{
"epoch": 0.13445854903907525,
"grad_norm": 109.9937744140625,
"loss": 5.4838,
"lr": 0.000946,
"step": 474,
"tokens_trained": 0.232983808
},
{
"epoch": 0.13502588468902915,
"grad_norm": 135.74899291992188,
"loss": 6.2738,
"lr": 0.00095,
"step": 476,
"tokens_trained": 0.233963016
},
{
"epoch": 0.13559322033898305,
"grad_norm": 142.99449157714844,
"loss": 5.8459,
"lr": 0.000954,
"step": 478,
"tokens_trained": 0.234948864
},
{
"epoch": 0.13616055598893695,
"grad_norm": 198.66883850097656,
"loss": 6.6626,
"lr": 0.000958,
"step": 480,
"tokens_trained": 0.235932392
},
{
"epoch": 0.13672789163889085,
"grad_norm": 260.76507568359375,
"loss": 6.9299,
"lr": 0.000962,
"step": 482,
"tokens_trained": 0.236915664
},
{
"epoch": 0.13729522728884477,
"grad_norm": 267.97589111328125,
"loss": 6.4343,
"lr": 0.000966,
"step": 484,
"tokens_trained": 0.237896904
},
{
"epoch": 0.13786256293879867,
"grad_norm": 89.8781967163086,
"loss": 6.3203,
"lr": 0.0009699999999999999,
"step": 486,
"tokens_trained": 0.238874528
},
{
"epoch": 0.13842989858875257,
"grad_norm": 225.62985229492188,
"loss": 6.2778,
"lr": 0.000974,
"step": 488,
"tokens_trained": 0.2398588
},
{
"epoch": 0.13899723423870647,
"grad_norm": 85.84110260009766,
"loss": 5.2786,
"lr": 0.000978,
"step": 490,
"tokens_trained": 0.240839968
},
{
"epoch": 0.13956456988866037,
"grad_norm": 141.4368438720703,
"loss": 5.5525,
"lr": 0.000982,
"step": 492,
"tokens_trained": 0.241823544
},
{
"epoch": 0.1401319055386143,
"grad_norm": 94.9535140991211,
"loss": 5.4386,
"lr": 0.0009860000000000001,
"step": 494,
"tokens_trained": 0.242805456
},
{
"epoch": 0.1406992411885682,
"grad_norm": 157.4557647705078,
"loss": 5.9786,
"lr": 0.00099,
"step": 496,
"tokens_trained": 0.243792496
},
{
"epoch": 0.1412665768385221,
"grad_norm": 319.5025634765625,
"loss": 7.04,
"lr": 0.000994,
"step": 498,
"tokens_trained": 0.244772472
},
{
"epoch": 0.141833912488476,
"grad_norm": 282.26824951171875,
"loss": 9.4037,
"lr": 0.000998,
"step": 500,
"tokens_trained": 0.245758968
},
{
"epoch": 0.141833912488476,
"eval_loss": 2.152184247970581,
"eval_runtime": 21.2772,
"step": 500,
"tokens_trained": 0.245758968
},
{
"epoch": 0.1424012481384299,
"grad_norm": 306.0666809082031,
"loss": 7.8845,
"lr": 0.00099986013986014,
"step": 502,
"tokens_trained": 0.246739024
},
{
"epoch": 0.1429685837883838,
"grad_norm": 188.89024353027344,
"loss": 6.8118,
"lr": 0.0009995804195804196,
"step": 504,
"tokens_trained": 0.247726552
},
{
"epoch": 0.1435359194383377,
"grad_norm": 228.97474670410156,
"loss": 6.8475,
"lr": 0.0009993006993006994,
"step": 506,
"tokens_trained": 0.24870688
},
{
"epoch": 0.1441032550882916,
"grad_norm": 229.80029296875,
"loss": 6.2171,
"lr": 0.000999020979020979,
"step": 508,
"tokens_trained": 0.249689096
},
{
"epoch": 0.1446705907382455,
"grad_norm": 157.30340576171875,
"loss": 6.2281,
"lr": 0.0009987412587412587,
"step": 510,
"tokens_trained": 0.250671768
},
{
"epoch": 0.1452379263881994,
"grad_norm": 176.64683532714844,
"loss": 6.5993,
"lr": 0.0009984615384615386,
"step": 512,
"tokens_trained": 0.25165608
},
{
"epoch": 0.14580526203815333,
"grad_norm": 197.20526123046875,
"loss": 5.7267,
"lr": 0.0009981818181818182,
"step": 514,
"tokens_trained": 0.252639712
},
{
"epoch": 0.14637259768810723,
"grad_norm": 54.713260650634766,
"loss": 5.7911,
"lr": 0.000997902097902098,
"step": 516,
"tokens_trained": 0.253622816
},
{
"epoch": 0.14693993333806113,
"grad_norm": 185.74923706054688,
"loss": 7.0055,
"lr": 0.0009976223776223777,
"step": 518,
"tokens_trained": 0.254602792
},
{
"epoch": 0.14750726898801503,
"grad_norm": 240.31021118164062,
"loss": 6.452,
"lr": 0.0009973426573426573,
"step": 520,
"tokens_trained": 0.255584736
},
{
"epoch": 0.14807460463796893,
"grad_norm": 160.2477264404297,
"loss": 7.6556,
"lr": 0.000997062937062937,
"step": 522,
"tokens_trained": 0.256563792
},
{
"epoch": 0.14864194028792285,
"grad_norm": 283.0034484863281,
"loss": 6.5345,
"lr": 0.0009967832167832168,
"step": 524,
"tokens_trained": 0.257546656
},
{
"epoch": 0.14920927593787675,
"grad_norm": 245.537109375,
"loss": 6.3281,
"lr": 0.0009965034965034964,
"step": 526,
"tokens_trained": 0.258530832
},
{
"epoch": 0.14977661158783065,
"grad_norm": 162.1538848876953,
"loss": 7.4072,
"lr": 0.0009962237762237763,
"step": 528,
"tokens_trained": 0.259514528
},
{
"epoch": 0.15034394723778455,
"grad_norm": 107.25792694091797,
"loss": 5.356,
"lr": 0.000995944055944056,
"step": 530,
"tokens_trained": 0.260500912
},
{
"epoch": 0.15091128288773845,
"grad_norm": 173.73353576660156,
"loss": 6.8625,
"lr": 0.0009956643356643356,
"step": 532,
"tokens_trained": 0.26148632
},
{
"epoch": 0.15147861853769237,
"grad_norm": 178.33541870117188,
"loss": 5.8794,
"lr": 0.0009953846153846154,
"step": 534,
"tokens_trained": 0.262468816
},
{
"epoch": 0.15204595418764627,
"grad_norm": 181.2533416748047,
"loss": 7.0243,
"lr": 0.000995104895104895,
"step": 536,
"tokens_trained": 0.263446696
},
{
"epoch": 0.15261328983760017,
"grad_norm": 208.79293823242188,
"loss": 5.8908,
"lr": 0.000994825174825175,
"step": 538,
"tokens_trained": 0.26443108
},
{
"epoch": 0.15318062548755407,
"grad_norm": 148.66285705566406,
"loss": 6.0831,
"lr": 0.0009945454545454546,
"step": 540,
"tokens_trained": 0.265414496
},
{
"epoch": 0.15374796113750797,
"grad_norm": 165.044189453125,
"loss": 5.5594,
"lr": 0.0009942657342657344,
"step": 542,
"tokens_trained": 0.266394128
},
{
"epoch": 0.1543152967874619,
"grad_norm": 124.5405502319336,
"loss": 5.2442,
"lr": 0.000993986013986014,
"step": 544,
"tokens_trained": 0.267378768
},
{
"epoch": 0.1548826324374158,
"grad_norm": 68.66510772705078,
"loss": 5.1173,
"lr": 0.0009937062937062937,
"step": 546,
"tokens_trained": 0.268360184
},
{
"epoch": 0.1554499680873697,
"grad_norm": 57.052860260009766,
"loss": 5.2348,
"lr": 0.0009934265734265735,
"step": 548,
"tokens_trained": 0.269345672
},
{
"epoch": 0.1560173037373236,
"grad_norm": 184.9175567626953,
"loss": 6.7748,
"lr": 0.0009931468531468532,
"step": 550,
"tokens_trained": 0.2703288
},
{
"epoch": 0.15658463938727749,
"grad_norm": 72.9861831665039,
"loss": 5.7387,
"lr": 0.000992867132867133,
"step": 552,
"tokens_trained": 0.271309176
},
{
"epoch": 0.1571519750372314,
"grad_norm": 135.864501953125,
"loss": 6.3035,
"lr": 0.0009925874125874127,
"step": 554,
"tokens_trained": 0.27229644
},
{
"epoch": 0.1577193106871853,
"grad_norm": 130.579833984375,
"loss": 5.4434,
"lr": 0.0009923076923076923,
"step": 556,
"tokens_trained": 0.273277904
},
{
"epoch": 0.1582866463371392,
"grad_norm": 206.77345275878906,
"loss": 5.8649,
"lr": 0.000992027972027972,
"step": 558,
"tokens_trained": 0.274261712
},
{
"epoch": 0.1588539819870931,
"grad_norm": 144.0505828857422,
"loss": 5.3459,
"lr": 0.0009917482517482518,
"step": 560,
"tokens_trained": 0.2752468
},
{
"epoch": 0.159421317637047,
"grad_norm": 87.56634521484375,
"loss": 5.6321,
"lr": 0.0009914685314685314,
"step": 562,
"tokens_trained": 0.276232384
},
{
"epoch": 0.15998865328700093,
"grad_norm": 275.2727355957031,
"loss": 6.7515,
"lr": 0.0009911888111888113,
"step": 564,
"tokens_trained": 0.277211608
},
{
"epoch": 0.16055598893695483,
"grad_norm": 97.00019836425781,
"loss": 5.4374,
"lr": 0.000990909090909091,
"step": 566,
"tokens_trained": 0.278196336
},
{
"epoch": 0.16112332458690873,
"grad_norm": 102.91439056396484,
"loss": 5.729,
"lr": 0.0009906293706293705,
"step": 568,
"tokens_trained": 0.279175672
},
{
"epoch": 0.16169066023686263,
"grad_norm": 151.12432861328125,
"loss": 5.4189,
"lr": 0.0009903496503496504,
"step": 570,
"tokens_trained": 0.280161088
},
{
"epoch": 0.16225799588681653,
"grad_norm": 86.6823959350586,
"loss": 5.1704,
"lr": 0.00099006993006993,
"step": 572,
"tokens_trained": 0.28114256
},
{
"epoch": 0.16282533153677045,
"grad_norm": 90.7052230834961,
"loss": 5.3673,
"lr": 0.0009897902097902099,
"step": 574,
"tokens_trained": 0.282128904
},
{
"epoch": 0.16339266718672435,
"grad_norm": 146.92874145507812,
"loss": 5.5971,
"lr": 0.0009895104895104895,
"step": 576,
"tokens_trained": 0.28311528
},
{
"epoch": 0.16396000283667825,
"grad_norm": 189.76296997070312,
"loss": 5.3109,
"lr": 0.0009892307692307694,
"step": 578,
"tokens_trained": 0.284098528
},
{
"epoch": 0.16452733848663215,
"grad_norm": 174.48092651367188,
"loss": 5.68,
"lr": 0.000988951048951049,
"step": 580,
"tokens_trained": 0.285081064
},
{
"epoch": 0.16509467413658604,
"grad_norm": 154.10816955566406,
"loss": 5.3307,
"lr": 0.0009886713286713286,
"step": 582,
"tokens_trained": 0.286067952
},
{
"epoch": 0.16566200978653997,
"grad_norm": 64.28263092041016,
"loss": 5.1676,
"lr": 0.0009883916083916085,
"step": 584,
"tokens_trained": 0.287051384
},
{
"epoch": 0.16622934543649387,
"grad_norm": 103.81795501708984,
"loss": 5.3436,
"lr": 0.0009881118881118881,
"step": 586,
"tokens_trained": 0.28803284
},
{
"epoch": 0.16679668108644777,
"grad_norm": 144.0076904296875,
"loss": 5.3033,
"lr": 0.000987832167832168,
"step": 588,
"tokens_trained": 0.289014824
},
{
"epoch": 0.16736401673640167,
"grad_norm": 88.31237030029297,
"loss": 5.0609,
"lr": 0.0009875524475524476,
"step": 590,
"tokens_trained": 0.289999864
},
{
"epoch": 0.16793135238635556,
"grad_norm": 68.4583740234375,
"loss": 5.0702,
"lr": 0.0009872727272727273,
"step": 592,
"tokens_trained": 0.290983888
},
{
"epoch": 0.1684986880363095,
"grad_norm": 135.28665161132812,
"loss": 5.3962,
"lr": 0.000986993006993007,
"step": 594,
"tokens_trained": 0.291965752
},
{
"epoch": 0.1690660236862634,
"grad_norm": 80.0412368774414,
"loss": 5.0246,
"lr": 0.0009867132867132867,
"step": 596,
"tokens_trained": 0.292946952
},
{
"epoch": 0.1696333593362173,
"grad_norm": 43.29194641113281,
"loss": 5.0051,
"lr": 0.0009864335664335664,
"step": 598,
"tokens_trained": 0.293928976
},
{
"epoch": 0.17020069498617119,
"grad_norm": 220.88687133789062,
"loss": 6.0798,
"lr": 0.0009861538461538462,
"step": 600,
"tokens_trained": 0.294912408
},
{
"epoch": 0.17076803063612508,
"grad_norm": 102.58654022216797,
"loss": 5.1271,
"lr": 0.0009858741258741259,
"step": 602,
"tokens_trained": 0.29589416
},
{
"epoch": 0.171335366286079,
"grad_norm": 119.0067138671875,
"loss": 5.7402,
"lr": 0.0009855944055944055,
"step": 604,
"tokens_trained": 0.296878584
},
{
"epoch": 0.1719027019360329,
"grad_norm": 138.8656005859375,
"loss": 5.1951,
"lr": 0.0009853146853146854,
"step": 606,
"tokens_trained": 0.297864552
},
{
"epoch": 0.1724700375859868,
"grad_norm": 73.5890884399414,
"loss": 5.2522,
"lr": 0.000985034965034965,
"step": 608,
"tokens_trained": 0.298854088
},
{
"epoch": 0.1730373732359407,
"grad_norm": 113.78330993652344,
"loss": 5.6683,
"lr": 0.0009847552447552449,
"step": 610,
"tokens_trained": 0.299835024
},
{
"epoch": 0.1736047088858946,
"grad_norm": 125.20297241210938,
"loss": 5.1812,
"lr": 0.0009844755244755245,
"step": 612,
"tokens_trained": 0.30082032
},
{
"epoch": 0.17417204453584853,
"grad_norm": 67.46041870117188,
"loss": 5.0417,
"lr": 0.0009841958041958043,
"step": 614,
"tokens_trained": 0.301808456
},
{
"epoch": 0.17473938018580243,
"grad_norm": 117.30754852294922,
"loss": 5.3064,
"lr": 0.000983916083916084,
"step": 616,
"tokens_trained": 0.302794456
},
{
"epoch": 0.17530671583575633,
"grad_norm": 124.30754089355469,
"loss": 5.1614,
"lr": 0.0009836363636363636,
"step": 618,
"tokens_trained": 0.303777376
},
{
"epoch": 0.17587405148571023,
"grad_norm": 102.72042083740234,
"loss": 5.1265,
"lr": 0.0009833566433566435,
"step": 620,
"tokens_trained": 0.304758864
},
{
"epoch": 0.17644138713566412,
"grad_norm": 39.332252502441406,
"loss": 5.1078,
"lr": 0.000983076923076923,
"step": 622,
"tokens_trained": 0.30574392
},
{
"epoch": 0.17700872278561805,
"grad_norm": 153.84811401367188,
"loss": 5.7696,
"lr": 0.000982797202797203,
"step": 624,
"tokens_trained": 0.306727584
},
{
"epoch": 0.17729239061059499,
"eval_loss": 1.3463915586471558,
"eval_runtime": 20.8357,
"step": 625,
"tokens_trained": 0.307220496
},
{
"epoch": 0.17757605843557195,
"grad_norm": 160.2552490234375,
"loss": 5.2283,
"lr": 0.0009825174825174826,
"step": 626,
"tokens_trained": 0.307713024
},
{
"epoch": 0.17814339408552585,
"grad_norm": 186.77407836914062,
"loss": 5.2866,
"lr": 0.0009822377622377622,
"step": 628,
"tokens_trained": 0.308700128
},
{
"epoch": 0.17871072973547975,
"grad_norm": 84.55519104003906,
"loss": 5.1106,
"lr": 0.0009819580419580419,
"step": 630,
"tokens_trained": 0.309681208
},
{
"epoch": 0.17927806538543364,
"grad_norm": 20.617040634155273,
"loss": 4.8327,
"lr": 0.0009816783216783217,
"step": 632,
"tokens_trained": 0.310662224
},
{
"epoch": 0.17984540103538757,
"grad_norm": 168.06039428710938,
"loss": 6.0704,
"lr": 0.0009813986013986014,
"step": 634,
"tokens_trained": 0.31164064
},
{
"epoch": 0.18041273668534147,
"grad_norm": 238.23736572265625,
"loss": 5.6188,
"lr": 0.0009811188811188812,
"step": 636,
"tokens_trained": 0.312622568
},
{
"epoch": 0.18098007233529537,
"grad_norm": 140.0707550048828,
"loss": 6.4034,
"lr": 0.0009808391608391608,
"step": 638,
"tokens_trained": 0.313604944
},
{
"epoch": 0.18154740798524927,
"grad_norm": 161.19302368164062,
"loss": 5.4906,
"lr": 0.0009805594405594405,
"step": 640,
"tokens_trained": 0.314592072
},
{
"epoch": 0.18211474363520316,
"grad_norm": 121.9577407836914,
"loss": 5.2097,
"lr": 0.0009802797202797203,
"step": 642,
"tokens_trained": 0.315574392
},
{
"epoch": 0.1826820792851571,
"grad_norm": 121.25574493408203,
"loss": 5.0317,
"lr": 0.00098,
"step": 644,
"tokens_trained": 0.316559008
},
{
"epoch": 0.183249414935111,
"grad_norm": 28.328269958496094,
"loss": 4.932,
"lr": 0.0009797202797202798,
"step": 646,
"tokens_trained": 0.317538776
},
{
"epoch": 0.1838167505850649,
"grad_norm": 127.77408599853516,
"loss": 5.8335,
"lr": 0.0009794405594405595,
"step": 648,
"tokens_trained": 0.31851792
},
{
"epoch": 0.18438408623501878,
"grad_norm": 94.9522933959961,
"loss": 5.1948,
"lr": 0.000979160839160839,
"step": 650,
"tokens_trained": 0.319501576
},
{
"epoch": 0.18495142188497268,
"grad_norm": 110.33658599853516,
"loss": 5.098,
"lr": 0.000978881118881119,
"step": 652,
"tokens_trained": 0.320482392
},
{
"epoch": 0.1855187575349266,
"grad_norm": 67.23124694824219,
"loss": 4.7723,
"lr": 0.0009786013986013986,
"step": 654,
"tokens_trained": 0.32146712
},
{
"epoch": 0.1860860931848805,
"grad_norm": 61.519866943359375,
"loss": 4.7245,
"lr": 0.0009783216783216782,
"step": 656,
"tokens_trained": 0.322449576
},
{
"epoch": 0.1866534288348344,
"grad_norm": 99.51078033447266,
"loss": 4.783,
"lr": 0.000978041958041958,
"step": 658,
"tokens_trained": 0.323432688
},
{
"epoch": 0.1872207644847883,
"grad_norm": 44.619197845458984,
"loss": 4.7495,
"lr": 0.000977762237762238,
"step": 660,
"tokens_trained": 0.324413952
},
{
"epoch": 0.18778810013474223,
"grad_norm": 114.5891342163086,
"loss": 5.1261,
"lr": 0.0009774825174825176,
"step": 662,
"tokens_trained": 0.325394536
},
{
"epoch": 0.18835543578469613,
"grad_norm": 100.3728256225586,
"loss": 4.7883,
"lr": 0.0009772027972027972,
"step": 664,
"tokens_trained": 0.326374672
},
{
"epoch": 0.18892277143465003,
"grad_norm": 51.883033752441406,
"loss": 4.7249,
"lr": 0.0009769230769230768,
"step": 666,
"tokens_trained": 0.327357152
},
{
"epoch": 0.18949010708460393,
"grad_norm": 82.27507019042969,
"loss": 4.8277,
"lr": 0.0009766433566433567,
"step": 668,
"tokens_trained": 0.328342088
},
{
"epoch": 0.19005744273455782,
"grad_norm": 83.53064727783203,
"loss": 4.8338,
"lr": 0.0009763636363636363,
"step": 670,
"tokens_trained": 0.329319248
},
{
"epoch": 0.19062477838451175,
"grad_norm": 76.18387603759766,
"loss": 4.6958,
"lr": 0.0009760839160839161,
"step": 672,
"tokens_trained": 0.330305968
},
{
"epoch": 0.19119211403446565,
"grad_norm": 27.401426315307617,
"loss": 4.6929,
"lr": 0.0009758041958041958,
"step": 674,
"tokens_trained": 0.3312912
},
{
"epoch": 0.19175944968441955,
"grad_norm": 186.770263671875,
"loss": 5.5089,
"lr": 0.0009755244755244756,
"step": 676,
"tokens_trained": 0.332275224
},
{
"epoch": 0.19232678533437345,
"grad_norm": 105.02385711669922,
"loss": 4.8876,
"lr": 0.0009752447552447553,
"step": 678,
"tokens_trained": 0.33325588
},
{
"epoch": 0.19289412098432734,
"grad_norm": 94.96269989013672,
"loss": 5.1235,
"lr": 0.0009749650349650349,
"step": 680,
"tokens_trained": 0.334238408
},
{
"epoch": 0.19346145663428127,
"grad_norm": 92.29356384277344,
"loss": 4.8194,
"lr": 0.0009746853146853148,
"step": 682,
"tokens_trained": 0.335219368
},
{
"epoch": 0.19402879228423517,
"grad_norm": 59.1584358215332,
"loss": 4.7511,
"lr": 0.0009744055944055944,
"step": 684,
"tokens_trained": 0.336207136
},
{
"epoch": 0.19459612793418907,
"grad_norm": 54.759002685546875,
"loss": 4.777,
"lr": 0.0009741258741258742,
"step": 686,
"tokens_trained": 0.337193536
},
{
"epoch": 0.19516346358414297,
"grad_norm": 92.20452880859375,
"loss": 4.8225,
"lr": 0.0009738461538461538,
"step": 688,
"tokens_trained": 0.338179224
},
{
"epoch": 0.19573079923409686,
"grad_norm": 75.97005462646484,
"loss": 4.655,
"lr": 0.0009735664335664336,
"step": 690,
"tokens_trained": 0.339162168
},
{
"epoch": 0.1962981348840508,
"grad_norm": 58.19076919555664,
"loss": 4.6446,
"lr": 0.0009732867132867133,
"step": 692,
"tokens_trained": 0.340138904
},
{
"epoch": 0.1968654705340047,
"grad_norm": 50.81512451171875,
"loss": 4.5866,
"lr": 0.000973006993006993,
"step": 694,
"tokens_trained": 0.34112288
},
{
"epoch": 0.1974328061839586,
"grad_norm": 61.683372497558594,
"loss": 4.6018,
"lr": 0.0009727272727272728,
"step": 696,
"tokens_trained": 0.342111992
},
{
"epoch": 0.19800014183391249,
"grad_norm": 61.01798629760742,
"loss": 4.6007,
"lr": 0.0009724475524475524,
"step": 698,
"tokens_trained": 0.343095912
},
{
"epoch": 0.19856747748386638,
"grad_norm": 96.49671936035156,
"loss": 4.7035,
"lr": 0.0009721678321678323,
"step": 700,
"tokens_trained": 0.344078632
},
{
"epoch": 0.1991348131338203,
"grad_norm": 64.7771224975586,
"loss": 4.8341,
"lr": 0.0009718881118881119,
"step": 702,
"tokens_trained": 0.345060576
},
{
"epoch": 0.1997021487837742,
"grad_norm": 90.1478042602539,
"loss": 4.7739,
"lr": 0.0009716083916083917,
"step": 704,
"tokens_trained": 0.34604112
},
{
"epoch": 0.2002694844337281,
"grad_norm": 67.6308822631836,
"loss": 4.6218,
"lr": 0.0009713286713286713,
"step": 706,
"tokens_trained": 0.347023496
},
{
"epoch": 0.200836820083682,
"grad_norm": 40.50175094604492,
"loss": 4.6008,
"lr": 0.000971048951048951,
"step": 708,
"tokens_trained": 0.348005416
},
{
"epoch": 0.2014041557336359,
"grad_norm": 33.6448860168457,
"loss": 4.5307,
"lr": 0.0009707692307692308,
"step": 710,
"tokens_trained": 0.3489886
},
{
"epoch": 0.20197149138358983,
"grad_norm": 15.484851837158203,
"loss": 4.5065,
"lr": 0.0009704895104895105,
"step": 712,
"tokens_trained": 0.34997024
},
{
"epoch": 0.20253882703354373,
"grad_norm": 109.26301574707031,
"loss": 4.9613,
"lr": 0.0009702097902097903,
"step": 714,
"tokens_trained": 0.350958496
},
{
"epoch": 0.20310616268349763,
"grad_norm": 150.07492065429688,
"loss": 4.8507,
"lr": 0.0009699300699300699,
"step": 716,
"tokens_trained": 0.35193892
},
{
"epoch": 0.20367349833345152,
"grad_norm": 113.43978881835938,
"loss": 5.4494,
"lr": 0.0009696503496503498,
"step": 718,
"tokens_trained": 0.35291908
},
{
"epoch": 0.20424083398340542,
"grad_norm": 123.0071792602539,
"loss": 4.9475,
"lr": 0.0009693706293706294,
"step": 720,
"tokens_trained": 0.353896072
},
{
"epoch": 0.20480816963335935,
"grad_norm": 65.55500793457031,
"loss": 4.7585,
"lr": 0.0009690909090909091,
"step": 722,
"tokens_trained": 0.354878992
},
{
"epoch": 0.20537550528331325,
"grad_norm": 36.11159896850586,
"loss": 4.6323,
"lr": 0.0009688111888111888,
"step": 724,
"tokens_trained": 0.355863728
},
{
"epoch": 0.20594284093326715,
"grad_norm": 30.566436767578125,
"loss": 4.53,
"lr": 0.0009685314685314685,
"step": 726,
"tokens_trained": 0.356845272
},
{
"epoch": 0.20651017658322104,
"grad_norm": 59.01853561401367,
"loss": 4.5283,
"lr": 0.0009682517482517483,
"step": 728,
"tokens_trained": 0.357826656
},
{
"epoch": 0.20707751223317494,
"grad_norm": 91.78115844726562,
"loss": 4.6149,
"lr": 0.000967972027972028,
"step": 730,
"tokens_trained": 0.358809896
},
{
"epoch": 0.20764484788312887,
"grad_norm": 67.97398376464844,
"loss": 4.617,
"lr": 0.0009676923076923078,
"step": 732,
"tokens_trained": 0.359788736
},
{
"epoch": 0.20821218353308277,
"grad_norm": 42.82001876831055,
"loss": 4.6134,
"lr": 0.0009674125874125874,
"step": 734,
"tokens_trained": 0.360771744
},
{
"epoch": 0.20877951918303667,
"grad_norm": 63.52122116088867,
"loss": 4.6995,
"lr": 0.0009671328671328672,
"step": 736,
"tokens_trained": 0.361757656
},
{
"epoch": 0.20934685483299056,
"grad_norm": 116.39544677734375,
"loss": 4.7153,
"lr": 0.0009668531468531469,
"step": 738,
"tokens_trained": 0.362744008
},
{
"epoch": 0.20991419048294446,
"grad_norm": 40.74269485473633,
"loss": 4.7978,
"lr": 0.0009665734265734266,
"step": 740,
"tokens_trained": 0.36372872
},
{
"epoch": 0.2104815261328984,
"grad_norm": 114.29917907714844,
"loss": 5.1683,
"lr": 0.0009662937062937063,
"step": 742,
"tokens_trained": 0.364710536
},
{
"epoch": 0.2110488617828523,
"grad_norm": 115.83326721191406,
"loss": 4.7642,
"lr": 0.000966013986013986,
"step": 744,
"tokens_trained": 0.3656912
},
{
"epoch": 0.21161619743280619,
"grad_norm": 21.708093643188477,
"loss": 4.8244,
"lr": 0.0009657342657342657,
"step": 746,
"tokens_trained": 0.36667388
},
{
"epoch": 0.21218353308276008,
"grad_norm": 182.01918029785156,
"loss": 5.6045,
"lr": 0.0009654545454545455,
"step": 748,
"tokens_trained": 0.3676634
},
{
"epoch": 0.21275086873271398,
"grad_norm": 47.119319915771484,
"loss": 4.7929,
"lr": 0.0009651748251748252,
"step": 750,
"tokens_trained": 0.368647288
},
{
"epoch": 0.21275086873271398,
"eval_loss": 1.2186306715011597,
"eval_runtime": 20.9362,
"step": 750,
"tokens_trained": 0.368647288
}
],
"logging_steps": 2,
"max_steps": 7650,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 750,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}