Sudanl's picture
Upload folder using huggingface_hub
d449291 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9739884393063583,
"eval_steps": 129,
"global_step": 258,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007707129094412331,
"grad_norm": 27.94101905822754,
"learning_rate": 2.0000000000000002e-07,
"loss": 2.3083,
"step": 1
},
{
"epoch": 0.007707129094412331,
"eval_loss": 2.273209810256958,
"eval_runtime": 27.5737,
"eval_samples_per_second": 1.813,
"eval_steps_per_second": 0.471,
"step": 1
},
{
"epoch": 0.015414258188824663,
"grad_norm": 27.76470184326172,
"learning_rate": 4.0000000000000003e-07,
"loss": 2.2483,
"step": 2
},
{
"epoch": 0.023121387283236993,
"grad_norm": 28.12770652770996,
"learning_rate": 6.000000000000001e-07,
"loss": 2.29,
"step": 3
},
{
"epoch": 0.030828516377649325,
"grad_norm": 29.105730056762695,
"learning_rate": 8.000000000000001e-07,
"loss": 2.3396,
"step": 4
},
{
"epoch": 0.038535645472061654,
"grad_norm": 28.90327262878418,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.2654,
"step": 5
},
{
"epoch": 0.046242774566473986,
"grad_norm": 30.192190170288086,
"learning_rate": 1.2000000000000002e-06,
"loss": 2.2205,
"step": 6
},
{
"epoch": 0.05394990366088632,
"grad_norm": 27.94385528564453,
"learning_rate": 1.4000000000000001e-06,
"loss": 2.2161,
"step": 7
},
{
"epoch": 0.06165703275529865,
"grad_norm": 17.948062896728516,
"learning_rate": 1.6000000000000001e-06,
"loss": 2.0695,
"step": 8
},
{
"epoch": 0.06936416184971098,
"grad_norm": 17.047054290771484,
"learning_rate": 1.8000000000000001e-06,
"loss": 2.0742,
"step": 9
},
{
"epoch": 0.07707129094412331,
"grad_norm": 15.427838325500488,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.9962,
"step": 10
},
{
"epoch": 0.08477842003853564,
"grad_norm": 6.273721694946289,
"learning_rate": 2.2e-06,
"loss": 1.859,
"step": 11
},
{
"epoch": 0.09248554913294797,
"grad_norm": 5.851183891296387,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.8696,
"step": 12
},
{
"epoch": 0.1001926782273603,
"grad_norm": 4.672307968139648,
"learning_rate": 2.6e-06,
"loss": 1.8348,
"step": 13
},
{
"epoch": 0.10789980732177264,
"grad_norm": 2.6562161445617676,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.7719,
"step": 14
},
{
"epoch": 0.11560693641618497,
"grad_norm": 4.172055721282959,
"learning_rate": 3e-06,
"loss": 1.7984,
"step": 15
},
{
"epoch": 0.1233140655105973,
"grad_norm": 4.055249214172363,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.8025,
"step": 16
},
{
"epoch": 0.13102119460500963,
"grad_norm": 3.3719887733459473,
"learning_rate": 3.4000000000000005e-06,
"loss": 1.739,
"step": 17
},
{
"epoch": 0.13872832369942195,
"grad_norm": 2.81038498878479,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.7478,
"step": 18
},
{
"epoch": 0.1464354527938343,
"grad_norm": 2.2064859867095947,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.7384,
"step": 19
},
{
"epoch": 0.15414258188824662,
"grad_norm": 1.940885305404663,
"learning_rate": 4.000000000000001e-06,
"loss": 1.7135,
"step": 20
},
{
"epoch": 0.16184971098265896,
"grad_norm": 1.9488122463226318,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.7108,
"step": 21
},
{
"epoch": 0.16955684007707128,
"grad_norm": 1.7049647569656372,
"learning_rate": 4.4e-06,
"loss": 1.6868,
"step": 22
},
{
"epoch": 0.17726396917148363,
"grad_norm": 1.5429236888885498,
"learning_rate": 4.600000000000001e-06,
"loss": 1.6947,
"step": 23
},
{
"epoch": 0.18497109826589594,
"grad_norm": 1.5304620265960693,
"learning_rate": 4.800000000000001e-06,
"loss": 1.6846,
"step": 24
},
{
"epoch": 0.1926782273603083,
"grad_norm": 1.5696897506713867,
"learning_rate": 5e-06,
"loss": 1.6844,
"step": 25
},
{
"epoch": 0.2003853564547206,
"grad_norm": 1.4362632036209106,
"learning_rate": 5.2e-06,
"loss": 1.6732,
"step": 26
},
{
"epoch": 0.20809248554913296,
"grad_norm": 1.3416928052902222,
"learning_rate": 5.400000000000001e-06,
"loss": 1.6424,
"step": 27
},
{
"epoch": 0.21579961464354527,
"grad_norm": 1.3142507076263428,
"learning_rate": 5.600000000000001e-06,
"loss": 1.677,
"step": 28
},
{
"epoch": 0.22350674373795762,
"grad_norm": 1.342984676361084,
"learning_rate": 5.8e-06,
"loss": 1.6762,
"step": 29
},
{
"epoch": 0.23121387283236994,
"grad_norm": 1.2972025871276855,
"learning_rate": 6e-06,
"loss": 1.6716,
"step": 30
},
{
"epoch": 0.23892100192678228,
"grad_norm": 1.2904590368270874,
"learning_rate": 6.200000000000001e-06,
"loss": 1.6234,
"step": 31
},
{
"epoch": 0.2466281310211946,
"grad_norm": 1.1942962408065796,
"learning_rate": 6.4000000000000006e-06,
"loss": 1.6533,
"step": 32
},
{
"epoch": 0.2543352601156069,
"grad_norm": 1.124014973640442,
"learning_rate": 6.600000000000001e-06,
"loss": 1.6604,
"step": 33
},
{
"epoch": 0.26204238921001927,
"grad_norm": 1.2125813961029053,
"learning_rate": 6.800000000000001e-06,
"loss": 1.6335,
"step": 34
},
{
"epoch": 0.2697495183044316,
"grad_norm": 1.2104367017745972,
"learning_rate": 7e-06,
"loss": 1.6356,
"step": 35
},
{
"epoch": 0.2774566473988439,
"grad_norm": 1.1877591609954834,
"learning_rate": 7.2000000000000005e-06,
"loss": 1.6349,
"step": 36
},
{
"epoch": 0.28516377649325625,
"grad_norm": 1.2402458190917969,
"learning_rate": 7.4e-06,
"loss": 1.6463,
"step": 37
},
{
"epoch": 0.2928709055876686,
"grad_norm": 1.1922346353530884,
"learning_rate": 7.600000000000001e-06,
"loss": 1.5998,
"step": 38
},
{
"epoch": 0.30057803468208094,
"grad_norm": 1.197464942932129,
"learning_rate": 7.800000000000002e-06,
"loss": 1.6265,
"step": 39
},
{
"epoch": 0.30828516377649323,
"grad_norm": 1.291739821434021,
"learning_rate": 8.000000000000001e-06,
"loss": 1.6077,
"step": 40
},
{
"epoch": 0.3159922928709056,
"grad_norm": 1.145663857460022,
"learning_rate": 8.2e-06,
"loss": 1.6152,
"step": 41
},
{
"epoch": 0.3236994219653179,
"grad_norm": 1.1572788953781128,
"learning_rate": 8.400000000000001e-06,
"loss": 1.6058,
"step": 42
},
{
"epoch": 0.33140655105973027,
"grad_norm": 1.3273899555206299,
"learning_rate": 8.6e-06,
"loss": 1.6223,
"step": 43
},
{
"epoch": 0.33911368015414256,
"grad_norm": 1.1160943508148193,
"learning_rate": 8.8e-06,
"loss": 1.5969,
"step": 44
},
{
"epoch": 0.3468208092485549,
"grad_norm": 1.3087902069091797,
"learning_rate": 9e-06,
"loss": 1.6464,
"step": 45
},
{
"epoch": 0.35452793834296725,
"grad_norm": 1.1589637994766235,
"learning_rate": 9.200000000000002e-06,
"loss": 1.5799,
"step": 46
},
{
"epoch": 0.3622350674373796,
"grad_norm": 1.159191370010376,
"learning_rate": 9.4e-06,
"loss": 1.6153,
"step": 47
},
{
"epoch": 0.3699421965317919,
"grad_norm": 1.206766128540039,
"learning_rate": 9.600000000000001e-06,
"loss": 1.5982,
"step": 48
},
{
"epoch": 0.37764932562620424,
"grad_norm": 1.1924678087234497,
"learning_rate": 9.800000000000001e-06,
"loss": 1.6054,
"step": 49
},
{
"epoch": 0.3853564547206166,
"grad_norm": 1.2029445171356201,
"learning_rate": 1e-05,
"loss": 1.6205,
"step": 50
},
{
"epoch": 0.3930635838150289,
"grad_norm": 1.1406632661819458,
"learning_rate": 1.02e-05,
"loss": 1.6158,
"step": 51
},
{
"epoch": 0.4007707129094412,
"grad_norm": 1.1437443494796753,
"learning_rate": 1.04e-05,
"loss": 1.6045,
"step": 52
},
{
"epoch": 0.40847784200385356,
"grad_norm": 1.127734661102295,
"learning_rate": 1.0600000000000002e-05,
"loss": 1.5968,
"step": 53
},
{
"epoch": 0.4161849710982659,
"grad_norm": 1.1851099729537964,
"learning_rate": 1.0800000000000002e-05,
"loss": 1.6045,
"step": 54
},
{
"epoch": 0.4238921001926782,
"grad_norm": 1.1298301219940186,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.5908,
"step": 55
},
{
"epoch": 0.43159922928709055,
"grad_norm": 1.095090627670288,
"learning_rate": 1.1200000000000001e-05,
"loss": 1.5901,
"step": 56
},
{
"epoch": 0.4393063583815029,
"grad_norm": 1.1739152669906616,
"learning_rate": 1.14e-05,
"loss": 1.6275,
"step": 57
},
{
"epoch": 0.44701348747591524,
"grad_norm": 1.1687606573104858,
"learning_rate": 1.16e-05,
"loss": 1.5938,
"step": 58
},
{
"epoch": 0.45472061657032753,
"grad_norm": 1.1895908117294312,
"learning_rate": 1.18e-05,
"loss": 1.6016,
"step": 59
},
{
"epoch": 0.4624277456647399,
"grad_norm": 1.199129581451416,
"learning_rate": 1.2e-05,
"loss": 1.6317,
"step": 60
},
{
"epoch": 0.4701348747591522,
"grad_norm": 1.2785886526107788,
"learning_rate": 1.22e-05,
"loss": 1.5672,
"step": 61
},
{
"epoch": 0.47784200385356457,
"grad_norm": 1.2036688327789307,
"learning_rate": 1.2400000000000002e-05,
"loss": 1.5636,
"step": 62
},
{
"epoch": 0.48554913294797686,
"grad_norm": 1.2586396932601929,
"learning_rate": 1.2600000000000001e-05,
"loss": 1.5806,
"step": 63
},
{
"epoch": 0.4932562620423892,
"grad_norm": 1.1760581731796265,
"learning_rate": 1.2800000000000001e-05,
"loss": 1.5724,
"step": 64
},
{
"epoch": 0.5009633911368016,
"grad_norm": 1.1171916723251343,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.5982,
"step": 65
},
{
"epoch": 0.5086705202312138,
"grad_norm": 1.234012484550476,
"learning_rate": 1.3200000000000002e-05,
"loss": 1.597,
"step": 66
},
{
"epoch": 0.5163776493256262,
"grad_norm": 1.1812013387680054,
"learning_rate": 1.3400000000000002e-05,
"loss": 1.6064,
"step": 67
},
{
"epoch": 0.5240847784200385,
"grad_norm": 1.1740922927856445,
"learning_rate": 1.3600000000000002e-05,
"loss": 1.5915,
"step": 68
},
{
"epoch": 0.5317919075144508,
"grad_norm": 1.277176856994629,
"learning_rate": 1.38e-05,
"loss": 1.5711,
"step": 69
},
{
"epoch": 0.5394990366088632,
"grad_norm": 1.1419289112091064,
"learning_rate": 1.4e-05,
"loss": 1.5934,
"step": 70
},
{
"epoch": 0.5472061657032755,
"grad_norm": 1.2002787590026855,
"learning_rate": 1.4200000000000001e-05,
"loss": 1.6021,
"step": 71
},
{
"epoch": 0.5549132947976878,
"grad_norm": 1.1920689344406128,
"learning_rate": 1.4400000000000001e-05,
"loss": 1.5893,
"step": 72
},
{
"epoch": 0.5626204238921002,
"grad_norm": 1.2546113729476929,
"learning_rate": 1.46e-05,
"loss": 1.558,
"step": 73
},
{
"epoch": 0.5703275529865125,
"grad_norm": 1.2610082626342773,
"learning_rate": 1.48e-05,
"loss": 1.5842,
"step": 74
},
{
"epoch": 0.5780346820809249,
"grad_norm": 1.1725729703903198,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.5746,
"step": 75
},
{
"epoch": 0.5857418111753372,
"grad_norm": 1.1732540130615234,
"learning_rate": 1.5200000000000002e-05,
"loss": 1.5804,
"step": 76
},
{
"epoch": 0.5934489402697495,
"grad_norm": 1.281145691871643,
"learning_rate": 1.54e-05,
"loss": 1.5884,
"step": 77
},
{
"epoch": 0.6011560693641619,
"grad_norm": 1.1668535470962524,
"learning_rate": 1.5600000000000003e-05,
"loss": 1.6048,
"step": 78
},
{
"epoch": 0.6088631984585742,
"grad_norm": 1.2680914402008057,
"learning_rate": 1.58e-05,
"loss": 1.5893,
"step": 79
},
{
"epoch": 0.6165703275529865,
"grad_norm": 1.1659042835235596,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.5791,
"step": 80
},
{
"epoch": 0.6242774566473989,
"grad_norm": 1.2156031131744385,
"learning_rate": 1.62e-05,
"loss": 1.5916,
"step": 81
},
{
"epoch": 0.6319845857418112,
"grad_norm": 1.1217319965362549,
"learning_rate": 1.64e-05,
"loss": 1.5725,
"step": 82
},
{
"epoch": 0.6396917148362236,
"grad_norm": 1.307479977607727,
"learning_rate": 1.66e-05,
"loss": 1.5723,
"step": 83
},
{
"epoch": 0.6473988439306358,
"grad_norm": 1.1636345386505127,
"learning_rate": 1.6800000000000002e-05,
"loss": 1.5906,
"step": 84
},
{
"epoch": 0.6551059730250481,
"grad_norm": 1.3260914087295532,
"learning_rate": 1.7e-05,
"loss": 1.5972,
"step": 85
},
{
"epoch": 0.6628131021194605,
"grad_norm": 1.14360511302948,
"learning_rate": 1.72e-05,
"loss": 1.5621,
"step": 86
},
{
"epoch": 0.6705202312138728,
"grad_norm": 1.4284840822219849,
"learning_rate": 1.7400000000000003e-05,
"loss": 1.571,
"step": 87
},
{
"epoch": 0.6782273603082851,
"grad_norm": 1.1513473987579346,
"learning_rate": 1.76e-05,
"loss": 1.6015,
"step": 88
},
{
"epoch": 0.6859344894026975,
"grad_norm": 1.3102519512176514,
"learning_rate": 1.7800000000000002e-05,
"loss": 1.576,
"step": 89
},
{
"epoch": 0.6936416184971098,
"grad_norm": 1.2329882383346558,
"learning_rate": 1.8e-05,
"loss": 1.5759,
"step": 90
},
{
"epoch": 0.7013487475915221,
"grad_norm": 1.1875412464141846,
"learning_rate": 1.8200000000000002e-05,
"loss": 1.555,
"step": 91
},
{
"epoch": 0.7090558766859345,
"grad_norm": 1.1887799501419067,
"learning_rate": 1.8400000000000003e-05,
"loss": 1.5926,
"step": 92
},
{
"epoch": 0.7167630057803468,
"grad_norm": 1.3002405166625977,
"learning_rate": 1.86e-05,
"loss": 1.5849,
"step": 93
},
{
"epoch": 0.7244701348747592,
"grad_norm": 1.194841980934143,
"learning_rate": 1.88e-05,
"loss": 1.5724,
"step": 94
},
{
"epoch": 0.7321772639691715,
"grad_norm": 1.315577745437622,
"learning_rate": 1.9e-05,
"loss": 1.5296,
"step": 95
},
{
"epoch": 0.7398843930635838,
"grad_norm": 1.239837884902954,
"learning_rate": 1.9200000000000003e-05,
"loss": 1.5845,
"step": 96
},
{
"epoch": 0.7475915221579962,
"grad_norm": 1.3335014581680298,
"learning_rate": 1.94e-05,
"loss": 1.5843,
"step": 97
},
{
"epoch": 0.7552986512524085,
"grad_norm": 1.2278801202774048,
"learning_rate": 1.9600000000000002e-05,
"loss": 1.588,
"step": 98
},
{
"epoch": 0.7630057803468208,
"grad_norm": 1.3168463706970215,
"learning_rate": 1.98e-05,
"loss": 1.5758,
"step": 99
},
{
"epoch": 0.7707129094412332,
"grad_norm": 1.3854187726974487,
"learning_rate": 2e-05,
"loss": 1.5489,
"step": 100
},
{
"epoch": 0.7784200385356455,
"grad_norm": 1.1883262395858765,
"learning_rate": 1.9998023297700656e-05,
"loss": 1.5707,
"step": 101
},
{
"epoch": 0.7861271676300579,
"grad_norm": 1.3683229684829712,
"learning_rate": 1.999209397227302e-05,
"loss": 1.5916,
"step": 102
},
{
"epoch": 0.7938342967244701,
"grad_norm": 1.2290884256362915,
"learning_rate": 1.998221436781933e-05,
"loss": 1.6028,
"step": 103
},
{
"epoch": 0.8015414258188824,
"grad_norm": 1.422328233718872,
"learning_rate": 1.996838839014696e-05,
"loss": 1.5753,
"step": 104
},
{
"epoch": 0.8092485549132948,
"grad_norm": 1.2604609727859497,
"learning_rate": 1.9950621505224276e-05,
"loss": 1.5537,
"step": 105
},
{
"epoch": 0.8169556840077071,
"grad_norm": 1.1725685596466064,
"learning_rate": 1.9928920737019735e-05,
"loss": 1.5745,
"step": 106
},
{
"epoch": 0.8246628131021194,
"grad_norm": 1.284792423248291,
"learning_rate": 1.9903294664725023e-05,
"loss": 1.5868,
"step": 107
},
{
"epoch": 0.8323699421965318,
"grad_norm": 1.1779919862747192,
"learning_rate": 1.9873753419363336e-05,
"loss": 1.5824,
"step": 108
},
{
"epoch": 0.8400770712909441,
"grad_norm": 1.1214483976364136,
"learning_rate": 1.9840308679784207e-05,
"loss": 1.5486,
"step": 109
},
{
"epoch": 0.8477842003853564,
"grad_norm": 1.2342500686645508,
"learning_rate": 1.9802973668046364e-05,
"loss": 1.5867,
"step": 110
},
{
"epoch": 0.8554913294797688,
"grad_norm": 1.1915156841278076,
"learning_rate": 1.976176314419051e-05,
"loss": 1.579,
"step": 111
},
{
"epoch": 0.8631984585741811,
"grad_norm": 1.215820550918579,
"learning_rate": 1.97166934004041e-05,
"loss": 1.6014,
"step": 112
},
{
"epoch": 0.8709055876685935,
"grad_norm": 1.2331247329711914,
"learning_rate": 1.9667782254580373e-05,
"loss": 1.5653,
"step": 113
},
{
"epoch": 0.8786127167630058,
"grad_norm": 1.1282511949539185,
"learning_rate": 1.9615049043274207e-05,
"loss": 1.5584,
"step": 114
},
{
"epoch": 0.8863198458574181,
"grad_norm": 1.2783879041671753,
"learning_rate": 1.955851461405761e-05,
"loss": 1.5956,
"step": 115
},
{
"epoch": 0.8940269749518305,
"grad_norm": 1.2387332916259766,
"learning_rate": 1.949820131727783e-05,
"loss": 1.5604,
"step": 116
},
{
"epoch": 0.9017341040462428,
"grad_norm": 1.3010255098342896,
"learning_rate": 1.9434132997221347e-05,
"loss": 1.5557,
"step": 117
},
{
"epoch": 0.9094412331406551,
"grad_norm": 1.3249139785766602,
"learning_rate": 1.936633498268728e-05,
"loss": 1.561,
"step": 118
},
{
"epoch": 0.9171483622350675,
"grad_norm": 1.2389734983444214,
"learning_rate": 1.9294834076973872e-05,
"loss": 1.5726,
"step": 119
},
{
"epoch": 0.9248554913294798,
"grad_norm": 1.258575677871704,
"learning_rate": 1.921965854728207e-05,
"loss": 1.5504,
"step": 120
},
{
"epoch": 0.9325626204238922,
"grad_norm": 1.2949562072753906,
"learning_rate": 1.9140838113540347e-05,
"loss": 1.576,
"step": 121
},
{
"epoch": 0.9402697495183044,
"grad_norm": 1.2721818685531616,
"learning_rate": 1.9058403936655235e-05,
"loss": 1.5697,
"step": 122
},
{
"epoch": 0.9479768786127167,
"grad_norm": 1.198541522026062,
"learning_rate": 1.8972388606192124e-05,
"loss": 1.5672,
"step": 123
},
{
"epoch": 0.9556840077071291,
"grad_norm": 1.2318319082260132,
"learning_rate": 1.888282612749132e-05,
"loss": 1.5511,
"step": 124
},
{
"epoch": 0.9633911368015414,
"grad_norm": 1.3235722780227661,
"learning_rate": 1.878975190822434e-05,
"loss": 1.5972,
"step": 125
},
{
"epoch": 0.9710982658959537,
"grad_norm": 1.2950528860092163,
"learning_rate": 1.869320274439583e-05,
"loss": 1.5696,
"step": 126
},
{
"epoch": 0.9788053949903661,
"grad_norm": 1.2997064590454102,
"learning_rate": 1.8593216805796612e-05,
"loss": 1.5751,
"step": 127
},
{
"epoch": 0.9865125240847784,
"grad_norm": 1.429874062538147,
"learning_rate": 1.8489833620913644e-05,
"loss": 1.5706,
"step": 128
},
{
"epoch": 0.9942196531791907,
"grad_norm": 1.2658491134643555,
"learning_rate": 1.8383094061302767e-05,
"loss": 1.5681,
"step": 129
},
{
"epoch": 0.9942196531791907,
"eval_loss": 1.590910792350769,
"eval_runtime": 27.4214,
"eval_samples_per_second": 1.823,
"eval_steps_per_second": 0.474,
"step": 129
},
{
"epoch": 1.001926782273603,
"grad_norm": 1.3823826313018799,
"learning_rate": 1.8273040325430575e-05,
"loss": 1.5453,
"step": 130
},
{
"epoch": 1.0096339113680155,
"grad_norm": 1.174560546875,
"learning_rate": 1.8159715921991612e-05,
"loss": 1.5485,
"step": 131
},
{
"epoch": 1.0028901734104045,
"grad_norm": 1.3361918926239014,
"learning_rate": 1.804316565270765e-05,
"loss": 1.511,
"step": 132
},
{
"epoch": 1.010597302504817,
"grad_norm": 1.5681639909744263,
"learning_rate": 1.7923435594615744e-05,
"loss": 1.3985,
"step": 133
},
{
"epoch": 1.0183044315992293,
"grad_norm": 1.3742421865463257,
"learning_rate": 1.7800573081852124e-05,
"loss": 1.4214,
"step": 134
},
{
"epoch": 1.0260115606936415,
"grad_norm": 1.375709056854248,
"learning_rate": 1.767462668693908e-05,
"loss": 1.3857,
"step": 135
},
{
"epoch": 1.033718689788054,
"grad_norm": 1.5805290937423706,
"learning_rate": 1.7545646201582304e-05,
"loss": 1.3975,
"step": 136
},
{
"epoch": 1.0414258188824663,
"grad_norm": 1.4905924797058105,
"learning_rate": 1.7413682616986185e-05,
"loss": 1.3663,
"step": 137
},
{
"epoch": 1.0491329479768785,
"grad_norm": 1.4072234630584717,
"learning_rate": 1.7278788103694944e-05,
"loss": 1.3969,
"step": 138
},
{
"epoch": 1.056840077071291,
"grad_norm": 1.3707107305526733,
"learning_rate": 1.71410159909675e-05,
"loss": 1.3932,
"step": 139
},
{
"epoch": 1.0645472061657033,
"grad_norm": 1.376590609550476,
"learning_rate": 1.7000420745694256e-05,
"loss": 1.3656,
"step": 140
},
{
"epoch": 1.0722543352601157,
"grad_norm": 1.2771958112716675,
"learning_rate": 1.6857057950864134e-05,
"loss": 1.3694,
"step": 141
},
{
"epoch": 1.079961464354528,
"grad_norm": 1.4592013359069824,
"learning_rate": 1.671098428359037e-05,
"loss": 1.3913,
"step": 142
},
{
"epoch": 1.0876685934489403,
"grad_norm": 1.3110437393188477,
"learning_rate": 1.6562257492703756e-05,
"loss": 1.3768,
"step": 143
},
{
"epoch": 1.0953757225433527,
"grad_norm": 1.344575047492981,
"learning_rate": 1.64109363759222e-05,
"loss": 1.3778,
"step": 144
},
{
"epoch": 1.1030828516377649,
"grad_norm": 1.277384638786316,
"learning_rate": 1.62570807566056e-05,
"loss": 1.3499,
"step": 145
},
{
"epoch": 1.1107899807321773,
"grad_norm": 1.2886083126068115,
"learning_rate": 1.6100751460105244e-05,
"loss": 1.3669,
"step": 146
},
{
"epoch": 1.1184971098265897,
"grad_norm": 1.3069369792938232,
"learning_rate": 1.5942010289717108e-05,
"loss": 1.3918,
"step": 147
},
{
"epoch": 1.1262042389210019,
"grad_norm": 1.2955520153045654,
"learning_rate": 1.5780920002248484e-05,
"loss": 1.3645,
"step": 148
},
{
"epoch": 1.1339113680154143,
"grad_norm": 1.3005629777908325,
"learning_rate": 1.561754428320771e-05,
"loss": 1.3522,
"step": 149
},
{
"epoch": 1.1416184971098267,
"grad_norm": 1.413831114768982,
"learning_rate": 1.5451947721626676e-05,
"loss": 1.4064,
"step": 150
},
{
"epoch": 1.1493256262042388,
"grad_norm": 1.2129186391830444,
"learning_rate": 1.5284195784526196e-05,
"loss": 1.3576,
"step": 151
},
{
"epoch": 1.1570327552986512,
"grad_norm": 1.3991036415100098,
"learning_rate": 1.5114354791034225e-05,
"loss": 1.3735,
"step": 152
},
{
"epoch": 1.1647398843930636,
"grad_norm": 1.2813304662704468,
"learning_rate": 1.494249188616723e-05,
"loss": 1.3689,
"step": 153
},
{
"epoch": 1.1724470134874758,
"grad_norm": 1.3265056610107422,
"learning_rate": 1.4768675014285063e-05,
"loss": 1.3714,
"step": 154
},
{
"epoch": 1.1801541425818882,
"grad_norm": 1.244061827659607,
"learning_rate": 1.4592972892229779e-05,
"loss": 1.371,
"step": 155
},
{
"epoch": 1.1878612716763006,
"grad_norm": 1.2477822303771973,
"learning_rate": 1.4415454982159121e-05,
"loss": 1.3705,
"step": 156
},
{
"epoch": 1.1955684007707128,
"grad_norm": 1.3200701475143433,
"learning_rate": 1.4236191464085286e-05,
"loss": 1.3657,
"step": 157
},
{
"epoch": 1.2032755298651252,
"grad_norm": 1.237042784690857,
"learning_rate": 1.405525320812994e-05,
"loss": 1.3602,
"step": 158
},
{
"epoch": 1.2109826589595376,
"grad_norm": 1.30637526512146,
"learning_rate": 1.3872711746506413e-05,
"loss": 1.3758,
"step": 159
},
{
"epoch": 1.21868978805395,
"grad_norm": 1.3186436891555786,
"learning_rate": 1.3688639245240078e-05,
"loss": 1.3907,
"step": 160
},
{
"epoch": 1.2263969171483622,
"grad_norm": 1.2071219682693481,
"learning_rate": 1.3503108475638244e-05,
"loss": 1.3698,
"step": 161
},
{
"epoch": 1.2341040462427746,
"grad_norm": 1.1885581016540527,
"learning_rate": 1.331619278552068e-05,
"loss": 1.3774,
"step": 162
},
{
"epoch": 1.241811175337187,
"grad_norm": 1.1943105459213257,
"learning_rate": 1.3127966070222273e-05,
"loss": 1.3538,
"step": 163
},
{
"epoch": 1.2495183044315992,
"grad_norm": 1.1982208490371704,
"learning_rate": 1.2938502743379212e-05,
"loss": 1.3797,
"step": 164
},
{
"epoch": 1.2572254335260116,
"grad_norm": 1.191636562347412,
"learning_rate": 1.2747877707510252e-05,
"loss": 1.371,
"step": 165
},
{
"epoch": 1.264932562620424,
"grad_norm": 1.2649930715560913,
"learning_rate": 1.2556166324404747e-05,
"loss": 1.3789,
"step": 166
},
{
"epoch": 1.2726396917148362,
"grad_norm": 1.206629753112793,
"learning_rate": 1.2363444385329052e-05,
"loss": 1.4232,
"step": 167
},
{
"epoch": 1.2803468208092486,
"grad_norm": 1.3122280836105347,
"learning_rate": 1.2169788081063181e-05,
"loss": 1.3871,
"step": 168
},
{
"epoch": 1.288053949903661,
"grad_norm": 1.1735293865203857,
"learning_rate": 1.1975273971779528e-05,
"loss": 1.3741,
"step": 169
},
{
"epoch": 1.2957610789980731,
"grad_norm": 1.3187175989151,
"learning_rate": 1.1779978956775507e-05,
"loss": 1.3644,
"step": 170
},
{
"epoch": 1.3034682080924855,
"grad_norm": 1.2720284461975098,
"learning_rate": 1.158398024407215e-05,
"loss": 1.3661,
"step": 171
},
{
"epoch": 1.311175337186898,
"grad_norm": 1.3094247579574585,
"learning_rate": 1.1387355319890685e-05,
"loss": 1.3617,
"step": 172
},
{
"epoch": 1.3188824662813103,
"grad_norm": 1.2710013389587402,
"learning_rate": 1.119018191801905e-05,
"loss": 1.373,
"step": 173
},
{
"epoch": 1.3265895953757225,
"grad_norm": 1.2845216989517212,
"learning_rate": 1.0992537989080618e-05,
"loss": 1.3712,
"step": 174
},
{
"epoch": 1.334296724470135,
"grad_norm": 1.277942419052124,
"learning_rate": 1.0794501669717146e-05,
"loss": 1.3676,
"step": 175
},
{
"epoch": 1.342003853564547,
"grad_norm": 1.190983533859253,
"learning_rate": 1.05961512516982e-05,
"loss": 1.3906,
"step": 176
},
{
"epoch": 1.3497109826589595,
"grad_norm": 1.3649415969848633,
"learning_rate": 1.039756515096926e-05,
"loss": 1.3883,
"step": 177
},
{
"epoch": 1.357418111753372,
"grad_norm": 1.2454570531845093,
"learning_rate": 1.0198821876650702e-05,
"loss": 1.3581,
"step": 178
},
{
"epoch": 1.3651252408477843,
"grad_norm": 1.2593861818313599,
"learning_rate": 1e-05,
"loss": 1.3726,
"step": 179
},
{
"epoch": 1.3728323699421965,
"grad_norm": 1.2473970651626587,
"learning_rate": 9.801178123349298e-06,
"loss": 1.4003,
"step": 180
},
{
"epoch": 1.3805394990366089,
"grad_norm": 1.210317611694336,
"learning_rate": 9.602434849030747e-06,
"loss": 1.3875,
"step": 181
},
{
"epoch": 1.388246628131021,
"grad_norm": 1.2112162113189697,
"learning_rate": 9.403848748301802e-06,
"loss": 1.3769,
"step": 182
},
{
"epoch": 1.3959537572254335,
"grad_norm": 1.1812710762023926,
"learning_rate": 9.205498330282857e-06,
"loss": 1.3521,
"step": 183
},
{
"epoch": 1.4036608863198459,
"grad_norm": 1.2227439880371094,
"learning_rate": 9.007462010919387e-06,
"loss": 1.3781,
"step": 184
},
{
"epoch": 1.4113680154142583,
"grad_norm": 1.2190202474594116,
"learning_rate": 8.809818081980954e-06,
"loss": 1.3529,
"step": 185
},
{
"epoch": 1.4190751445086704,
"grad_norm": 1.1302087306976318,
"learning_rate": 8.61264468010932e-06,
"loss": 1.3937,
"step": 186
},
{
"epoch": 1.4267822736030829,
"grad_norm": 1.1406745910644531,
"learning_rate": 8.416019755927851e-06,
"loss": 1.3715,
"step": 187
},
{
"epoch": 1.4344894026974953,
"grad_norm": 1.1421207189559937,
"learning_rate": 8.2200210432245e-06,
"loss": 1.3441,
"step": 188
},
{
"epoch": 1.4421965317919074,
"grad_norm": 1.1282238960266113,
"learning_rate": 8.024726028220474e-06,
"loss": 1.3484,
"step": 189
},
{
"epoch": 1.4499036608863198,
"grad_norm": 1.1182270050048828,
"learning_rate": 7.83021191893682e-06,
"loss": 1.3736,
"step": 190
},
{
"epoch": 1.4576107899807322,
"grad_norm": 1.1618040800094604,
"learning_rate": 7.636555614670953e-06,
"loss": 1.3481,
"step": 191
},
{
"epoch": 1.4653179190751446,
"grad_norm": 1.1137522459030151,
"learning_rate": 7.443833675595254e-06,
"loss": 1.3523,
"step": 192
},
{
"epoch": 1.4730250481695568,
"grad_norm": 1.2066893577575684,
"learning_rate": 7.252122292489747e-06,
"loss": 1.3616,
"step": 193
},
{
"epoch": 1.4807321772639692,
"grad_norm": 1.1276185512542725,
"learning_rate": 7.061497256620793e-06,
"loss": 1.353,
"step": 194
},
{
"epoch": 1.4884393063583814,
"grad_norm": 1.1631989479064941,
"learning_rate": 6.872033929777731e-06,
"loss": 1.3483,
"step": 195
},
{
"epoch": 1.4961464354527938,
"grad_norm": 1.1466474533081055,
"learning_rate": 6.683807214479323e-06,
"loss": 1.3678,
"step": 196
},
{
"epoch": 1.5038535645472062,
"grad_norm": 1.132791519165039,
"learning_rate": 6.496891524361757e-06,
"loss": 1.3576,
"step": 197
},
{
"epoch": 1.5115606936416186,
"grad_norm": 1.1244217157363892,
"learning_rate": 6.311360754759923e-06,
"loss": 1.3832,
"step": 198
},
{
"epoch": 1.5192678227360308,
"grad_norm": 1.1384022235870361,
"learning_rate": 6.127288253493591e-06,
"loss": 1.3578,
"step": 199
},
{
"epoch": 1.5269749518304432,
"grad_norm": 1.1305923461914062,
"learning_rate": 5.944746791870062e-06,
"loss": 1.368,
"step": 200
},
{
"epoch": 1.5346820809248554,
"grad_norm": 1.1514254808425903,
"learning_rate": 5.7638085359147235e-06,
"loss": 1.3533,
"step": 201
},
{
"epoch": 1.5423892100192678,
"grad_norm": 1.1174412965774536,
"learning_rate": 5.584545017840886e-06,
"loss": 1.3729,
"step": 202
},
{
"epoch": 1.5500963391136802,
"grad_norm": 1.0917550325393677,
"learning_rate": 5.40702710777022e-06,
"loss": 1.3539,
"step": 203
},
{
"epoch": 1.5578034682080926,
"grad_norm": 1.0902245044708252,
"learning_rate": 5.231324985714942e-06,
"loss": 1.3711,
"step": 204
},
{
"epoch": 1.565510597302505,
"grad_norm": 1.1163016557693481,
"learning_rate": 5.057508113832772e-06,
"loss": 1.3782,
"step": 205
},
{
"epoch": 1.5732177263969171,
"grad_norm": 1.1419026851654053,
"learning_rate": 4.885645208965779e-06,
"loss": 1.3825,
"step": 206
},
{
"epoch": 1.5809248554913293,
"grad_norm": 1.1543022394180298,
"learning_rate": 4.7158042154738094e-06,
"loss": 1.3551,
"step": 207
},
{
"epoch": 1.5886319845857417,
"grad_norm": 1.0950229167938232,
"learning_rate": 4.548052278373327e-06,
"loss": 1.3375,
"step": 208
},
{
"epoch": 1.5963391136801541,
"grad_norm": 1.1293272972106934,
"learning_rate": 4.382455716792291e-06,
"loss": 1.3498,
"step": 209
},
{
"epoch": 1.6040462427745665,
"grad_norm": 1.123294472694397,
"learning_rate": 4.219079997751515e-06,
"loss": 1.3519,
"step": 210
},
{
"epoch": 1.611753371868979,
"grad_norm": 1.114963412284851,
"learning_rate": 4.057989710282897e-06,
"loss": 1.3597,
"step": 211
},
{
"epoch": 1.6194605009633911,
"grad_norm": 1.0550687313079834,
"learning_rate": 3.899248539894756e-06,
"loss": 1.3594,
"step": 212
},
{
"epoch": 1.6271676300578035,
"grad_norm": 1.0849530696868896,
"learning_rate": 3.7429192433944016e-06,
"loss": 1.3585,
"step": 213
},
{
"epoch": 1.6348747591522157,
"grad_norm": 1.0992666482925415,
"learning_rate": 3.589063624077802e-06,
"loss": 1.3765,
"step": 214
},
{
"epoch": 1.642581888246628,
"grad_norm": 1.1028841733932495,
"learning_rate": 3.4377425072962467e-06,
"loss": 1.3551,
"step": 215
},
{
"epoch": 1.6502890173410405,
"grad_norm": 1.0943406820297241,
"learning_rate": 3.2890157164096315e-06,
"loss": 1.3426,
"step": 216
},
{
"epoch": 1.657996146435453,
"grad_norm": 1.0819505453109741,
"learning_rate": 3.1429420491358696e-06,
"loss": 1.37,
"step": 217
},
{
"epoch": 1.665703275529865,
"grad_norm": 1.0802693367004395,
"learning_rate": 2.999579254305748e-06,
"loss": 1.363,
"step": 218
},
{
"epoch": 1.6734104046242775,
"grad_norm": 1.0993719100952148,
"learning_rate": 2.8589840090325028e-06,
"loss": 1.373,
"step": 219
},
{
"epoch": 1.6811175337186897,
"grad_norm": 1.1456190347671509,
"learning_rate": 2.721211896305059e-06,
"loss": 1.337,
"step": 220
},
{
"epoch": 1.688824662813102,
"grad_norm": 1.1663914918899536,
"learning_rate": 2.5863173830138212e-06,
"loss": 1.3695,
"step": 221
},
{
"epoch": 1.6965317919075145,
"grad_norm": 1.10584557056427,
"learning_rate": 2.454353798417698e-06,
"loss": 1.336,
"step": 222
},
{
"epoch": 1.7042389210019269,
"grad_norm": 1.0759963989257812,
"learning_rate": 2.325373313060919e-06,
"loss": 1.3436,
"step": 223
},
{
"epoch": 1.7119460500963393,
"grad_norm": 1.0870240926742554,
"learning_rate": 2.19942691814788e-06,
"loss": 1.3458,
"step": 224
},
{
"epoch": 1.7196531791907514,
"grad_norm": 1.101758360862732,
"learning_rate": 2.0765644053842583e-06,
"loss": 1.3562,
"step": 225
},
{
"epoch": 1.7273603082851636,
"grad_norm": 1.0890095233917236,
"learning_rate": 1.9568343472923524e-06,
"loss": 1.3717,
"step": 226
},
{
"epoch": 1.735067437379576,
"grad_norm": 1.0680255889892578,
"learning_rate": 1.840284078008393e-06,
"loss": 1.3402,
"step": 227
},
{
"epoch": 1.7427745664739884,
"grad_norm": 1.0939226150512695,
"learning_rate": 1.7269596745694295e-06,
"loss": 1.3688,
"step": 228
},
{
"epoch": 1.7504816955684008,
"grad_norm": 1.0921400785446167,
"learning_rate": 1.6169059386972342e-06,
"loss": 1.3316,
"step": 229
},
{
"epoch": 1.7581888246628132,
"grad_norm": 1.048248529434204,
"learning_rate": 1.5101663790863597e-06,
"loss": 1.3347,
"step": 230
},
{
"epoch": 1.7658959537572254,
"grad_norm": 1.080112338066101,
"learning_rate": 1.4067831942033904e-06,
"loss": 1.3476,
"step": 231
},
{
"epoch": 1.7736030828516378,
"grad_norm": 1.0681742429733276,
"learning_rate": 1.3067972556041753e-06,
"loss": 1.3833,
"step": 232
},
{
"epoch": 1.78131021194605,
"grad_norm": 1.070648431777954,
"learning_rate": 1.2102480917756632e-06,
"loss": 1.3601,
"step": 233
},
{
"epoch": 1.7890173410404624,
"grad_norm": 1.0644824504852295,
"learning_rate": 1.1171738725086833e-06,
"loss": 1.3503,
"step": 234
},
{
"epoch": 1.7967244701348748,
"grad_norm": 1.0739105939865112,
"learning_rate": 1.0276113938078768e-06,
"loss": 1.3686,
"step": 235
},
{
"epoch": 1.8044315992292872,
"grad_norm": 1.0678924322128296,
"learning_rate": 9.415960633447674e-07,
"loss": 1.348,
"step": 236
},
{
"epoch": 1.8121387283236994,
"grad_norm": 1.0799516439437866,
"learning_rate": 8.591618864596541e-07,
"loss": 1.3571,
"step": 237
},
{
"epoch": 1.8198458574181118,
"grad_norm": 1.0634883642196655,
"learning_rate": 7.803414527179343e-07,
"loss": 1.3383,
"step": 238
},
{
"epoch": 1.827552986512524,
"grad_norm": 1.0771961212158203,
"learning_rate": 7.051659230261299e-07,
"loss": 1.363,
"step": 239
},
{
"epoch": 1.8352601156069364,
"grad_norm": 1.0874431133270264,
"learning_rate": 6.336650173127224e-07,
"loss": 1.3617,
"step": 240
},
{
"epoch": 1.8429672447013488,
"grad_norm": 1.0974795818328857,
"learning_rate": 5.658670027786561e-07,
"loss": 1.3685,
"step": 241
},
{
"epoch": 1.8506743737957612,
"grad_norm": 1.0479196310043335,
"learning_rate": 5.017986827221733e-07,
"loss": 1.3502,
"step": 242
},
{
"epoch": 1.8583815028901736,
"grad_norm": 1.0492548942565918,
"learning_rate": 4.4148538594239176e-07,
"loss": 1.335,
"step": 243
},
{
"epoch": 1.8660886319845857,
"grad_norm": 1.0853266716003418,
"learning_rate": 3.8495095672579584e-07,
"loss": 1.374,
"step": 244
},
{
"epoch": 1.873795761078998,
"grad_norm": 1.0355820655822754,
"learning_rate": 3.322177454196285e-07,
"loss": 1.3624,
"step": 245
},
{
"epoch": 1.8815028901734103,
"grad_norm": 1.0834838151931763,
"learning_rate": 2.8330659959589944e-07,
"loss": 1.3798,
"step": 246
},
{
"epoch": 1.8892100192678227,
"grad_norm": 1.0652782917022705,
"learning_rate": 2.3823685580949273e-07,
"loss": 1.3659,
"step": 247
},
{
"epoch": 1.8969171483622351,
"grad_norm": 1.0404101610183716,
"learning_rate": 1.9702633195363918e-07,
"loss": 1.3512,
"step": 248
},
{
"epoch": 1.9046242774566475,
"grad_norm": 1.0568671226501465,
"learning_rate": 1.5969132021579347e-07,
"loss": 1.3674,
"step": 249
},
{
"epoch": 1.9123314065510597,
"grad_norm": 1.0594927072525024,
"learning_rate": 1.262465806366664e-07,
"loss": 1.3562,
"step": 250
},
{
"epoch": 1.920038535645472,
"grad_norm": 1.0357680320739746,
"learning_rate": 9.670533527498139e-08,
"loss": 1.3609,
"step": 251
},
{
"epoch": 1.9277456647398843,
"grad_norm": 1.043979525566101,
"learning_rate": 7.10792629802659e-08,
"loss": 1.3224,
"step": 252
},
{
"epoch": 1.9354527938342967,
"grad_norm": 1.045696496963501,
"learning_rate": 4.937849477572587e-08,
"loss": 1.3764,
"step": 253
},
{
"epoch": 1.943159922928709,
"grad_norm": 1.0686546564102173,
"learning_rate": 3.161160985304168e-08,
"loss": 1.3563,
"step": 254
},
{
"epoch": 1.9508670520231215,
"grad_norm": 1.0238714218139648,
"learning_rate": 1.77856321806702e-08,
"loss": 1.3303,
"step": 255
},
{
"epoch": 1.9585741811175337,
"grad_norm": 1.0332682132720947,
"learning_rate": 7.906027726981568e-09,
"loss": 1.3412,
"step": 256
},
{
"epoch": 1.966281310211946,
"grad_norm": 1.079695224761963,
"learning_rate": 1.9767022993444353e-09,
"loss": 1.3787,
"step": 257
},
{
"epoch": 1.9739884393063583,
"grad_norm": 1.0400174856185913,
"learning_rate": 0.0,
"loss": 1.3201,
"step": 258
},
{
"epoch": 1.9739884393063583,
"eval_loss": 1.5724855661392212,
"eval_runtime": 27.4882,
"eval_samples_per_second": 1.819,
"eval_steps_per_second": 0.473,
"step": 258
}
],
"logging_steps": 1,
"max_steps": 258,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 43,
"total_flos": 3.045490266539557e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}